summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig3
-rw-r--r--arch/x86/Makefile5
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c10
-rw-r--r--arch/x86/crypto/crc32-pclmul_asm.S17
-rw-r--r--arch/x86/entry/vdso/Makefile4
-rw-r--r--arch/x86/entry/vdso/vma.c2
-rw-r--r--arch/x86/events/intel/core.c35
-rw-r--r--arch/x86/events/intel/uncore_snbep.c8
-rw-r--r--arch/x86/hyperv/hv_init.c15
-rw-r--r--arch/x86/include/asm/compat.h1
-rw-r--r--arch/x86/include/asm/dma-mapping.h8
-rw-r--r--arch/x86/include/asm/fixmap.h6
-rw-r--r--arch/x86/include/asm/kmemcheck.h42
-rw-r--r--arch/x86/include/asm/kvm_emulate.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h8
-rw-r--r--arch/x86/include/asm/mshyperv.h2
-rw-r--r--arch/x86/include/asm/pci.h2
-rw-r--r--arch/x86/include/asm/pgtable.h5
-rw-r--r--arch/x86/include/asm/pgtable_types.h13
-rw-r--r--arch/x86/include/asm/pvclock.h19
-rw-r--r--arch/x86/include/asm/string_32.h9
-rw-r--r--arch/x86/include/asm/string_64.h8
-rw-r--r--arch/x86/include/asm/vmx.h4
-rw-r--r--arch/x86/include/asm/xen/cpuid.h42
-rw-r--r--arch/x86/include/asm/xen/page.h11
-rw-r--r--arch/x86/include/asm/xor.h5
-rw-r--r--arch/x86/kernel/acpi/apei.c5
-rw-r--r--arch/x86/kernel/cpu/Makefile2
-rw-r--r--arch/x86/kernel/cpu/aperfmperf.c74
-rw-r--r--arch/x86/kernel/cpu/cpu.h3
-rw-r--r--arch/x86/kernel/cpu/intel.c15
-rw-r--r--arch/x86/kernel/cpu/proc.c6
-rw-r--r--arch/x86/kernel/espfix_64.c2
-rw-r--r--arch/x86/kernel/kvmclock.c7
-rw-r--r--arch/x86/kernel/pvclock.c14
-rw-r--r--arch/x86/kernel/traps.c5
-rw-r--r--arch/x86/kvm/emulate.c9
-rw-r--r--arch/x86/kvm/lapic.c91
-rw-r--r--arch/x86/kvm/mmu.c115
-rw-r--r--arch/x86/kvm/mmu.h3
-rw-r--r--arch/x86/kvm/paging_tmpl.h18
-rw-r--r--arch/x86/kvm/svm.c248
-rw-r--r--arch/x86/kvm/vmx.c369
-rw-r--r--arch/x86/kvm/x86.c94
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/fault.c6
-rw-r--r--arch/x86/mm/init.c8
-rw-r--r--arch/x86/mm/init_64.c13
-rw-r--r--arch/x86/mm/kmemcheck/Makefile1
-rw-r--r--arch/x86/mm/kmemcheck/error.c227
-rw-r--r--arch/x86/mm/kmemcheck/error.h15
-rw-r--r--arch/x86/mm/kmemcheck/kmemcheck.c658
-rw-r--r--arch/x86/mm/kmemcheck/opcode.c106
-rw-r--r--arch/x86/mm/kmemcheck/opcode.h9
-rw-r--r--arch/x86/mm/kmemcheck/pte.c22
-rw-r--r--arch/x86/mm/kmemcheck/pte.h10
-rw-r--r--arch/x86/mm/kmemcheck/selftest.c70
-rw-r--r--arch/x86/mm/kmemcheck/selftest.h6
-rw-r--r--arch/x86/mm/kmemcheck/shadow.c173
-rw-r--r--arch/x86/mm/kmemcheck/shadow.h18
-rw-r--r--arch/x86/mm/pageattr.c10
-rw-r--r--arch/x86/mm/pgtable.c2
-rw-r--r--arch/x86/oprofile/nmi_int.c2
-rw-r--r--arch/x86/pci/fixup.c85
-rw-r--r--arch/x86/pci/intel_mid_pci.c2
-rw-r--r--arch/x86/platform/efi/efi_64.c2
-rw-r--r--arch/x86/xen/grant-table.c60
-rw-r--r--arch/x86/xen/mmu.c14
-rw-r--r--arch/x86/xen/mmu_pv.c4
-rw-r--r--arch/x86/xen/suspend.c4
-rw-r--r--arch/x86/xen/time.c99
-rw-r--r--arch/x86/xen/xen-ops.h2
72 files changed, 1118 insertions, 1878 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 09dcc94c4484..8eed3f94bfc7 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -112,7 +112,6 @@ config X86
select HAVE_ARCH_JUMP_LABEL
select HAVE_ARCH_KASAN if X86_64
select HAVE_ARCH_KGDB
- select HAVE_ARCH_KMEMCHECK
select HAVE_ARCH_MMAP_RND_BITS if MMU
select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT
select HAVE_ARCH_COMPAT_MMAP_BASES if MMU && COMPAT
@@ -1430,7 +1429,7 @@ config ARCH_DMA_ADDR_T_64BIT
config X86_DIRECT_GBPAGES
def_bool y
- depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK
+ depends on X86_64 && !DEBUG_PAGEALLOC
---help---
Certain kernel features effectively disable kernel
linear 1 GB mappings (even if the CPU otherwise
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index a20eacd9c7e9..3e73bc255e4e 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -158,11 +158,6 @@ ifdef CONFIG_X86_X32
endif
export CONFIG_X86_X32_ABI
-# Don't unroll struct assignments with kmemcheck enabled
-ifeq ($(CONFIG_KMEMCHECK),y)
- KBUILD_CFLAGS += $(call cc-option,-fno-builtin-memcpy)
-endif
-
#
# If the function graph tracer is used with mcount instead of fentry,
# '-maccumulate-outgoing-args' is needed to prevent a GCC bug
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 5c15d6b57329..3bf3dcf29825 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -28,6 +28,7 @@
#include <crypto/cryptd.h>
#include <crypto/ctr.h>
#include <crypto/b128ops.h>
+#include <crypto/gcm.h>
#include <crypto/xts.h>
#include <asm/cpu_device_id.h>
#include <asm/fpu/api.h>
@@ -1067,9 +1068,10 @@ static struct skcipher_alg aesni_skciphers[] = {
}
};
+static
struct simd_skcipher_alg *aesni_simd_skciphers[ARRAY_SIZE(aesni_skciphers)];
-struct {
+static struct {
const char *algname;
const char *drvname;
const char *basename;
@@ -1131,7 +1133,7 @@ static struct aead_alg aesni_aead_algs[] = { {
.setauthsize = common_rfc4106_set_authsize,
.encrypt = helper_rfc4106_encrypt,
.decrypt = helper_rfc4106_decrypt,
- .ivsize = 8,
+ .ivsize = GCM_RFC4106_IV_SIZE,
.maxauthsize = 16,
.base = {
.cra_name = "__gcm-aes-aesni",
@@ -1149,7 +1151,7 @@ static struct aead_alg aesni_aead_algs[] = { {
.setauthsize = rfc4106_set_authsize,
.encrypt = rfc4106_encrypt,
.decrypt = rfc4106_decrypt,
- .ivsize = 8,
+ .ivsize = GCM_RFC4106_IV_SIZE,
.maxauthsize = 16,
.base = {
.cra_name = "rfc4106(gcm(aes))",
@@ -1165,7 +1167,7 @@ static struct aead_alg aesni_aead_algs[] = { {
.setauthsize = generic_gcmaes_set_authsize,
.encrypt = generic_gcmaes_encrypt,
.decrypt = generic_gcmaes_decrypt,
- .ivsize = 12,
+ .ivsize = GCM_AES_IV_SIZE,
.maxauthsize = 16,
.base = {
.cra_name = "gcm(aes)",
diff --git a/arch/x86/crypto/crc32-pclmul_asm.S b/arch/x86/crypto/crc32-pclmul_asm.S
index f247304299a2..1c099dc08cc3 100644
--- a/arch/x86/crypto/crc32-pclmul_asm.S
+++ b/arch/x86/crypto/crc32-pclmul_asm.S
@@ -41,6 +41,7 @@
#include <asm/inst.h>
+.section .rodata
.align 16
/*
* [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4
@@ -111,19 +112,13 @@ ENTRY(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */
pxor CONSTANT, %xmm1
sub $0x40, LEN
add $0x40, BUF
-#ifndef __x86_64__
- /* This is for position independent code(-fPIC) support for 32bit */
- call delta
-delta:
- pop %ecx
-#endif
cmp $0x40, LEN
jb less_64
#ifdef __x86_64__
movdqa .Lconstant_R2R1(%rip), CONSTANT
#else
- movdqa .Lconstant_R2R1 - delta(%ecx), CONSTANT
+ movdqa .Lconstant_R2R1, CONSTANT
#endif
loop_64:/* 64 bytes Full cache line folding */
@@ -172,7 +167,7 @@ less_64:/* Folding cache line into 128bit */
#ifdef __x86_64__
movdqa .Lconstant_R4R3(%rip), CONSTANT
#else
- movdqa .Lconstant_R4R3 - delta(%ecx), CONSTANT
+ movdqa .Lconstant_R4R3, CONSTANT
#endif
prefetchnta (BUF)
@@ -220,8 +215,8 @@ fold_64:
movdqa .Lconstant_R5(%rip), CONSTANT
movdqa .Lconstant_mask32(%rip), %xmm3
#else
- movdqa .Lconstant_R5 - delta(%ecx), CONSTANT
- movdqa .Lconstant_mask32 - delta(%ecx), %xmm3
+ movdqa .Lconstant_R5, CONSTANT
+ movdqa .Lconstant_mask32, %xmm3
#endif
psrldq $0x04, %xmm2
pand %xmm3, %xmm1
@@ -232,7 +227,7 @@ fold_64:
#ifdef __x86_64__
movdqa .Lconstant_RUpoly(%rip), CONSTANT
#else
- movdqa .Lconstant_RUpoly - delta(%ecx), CONSTANT
+ movdqa .Lconstant_RUpoly, CONSTANT
#endif
movdqa %xmm1, %xmm2
pand %xmm3, %xmm1
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index c366c0adeb40..1943aebadede 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -130,10 +130,6 @@ $(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE
CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds)
VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf_i386 -Wl,-soname=linux-gate.so.1
-# This makes sure the $(obj) subdirectory exists even though vdso32/
-# is not a kbuild sub-make subdirectory.
-override obj-dirs = $(dir $(obj)) $(obj)/vdso32/
-
targets += vdso32/vdso32.lds
targets += vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o
targets += vdso32/vclock_gettime.o
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index d63053142b16..5b8b556dbb12 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -112,7 +112,7 @@ static int vvar_fault(const struct vm_special_mapping *sm,
__pa_symbol(&__vvar_page) >> PAGE_SHIFT);
} else if (sym_offset == image->sym_pvclock_page) {
struct pvclock_vsyscall_time_info *pvti =
- pvclock_pvti_cpu0_va();
+ pvclock_get_pvti_cpu0_va();
if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) {
ret = vm_insert_pfn_prot(
vma,
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 43445da30cea..09c26a4f139c 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3734,6 +3734,19 @@ EVENT_ATTR_STR(cycles-t, cycles_t, "event=0x3c,in_tx=1");
EVENT_ATTR_STR(cycles-ct, cycles_ct, "event=0x3c,in_tx=1,in_tx_cp=1");
static struct attribute *hsw_events_attrs[] = {
+ EVENT_PTR(mem_ld_hsw),
+ EVENT_PTR(mem_st_hsw),
+ EVENT_PTR(td_slots_issued),
+ EVENT_PTR(td_slots_retired),
+ EVENT_PTR(td_fetch_bubbles),
+ EVENT_PTR(td_total_slots),
+ EVENT_PTR(td_total_slots_scale),
+ EVENT_PTR(td_recovery_bubbles),
+ EVENT_PTR(td_recovery_bubbles_scale),
+ NULL
+};
+
+static struct attribute *hsw_tsx_events_attrs[] = {
EVENT_PTR(tx_start),
EVENT_PTR(tx_commit),
EVENT_PTR(tx_abort),
@@ -3746,18 +3759,16 @@ static struct attribute *hsw_events_attrs[] = {
EVENT_PTR(el_conflict),
EVENT_PTR(cycles_t),
EVENT_PTR(cycles_ct),
- EVENT_PTR(mem_ld_hsw),
- EVENT_PTR(mem_st_hsw),
- EVENT_PTR(td_slots_issued),
- EVENT_PTR(td_slots_retired),
- EVENT_PTR(td_fetch_bubbles),
- EVENT_PTR(td_total_slots),
- EVENT_PTR(td_total_slots_scale),
- EVENT_PTR(td_recovery_bubbles),
- EVENT_PTR(td_recovery_bubbles_scale),
NULL
};
+static __init struct attribute **get_hsw_events_attrs(void)
+{
+ return boot_cpu_has(X86_FEATURE_RTM) ?
+ merge_attr(hsw_events_attrs, hsw_tsx_events_attrs) :
+ hsw_events_attrs;
+}
+
static ssize_t freeze_on_smi_show(struct device *cdev,
struct device_attribute *attr,
char *buf)
@@ -4186,7 +4197,7 @@ __init int intel_pmu_init(void)
x86_pmu.hw_config = hsw_hw_config;
x86_pmu.get_event_constraints = hsw_get_event_constraints;
- x86_pmu.cpu_events = hsw_events_attrs;
+ x86_pmu.cpu_events = get_hsw_events_attrs();
x86_pmu.lbr_double_abort = true;
extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
hsw_format_attr : nhm_format_attr;
@@ -4225,7 +4236,7 @@ __init int intel_pmu_init(void)
x86_pmu.hw_config = hsw_hw_config;
x86_pmu.get_event_constraints = hsw_get_event_constraints;
- x86_pmu.cpu_events = hsw_events_attrs;
+ x86_pmu.cpu_events = get_hsw_events_attrs();
x86_pmu.limit_period = bdw_limit_period;
extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
hsw_format_attr : nhm_format_attr;
@@ -4283,7 +4294,7 @@ __init int intel_pmu_init(void)
extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
hsw_format_attr : nhm_format_attr;
extra_attr = merge_attr(extra_attr, skl_format_attr);
- x86_pmu.cpu_events = hsw_events_attrs;
+ x86_pmu.cpu_events = get_hsw_events_attrs();
intel_pmu_pebs_data_source_skl(
boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X);
pr_cont("Skylake events, ");
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index de8f8625213c..6d8044ab1060 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -3035,11 +3035,19 @@ static struct intel_uncore_type *bdx_msr_uncores[] = {
NULL,
};
+/* Bit 7 'Use Occupancy' is not available for counter 0 on BDX */
+static struct event_constraint bdx_uncore_pcu_constraints[] = {
+ EVENT_CONSTRAINT(0x80, 0xe, 0x80),
+ EVENT_CONSTRAINT_END
+};
+
void bdx_uncore_cpu_init(void)
{
if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
uncore_msr_uncores = bdx_msr_uncores;
+
+ hswep_uncore_pcu.constraints = bdx_uncore_pcu_constraints;
}
static struct intel_uncore_type bdx_uncore_ha = {
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index a0b86cf486e0..189a398290db 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -210,9 +210,10 @@ void hyperv_cleanup(void)
}
EXPORT_SYMBOL_GPL(hyperv_cleanup);
-void hyperv_report_panic(struct pt_regs *regs)
+void hyperv_report_panic(struct pt_regs *regs, long err)
{
static bool panic_reported;
+ u64 guest_id;
/*
* We prefer to report panic on 'die' chain as we have proper
@@ -223,11 +224,13 @@ void hyperv_report_panic(struct pt_regs *regs)
return;
panic_reported = true;
- wrmsrl(HV_X64_MSR_CRASH_P0, regs->ip);
- wrmsrl(HV_X64_MSR_CRASH_P1, regs->ax);
- wrmsrl(HV_X64_MSR_CRASH_P2, regs->bx);
- wrmsrl(HV_X64_MSR_CRASH_P3, regs->cx);
- wrmsrl(HV_X64_MSR_CRASH_P4, regs->dx);
+ rdmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id);
+
+ wrmsrl(HV_X64_MSR_CRASH_P0, err);
+ wrmsrl(HV_X64_MSR_CRASH_P1, guest_id);
+ wrmsrl(HV_X64_MSR_CRASH_P2, regs->ip);
+ wrmsrl(HV_X64_MSR_CRASH_P3, regs->ax);
+ wrmsrl(HV_X64_MSR_CRASH_P4, regs->sp);
/*
* Let Hyper-V know there is crash data available
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index a600a6cda9ec..2cbd75dd2fd3 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -210,7 +210,6 @@ typedef struct compat_siginfo {
} compat_siginfo_t;
#define COMPAT_OFF_T_MAX 0x7fffffff
-#define COMPAT_LOFF_T_MAX 0x7fffffffffffffffL
struct compat_ipc64_perm {
compat_key_t key;
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 836ca1178a6a..0350d99bb8fd 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -7,7 +7,6 @@
* Documentation/DMA-API.txt for documentation.
*/
-#include <linux/kmemcheck.h>
#include <linux/scatterlist.h>
#include <linux/dma-debug.h>
#include <asm/io.h>
@@ -68,13 +67,6 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
}
#endif /* CONFIG_X86_DMA_REMAP */
-static inline void
-dma_cache_sync(struct device *dev, void *vaddr, size_t size,
- enum dma_data_direction dir)
-{
- flush_write_buffers();
-}
-
static inline unsigned long dma_alloc_coherent_mask(struct device *dev,
gfp_t gfp)
{
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index dcd9fb55e679..b0c505fe9a95 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -104,6 +104,12 @@ enum fixed_addresses {
FIX_GDT_REMAP_BEGIN,
FIX_GDT_REMAP_END = FIX_GDT_REMAP_BEGIN + NR_CPUS - 1,
+#ifdef CONFIG_ACPI_APEI_GHES
+ /* Used for GHES mapping from assorted contexts */
+ FIX_APEI_GHES_IRQ,
+ FIX_APEI_GHES_NMI,
+#endif
+
__end_of_permanent_fixed_addresses,
/*
diff --git a/arch/x86/include/asm/kmemcheck.h b/arch/x86/include/asm/kmemcheck.h
index 945a0337fbcf..ea32a7d3cf1b 100644
--- a/arch/x86/include/asm/kmemcheck.h
+++ b/arch/x86/include/asm/kmemcheck.h
@@ -1,43 +1 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ASM_X86_KMEMCHECK_H
-#define ASM_X86_KMEMCHECK_H
-
-#include <linux/types.h>
-#include <asm/ptrace.h>
-
-#ifdef CONFIG_KMEMCHECK
-bool kmemcheck_active(struct pt_regs *regs);
-
-void kmemcheck_show(struct pt_regs *regs);
-void kmemcheck_hide(struct pt_regs *regs);
-
-bool kmemcheck_fault(struct pt_regs *regs,
- unsigned long address, unsigned long error_code);
-bool kmemcheck_trap(struct pt_regs *regs);
-#else
-static inline bool kmemcheck_active(struct pt_regs *regs)
-{
- return false;
-}
-
-static inline void kmemcheck_show(struct pt_regs *regs)
-{
-}
-
-static inline void kmemcheck_hide(struct pt_regs *regs)
-{
-}
-
-static inline bool kmemcheck_fault(struct pt_regs *regs,
- unsigned long address, unsigned long error_code)
-{
- return false;
-}
-
-static inline bool kmemcheck_trap(struct pt_regs *regs)
-{
- return false;
-}
-#endif /* CONFIG_KMEMCHECK */
-
-#endif
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index ee23a43386a2..034caa1a084e 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -226,6 +226,8 @@ struct x86_emulate_ops {
unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt);
void (*set_hflags)(struct x86_emulate_ctxt *ctxt, unsigned hflags);
+ int (*pre_leave_smm)(struct x86_emulate_ctxt *ctxt, u64 smbase);
+
};
typedef u32 __attribute__((vector_size(16))) sse128_t;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9d7d856b2d89..1bfb99770c34 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1061,6 +1061,11 @@ struct kvm_x86_ops {
void (*cancel_hv_timer)(struct kvm_vcpu *vcpu);
void (*setup_mce)(struct kvm_vcpu *vcpu);
+
+ int (*smi_allowed)(struct kvm_vcpu *vcpu);
+ int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate);
+ int (*pre_leave_smm)(struct kvm_vcpu *vcpu, u64 smbase);
+ int (*enable_smi_window)(struct kvm_vcpu *vcpu);
};
struct kvm_arch_async_pf {
@@ -1426,4 +1431,7 @@ static inline int kvm_cpu_get_apicid(int mps_cpu)
#endif
}
+#define put_smstate(type, buf, offset, val) \
+ *(type *)((buf) + (offset) - 0x7e00) = val
+
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 581bb54dd464..5400add2885b 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -311,7 +311,7 @@ static inline int hv_cpu_number_to_vp_number(int cpu_number)
void hyperv_init(void);
void hyperv_setup_mmu_ops(void);
void hyper_alloc_mmu(void);
-void hyperv_report_panic(struct pt_regs *regs);
+void hyperv_report_panic(struct pt_regs *regs, long err);
bool hv_is_hypercall_page_setup(void);
void hyperv_cleanup(void);
#else /* CONFIG_HYPERV */
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 09c06b0fb964..d32175e30259 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -89,10 +89,8 @@ extern unsigned long pci_mem_start;
#define PCIBIOS_MIN_CARDBUS_IO 0x4000
extern int pcibios_enabled;
-void pcibios_config_init(void);
void pcibios_scan_root(int bus);
-void pcibios_set_master(struct pci_dev *dev);
struct irq_routing_table *pcibios_get_irq_routing_table(void);
int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index f735c3016325..09f9e1e00e3b 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -667,11 +667,6 @@ static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
return false;
}
-static inline int pte_hidden(pte_t pte)
-{
- return pte_flags(pte) & _PAGE_HIDDEN;
-}
-
static inline int pmd_present(pmd_t pmd)
{
/*
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 9e9b05fc4860..3696398a9475 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -32,7 +32,6 @@
#define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1
#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
-#define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */
#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
#define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4
@@ -79,18 +78,6 @@
#define _PAGE_KNL_ERRATUM_MASK 0
#endif
-#ifdef CONFIG_KMEMCHECK
-#define _PAGE_HIDDEN (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN)
-#else
-#define _PAGE_HIDDEN (_AT(pteval_t, 0))
-#endif
-
-/*
- * The same hidden bit is used by kmemcheck, but since kmemcheck
- * works on kernel pages while soft-dirty engine on user space,
- * they do not conflict with each other.
- */
-
#ifdef CONFIG_MEM_SOFT_DIRTY
#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY)
#else
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 3e4ed8fb5f91..a7471dcd2205 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -5,15 +5,6 @@
#include <linux/clocksource.h>
#include <asm/pvclock-abi.h>
-#ifdef CONFIG_KVM_GUEST
-extern struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void);
-#else
-static inline struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
-{
- return NULL;
-}
-#endif
-
/* some helper functions for xen and kvm pv clock sources */
u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src);
@@ -102,4 +93,14 @@ struct pvclock_vsyscall_time_info {
#define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
+#ifdef CONFIG_PARAVIRT_CLOCK
+void pvclock_set_pvti_cpu0_va(struct pvclock_vsyscall_time_info *pvti);
+struct pvclock_vsyscall_time_info *pvclock_get_pvti_cpu0_va(void);
+#else
+static inline struct pvclock_vsyscall_time_info *pvclock_get_pvti_cpu0_va(void)
+{
+ return NULL;
+}
+#endif
+
#endif /* _ASM_X86_PVCLOCK_H */
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index 076502241eae..55d392c6bd29 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -179,8 +179,6 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len)
* No 3D Now!
*/
-#ifndef CONFIG_KMEMCHECK
-
#if (__GNUC__ >= 4)
#define memcpy(t, f, n) __builtin_memcpy(t, f, n)
#else
@@ -189,13 +187,6 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len)
? __constant_memcpy((t), (f), (n)) \
: __memcpy((t), (f), (n)))
#endif
-#else
-/*
- * kmemcheck becomes very happy if we use the REP instructions unconditionally,
- * because it means that we know both memory operands in advance.
- */
-#define memcpy(t, f, n) __memcpy((t), (f), (n))
-#endif
#endif
#endif /* !CONFIG_FORTIFY_SOURCE */
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 0b1b4445f4c5..533f74c300c2 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -33,7 +33,6 @@ extern void *memcpy(void *to, const void *from, size_t len);
extern void *__memcpy(void *to, const void *from, size_t len);
#ifndef CONFIG_FORTIFY_SOURCE
-#ifndef CONFIG_KMEMCHECK
#if (__GNUC__ == 4 && __GNUC_MINOR__ < 3) || __GNUC__ < 4
#define memcpy(dst, src, len) \
({ \
@@ -46,13 +45,6 @@ extern void *__memcpy(void *to, const void *from, size_t len);
__ret; \
})
#endif
-#else
-/*
- * kmemcheck becomes very happy if we use the REP instructions unconditionally,
- * because it means that we know both memory operands in advance.
- */
-#define memcpy(dst, src, len) __inline_memcpy((dst), (src), (len))
-#endif
#endif /* !CONFIG_FORTIFY_SOURCE */
#define __HAVE_ARCH_MEMSET
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index caec8417539f..8b6780751132 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -70,11 +70,11 @@
#define SECONDARY_EXEC_APIC_REGISTER_VIRT 0x00000100
#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY 0x00000200
#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
-#define SECONDARY_EXEC_RDRAND 0x00000800
+#define SECONDARY_EXEC_RDRAND_EXITING 0x00000800
#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000
#define SECONDARY_EXEC_ENABLE_VMFUNC 0x00002000
#define SECONDARY_EXEC_SHADOW_VMCS 0x00004000
-#define SECONDARY_EXEC_RDSEED 0x00010000
+#define SECONDARY_EXEC_RDSEED_EXITING 0x00010000
#define SECONDARY_EXEC_ENABLE_PML 0x00020000
#define SECONDARY_EXEC_XSAVES 0x00100000
#define SECONDARY_EXEC_TSC_SCALING 0x02000000
diff --git a/arch/x86/include/asm/xen/cpuid.h b/arch/x86/include/asm/xen/cpuid.h
index 3bdd10d71223..a9630104f1c4 100644
--- a/arch/x86/include/asm/xen/cpuid.h
+++ b/arch/x86/include/asm/xen/cpuid.h
@@ -74,21 +74,43 @@
#define XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD (1u<<0)
/*
+ * Leaf 4 (0x40000x03)
+ * Sub-leaf 0: EAX: bit 0: emulated tsc
+ * bit 1: host tsc is known to be reliable
+ * bit 2: RDTSCP instruction available
+ * EBX: tsc_mode: 0=default (emulate if necessary), 1=emulate,
+ * 2=no emulation, 3=no emulation + TSC_AUX support
+ * ECX: guest tsc frequency in kHz
+ * EDX: guest tsc incarnation (migration count)
+ * Sub-leaf 1: EAX: tsc offset low part
+ * EBX: tsc offset high part
+ * ECX: multiplicator for tsc->ns conversion
+ * EDX: shift amount for tsc->ns conversion
+ * Sub-leaf 2: EAX: host tsc frequency in kHz
+ */
+
+/*
* Leaf 5 (0x40000x04)
* HVM-specific features
- * EAX: Features
- * EBX: vcpu id (iff EAX has XEN_HVM_CPUID_VCPU_ID_PRESENT flag)
+ * Sub-leaf 0: EAX: Features
+ * Sub-leaf 0: EBX: vcpu id (iff EAX has XEN_HVM_CPUID_VCPU_ID_PRESENT flag)
*/
-
-/* Virtualized APIC registers */
-#define XEN_HVM_CPUID_APIC_ACCESS_VIRT (1u << 0)
-/* Virtualized x2APIC accesses */
-#define XEN_HVM_CPUID_X2APIC_VIRT (1u << 1)
+#define XEN_HVM_CPUID_APIC_ACCESS_VIRT (1u << 0) /* Virtualized APIC registers */
+#define XEN_HVM_CPUID_X2APIC_VIRT (1u << 1) /* Virtualized x2APIC accesses */
/* Memory mapped from other domains has valid IOMMU entries */
#define XEN_HVM_CPUID_IOMMU_MAPPINGS (1u << 2)
-/* vcpu id is present in EBX */
-#define XEN_HVM_CPUID_VCPU_ID_PRESENT (1u << 3)
+#define XEN_HVM_CPUID_VCPU_ID_PRESENT (1u << 3) /* vcpu id is present in EBX */
+
+/*
+ * Leaf 6 (0x40000x05)
+ * PV-specific parameters
+ * Sub-leaf 0: EAX: max available sub-leaf
+ * Sub-leaf 0: EBX: bits 0-7: max machine address width
+ */
+
+/* Max. address width in bits taking memory hotplug into account. */
+#define XEN_CPUID_MACHINE_ADDRESS_WIDTH_MASK (0xffu << 0)
-#define XEN_CPUID_MAX_NUM_LEAVES 4
+#define XEN_CPUID_MAX_NUM_LEAVES 5
#endif /* __XEN_PUBLIC_ARCH_X86_CPUID_H__ */
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index c6b84245e5ab..123e669bf363 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -27,6 +27,15 @@ typedef struct xpaddr {
phys_addr_t paddr;
} xpaddr_t;
+#ifdef CONFIG_X86_64
+#define XEN_PHYSICAL_MASK __sme_clr((1UL << 52) - 1)
+#else
+#define XEN_PHYSICAL_MASK __PHYSICAL_MASK
+#endif
+
+#define XEN_PTE_MFN_MASK ((pteval_t)(((signed long)PAGE_MASK) & \
+ XEN_PHYSICAL_MASK))
+
#define XMADDR(x) ((xmaddr_t) { .maddr = (x) })
#define XPADDR(x) ((xpaddr_t) { .paddr = (x) })
@@ -278,7 +287,7 @@ static inline unsigned long bfn_to_local_pfn(unsigned long mfn)
static inline unsigned long pte_mfn(pte_t pte)
{
- return (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT;
+ return (pte.pte & XEN_PTE_MFN_MASK) >> PAGE_SHIFT;
}
static inline pte_t mfn_pte(unsigned long page_nr, pgprot_t pgprot)
diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h
index 1f5c5161ead6..45c8605467f1 100644
--- a/arch/x86/include/asm/xor.h
+++ b/arch/x86/include/asm/xor.h
@@ -1,7 +1,4 @@
-#ifdef CONFIG_KMEMCHECK
-/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */
-# include <asm-generic/xor.h>
-#elif !defined(_ASM_X86_XOR_H)
+#ifndef _ASM_X86_XOR_H
#define _ASM_X86_XOR_H
/*
diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c
index ea3046e0b0cf..bb8d300fecbd 100644
--- a/arch/x86/kernel/acpi/apei.c
+++ b/arch/x86/kernel/acpi/apei.c
@@ -52,8 +52,3 @@ void arch_apei_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
apei_mce_report_mem_error(sev, mem_err);
#endif
}
-
-void arch_apei_flush_tlb_one(unsigned long addr)
-{
- __flush_tlb_one(addr);
-}
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 90cb82dbba57..570e8bb1f386 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -22,7 +22,7 @@ obj-y += common.o
obj-y += rdrand.o
obj-y += match.o
obj-y += bugs.o
-obj-$(CONFIG_CPU_FREQ) += aperfmperf.o
+obj-y += aperfmperf.o
obj-y += cpuid-deps.o
obj-$(CONFIG_PROC_FS) += proc.o
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
index 957813e0180d..7eba34df54c3 100644
--- a/arch/x86/kernel/cpu/aperfmperf.c
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -14,6 +14,8 @@
#include <linux/percpu.h>
#include <linux/smp.h>
+#include "cpu.h"
+
struct aperfmperf_sample {
unsigned int khz;
ktime_t time;
@@ -24,7 +26,7 @@ struct aperfmperf_sample {
static DEFINE_PER_CPU(struct aperfmperf_sample, samples);
#define APERFMPERF_CACHE_THRESHOLD_MS 10
-#define APERFMPERF_REFRESH_DELAY_MS 20
+#define APERFMPERF_REFRESH_DELAY_MS 10
#define APERFMPERF_STALE_THRESHOLD_MS 1000
/*
@@ -38,8 +40,6 @@ static void aperfmperf_snapshot_khz(void *dummy)
u64 aperf, aperf_delta;
u64 mperf, mperf_delta;
struct aperfmperf_sample *s = this_cpu_ptr(&samples);
- ktime_t now = ktime_get();
- s64 time_delta = ktime_ms_delta(now, s->time);
unsigned long flags;
local_irq_save(flags);
@@ -57,38 +57,68 @@ static void aperfmperf_snapshot_khz(void *dummy)
if (mperf_delta == 0)
return;
- s->time = now;
+ s->time = ktime_get();
s->aperf = aperf;
s->mperf = mperf;
-
- /* If the previous iteration was too long ago, discard it. */
- if (time_delta > APERFMPERF_STALE_THRESHOLD_MS)
- s->khz = 0;
- else
- s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta);
+ s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta);
}
-unsigned int arch_freq_get_on_cpu(int cpu)
+static bool aperfmperf_snapshot_cpu(int cpu, ktime_t now, bool wait)
{
- s64 time_delta;
- unsigned int khz;
+ s64 time_delta = ktime_ms_delta(now, per_cpu(samples.time, cpu));
+
+ /* Don't bother re-computing within the cache threshold time. */
+ if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS)
+ return true;
+
+ smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, wait);
+
+ /* Return false if the previous iteration was too long ago. */
+ return time_delta <= APERFMPERF_STALE_THRESHOLD_MS;
+}
+unsigned int aperfmperf_get_khz(int cpu)
+{
if (!cpu_khz)
return 0;
if (!static_cpu_has(X86_FEATURE_APERFMPERF))
return 0;
- /* Don't bother re-computing within the cache threshold time. */
- time_delta = ktime_ms_delta(ktime_get(), per_cpu(samples.time, cpu));
- khz = per_cpu(samples.khz, cpu);
- if (khz && time_delta < APERFMPERF_CACHE_THRESHOLD_MS)
- return khz;
+ aperfmperf_snapshot_cpu(cpu, ktime_get(), true);
+ return per_cpu(samples.khz, cpu);
+}
- smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);
- khz = per_cpu(samples.khz, cpu);
- if (khz)
- return khz;
+void arch_freq_prepare_all(void)
+{
+ ktime_t now = ktime_get();
+ bool wait = false;
+ int cpu;
+
+ if (!cpu_khz)
+ return;
+
+ if (!static_cpu_has(X86_FEATURE_APERFMPERF))
+ return;
+
+ for_each_online_cpu(cpu)
+ if (!aperfmperf_snapshot_cpu(cpu, now, false))
+ wait = true;
+
+ if (wait)
+ msleep(APERFMPERF_REFRESH_DELAY_MS);
+}
+
+unsigned int arch_freq_get_on_cpu(int cpu)
+{
+ if (!cpu_khz)
+ return 0;
+
+ if (!static_cpu_has(X86_FEATURE_APERFMPERF))
+ return 0;
+
+ if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true))
+ return per_cpu(samples.khz, cpu);
msleep(APERFMPERF_REFRESH_DELAY_MS);
smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index f52a370b6c00..e806b11a99af 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -47,4 +47,7 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[],
extern void get_cpu_cap(struct cpuinfo_x86 *c);
extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
+
+unsigned int aperfmperf_get_khz(int cpu);
+
#endif /* ARCH_X86_CPU_H */
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index b720dacac051..b1af22073e28 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -187,21 +187,6 @@ static void early_init_intel(struct cpuinfo_x86 *c)
if (c->x86 == 6 && c->x86_model < 15)
clear_cpu_cap(c, X86_FEATURE_PAT);
-#ifdef CONFIG_KMEMCHECK
- /*
- * P4s have a "fast strings" feature which causes single-
- * stepping REP instructions to only generate a #DB on
- * cache-line boundaries.
- *
- * Ingo Molnar reported a Pentium D (model 6) and a Xeon
- * (model 2) with the same problem.
- */
- if (c->x86 == 15)
- if (msr_clear_bit(MSR_IA32_MISC_ENABLE,
- MSR_IA32_MISC_ENABLE_FAST_STRING_BIT) > 0)
- pr_info("kmemcheck: Disabling fast string operations\n");
-#endif
-
/*
* If fast string is not enabled in IA32_MISC_ENABLE for any reason,
* clear the fast string and enhanced fast string CPU capabilities.
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 6b7e17bf0b71..e7ecedafa1c8 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -5,6 +5,8 @@
#include <linux/seq_file.h>
#include <linux/cpufreq.h>
+#include "cpu.h"
+
/*
* Get CPU information for use by the procfs.
*/
@@ -78,9 +80,11 @@ static int show_cpuinfo(struct seq_file *m, void *v)
seq_printf(m, "microcode\t: 0x%x\n", c->microcode);
if (cpu_has(c, X86_FEATURE_TSC)) {
- unsigned int freq = cpufreq_quick_get(cpu);
+ unsigned int freq = aperfmperf_get_khz(cpu);
if (!freq)
+ freq = cpufreq_quick_get(cpu);
+ if (!freq)
freq = cpu_khz;
seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
freq / 1000, (freq % 1000));
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 7d7715dde901..e5ec3cafa72e 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -57,7 +57,7 @@
# error "Need more virtual address space for the ESPFIX hack"
#endif
-#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
+#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO)
/* This contains the *bottom* address of the espfix stack */
DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 77b492c2d658..8b26c9e01cc4 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -48,12 +48,6 @@ early_param("no-kvmclock", parse_no_kvmclock);
static struct pvclock_vsyscall_time_info *hv_clock;
static struct pvclock_wall_clock *wall_clock;
-struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
-{
- return hv_clock;
-}
-EXPORT_SYMBOL_GPL(pvclock_pvti_cpu0_va);
-
/*
* The wallclock is the time of day when we booted. Since then, some time may
* have elapsed since the hypervisor wrote the data. So we try to account for
@@ -377,6 +371,7 @@ int __init kvm_setup_vsyscall_timeinfo(void)
return 1;
}
+ pvclock_set_pvti_cpu0_va(hv_clock);
put_cpu();
kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK;
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 5c3f6d6a5078..761f6af6efa5 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -25,8 +25,10 @@
#include <asm/fixmap.h>
#include <asm/pvclock.h>
+#include <asm/vgtod.h>
static u8 valid_flags __read_mostly = 0;
+static struct pvclock_vsyscall_time_info *pvti_cpu0_va __read_mostly;
void pvclock_set_flags(u8 flags)
{
@@ -144,3 +146,15 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
}
+
+void pvclock_set_pvti_cpu0_va(struct pvclock_vsyscall_time_info *pvti)
+{
+ WARN_ON(vclock_was_used(VCLOCK_PVCLOCK));
+ pvti_cpu0_va = pvti;
+}
+
+struct pvclock_vsyscall_time_info *pvclock_get_pvti_cpu0_va(void)
+{
+ return pvti_cpu0_va;
+}
+EXPORT_SYMBOL_GPL(pvclock_get_pvti_cpu0_va);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b7b0f74a2150..989514c94a55 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -42,7 +42,6 @@
#include <linux/edac.h>
#endif
-#include <asm/kmemcheck.h>
#include <asm/stacktrace.h>
#include <asm/processor.h>
#include <asm/debugreg.h>
@@ -749,10 +748,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
if (!dr6 && user_mode(regs))
user_icebp = 1;
- /* Catch kmemcheck conditions! */
- if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
- goto exit;
-
/* Store the virtualized DR6 value */
tsk->thread.debugreg6 = dr6;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index d90cdc77e077..8079d141792a 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2591,6 +2591,15 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
ctxt->ops->set_msr(ctxt, MSR_EFER, efer);
smbase = ctxt->ops->get_smbase(ctxt);
+
+ /*
+ * Give pre_leave_smm() a chance to make ISA-specific changes to the
+ * vCPU state (e.g. enter guest mode) before loading state from the SMM
+ * state-save area.
+ */
+ if (ctxt->ops->pre_leave_smm(ctxt, smbase))
+ return X86EMUL_UNHANDLEABLE;
+
if (emulator_has_longmode(ctxt))
ret = rsm_load_state_64(ctxt, smbase + 0x8000);
else
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 36c90d631096..943acbf00c69 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1301,14 +1301,42 @@ static void update_divide_count(struct kvm_lapic *apic)
apic->divide_count);
}
+static void limit_periodic_timer_frequency(struct kvm_lapic *apic)
+{
+ /*
+ * Do not allow the guest to program periodic timers with small
+ * interval, since the hrtimers are not throttled by the host
+ * scheduler.
+ */
+ if (apic_lvtt_period(apic) && apic->lapic_timer.period) {
+ s64 min_period = min_timer_period_us * 1000LL;
+
+ if (apic->lapic_timer.period < min_period) {
+ pr_info_ratelimited(
+ "kvm: vcpu %i: requested %lld ns "
+ "lapic timer period limited to %lld ns\n",
+ apic->vcpu->vcpu_id,
+ apic->lapic_timer.period, min_period);
+ apic->lapic_timer.period = min_period;
+ }
+ }
+}
+
static void apic_update_lvtt(struct kvm_lapic *apic)
{
u32 timer_mode = kvm_lapic_get_reg(apic, APIC_LVTT) &
apic->lapic_timer.timer_mode_mask;
if (apic->lapic_timer.timer_mode != timer_mode) {
+ if (apic_lvtt_tscdeadline(apic) != (timer_mode ==
+ APIC_LVT_TIMER_TSCDEADLINE)) {
+ hrtimer_cancel(&apic->lapic_timer.timer);
+ kvm_lapic_set_reg(apic, APIC_TMICT, 0);
+ apic->lapic_timer.period = 0;
+ apic->lapic_timer.tscdeadline = 0;
+ }
apic->lapic_timer.timer_mode = timer_mode;
- hrtimer_cancel(&apic->lapic_timer.timer);
+ limit_periodic_timer_frequency(apic);
}
}
@@ -1430,6 +1458,30 @@ static void start_sw_period(struct kvm_lapic *apic)
HRTIMER_MODE_ABS_PINNED);
}
+static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_divisor)
+{
+ ktime_t now, remaining;
+ u64 ns_remaining_old, ns_remaining_new;
+
+ apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
+ * APIC_BUS_CYCLE_NS * apic->divide_count;
+ limit_periodic_timer_frequency(apic);
+
+ now = ktime_get();
+ remaining = ktime_sub(apic->lapic_timer.target_expiration, now);
+ if (ktime_to_ns(remaining) < 0)
+ remaining = 0;
+
+ ns_remaining_old = ktime_to_ns(remaining);
+ ns_remaining_new = mul_u64_u32_div(ns_remaining_old,
+ apic->divide_count, old_divisor);
+
+ apic->lapic_timer.tscdeadline +=
+ nsec_to_cycles(apic->vcpu, ns_remaining_new) -
+ nsec_to_cycles(apic->vcpu, ns_remaining_old);
+ apic->lapic_timer.target_expiration = ktime_add_ns(now, ns_remaining_new);
+}
+
static bool set_target_expiration(struct kvm_lapic *apic)
{
ktime_t now;
@@ -1439,27 +1491,13 @@ static bool set_target_expiration(struct kvm_lapic *apic)
apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
* APIC_BUS_CYCLE_NS * apic->divide_count;
- if (!apic->lapic_timer.period)
+ if (!apic->lapic_timer.period) {
+ apic->lapic_timer.tscdeadline = 0;
return false;
-
- /*
- * Do not allow the guest to program periodic timers with small
- * interval, since the hrtimers are not throttled by the host
- * scheduler.
- */
- if (apic_lvtt_period(apic)) {
- s64 min_period = min_timer_period_us * 1000LL;
-
- if (apic->lapic_timer.period < min_period) {
- pr_info_ratelimited(
- "kvm: vcpu %i: requested %lld ns "
- "lapic timer period limited to %lld ns\n",
- apic->vcpu->vcpu_id,
- apic->lapic_timer.period, min_period);
- apic->lapic_timer.period = min_period;
- }
}
+ limit_periodic_timer_frequency(apic);
+
apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
PRIx64 ", "
"timer initial count 0x%x, period %lldns, "
@@ -1515,6 +1553,9 @@ static bool start_hv_timer(struct kvm_lapic *apic)
if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending))
return false;
+ if (!ktimer->tscdeadline)
+ return false;
+
r = kvm_x86_ops->set_hv_timer(apic->vcpu, ktimer->tscdeadline);
if (r < 0)
return false;
@@ -1738,13 +1779,21 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
start_apic_timer(apic);
break;
- case APIC_TDCR:
+ case APIC_TDCR: {
+ uint32_t old_divisor = apic->divide_count;
+
if (val & 4)
apic_debug("KVM_WRITE:TDCR %x\n", val);
kvm_lapic_set_reg(apic, APIC_TDCR, val);
update_divide_count(apic);
+ if (apic->divide_count != old_divisor &&
+ apic->lapic_timer.period) {
+ hrtimer_cancel(&apic->lapic_timer.timer);
+ update_target_expiration(apic, old_divisor);
+ restart_apic_timer(apic);
+ }
break;
-
+ }
case APIC_ESR:
if (apic_x2apic_mode(apic) && val != 0) {
apic_debug("KVM_WRITE:ESR not zero %x\n", val);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a119b361b8b7..e5e66e5c6640 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -150,6 +150,20 @@ module_param(dbg, bool, 0644);
/* make pte_list_desc fit well in cache line */
#define PTE_LIST_EXT 3
+/*
+ * Return values of handle_mmio_page_fault and mmu.page_fault:
+ * RET_PF_RETRY: let CPU fault again on the address.
+ * RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
+ *
+ * For handle_mmio_page_fault only:
+ * RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
+ */
+enum {
+ RET_PF_RETRY = 0,
+ RET_PF_EMULATE = 1,
+ RET_PF_INVALID = 2,
+};
+
struct pte_list_desc {
u64 *sptes[PTE_LIST_EXT];
struct pte_list_desc *more;
@@ -2424,7 +2438,7 @@ static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
{
- return __shadow_walk_next(iterator, *iterator->sptep);
+ __shadow_walk_next(iterator, *iterator->sptep);
}
static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
@@ -2794,13 +2808,13 @@ done:
return ret;
}
-static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
- int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
- bool speculative, bool host_writable)
+static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
+ int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
+ bool speculative, bool host_writable)
{
int was_rmapped = 0;
int rmap_count;
- bool emulate = false;
+ int ret = RET_PF_RETRY;
pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
*sptep, write_fault, gfn);
@@ -2830,12 +2844,12 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative,
true, host_writable)) {
if (write_fault)
- emulate = true;
+ ret = RET_PF_EMULATE;
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
if (unlikely(is_mmio_spte(*sptep)))
- emulate = true;
+ ret = RET_PF_EMULATE;
pgprintk("%s: setting spte %llx\n", __func__, *sptep);
pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
@@ -2855,7 +2869,7 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
kvm_release_pfn_clean(pfn);
- return emulate;
+ return ret;
}
static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
@@ -2994,14 +3008,13 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
* Do not cache the mmio info caused by writing the readonly gfn
* into the spte otherwise read access on readonly gfn also can
* caused mmio page fault and treat it as mmio access.
- * Return 1 to tell kvm to emulate it.
*/
if (pfn == KVM_PFN_ERR_RO_FAULT)
- return 1;
+ return RET_PF_EMULATE;
if (pfn == KVM_PFN_ERR_HWPOISON) {
kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
- return 0;
+ return RET_PF_RETRY;
}
return -EFAULT;
@@ -3286,13 +3299,13 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
}
if (fast_page_fault(vcpu, v, level, error_code))
- return 0;
+ return RET_PF_RETRY;
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
- return 0;
+ return RET_PF_RETRY;
if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
return r;
@@ -3312,7 +3325,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
- return 0;
+ return RET_PF_RETRY;
}
@@ -3659,54 +3672,38 @@ exit:
return reserved;
}
-/*
- * Return values of handle_mmio_page_fault:
- * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction
- * directly.
- * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page
- * fault path update the mmio spte.
- * RET_MMIO_PF_RETRY: let CPU fault again on the address.
- * RET_MMIO_PF_BUG: a bug was detected (and a WARN was printed).
- */
-enum {
- RET_MMIO_PF_EMULATE = 1,
- RET_MMIO_PF_INVALID = 2,
- RET_MMIO_PF_RETRY = 0,
- RET_MMIO_PF_BUG = -1
-};
-
static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
{
u64 spte;
bool reserved;
if (mmio_info_in_cache(vcpu, addr, direct))
- return RET_MMIO_PF_EMULATE;
+ return RET_PF_EMULATE;
reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
if (WARN_ON(reserved))
- return RET_MMIO_PF_BUG;
+ return -EINVAL;
if (is_mmio_spte(spte)) {
gfn_t gfn = get_mmio_spte_gfn(spte);
unsigned access = get_mmio_spte_access(spte);
if (!check_mmio_spte(vcpu, spte))
- return RET_MMIO_PF_INVALID;
+ return RET_PF_INVALID;
if (direct)
addr = 0;
trace_handle_mmio_page_fault(addr, gfn, access);
vcpu_cache_mmio_info(vcpu, addr, gfn, access);
- return RET_MMIO_PF_EMULATE;
+ return RET_PF_EMULATE;
}
/*
* If the page table is zapped by other cpus, let CPU fault again on
* the address.
*/
- return RET_MMIO_PF_RETRY;
+ return RET_PF_RETRY;
}
EXPORT_SYMBOL_GPL(handle_mmio_page_fault);
@@ -3756,7 +3753,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
if (page_fault_handle_page_track(vcpu, error_code, gfn))
- return 1;
+ return RET_PF_EMULATE;
r = mmu_topup_memory_caches(vcpu);
if (r)
@@ -3820,8 +3817,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
}
int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
- u64 fault_address, char *insn, int insn_len,
- bool need_unprotect)
+ u64 fault_address, char *insn, int insn_len)
{
int r = 1;
@@ -3829,7 +3825,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
default:
trace_kvm_page_fault(fault_address, error_code);
- if (need_unprotect && kvm_event_needs_reinjection(vcpu))
+ if (kvm_event_needs_reinjection(vcpu))
kvm_mmu_unprotect_page_virt(vcpu, fault_address);
r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
insn_len);
@@ -3876,7 +3872,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
if (page_fault_handle_page_track(vcpu, error_code, gfn))
- return 1;
+ return RET_PF_EMULATE;
r = mmu_topup_memory_caches(vcpu);
if (r)
@@ -3893,13 +3889,13 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
}
if (fast_page_fault(vcpu, gpa, level, error_code))
- return 0;
+ return RET_PF_RETRY;
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
- return 0;
+ return RET_PF_RETRY;
if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
return r;
@@ -3919,7 +3915,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
- return 0;
+ return RET_PF_RETRY;
}
static void nonpaging_init_context(struct kvm_vcpu *vcpu,
@@ -4918,25 +4914,25 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
vcpu->arch.gpa_val = cr2;
}
+ r = RET_PF_INVALID;
if (unlikely(error_code & PFERR_RSVD_MASK)) {
r = handle_mmio_page_fault(vcpu, cr2, direct);
- if (r == RET_MMIO_PF_EMULATE) {
+ if (r == RET_PF_EMULATE) {
emulation_type = 0;
goto emulate;
}
- if (r == RET_MMIO_PF_RETRY)
- return 1;
- if (r < 0)
- return r;
- /* Must be RET_MMIO_PF_INVALID. */
}
- r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code),
- false);
+ if (r == RET_PF_INVALID) {
+ r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code),
+ false);
+ WARN_ON(r == RET_PF_INVALID);
+ }
+
+ if (r == RET_PF_RETRY)
+ return 1;
if (r < 0)
return r;
- if (!r)
- return 1;
/*
* Before emulating the instruction, check if the error code
@@ -4993,8 +4989,7 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp);
static void free_mmu_pages(struct kvm_vcpu *vcpu)
{
free_page((unsigned long)vcpu->arch.mmu.pae_root);
- if (vcpu->arch.mmu.lm_root != NULL)
- free_page((unsigned long)vcpu->arch.mmu.lm_root);
+ free_page((unsigned long)vcpu->arch.mmu.lm_root);
}
static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
@@ -5464,10 +5459,8 @@ static struct shrinker mmu_shrinker = {
static void mmu_destroy_caches(void)
{
- if (pte_list_desc_cache)
- kmem_cache_destroy(pte_list_desc_cache);
- if (mmu_page_header_cache)
- kmem_cache_destroy(mmu_page_header_cache);
+ kmem_cache_destroy(pte_list_desc_cache);
+ kmem_cache_destroy(mmu_page_header_cache);
}
int kvm_mmu_module_init(void)
@@ -5476,13 +5469,13 @@ int kvm_mmu_module_init(void)
pte_list_desc_cache = kmem_cache_create("pte_list_desc",
sizeof(struct pte_list_desc),
- 0, 0, NULL);
+ 0, SLAB_ACCOUNT, NULL);
if (!pte_list_desc_cache)
goto nomem;
mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
sizeof(struct kvm_mmu_page),
- 0, 0, NULL);
+ 0, SLAB_ACCOUNT, NULL);
if (!mmu_page_header_cache)
goto nomem;
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index efc857615d8e..5b408c0ad612 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -66,8 +66,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
bool accessed_dirty);
bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
- u64 fault_address, char *insn, int insn_len,
- bool need_unprotect);
+ u64 fault_address, char *insn, int insn_len);
static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
{
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index f18d1f8d332b..5abae72266b7 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -593,7 +593,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
struct kvm_mmu_page *sp = NULL;
struct kvm_shadow_walk_iterator it;
unsigned direct_access, access = gw->pt_access;
- int top_level, emulate;
+ int top_level, ret;
direct_access = gw->pte_access;
@@ -659,15 +659,15 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
}
clear_sp_write_flooding_count(it.sptep);
- emulate = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault,
- it.level, gw->gfn, pfn, prefault, map_writable);
+ ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault,
+ it.level, gw->gfn, pfn, prefault, map_writable);
FNAME(pte_prefetch)(vcpu, gw, it.sptep);
- return emulate;
+ return ret;
out_gpte_changed:
kvm_release_pfn_clean(pfn);
- return 0;
+ return RET_PF_RETRY;
}
/*
@@ -762,12 +762,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
if (!prefault)
inject_page_fault(vcpu, &walker.fault);
- return 0;
+ return RET_PF_RETRY;
}
if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) {
shadow_page_table_clear_flood(vcpu, addr);
- return 1;
+ return RET_PF_EMULATE;
}
vcpu->arch.write_fault_to_shadow_pgtable = false;
@@ -789,7 +789,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
&map_writable))
- return 0;
+ return RET_PF_RETRY;
if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r))
return r;
@@ -834,7 +834,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
- return 0;
+ return RET_PF_RETRY;
}
static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 0e68f0b3cbf7..59e13a79c2e3 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1034,15 +1034,12 @@ static int avic_ga_log_notifier(u32 ga_tag)
}
spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
- if (!vcpu)
- return 0;
-
/* Note:
* At this point, the IOMMU should have already set the pending
* bit in the vAPIC backing page. So, we just need to schedule
* in the vcpu.
*/
- if (vcpu->mode == OUTSIDE_GUEST_MODE)
+ if (vcpu)
kvm_vcpu_wake_up(vcpu);
return 0;
@@ -2144,7 +2141,18 @@ static int pf_interception(struct vcpu_svm *svm)
return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address,
svm->vmcb->control.insn_bytes,
- svm->vmcb->control.insn_len, !npt_enabled);
+ svm->vmcb->control.insn_len);
+}
+
+static int npf_interception(struct vcpu_svm *svm)
+{
+ u64 fault_address = svm->vmcb->control.exit_info_2;
+ u64 error_code = svm->vmcb->control.exit_info_1;
+
+ trace_kvm_page_fault(fault_address, error_code);
+ return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
+ svm->vmcb->control.insn_bytes,
+ svm->vmcb->control.insn_len);
}
static int db_interception(struct vcpu_svm *svm)
@@ -2916,70 +2924,9 @@ static bool nested_vmcb_checks(struct vmcb *vmcb)
return true;
}
-static bool nested_svm_vmrun(struct vcpu_svm *svm)
+static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
+ struct vmcb *nested_vmcb, struct page *page)
{
- struct vmcb *nested_vmcb;
- struct vmcb *hsave = svm->nested.hsave;
- struct vmcb *vmcb = svm->vmcb;
- struct page *page;
- u64 vmcb_gpa;
-
- vmcb_gpa = svm->vmcb->save.rax;
-
- nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
- if (!nested_vmcb)
- return false;
-
- if (!nested_vmcb_checks(nested_vmcb)) {
- nested_vmcb->control.exit_code = SVM_EXIT_ERR;
- nested_vmcb->control.exit_code_hi = 0;
- nested_vmcb->control.exit_info_1 = 0;
- nested_vmcb->control.exit_info_2 = 0;
-
- nested_svm_unmap(page);
-
- return false;
- }
-
- trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
- nested_vmcb->save.rip,
- nested_vmcb->control.int_ctl,
- nested_vmcb->control.event_inj,
- nested_vmcb->control.nested_ctl);
-
- trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
- nested_vmcb->control.intercept_cr >> 16,
- nested_vmcb->control.intercept_exceptions,
- nested_vmcb->control.intercept);
-
- /* Clear internal status */
- kvm_clear_exception_queue(&svm->vcpu);
- kvm_clear_interrupt_queue(&svm->vcpu);
-
- /*
- * Save the old vmcb, so we don't need to pick what we save, but can
- * restore everything when a VMEXIT occurs
- */
- hsave->save.es = vmcb->save.es;
- hsave->save.cs = vmcb->save.cs;
- hsave->save.ss = vmcb->save.ss;
- hsave->save.ds = vmcb->save.ds;
- hsave->save.gdtr = vmcb->save.gdtr;
- hsave->save.idtr = vmcb->save.idtr;
- hsave->save.efer = svm->vcpu.arch.efer;
- hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
- hsave->save.cr4 = svm->vcpu.arch.cr4;
- hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
- hsave->save.rip = kvm_rip_read(&svm->vcpu);
- hsave->save.rsp = vmcb->save.rsp;
- hsave->save.rax = vmcb->save.rax;
- if (npt_enabled)
- hsave->save.cr3 = vmcb->save.cr3;
- else
- hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
-
- copy_vmcb_control_area(hsave, vmcb);
-
if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
svm->vcpu.arch.hflags |= HF_HIF_MASK;
else
@@ -3072,6 +3019,73 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
enable_gif(svm);
mark_all_dirty(svm->vmcb);
+}
+
+static bool nested_svm_vmrun(struct vcpu_svm *svm)
+{
+ struct vmcb *nested_vmcb;
+ struct vmcb *hsave = svm->nested.hsave;
+ struct vmcb *vmcb = svm->vmcb;
+ struct page *page;
+ u64 vmcb_gpa;
+
+ vmcb_gpa = svm->vmcb->save.rax;
+
+ nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
+ if (!nested_vmcb)
+ return false;
+
+ if (!nested_vmcb_checks(nested_vmcb)) {
+ nested_vmcb->control.exit_code = SVM_EXIT_ERR;
+ nested_vmcb->control.exit_code_hi = 0;
+ nested_vmcb->control.exit_info_1 = 0;
+ nested_vmcb->control.exit_info_2 = 0;
+
+ nested_svm_unmap(page);
+
+ return false;
+ }
+
+ trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
+ nested_vmcb->save.rip,
+ nested_vmcb->control.int_ctl,
+ nested_vmcb->control.event_inj,
+ nested_vmcb->control.nested_ctl);
+
+ trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
+ nested_vmcb->control.intercept_cr >> 16,
+ nested_vmcb->control.intercept_exceptions,
+ nested_vmcb->control.intercept);
+
+ /* Clear internal status */
+ kvm_clear_exception_queue(&svm->vcpu);
+ kvm_clear_interrupt_queue(&svm->vcpu);
+
+ /*
+ * Save the old vmcb, so we don't need to pick what we save, but can
+ * restore everything when a VMEXIT occurs
+ */
+ hsave->save.es = vmcb->save.es;
+ hsave->save.cs = vmcb->save.cs;
+ hsave->save.ss = vmcb->save.ss;
+ hsave->save.ds = vmcb->save.ds;
+ hsave->save.gdtr = vmcb->save.gdtr;
+ hsave->save.idtr = vmcb->save.idtr;
+ hsave->save.efer = svm->vcpu.arch.efer;
+ hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
+ hsave->save.cr4 = svm->vcpu.arch.cr4;
+ hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
+ hsave->save.rip = kvm_rip_read(&svm->vcpu);
+ hsave->save.rsp = vmcb->save.rsp;
+ hsave->save.rax = vmcb->save.rax;
+ if (npt_enabled)
+ hsave->save.cr3 = vmcb->save.cr3;
+ else
+ hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
+
+ copy_vmcb_control_area(hsave, vmcb);
+
+ enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, page);
return true;
}
@@ -3173,7 +3187,7 @@ static int stgi_interception(struct vcpu_svm *svm)
/*
* If VGIF is enabled, the STGI intercept is only added to
- * detect the opening of the NMI window; remove it now.
+ * detect the opening of the SMI/NMI window; remove it now.
*/
if (vgif_enabled(svm))
clr_intercept(svm, INTERCEPT_STGI);
@@ -3657,6 +3671,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
u32 ecx = msr->index;
u64 data = msr->data;
switch (ecx) {
+ case MSR_IA32_CR_PAT:
+ if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
+ return 1;
+ vcpu->arch.pat = data;
+ svm->vmcb->save.g_pat = data;
+ mark_dirty(svm->vmcb, VMCB_NPT);
+ break;
case MSR_IA32_TSC:
kvm_write_tsc(vcpu, msr);
break;
@@ -4131,7 +4152,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_MONITOR] = monitor_interception,
[SVM_EXIT_MWAIT] = mwait_interception,
[SVM_EXIT_XSETBV] = xsetbv_interception,
- [SVM_EXIT_NPF] = pf_interception,
+ [SVM_EXIT_NPF] = npf_interception,
[SVM_EXIT_RSM] = emulate_on_interception,
[SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception,
[SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception,
@@ -5393,6 +5414,88 @@ static void svm_setup_mce(struct kvm_vcpu *vcpu)
vcpu->arch.mcg_cap &= 0x1ff;
}
+static int svm_smi_allowed(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /* Per APM Vol.2 15.22.2 "Response to SMI" */
+ if (!gif_set(svm))
+ return 0;
+
+ if (is_guest_mode(&svm->vcpu) &&
+ svm->nested.intercept & (1ULL << INTERCEPT_SMI)) {
+ /* TODO: Might need to set exit_info_1 and exit_info_2 here */
+ svm->vmcb->control.exit_code = SVM_EXIT_SMI;
+ svm->nested.exit_required = true;
+ return 0;
+ }
+
+ return 1;
+}
+
+static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int ret;
+
+ if (is_guest_mode(vcpu)) {
+ /* FED8h - SVM Guest */
+ put_smstate(u64, smstate, 0x7ed8, 1);
+ /* FEE0h - SVM Guest VMCB Physical Address */
+ put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb);
+
+ svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
+ svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+ svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
+
+ ret = nested_svm_vmexit(svm);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *nested_vmcb;
+ struct page *page;
+ struct {
+ u64 guest;
+ u64 vmcb;
+ } svm_state_save;
+ int ret;
+
+ ret = kvm_vcpu_read_guest(vcpu, smbase + 0xfed8, &svm_state_save,
+ sizeof(svm_state_save));
+ if (ret)
+ return ret;
+
+ if (svm_state_save.guest) {
+ vcpu->arch.hflags &= ~HF_SMM_MASK;
+ nested_vmcb = nested_svm_map(svm, svm_state_save.vmcb, &page);
+ if (nested_vmcb)
+ enter_svm_guest_mode(svm, svm_state_save.vmcb, nested_vmcb, page);
+ else
+ ret = 1;
+ vcpu->arch.hflags |= HF_SMM_MASK;
+ }
+ return ret;
+}
+
+static int enable_smi_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!gif_set(svm)) {
+ if (vgif_enabled(svm))
+ set_intercept(svm, INTERCEPT_STGI);
+ /* STGI will cause a vm exit */
+ return 1;
+ }
+ return 0;
+}
+
static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled,
@@ -5503,6 +5606,11 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.deliver_posted_interrupt = svm_deliver_avic_intr,
.update_pi_irte = svm_update_pi_irte,
.setup_mce = svm_setup_mce,
+
+ .smi_allowed = svm_smi_allowed,
+ .pre_enter_smm = svm_pre_enter_smm,
+ .pre_leave_smm = svm_pre_leave_smm,
+ .enable_smi_window = enable_smi_window,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a6f4f095f8f4..714a0673ec3c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -70,6 +70,9 @@ MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
static bool __read_mostly enable_vpid = 1;
module_param_named(vpid, enable_vpid, bool, 0444);
+static bool __read_mostly enable_vnmi = 1;
+module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
+
static bool __read_mostly flexpriority_enabled = 1;
module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
@@ -202,6 +205,10 @@ struct loaded_vmcs {
bool nmi_known_unmasked;
unsigned long vmcs_host_cr3; /* May not match real cr3 */
unsigned long vmcs_host_cr4; /* May not match real cr4 */
+ /* Support for vnmi-less CPUs */
+ int soft_vnmi_blocked;
+ ktime_t entry_time;
+ s64 vnmi_blocked_time;
struct list_head loaded_vmcss_on_cpu_link;
};
@@ -486,6 +493,14 @@ struct nested_vmx {
u64 nested_vmx_cr4_fixed1;
u64 nested_vmx_vmcs_enum;
u64 nested_vmx_vmfunc_controls;
+
+ /* SMM related state */
+ struct {
+ /* in VMX operation on SMM entry? */
+ bool vmxon;
+ /* in guest mode on SMM entry? */
+ bool guest_mode;
+ } smm;
};
#define POSTED_INTR_ON 0
@@ -900,16 +915,13 @@ static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu);
static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
static bool vmx_xsaves_supported(void);
-static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
static void vmx_get_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
static bool guest_state_valid(struct kvm_vcpu *vcpu);
static u32 vmx_segment_access_rights(struct kvm_segment *var);
-static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
-static int alloc_identity_pagetable(struct kvm *kvm);
static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
@@ -1286,6 +1298,11 @@ static inline bool cpu_has_vmx_invpcid(void)
SECONDARY_EXEC_ENABLE_INVPCID;
}
+static inline bool cpu_has_virtual_nmis(void)
+{
+ return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
+}
+
static inline bool cpu_has_vmx_wbinvd_exit(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -1343,11 +1360,6 @@ static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
(vmcs12->secondary_vm_exec_control & bit);
}
-static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
-{
- return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
-}
-
static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12)
{
return vmcs12->pin_based_vm_exec_control &
@@ -1598,18 +1610,15 @@ static inline void vpid_sync_context(int vpid)
static inline void ept_sync_global(void)
{
- if (cpu_has_vmx_invept_global())
- __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
+ __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
}
static inline void ept_sync_context(u64 eptp)
{
- if (enable_ept) {
- if (cpu_has_vmx_invept_context())
- __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
- else
- ept_sync_global();
- }
+ if (cpu_has_vmx_invept_context())
+ __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
+ else
+ ept_sync_global();
}
static __always_inline void vmcs_check16(unsigned long field)
@@ -2831,8 +2840,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
SECONDARY_EXEC_ENABLE_PML;
vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT;
}
- } else
- vmx->nested.nested_vmx_ept_caps = 0;
+ }
if (cpu_has_vmx_vmfunc()) {
vmx->nested.nested_vmx_secondary_ctls_high |=
@@ -2841,8 +2849,9 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
* Advertise EPTP switching unconditionally
* since we emulate it
*/
- vmx->nested.nested_vmx_vmfunc_controls =
- VMX_VMFUNC_EPTP_SWITCHING;
+ if (enable_ept)
+ vmx->nested.nested_vmx_vmfunc_controls =
+ VMX_VMFUNC_EPTP_SWITCHING;
}
/*
@@ -2856,8 +2865,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
SECONDARY_EXEC_ENABLE_VPID;
vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
VMX_VPID_EXTENT_SUPPORTED_MASK;
- } else
- vmx->nested.nested_vmx_vpid_caps = 0;
+ }
if (enable_unrestricted_guest)
vmx->nested.nested_vmx_secondary_ctls_high |=
@@ -3544,7 +3552,8 @@ static int hardware_enable(void)
wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
}
kvm_cpu_vmxon(phys_addr);
- ept_sync_global();
+ if (enable_ept)
+ ept_sync_global();
return 0;
}
@@ -3657,8 +3666,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_SHADOW_VMCS |
SECONDARY_EXEC_XSAVES |
- SECONDARY_EXEC_RDSEED |
- SECONDARY_EXEC_RDRAND |
+ SECONDARY_EXEC_RDSEED_EXITING |
+ SECONDARY_EXEC_RDRAND_EXITING |
SECONDARY_EXEC_ENABLE_PML |
SECONDARY_EXEC_TSC_SCALING |
SECONDARY_EXEC_ENABLE_VMFUNC;
@@ -3679,14 +3688,25 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+ rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
+ &vmx_capability.ept, &vmx_capability.vpid);
+
if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
/* CR3 accesses and invlpg don't need to cause VM Exits when EPT
enabled */
_cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
CPU_BASED_CR3_STORE_EXITING |
CPU_BASED_INVLPG_EXITING);
- rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
- vmx_capability.ept, vmx_capability.vpid);
+ } else if (vmx_capability.ept) {
+ vmx_capability.ept = 0;
+ pr_warn_once("EPT CAP should not exist if not support "
+ "1-setting enable EPT VM-execution control\n");
+ }
+ if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
+ vmx_capability.vpid) {
+ vmx_capability.vpid = 0;
+ pr_warn_once("VPID CAP should not exist if not support "
+ "1-setting enable VPID VM-execution control\n");
}
min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
@@ -3699,9 +3719,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
&_vmexit_control) < 0)
return -EIO;
- min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING |
- PIN_BASED_VIRTUAL_NMIS;
- opt = PIN_BASED_POSTED_INTR | PIN_BASED_VMX_PREEMPTION_TIMER;
+ min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
+ opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
+ PIN_BASED_VMX_PREEMPTION_TIMER;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
&_pin_based_exec_control) < 0)
return -EIO;
@@ -4781,18 +4801,18 @@ static int init_rmode_identity_map(struct kvm *kvm)
kvm_pfn_t identity_map_pfn;
u32 tmp;
- if (!enable_ept)
- return 0;
-
/* Protect kvm->arch.ept_identity_pagetable_done. */
mutex_lock(&kvm->slots_lock);
if (likely(kvm->arch.ept_identity_pagetable_done))
goto out2;
+ if (!kvm->arch.ept_identity_map_addr)
+ kvm->arch.ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
- r = alloc_identity_pagetable(kvm);
+ r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
+ kvm->arch.ept_identity_map_addr, PAGE_SIZE);
if (r < 0)
goto out2;
@@ -4864,20 +4884,6 @@ out:
return r;
}
-static int alloc_identity_pagetable(struct kvm *kvm)
-{
- /* Called with kvm->slots_lock held. */
-
- int r = 0;
-
- BUG_ON(kvm->arch.ept_identity_pagetable_done);
-
- r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
- kvm->arch.ept_identity_map_addr, PAGE_SIZE);
-
- return r;
-}
-
static int allocate_vpid(void)
{
int vpid;
@@ -5233,6 +5239,10 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
if (!kvm_vcpu_apicv_active(&vmx->vcpu))
pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
+
+ if (!enable_vnmi)
+ pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;
+
/* Enable the preemption timer dynamically */
pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
return pin_based_exec_ctrl;
@@ -5282,13 +5292,13 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
static bool vmx_rdrand_supported(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
- SECONDARY_EXEC_RDRAND;
+ SECONDARY_EXEC_RDRAND_EXITING;
}
static bool vmx_rdseed_supported(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
- SECONDARY_EXEC_RDSEED;
+ SECONDARY_EXEC_RDSEED_EXITING;
}
static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
@@ -5382,30 +5392,30 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
if (vmx_rdrand_supported()) {
bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND);
if (rdrand_enabled)
- exec_control &= ~SECONDARY_EXEC_RDRAND;
+ exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING;
if (nested) {
if (rdrand_enabled)
vmx->nested.nested_vmx_secondary_ctls_high |=
- SECONDARY_EXEC_RDRAND;
+ SECONDARY_EXEC_RDRAND_EXITING;
else
vmx->nested.nested_vmx_secondary_ctls_high &=
- ~SECONDARY_EXEC_RDRAND;
+ ~SECONDARY_EXEC_RDRAND_EXITING;
}
}
if (vmx_rdseed_supported()) {
bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED);
if (rdseed_enabled)
- exec_control &= ~SECONDARY_EXEC_RDSEED;
+ exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING;
if (nested) {
if (rdseed_enabled)
vmx->nested.nested_vmx_secondary_ctls_high |=
- SECONDARY_EXEC_RDSEED;
+ SECONDARY_EXEC_RDSEED_EXITING;
else
vmx->nested.nested_vmx_secondary_ctls_high &=
- ~SECONDARY_EXEC_RDSEED;
+ ~SECONDARY_EXEC_RDSEED_EXITING;
}
}
@@ -5426,7 +5436,7 @@ static void ept_set_mmio_spte_mask(void)
/*
* Sets up the vmcs for emulated real mode.
*/
-static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
+static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
{
#ifdef CONFIG_X86_64
unsigned long a;
@@ -5539,8 +5549,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
}
-
- return 0;
}
static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -5604,6 +5612,8 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
+ if (kvm_mpx_supported())
+ vmcs_write64(GUEST_BNDCFGS, 0);
setup_msrs(vmx);
@@ -5667,7 +5677,8 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
static void enable_nmi_window(struct kvm_vcpu *vcpu)
{
- if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
+ if (!enable_vnmi ||
+ vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
enable_irq_window(vcpu);
return;
}
@@ -5707,6 +5718,19 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (!enable_vnmi) {
+ /*
+ * Tracking the NMI-blocked state in software is built upon
+ * finding the next open IRQ window. This, in turn, depends on
+ * well-behaving guests: They have to keep IRQs disabled at
+ * least as long as the NMI handler runs. Otherwise we may
+ * cause NMI nesting, maybe breaking the guest. But as this is
+ * highly unlikely, we can live with the residual risk.
+ */
+ vmx->loaded_vmcs->soft_vnmi_blocked = 1;
+ vmx->loaded_vmcs->vnmi_blocked_time = 0;
+ }
+
++vcpu->stat.nmi_injections;
vmx->loaded_vmcs->nmi_known_unmasked = false;
@@ -5725,6 +5749,8 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
bool masked;
+ if (!enable_vnmi)
+ return vmx->loaded_vmcs->soft_vnmi_blocked;
if (vmx->loaded_vmcs->nmi_known_unmasked)
return false;
masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
@@ -5736,13 +5762,20 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- vmx->loaded_vmcs->nmi_known_unmasked = !masked;
- if (masked)
- vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
- GUEST_INTR_STATE_NMI);
- else
- vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
- GUEST_INTR_STATE_NMI);
+ if (!enable_vnmi) {
+ if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
+ vmx->loaded_vmcs->soft_vnmi_blocked = masked;
+ vmx->loaded_vmcs->vnmi_blocked_time = 0;
+ }
+ } else {
+ vmx->loaded_vmcs->nmi_known_unmasked = !masked;
+ if (masked)
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ else
+ vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ }
}
static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -5750,6 +5783,10 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
if (to_vmx(vcpu)->nested.nested_run_pending)
return 0;
+ if (!enable_vnmi &&
+ to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
+ return 0;
+
return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
(GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
| GUEST_INTR_STATE_NMI));
@@ -5912,8 +5949,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
cr2 = vmcs_readl(EXIT_QUALIFICATION);
/* EPT won't cause page fault directly */
WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept);
- return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0,
- true);
+ return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
}
ex_no = intr_info & INTR_INFO_VECTOR_MASK;
@@ -6478,6 +6514,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
* AAK134, BY25.
*/
if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+ enable_vnmi &&
(exit_qualification & INTR_INFO_UNBLOCK_NMI))
vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
@@ -6537,6 +6574,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
static int handle_nmi_window(struct kvm_vcpu *vcpu)
{
+ WARN_ON_ONCE(!enable_vnmi);
vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
CPU_BASED_VIRTUAL_NMI_PENDING);
++vcpu->stat.nmi_window_exits;
@@ -6747,21 +6785,22 @@ static __init int hardware_setup(void)
if (!cpu_has_vmx_ept() ||
!cpu_has_vmx_ept_4levels() ||
- !cpu_has_vmx_ept_mt_wb()) {
+ !cpu_has_vmx_ept_mt_wb() ||
+ !cpu_has_vmx_invept_global())
enable_ept = 0;
- enable_unrestricted_guest = 0;
- enable_ept_ad_bits = 0;
- }
if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
enable_ept_ad_bits = 0;
- if (!cpu_has_vmx_unrestricted_guest())
+ if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
enable_unrestricted_guest = 0;
if (!cpu_has_vmx_flexpriority())
flexpriority_enabled = 0;
+ if (!cpu_has_virtual_nmis())
+ enable_vnmi = 0;
+
/*
* set_apic_access_page_addr() is used to reload apic access
* page upon invalidation. No need to do anything if not
@@ -6776,8 +6815,13 @@ static __init int hardware_setup(void)
if (enable_ept && !cpu_has_vmx_ept_2m_page())
kvm_disable_largepages();
- if (!cpu_has_vmx_ple())
+ if (!cpu_has_vmx_ple()) {
ple_gap = 0;
+ ple_window = 0;
+ ple_window_grow = 0;
+ ple_window_max = 0;
+ ple_window_shrink = 0;
+ }
if (!cpu_has_vmx_apicv()) {
enable_apicv = 0;
@@ -6961,7 +7005,7 @@ static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
}
/* Create a new VMCS */
- item = kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
+ item = kzalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
if (!item)
return NULL;
item->vmcs02.vmcs = alloc_vmcs();
@@ -7978,6 +8022,7 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
* "blocked by NMI" bit has to be set before next VM entry.
*/
if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+ enable_vnmi &&
(exit_qualification & INTR_INFO_UNBLOCK_NMI))
vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
GUEST_INTR_STATE_NMI);
@@ -8415,9 +8460,9 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
case EXIT_REASON_RDPMC:
return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
case EXIT_REASON_RDRAND:
- return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND);
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
case EXIT_REASON_RDSEED:
- return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED);
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
@@ -8822,6 +8867,25 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
return 0;
}
+ if (unlikely(!enable_vnmi &&
+ vmx->loaded_vmcs->soft_vnmi_blocked)) {
+ if (vmx_interrupt_allowed(vcpu)) {
+ vmx->loaded_vmcs->soft_vnmi_blocked = 0;
+ } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
+ vcpu->arch.nmi_pending) {
+ /*
+ * This CPU don't support us in finding the end of an
+ * NMI-blocked window if the guest runs with IRQs
+ * disabled. So we pull the trigger after 1 s of
+ * futile waiting, but inform the user about this.
+ */
+ printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
+ "state on VCPU %d after 1 s timeout\n",
+ __func__, vcpu->vcpu_id);
+ vmx->loaded_vmcs->soft_vnmi_blocked = 0;
+ }
+ }
+
if (exit_reason < kvm_vmx_max_exit_handlers
&& kvm_vmx_exit_handlers[exit_reason])
return kvm_vmx_exit_handlers[exit_reason](vcpu);
@@ -9104,33 +9168,38 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
- if (vmx->loaded_vmcs->nmi_known_unmasked)
- return;
- /*
- * Can't use vmx->exit_intr_info since we're not sure what
- * the exit reason is.
- */
- exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
- unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
- vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
- /*
- * SDM 3: 27.7.1.2 (September 2008)
- * Re-set bit "block by NMI" before VM entry if vmexit caused by
- * a guest IRET fault.
- * SDM 3: 23.2.2 (September 2008)
- * Bit 12 is undefined in any of the following cases:
- * If the VM exit sets the valid bit in the IDT-vectoring
- * information field.
- * If the VM exit is due to a double fault.
- */
- if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
- vector != DF_VECTOR && !idtv_info_valid)
- vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
- GUEST_INTR_STATE_NMI);
- else
- vmx->loaded_vmcs->nmi_known_unmasked =
- !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
- & GUEST_INTR_STATE_NMI);
+ if (enable_vnmi) {
+ if (vmx->loaded_vmcs->nmi_known_unmasked)
+ return;
+ /*
+ * Can't use vmx->exit_intr_info since we're not sure what
+ * the exit reason is.
+ */
+ exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+ unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
+ vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
+ /*
+ * SDM 3: 27.7.1.2 (September 2008)
+ * Re-set bit "block by NMI" before VM entry if vmexit caused by
+ * a guest IRET fault.
+ * SDM 3: 23.2.2 (September 2008)
+ * Bit 12 is undefined in any of the following cases:
+ * If the VM exit sets the valid bit in the IDT-vectoring
+ * information field.
+ * If the VM exit is due to a double fault.
+ */
+ if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
+ vector != DF_VECTOR && !idtv_info_valid)
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ else
+ vmx->loaded_vmcs->nmi_known_unmasked =
+ !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
+ & GUEST_INTR_STATE_NMI);
+ } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
+ vmx->loaded_vmcs->vnmi_blocked_time +=
+ ktime_to_ns(ktime_sub(ktime_get(),
+ vmx->loaded_vmcs->entry_time));
}
static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
@@ -9247,6 +9316,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long debugctlmsr, cr3, cr4;
+ /* Record the guest's net vcpu time for enforced NMI injections. */
+ if (unlikely(!enable_vnmi &&
+ vmx->loaded_vmcs->soft_vnmi_blocked))
+ vmx->loaded_vmcs->entry_time = ktime_get();
+
/* Don't enter VMX if guest state is invalid, let the exit handler
start emulation until we arrive back to a valid state */
if (vmx->emulation_required)
@@ -9475,7 +9549,6 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
vmx->loaded_vmcs = vmcs;
vmx_vcpu_put(vcpu);
vmx_vcpu_load(vcpu, cpu);
- vcpu->cpu = cpu;
put_cpu();
}
@@ -9556,11 +9629,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
cpu = get_cpu();
vmx_vcpu_load(&vmx->vcpu, cpu);
vmx->vcpu.cpu = cpu;
- err = vmx_vcpu_setup(vmx);
+ vmx_vcpu_setup(vmx);
vmx_vcpu_put(&vmx->vcpu);
put_cpu();
- if (err)
- goto free_vmcs;
if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
err = alloc_apic_access_page(kvm);
if (err)
@@ -9568,9 +9639,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
}
if (enable_ept) {
- if (!kvm->arch.ept_identity_map_addr)
- kvm->arch.ept_identity_map_addr =
- VMX_EPT_IDENTITY_PAGETABLE_ADDR;
err = init_rmode_identity_map(kvm);
if (err)
goto free_vmcs;
@@ -11325,6 +11393,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
+ vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
+ vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
/* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */
if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
@@ -11421,8 +11491,11 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
leave_guest_mode(vcpu);
if (likely(!vmx->fail)) {
- prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
- exit_qualification);
+ if (exit_reason == -1)
+ sync_vmcs12(vcpu, vmcs12);
+ else
+ prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
+ exit_qualification);
if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
vmcs12->vm_exit_msr_store_count))
@@ -11486,7 +11559,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
*/
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
- if (enable_shadow_vmcs)
+ if (enable_shadow_vmcs && exit_reason != -1)
vmx->nested.sync_shadow_vmcs = true;
/* in case we halted in L2 */
@@ -11510,12 +11583,13 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
}
- trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
- vmcs12->exit_qualification,
- vmcs12->idt_vectoring_info_field,
- vmcs12->vm_exit_intr_info,
- vmcs12->vm_exit_intr_error_code,
- KVM_ISA_VMX);
+ if (exit_reason != -1)
+ trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
+ vmcs12->exit_qualification,
+ vmcs12->idt_vectoring_info_field,
+ vmcs12->vm_exit_intr_info,
+ vmcs12->vm_exit_intr_error_code,
+ KVM_ISA_VMX);
load_vmcs12_host_state(vcpu, vmcs12);
@@ -11938,6 +12012,54 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu)
~FEATURE_CONTROL_LMCE;
}
+static int vmx_smi_allowed(struct kvm_vcpu *vcpu)
+{
+ /* we need a nested vmexit to enter SMM, postpone if run is pending */
+ if (to_vmx(vcpu)->nested.nested_run_pending)
+ return 0;
+ return 1;
+}
+
+static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
+ if (vmx->nested.smm.guest_mode)
+ nested_vmx_vmexit(vcpu, -1, 0, 0);
+
+ vmx->nested.smm.vmxon = vmx->nested.vmxon;
+ vmx->nested.vmxon = false;
+ return 0;
+}
+
+static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int ret;
+
+ if (vmx->nested.smm.vmxon) {
+ vmx->nested.vmxon = true;
+ vmx->nested.smm.vmxon = false;
+ }
+
+ if (vmx->nested.smm.guest_mode) {
+ vcpu->arch.hflags &= ~HF_SMM_MASK;
+ ret = enter_vmx_non_root_mode(vcpu, false);
+ vcpu->arch.hflags |= HF_SMM_MASK;
+ if (ret)
+ return ret;
+
+ vmx->nested.smm.guest_mode = false;
+ }
+ return 0;
+}
+
+static int enable_smi_window(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
@@ -12063,6 +12185,11 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
#endif
.setup_mce = vmx_setup_mce,
+
+ .smi_allowed = vmx_smi_allowed,
+ .pre_enter_smm = vmx_pre_enter_smm,
+ .pre_leave_smm = vmx_pre_leave_smm,
+ .enable_smi_window = enable_smi_window,
};
static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 03869eb7fcd6..34c85aa2e2d1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2006,10 +2006,12 @@ static void kvmclock_sync_fn(struct work_struct *work)
KVMCLOCK_SYNC_PERIOD);
}
-static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
u64 mcg_cap = vcpu->arch.mcg_cap;
unsigned bank_num = mcg_cap & 0xff;
+ u32 msr = msr_info->index;
+ u64 data = msr_info->data;
switch (msr) {
case MSR_IA32_MCG_STATUS:
@@ -2034,6 +2036,9 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
if ((offset & 0x3) == 0 &&
data != 0 && (data | (1 << 10)) != ~(u64)0)
return -1;
+ if (!msr_info->host_initiated &&
+ (offset & 0x3) == 1 && data != 0)
+ return -1;
vcpu->arch.mce_banks[offset] = data;
break;
}
@@ -2283,7 +2288,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_MCG_CTL:
case MSR_IA32_MCG_STATUS:
case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
- return set_msr_mce(vcpu, msr, data);
+ return set_msr_mce(vcpu, msr_info);
case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
@@ -4034,10 +4039,16 @@ long kvm_arch_vm_ioctl(struct file *filp,
case KVM_SET_IDENTITY_MAP_ADDR: {
u64 ident_addr;
+ mutex_lock(&kvm->lock);
+ r = -EINVAL;
+ if (kvm->created_vcpus)
+ goto set_identity_unlock;
r = -EFAULT;
if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
- goto out;
+ goto set_identity_unlock;
r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
+set_identity_unlock:
+ mutex_unlock(&kvm->lock);
break;
}
case KVM_SET_NR_MMU_PAGES:
@@ -5275,6 +5286,11 @@ static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_fla
kvm_set_hflags(emul_to_vcpu(ctxt), emul_flags);
}
+static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, u64 smbase)
+{
+ return kvm_x86_ops->pre_leave_smm(emul_to_vcpu(ctxt), smbase);
+}
+
static const struct x86_emulate_ops emulate_ops = {
.read_gpr = emulator_read_gpr,
.write_gpr = emulator_write_gpr,
@@ -5316,6 +5332,7 @@ static const struct x86_emulate_ops emulate_ops = {
.set_nmi_mask = emulator_set_nmi_mask,
.get_hflags = emulator_get_hflags,
.set_hflags = emulator_set_hflags,
+ .pre_leave_smm = emulator_pre_leave_smm,
};
static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
@@ -6426,7 +6443,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
}
kvm_x86_ops->queue_exception(vcpu);
- } else if (vcpu->arch.smi_pending && !is_smm(vcpu)) {
+ } else if (vcpu->arch.smi_pending && !is_smm(vcpu) && kvm_x86_ops->smi_allowed(vcpu)) {
vcpu->arch.smi_pending = false;
enter_smm(vcpu);
} else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
@@ -6473,9 +6490,6 @@ static void process_nmi(struct kvm_vcpu *vcpu)
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
-#define put_smstate(type, buf, offset, val) \
- *(type *)((buf) + (offset) - 0x7e00) = val
-
static u32 enter_smm_get_segment_flags(struct kvm_segment *seg)
{
u32 flags = 0;
@@ -6641,13 +6655,20 @@ static void enter_smm(struct kvm_vcpu *vcpu)
u32 cr0;
trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
- vcpu->arch.hflags |= HF_SMM_MASK;
memset(buf, 0, 512);
if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
enter_smm_save_state_64(vcpu, buf);
else
enter_smm_save_state_32(vcpu, buf);
+ /*
+ * Give pre_enter_smm() a chance to make ISA-specific changes to the
+ * vCPU state (e.g. leave guest mode) after we've saved the state into
+ * the SMM state-save area.
+ */
+ kvm_x86_ops->pre_enter_smm(vcpu, buf);
+
+ vcpu->arch.hflags |= HF_SMM_MASK;
kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
if (kvm_x86_ops->get_nmi_mask(vcpu))
@@ -6876,17 +6897,23 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (inject_pending_event(vcpu, req_int_win) != 0)
req_immediate_exit = true;
else {
- /* Enable NMI/IRQ window open exits if needed.
+ /* Enable SMI/NMI/IRQ window open exits if needed.
*
- * SMIs have two cases: 1) they can be nested, and
- * then there is nothing to do here because RSM will
- * cause a vmexit anyway; 2) or the SMI can be pending
- * because inject_pending_event has completed the
- * injection of an IRQ or NMI from the previous vmexit,
- * and then we request an immediate exit to inject the SMI.
+ * SMIs have three cases:
+ * 1) They can be nested, and then there is nothing to
+ * do here because RSM will cause a vmexit anyway.
+ * 2) There is an ISA-specific reason why SMI cannot be
+ * injected, and the moment when this changes can be
+ * intercepted.
+ * 3) Or the SMI can be pending because
+ * inject_pending_event has completed the injection
+ * of an IRQ or NMI from the previous vmexit, and
+ * then we request an immediate exit to inject the
+ * SMI.
*/
if (vcpu->arch.smi_pending && !is_smm(vcpu))
- req_immediate_exit = true;
+ if (!kvm_x86_ops->enable_smi_window(vcpu))
+ req_immediate_exit = true;
if (vcpu->arch.nmi_pending)
kvm_x86_ops->enable_nmi_window(vcpu);
if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
@@ -7798,18 +7825,40 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
kvm_async_pf_hash_reset(vcpu);
vcpu->arch.apf.halted = false;
+ if (kvm_mpx_supported()) {
+ void *mpx_state_buffer;
+
+ /*
+ * To avoid have the INIT path from kvm_apic_has_events() that be
+ * called with loaded FPU and does not let userspace fix the state.
+ */
+ kvm_put_guest_fpu(vcpu);
+ mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave,
+ XFEATURE_MASK_BNDREGS);
+ if (mpx_state_buffer)
+ memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state));
+ mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave,
+ XFEATURE_MASK_BNDCSR);
+ if (mpx_state_buffer)
+ memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr));
+ }
+
if (!init_event) {
kvm_pmu_reset(vcpu);
vcpu->arch.smbase = 0x30000;
vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
vcpu->arch.msr_misc_features_enables = 0;
+
+ vcpu->arch.xcr0 = XFEATURE_MASK_FP;
}
memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
vcpu->arch.regs_avail = ~0;
vcpu->arch.regs_dirty = ~0;
+ vcpu->arch.ia32_xss = 0;
+
kvm_x86_ops->vcpu_reset(vcpu, init_event);
}
@@ -7974,16 +8023,11 @@ EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
{
struct page *page;
- struct kvm *kvm;
int r;
- BUG_ON(vcpu->kvm == NULL);
- kvm = vcpu->kvm;
-
vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu);
- vcpu->arch.pv.pv_unhalted = false;
vcpu->arch.emulate_ctxt.ops = &emulate_ops;
- if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu))
+ if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
else
vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
@@ -8001,7 +8045,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
if (r < 0)
goto fail_free_pio_data;
- if (irqchip_in_kernel(kvm)) {
+ if (irqchip_in_kernel(vcpu->kvm)) {
r = kvm_create_lapic(vcpu);
if (r < 0)
goto fail_mmu_destroy;
@@ -8023,10 +8067,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
fx_init(vcpu);
- vcpu->arch.ia32_tsc_adjust_msr = 0x0;
- vcpu->arch.pv_time_enabled = false;
-
- vcpu->arch.guest_supported_xcr0 = 0;
vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 7ba7f3d7f477..8e13b8cc6bed 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -29,8 +29,6 @@ obj-$(CONFIG_X86_PTDUMP) += debug_pagetables.o
obj-$(CONFIG_HIGHMEM) += highmem_32.o
-obj-$(CONFIG_KMEMCHECK) += kmemcheck/
-
KASAN_SANITIZE_kasan_init_$(BITS).o := n
obj-$(CONFIG_KASAN) += kasan_init_$(BITS).o
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 3109ba6c6ede..78ca9a8ee454 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -20,7 +20,6 @@
#include <asm/cpufeature.h> /* boot_cpu_has, ... */
#include <asm/traps.h> /* dotraplinkage, ... */
#include <asm/pgalloc.h> /* pgd_*(), ... */
-#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
#include <asm/fixmap.h> /* VSYSCALL_ADDR */
#include <asm/vsyscall.h> /* emulate_vsyscall */
#include <asm/vm86.h> /* struct vm86 */
@@ -1256,8 +1255,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* Detect and handle instructions that would cause a page fault for
* both a tracked kernel page and a userspace page.
*/
- if (kmemcheck_active(regs))
- kmemcheck_hide(regs);
prefetchw(&mm->mmap_sem);
if (unlikely(kmmio_fault(regs, address)))
@@ -1280,9 +1277,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
if (vmalloc_fault(address) >= 0)
return;
-
- if (kmemcheck_fault(regs, address, error_code))
- return;
}
/* Can handle a stale RO->RW TLB: */
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index a22c2b95e513..6fdf91ef130a 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -92,8 +92,7 @@ __ref void *alloc_low_pages(unsigned int num)
unsigned int order;
order = get_order((unsigned long)num << PAGE_SHIFT);
- return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK |
- __GFP_ZERO, order);
+ return (void *)__get_free_pages(GFP_ATOMIC | __GFP_ZERO, order);
}
if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
@@ -164,12 +163,11 @@ static int page_size_mask;
static void __init probe_page_size_mask(void)
{
/*
- * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will
- * use small pages.
+ * For pagealloc debugging, identity mapping will use small pages.
* This will simplify cpa(), which otherwise needs to support splitting
* large pages into small in interrupt context, etc.
*/
- if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled() && !IS_ENABLED(CONFIG_KMEMCHECK))
+ if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled())
page_size_mask |= 1 << PG_LEVEL_2M;
else
direct_gbpages = 0;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index adcea90a2046..4a837289f2ad 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -184,7 +184,7 @@ static __ref void *spp_getpage(void)
void *ptr;
if (after_bootmem)
- ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
+ ptr = (void *) get_zeroed_page(GFP_ATOMIC);
else
ptr = alloc_bootmem_pages(PAGE_SIZE);
@@ -1173,12 +1173,18 @@ void __init mem_init(void)
/* clear_bss() already clear the empty_zero_page */
- register_page_bootmem_info();
-
/* this will put all memory onto the freelists */
free_all_bootmem();
after_bootmem = 1;
+ /*
+ * Must be done after boot memory is put on freelist, because here we
+ * might set fields in deferred struct pages that have not yet been
+ * initialized, and free_all_bootmem() initializes all the reserved
+ * deferred pages for us.
+ */
+ register_page_bootmem_info();
+
/* Register memory areas for /proc/kcore */
kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR,
PAGE_SIZE, KCORE_OTHER);
@@ -1399,7 +1405,6 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
vmemmap_verify((pte_t *)pmd, node, addr, next);
continue;
}
- pr_warn_once("vmemmap: falling back to regular page backing\n");
if (vmemmap_populate_basepages(addr, next, node))
return -ENOMEM;
}
diff --git a/arch/x86/mm/kmemcheck/Makefile b/arch/x86/mm/kmemcheck/Makefile
deleted file mode 100644
index 520b3bce4095..000000000000
--- a/arch/x86/mm/kmemcheck/Makefile
+++ /dev/null
@@ -1 +0,0 @@
-obj-y := error.o kmemcheck.o opcode.o pte.o selftest.o shadow.o
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
index 872ec4159a68..cec594032515 100644
--- a/arch/x86/mm/kmemcheck/error.c
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -1,228 +1 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/interrupt.h>
-#include <linux/kdebug.h>
-#include <linux/kmemcheck.h>
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/stacktrace.h>
-#include <linux/string.h>
-
-#include "error.h"
-#include "shadow.h"
-
-enum kmemcheck_error_type {
- KMEMCHECK_ERROR_INVALID_ACCESS,
- KMEMCHECK_ERROR_BUG,
-};
-
-#define SHADOW_COPY_SIZE (1 << CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT)
-
-struct kmemcheck_error {
- enum kmemcheck_error_type type;
-
- union {
- /* KMEMCHECK_ERROR_INVALID_ACCESS */
- struct {
- /* Kind of access that caused the error */
- enum kmemcheck_shadow state;
- /* Address and size of the erroneous read */
- unsigned long address;
- unsigned int size;
- };
- };
-
- struct pt_regs regs;
- struct stack_trace trace;
- unsigned long trace_entries[32];
-
- /* We compress it to a char. */
- unsigned char shadow_copy[SHADOW_COPY_SIZE];
- unsigned char memory_copy[SHADOW_COPY_SIZE];
-};
-
-/*
- * Create a ring queue of errors to output. We can't call printk() directly
- * from the kmemcheck traps, since this may call the console drivers and
- * result in a recursive fault.
- */
-static struct kmemcheck_error error_fifo[CONFIG_KMEMCHECK_QUEUE_SIZE];
-static unsigned int error_count;
-static unsigned int error_rd;
-static unsigned int error_wr;
-static unsigned int error_missed_count;
-
-static struct kmemcheck_error *error_next_wr(void)
-{
- struct kmemcheck_error *e;
-
- if (error_count == ARRAY_SIZE(error_fifo)) {
- ++error_missed_count;
- return NULL;
- }
-
- e = &error_fifo[error_wr];
- if (++error_wr == ARRAY_SIZE(error_fifo))
- error_wr = 0;
- ++error_count;
- return e;
-}
-
-static struct kmemcheck_error *error_next_rd(void)
-{
- struct kmemcheck_error *e;
-
- if (error_count == 0)
- return NULL;
-
- e = &error_fifo[error_rd];
- if (++error_rd == ARRAY_SIZE(error_fifo))
- error_rd = 0;
- --error_count;
- return e;
-}
-
-void kmemcheck_error_recall(void)
-{
- static const char *desc[] = {
- [KMEMCHECK_SHADOW_UNALLOCATED] = "unallocated",
- [KMEMCHECK_SHADOW_UNINITIALIZED] = "uninitialized",
- [KMEMCHECK_SHADOW_INITIALIZED] = "initialized",
- [KMEMCHECK_SHADOW_FREED] = "freed",
- };
-
- static const char short_desc[] = {
- [KMEMCHECK_SHADOW_UNALLOCATED] = 'a',
- [KMEMCHECK_SHADOW_UNINITIALIZED] = 'u',
- [KMEMCHECK_SHADOW_INITIALIZED] = 'i',
- [KMEMCHECK_SHADOW_FREED] = 'f',
- };
-
- struct kmemcheck_error *e;
- unsigned int i;
-
- e = error_next_rd();
- if (!e)
- return;
-
- switch (e->type) {
- case KMEMCHECK_ERROR_INVALID_ACCESS:
- printk(KERN_WARNING "WARNING: kmemcheck: Caught %d-bit read from %s memory (%p)\n",
- 8 * e->size, e->state < ARRAY_SIZE(desc) ?
- desc[e->state] : "(invalid shadow state)",
- (void *) e->address);
-
- printk(KERN_WARNING);
- for (i = 0; i < SHADOW_COPY_SIZE; ++i)
- printk(KERN_CONT "%02x", e->memory_copy[i]);
- printk(KERN_CONT "\n");
-
- printk(KERN_WARNING);
- for (i = 0; i < SHADOW_COPY_SIZE; ++i) {
- if (e->shadow_copy[i] < ARRAY_SIZE(short_desc))
- printk(KERN_CONT " %c", short_desc[e->shadow_copy[i]]);
- else
- printk(KERN_CONT " ?");
- }
- printk(KERN_CONT "\n");
- printk(KERN_WARNING "%*c\n", 2 + 2
- * (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^');
- break;
- case KMEMCHECK_ERROR_BUG:
- printk(KERN_EMERG "ERROR: kmemcheck: Fatal error\n");
- break;
- }
-
- __show_regs(&e->regs, 1);
- print_stack_trace(&e->trace, 0);
-}
-
-static void do_wakeup(unsigned long data)
-{
- while (error_count > 0)
- kmemcheck_error_recall();
-
- if (error_missed_count > 0) {
- printk(KERN_WARNING "kmemcheck: Lost %d error reports because "
- "the queue was too small\n", error_missed_count);
- error_missed_count = 0;
- }
-}
-
-static DECLARE_TASKLET(kmemcheck_tasklet, &do_wakeup, 0);
-
-/*
- * Save the context of an error report.
- */
-void kmemcheck_error_save(enum kmemcheck_shadow state,
- unsigned long address, unsigned int size, struct pt_regs *regs)
-{
- static unsigned long prev_ip;
-
- struct kmemcheck_error *e;
- void *shadow_copy;
- void *memory_copy;
-
- /* Don't report several adjacent errors from the same EIP. */
- if (regs->ip == prev_ip)
- return;
- prev_ip = regs->ip;
-
- e = error_next_wr();
- if (!e)
- return;
-
- e->type = KMEMCHECK_ERROR_INVALID_ACCESS;
-
- e->state = state;
- e->address = address;
- e->size = size;
-
- /* Save regs */
- memcpy(&e->regs, regs, sizeof(*regs));
-
- /* Save stack trace */
- e->trace.nr_entries = 0;
- e->trace.entries = e->trace_entries;
- e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
- e->trace.skip = 0;
- save_stack_trace_regs(regs, &e->trace);
-
- /* Round address down to nearest 16 bytes */
- shadow_copy = kmemcheck_shadow_lookup(address
- & ~(SHADOW_COPY_SIZE - 1));
- BUG_ON(!shadow_copy);
-
- memcpy(e->shadow_copy, shadow_copy, SHADOW_COPY_SIZE);
-
- kmemcheck_show_addr(address);
- memory_copy = (void *) (address & ~(SHADOW_COPY_SIZE - 1));
- memcpy(e->memory_copy, memory_copy, SHADOW_COPY_SIZE);
- kmemcheck_hide_addr(address);
-
- tasklet_hi_schedule_first(&kmemcheck_tasklet);
-}
-
-/*
- * Save the context of a kmemcheck bug.
- */
-void kmemcheck_error_save_bug(struct pt_regs *regs)
-{
- struct kmemcheck_error *e;
-
- e = error_next_wr();
- if (!e)
- return;
-
- e->type = KMEMCHECK_ERROR_BUG;
-
- memcpy(&e->regs, regs, sizeof(*regs));
-
- e->trace.nr_entries = 0;
- e->trace.entries = e->trace_entries;
- e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
- e->trace.skip = 1;
- save_stack_trace(&e->trace);
-
- tasklet_hi_schedule_first(&kmemcheck_tasklet);
-}
diff --git a/arch/x86/mm/kmemcheck/error.h b/arch/x86/mm/kmemcheck/error.h
index 39f80d7a874d..ea32a7d3cf1b 100644
--- a/arch/x86/mm/kmemcheck/error.h
+++ b/arch/x86/mm/kmemcheck/error.h
@@ -1,16 +1 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ARCH__X86__MM__KMEMCHECK__ERROR_H
-#define ARCH__X86__MM__KMEMCHECK__ERROR_H
-
-#include <linux/ptrace.h>
-
-#include "shadow.h"
-
-void kmemcheck_error_save(enum kmemcheck_shadow state,
- unsigned long address, unsigned int size, struct pt_regs *regs);
-
-void kmemcheck_error_save_bug(struct pt_regs *regs);
-
-void kmemcheck_error_recall(void);
-
-#endif
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
deleted file mode 100644
index 4515bae36bbe..000000000000
--- a/arch/x86/mm/kmemcheck/kmemcheck.c
+++ /dev/null
@@ -1,658 +0,0 @@
-/**
- * kmemcheck - a heavyweight memory checker for the linux kernel
- * Copyright (C) 2007, 2008 Vegard Nossum <vegardno@ifi.uio.no>
- * (With a lot of help from Ingo Molnar and Pekka Enberg.)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License (version 2) as
- * published by the Free Software Foundation.
- */
-
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/kallsyms.h>
-#include <linux/kernel.h>
-#include <linux/kmemcheck.h>
-#include <linux/mm.h>
-#include <linux/page-flags.h>
-#include <linux/percpu.h>
-#include <linux/ptrace.h>
-#include <linux/string.h>
-#include <linux/types.h>
-
-#include <asm/cacheflush.h>
-#include <asm/kmemcheck.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-
-#include "error.h"
-#include "opcode.h"
-#include "pte.h"
-#include "selftest.h"
-#include "shadow.h"
-
-
-#ifdef CONFIG_KMEMCHECK_DISABLED_BY_DEFAULT
-# define KMEMCHECK_ENABLED 0
-#endif
-
-#ifdef CONFIG_KMEMCHECK_ENABLED_BY_DEFAULT
-# define KMEMCHECK_ENABLED 1
-#endif
-
-#ifdef CONFIG_KMEMCHECK_ONESHOT_BY_DEFAULT
-# define KMEMCHECK_ENABLED 2
-#endif
-
-int kmemcheck_enabled = KMEMCHECK_ENABLED;
-
-int __init kmemcheck_init(void)
-{
-#ifdef CONFIG_SMP
- /*
- * Limit SMP to use a single CPU. We rely on the fact that this code
- * runs before SMP is set up.
- */
- if (setup_max_cpus > 1) {
- printk(KERN_INFO
- "kmemcheck: Limiting number of CPUs to 1.\n");
- setup_max_cpus = 1;
- }
-#endif
-
- if (!kmemcheck_selftest()) {
- printk(KERN_INFO "kmemcheck: self-tests failed; disabling\n");
- kmemcheck_enabled = 0;
- return -EINVAL;
- }
-
- printk(KERN_INFO "kmemcheck: Initialized\n");
- return 0;
-}
-
-early_initcall(kmemcheck_init);
-
-/*
- * We need to parse the kmemcheck= option before any memory is allocated.
- */
-static int __init param_kmemcheck(char *str)
-{
- int val;
- int ret;
-
- if (!str)
- return -EINVAL;
-
- ret = kstrtoint(str, 0, &val);
- if (ret)
- return ret;
- kmemcheck_enabled = val;
- return 0;
-}
-
-early_param("kmemcheck", param_kmemcheck);
-
-int kmemcheck_show_addr(unsigned long address)
-{
- pte_t *pte;
-
- pte = kmemcheck_pte_lookup(address);
- if (!pte)
- return 0;
-
- set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
- __flush_tlb_one(address);
- return 1;
-}
-
-int kmemcheck_hide_addr(unsigned long address)
-{
- pte_t *pte;
-
- pte = kmemcheck_pte_lookup(address);
- if (!pte)
- return 0;
-
- set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
- __flush_tlb_one(address);
- return 1;
-}
-
-struct kmemcheck_context {
- bool busy;
- int balance;
-
- /*
- * There can be at most two memory operands to an instruction, but
- * each address can cross a page boundary -- so we may need up to
- * four addresses that must be hidden/revealed for each fault.
- */
- unsigned long addr[4];
- unsigned long n_addrs;
- unsigned long flags;
-
- /* Data size of the instruction that caused a fault. */
- unsigned int size;
-};
-
-static DEFINE_PER_CPU(struct kmemcheck_context, kmemcheck_context);
-
-bool kmemcheck_active(struct pt_regs *regs)
-{
- struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-
- return data->balance > 0;
-}
-
-/* Save an address that needs to be shown/hidden */
-static void kmemcheck_save_addr(unsigned long addr)
-{
- struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-
- BUG_ON(data->n_addrs >= ARRAY_SIZE(data->addr));
- data->addr[data->n_addrs++] = addr;
-}
-
-static unsigned int kmemcheck_show_all(void)
-{
- struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
- unsigned int i;
- unsigned int n;
-
- n = 0;
- for (i = 0; i < data->n_addrs; ++i)
- n += kmemcheck_show_addr(data->addr[i]);
-
- return n;
-}
-
-static unsigned int kmemcheck_hide_all(void)
-{
- struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
- unsigned int i;
- unsigned int n;
-
- n = 0;
- for (i = 0; i < data->n_addrs; ++i)
- n += kmemcheck_hide_addr(data->addr[i]);
-
- return n;
-}
-
-/*
- * Called from the #PF handler.
- */
-void kmemcheck_show(struct pt_regs *regs)
-{
- struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-
- BUG_ON(!irqs_disabled());
-
- if (unlikely(data->balance != 0)) {
- kmemcheck_show_all();
- kmemcheck_error_save_bug(regs);
- data->balance = 0;
- return;
- }
-
- /*
- * None of the addresses actually belonged to kmemcheck. Note that
- * this is not an error.
- */
- if (kmemcheck_show_all() == 0)
- return;
-
- ++data->balance;
-
- /*
- * The IF needs to be cleared as well, so that the faulting
- * instruction can run "uninterrupted". Otherwise, we might take
- * an interrupt and start executing that before we've had a chance
- * to hide the page again.
- *
- * NOTE: In the rare case of multiple faults, we must not override
- * the original flags:
- */
- if (!(regs->flags & X86_EFLAGS_TF))
- data->flags = regs->flags;
-
- regs->flags |= X86_EFLAGS_TF;
- regs->flags &= ~X86_EFLAGS_IF;
-}
-
-/*
- * Called from the #DB handler.
- */
-void kmemcheck_hide(struct pt_regs *regs)
-{
- struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
- int n;
-
- BUG_ON(!irqs_disabled());
-
- if (unlikely(data->balance != 1)) {
- kmemcheck_show_all();
- kmemcheck_error_save_bug(regs);
- data->n_addrs = 0;
- data->balance = 0;
-
- if (!(data->flags & X86_EFLAGS_TF))
- regs->flags &= ~X86_EFLAGS_TF;
- if (data->flags & X86_EFLAGS_IF)
- regs->flags |= X86_EFLAGS_IF;
- return;
- }
-
- if (kmemcheck_enabled)
- n = kmemcheck_hide_all();
- else
- n = kmemcheck_show_all();
-
- if (n == 0)
- return;
-
- --data->balance;
-
- data->n_addrs = 0;
-
- if (!(data->flags & X86_EFLAGS_TF))
- regs->flags &= ~X86_EFLAGS_TF;
- if (data->flags & X86_EFLAGS_IF)
- regs->flags |= X86_EFLAGS_IF;
-}
-
-void kmemcheck_show_pages(struct page *p, unsigned int n)
-{
- unsigned int i;
-
- for (i = 0; i < n; ++i) {
- unsigned long address;
- pte_t *pte;
- unsigned int level;
-
- address = (unsigned long) page_address(&p[i]);
- pte = lookup_address(address, &level);
- BUG_ON(!pte);
- BUG_ON(level != PG_LEVEL_4K);
-
- set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
- set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_HIDDEN));
- __flush_tlb_one(address);
- }
-}
-
-bool kmemcheck_page_is_tracked(struct page *p)
-{
- /* This will also check the "hidden" flag of the PTE. */
- return kmemcheck_pte_lookup((unsigned long) page_address(p));
-}
-
-void kmemcheck_hide_pages(struct page *p, unsigned int n)
-{
- unsigned int i;
-
- for (i = 0; i < n; ++i) {
- unsigned long address;
- pte_t *pte;
- unsigned int level;
-
- address = (unsigned long) page_address(&p[i]);
- pte = lookup_address(address, &level);
- BUG_ON(!pte);
- BUG_ON(level != PG_LEVEL_4K);
-
- set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
- set_pte(pte, __pte(pte_val(*pte) | _PAGE_HIDDEN));
- __flush_tlb_one(address);
- }
-}
-
-/* Access may NOT cross page boundary */
-static void kmemcheck_read_strict(struct pt_regs *regs,
- unsigned long addr, unsigned int size)
-{
- void *shadow;
- enum kmemcheck_shadow status;
-
- shadow = kmemcheck_shadow_lookup(addr);
- if (!shadow)
- return;
-
- kmemcheck_save_addr(addr);
- status = kmemcheck_shadow_test(shadow, size);
- if (status == KMEMCHECK_SHADOW_INITIALIZED)
- return;
-
- if (kmemcheck_enabled)
- kmemcheck_error_save(status, addr, size, regs);
-
- if (kmemcheck_enabled == 2)
- kmemcheck_enabled = 0;
-
- /* Don't warn about it again. */
- kmemcheck_shadow_set(shadow, size);
-}
-
-bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size)
-{
- enum kmemcheck_shadow status;
- void *shadow;
-
- shadow = kmemcheck_shadow_lookup(addr);
- if (!shadow)
- return true;
-
- status = kmemcheck_shadow_test_all(shadow, size);
-
- return status == KMEMCHECK_SHADOW_INITIALIZED;
-}
-
-/* Access may cross page boundary */
-static void kmemcheck_read(struct pt_regs *regs,
- unsigned long addr, unsigned int size)
-{
- unsigned long page = addr & PAGE_MASK;
- unsigned long next_addr = addr + size - 1;
- unsigned long next_page = next_addr & PAGE_MASK;
-
- if (likely(page == next_page)) {
- kmemcheck_read_strict(regs, addr, size);
- return;
- }
-
- /*
- * What we do is basically to split the access across the
- * two pages and handle each part separately. Yes, this means
- * that we may now see reads that are 3 + 5 bytes, for
- * example (and if both are uninitialized, there will be two
- * reports), but it makes the code a lot simpler.
- */
- kmemcheck_read_strict(regs, addr, next_page - addr);
- kmemcheck_read_strict(regs, next_page, next_addr - next_page);
-}
-
-static void kmemcheck_write_strict(struct pt_regs *regs,
- unsigned long addr, unsigned int size)
-{
- void *shadow;
-
- shadow = kmemcheck_shadow_lookup(addr);
- if (!shadow)
- return;
-
- kmemcheck_save_addr(addr);
- kmemcheck_shadow_set(shadow, size);
-}
-
-static void kmemcheck_write(struct pt_regs *regs,
- unsigned long addr, unsigned int size)
-{
- unsigned long page = addr & PAGE_MASK;
- unsigned long next_addr = addr + size - 1;
- unsigned long next_page = next_addr & PAGE_MASK;
-
- if (likely(page == next_page)) {
- kmemcheck_write_strict(regs, addr, size);
- return;
- }
-
- /* See comment in kmemcheck_read(). */
- kmemcheck_write_strict(regs, addr, next_page - addr);
- kmemcheck_write_strict(regs, next_page, next_addr - next_page);
-}
-
-/*
- * Copying is hard. We have two addresses, each of which may be split across
- * a page (and each page will have different shadow addresses).
- */
-static void kmemcheck_copy(struct pt_regs *regs,
- unsigned long src_addr, unsigned long dst_addr, unsigned int size)
-{
- uint8_t shadow[8];
- enum kmemcheck_shadow status;
-
- unsigned long page;
- unsigned long next_addr;
- unsigned long next_page;
-
- uint8_t *x;
- unsigned int i;
- unsigned int n;
-
- BUG_ON(size > sizeof(shadow));
-
- page = src_addr & PAGE_MASK;
- next_addr = src_addr + size - 1;
- next_page = next_addr & PAGE_MASK;
-
- if (likely(page == next_page)) {
- /* Same page */
- x = kmemcheck_shadow_lookup(src_addr);
- if (x) {
- kmemcheck_save_addr(src_addr);
- for (i = 0; i < size; ++i)
- shadow[i] = x[i];
- } else {
- for (i = 0; i < size; ++i)
- shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
- }
- } else {
- n = next_page - src_addr;
- BUG_ON(n > sizeof(shadow));
-
- /* First page */
- x = kmemcheck_shadow_lookup(src_addr);
- if (x) {
- kmemcheck_save_addr(src_addr);
- for (i = 0; i < n; ++i)
- shadow[i] = x[i];
- } else {
- /* Not tracked */
- for (i = 0; i < n; ++i)
- shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
- }
-
- /* Second page */
- x = kmemcheck_shadow_lookup(next_page);
- if (x) {
- kmemcheck_save_addr(next_page);
- for (i = n; i < size; ++i)
- shadow[i] = x[i - n];
- } else {
- /* Not tracked */
- for (i = n; i < size; ++i)
- shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
- }
- }
-
- page = dst_addr & PAGE_MASK;
- next_addr = dst_addr + size - 1;
- next_page = next_addr & PAGE_MASK;
-
- if (likely(page == next_page)) {
- /* Same page */
- x = kmemcheck_shadow_lookup(dst_addr);
- if (x) {
- kmemcheck_save_addr(dst_addr);
- for (i = 0; i < size; ++i) {
- x[i] = shadow[i];
- shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
- }
- }
- } else {
- n = next_page - dst_addr;
- BUG_ON(n > sizeof(shadow));
-
- /* First page */
- x = kmemcheck_shadow_lookup(dst_addr);
- if (x) {
- kmemcheck_save_addr(dst_addr);
- for (i = 0; i < n; ++i) {
- x[i] = shadow[i];
- shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
- }
- }
-
- /* Second page */
- x = kmemcheck_shadow_lookup(next_page);
- if (x) {
- kmemcheck_save_addr(next_page);
- for (i = n; i < size; ++i) {
- x[i - n] = shadow[i];
- shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
- }
- }
- }
-
- status = kmemcheck_shadow_test(shadow, size);
- if (status == KMEMCHECK_SHADOW_INITIALIZED)
- return;
-
- if (kmemcheck_enabled)
- kmemcheck_error_save(status, src_addr, size, regs);
-
- if (kmemcheck_enabled == 2)
- kmemcheck_enabled = 0;
-}
-
-enum kmemcheck_method {
- KMEMCHECK_READ,
- KMEMCHECK_WRITE,
-};
-
-static void kmemcheck_access(struct pt_regs *regs,
- unsigned long fallback_address, enum kmemcheck_method fallback_method)
-{
- const uint8_t *insn;
- const uint8_t *insn_primary;
- unsigned int size;
-
- struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-
- /* Recursive fault -- ouch. */
- if (data->busy) {
- kmemcheck_show_addr(fallback_address);
- kmemcheck_error_save_bug(regs);
- return;
- }
-
- data->busy = true;
-
- insn = (const uint8_t *) regs->ip;
- insn_primary = kmemcheck_opcode_get_primary(insn);
-
- kmemcheck_opcode_decode(insn, &size);
-
- switch (insn_primary[0]) {
-#ifdef CONFIG_KMEMCHECK_BITOPS_OK
- /* AND, OR, XOR */
- /*
- * Unfortunately, these instructions have to be excluded from
- * our regular checking since they access only some (and not
- * all) bits. This clears out "bogus" bitfield-access warnings.
- */
- case 0x80:
- case 0x81:
- case 0x82:
- case 0x83:
- switch ((insn_primary[1] >> 3) & 7) {
- /* OR */
- case 1:
- /* AND */
- case 4:
- /* XOR */
- case 6:
- kmemcheck_write(regs, fallback_address, size);
- goto out;
-
- /* ADD */
- case 0:
- /* ADC */
- case 2:
- /* SBB */
- case 3:
- /* SUB */
- case 5:
- /* CMP */
- case 7:
- break;
- }
- break;
-#endif
-
- /* MOVS, MOVSB, MOVSW, MOVSD */
- case 0xa4:
- case 0xa5:
- /*
- * These instructions are special because they take two
- * addresses, but we only get one page fault.
- */
- kmemcheck_copy(regs, regs->si, regs->di, size);
- goto out;
-
- /* CMPS, CMPSB, CMPSW, CMPSD */
- case 0xa6:
- case 0xa7:
- kmemcheck_read(regs, regs->si, size);
- kmemcheck_read(regs, regs->di, size);
- goto out;
- }
-
- /*
- * If the opcode isn't special in any way, we use the data from the
- * page fault handler to determine the address and type of memory
- * access.
- */
- switch (fallback_method) {
- case KMEMCHECK_READ:
- kmemcheck_read(regs, fallback_address, size);
- goto out;
- case KMEMCHECK_WRITE:
- kmemcheck_write(regs, fallback_address, size);
- goto out;
- }
-
-out:
- data->busy = false;
-}
-
-bool kmemcheck_fault(struct pt_regs *regs, unsigned long address,
- unsigned long error_code)
-{
- pte_t *pte;
-
- /*
- * XXX: Is it safe to assume that memory accesses from virtual 86
- * mode or non-kernel code segments will _never_ access kernel
- * memory (e.g. tracked pages)? For now, we need this to avoid
- * invoking kmemcheck for PnP BIOS calls.
- */
- if (regs->flags & X86_VM_MASK)
- return false;
- if (regs->cs != __KERNEL_CS)
- return false;
-
- pte = kmemcheck_pte_lookup(address);
- if (!pte)
- return false;
-
- WARN_ON_ONCE(in_nmi());
-
- if (error_code & 2)
- kmemcheck_access(regs, address, KMEMCHECK_WRITE);
- else
- kmemcheck_access(regs, address, KMEMCHECK_READ);
-
- kmemcheck_show(regs);
- return true;
-}
-
-bool kmemcheck_trap(struct pt_regs *regs)
-{
- if (!kmemcheck_active(regs))
- return false;
-
- /* We're done. */
- kmemcheck_hide(regs);
- return true;
-}
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c
index df8109ddf7fe..cec594032515 100644
--- a/arch/x86/mm/kmemcheck/opcode.c
+++ b/arch/x86/mm/kmemcheck/opcode.c
@@ -1,107 +1 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/types.h>
-
-#include "opcode.h"
-
-static bool opcode_is_prefix(uint8_t b)
-{
- return
- /* Group 1 */
- b == 0xf0 || b == 0xf2 || b == 0xf3
- /* Group 2 */
- || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26
- || b == 0x64 || b == 0x65
- /* Group 3 */
- || b == 0x66
- /* Group 4 */
- || b == 0x67;
-}
-
-#ifdef CONFIG_X86_64
-static bool opcode_is_rex_prefix(uint8_t b)
-{
- return (b & 0xf0) == 0x40;
-}
-#else
-static bool opcode_is_rex_prefix(uint8_t b)
-{
- return false;
-}
-#endif
-
-#define REX_W (1 << 3)
-
-/*
- * This is a VERY crude opcode decoder. We only need to find the size of the
- * load/store that caused our #PF and this should work for all the opcodes
- * that we care about. Moreover, the ones who invented this instruction set
- * should be shot.
- */
-void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size)
-{
- /* Default operand size */
- int operand_size_override = 4;
-
- /* prefixes */
- for (; opcode_is_prefix(*op); ++op) {
- if (*op == 0x66)
- operand_size_override = 2;
- }
-
- /* REX prefix */
- if (opcode_is_rex_prefix(*op)) {
- uint8_t rex = *op;
-
- ++op;
- if (rex & REX_W) {
- switch (*op) {
- case 0x63:
- *size = 4;
- return;
- case 0x0f:
- ++op;
-
- switch (*op) {
- case 0xb6:
- case 0xbe:
- *size = 1;
- return;
- case 0xb7:
- case 0xbf:
- *size = 2;
- return;
- }
-
- break;
- }
-
- *size = 8;
- return;
- }
- }
-
- /* escape opcode */
- if (*op == 0x0f) {
- ++op;
-
- /*
- * This is move with zero-extend and sign-extend, respectively;
- * we don't have to think about 0xb6/0xbe, because this is
- * already handled in the conditional below.
- */
- if (*op == 0xb7 || *op == 0xbf)
- operand_size_override = 2;
- }
-
- *size = (*op & 1) ? operand_size_override : 1;
-}
-
-const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op)
-{
- /* skip prefixes */
- while (opcode_is_prefix(*op))
- ++op;
- if (opcode_is_rex_prefix(*op))
- ++op;
- return op;
-}
diff --git a/arch/x86/mm/kmemcheck/opcode.h b/arch/x86/mm/kmemcheck/opcode.h
index 51a1ce94c24a..ea32a7d3cf1b 100644
--- a/arch/x86/mm/kmemcheck/opcode.h
+++ b/arch/x86/mm/kmemcheck/opcode.h
@@ -1,10 +1 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ARCH__X86__MM__KMEMCHECK__OPCODE_H
-#define ARCH__X86__MM__KMEMCHECK__OPCODE_H
-
-#include <linux/types.h>
-
-void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size);
-const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op);
-
-#endif
diff --git a/arch/x86/mm/kmemcheck/pte.c b/arch/x86/mm/kmemcheck/pte.c
index 8a03be90272a..cec594032515 100644
--- a/arch/x86/mm/kmemcheck/pte.c
+++ b/arch/x86/mm/kmemcheck/pte.c
@@ -1,23 +1 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/mm.h>
-
-#include <asm/pgtable.h>
-
-#include "pte.h"
-
-pte_t *kmemcheck_pte_lookup(unsigned long address)
-{
- pte_t *pte;
- unsigned int level;
-
- pte = lookup_address(address, &level);
- if (!pte)
- return NULL;
- if (level != PG_LEVEL_4K)
- return NULL;
- if (!pte_hidden(*pte))
- return NULL;
-
- return pte;
-}
-
diff --git a/arch/x86/mm/kmemcheck/pte.h b/arch/x86/mm/kmemcheck/pte.h
index b595612382c2..ea32a7d3cf1b 100644
--- a/arch/x86/mm/kmemcheck/pte.h
+++ b/arch/x86/mm/kmemcheck/pte.h
@@ -1,11 +1 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ARCH__X86__MM__KMEMCHECK__PTE_H
-#define ARCH__X86__MM__KMEMCHECK__PTE_H
-
-#include <linux/mm.h>
-
-#include <asm/pgtable.h>
-
-pte_t *kmemcheck_pte_lookup(unsigned long address);
-
-#endif
diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c
index 7ce0be1f99eb..cec594032515 100644
--- a/arch/x86/mm/kmemcheck/selftest.c
+++ b/arch/x86/mm/kmemcheck/selftest.c
@@ -1,71 +1 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/bug.h>
-#include <linux/kernel.h>
-
-#include "opcode.h"
-#include "selftest.h"
-
-struct selftest_opcode {
- unsigned int expected_size;
- const uint8_t *insn;
- const char *desc;
-};
-
-static const struct selftest_opcode selftest_opcodes[] = {
- /* REP MOVS */
- {1, "\xf3\xa4", "rep movsb <mem8>, <mem8>"},
- {4, "\xf3\xa5", "rep movsl <mem32>, <mem32>"},
-
- /* MOVZX / MOVZXD */
- {1, "\x66\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg16>"},
- {1, "\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg32>"},
-
- /* MOVSX / MOVSXD */
- {1, "\x66\x0f\xbe\x51\xf8", "movswq <mem8>, <reg16>"},
- {1, "\x0f\xbe\x51\xf8", "movswq <mem8>, <reg32>"},
-
-#ifdef CONFIG_X86_64
- /* MOVZX / MOVZXD */
- {1, "\x49\x0f\xb6\x51\xf8", "movzbq <mem8>, <reg64>"},
- {2, "\x49\x0f\xb7\x51\xf8", "movzbq <mem16>, <reg64>"},
-
- /* MOVSX / MOVSXD */
- {1, "\x49\x0f\xbe\x51\xf8", "movsbq <mem8>, <reg64>"},
- {2, "\x49\x0f\xbf\x51\xf8", "movsbq <mem16>, <reg64>"},
- {4, "\x49\x63\x51\xf8", "movslq <mem32>, <reg64>"},
-#endif
-};
-
-static bool selftest_opcode_one(const struct selftest_opcode *op)
-{
- unsigned size;
-
- kmemcheck_opcode_decode(op->insn, &size);
-
- if (size == op->expected_size)
- return true;
-
- printk(KERN_WARNING "kmemcheck: opcode %s: expected size %d, got %d\n",
- op->desc, op->expected_size, size);
- return false;
-}
-
-static bool selftest_opcodes_all(void)
-{
- bool pass = true;
- unsigned int i;
-
- for (i = 0; i < ARRAY_SIZE(selftest_opcodes); ++i)
- pass = pass && selftest_opcode_one(&selftest_opcodes[i]);
-
- return pass;
-}
-
-bool kmemcheck_selftest(void)
-{
- bool pass = true;
-
- pass = pass && selftest_opcodes_all();
-
- return pass;
-}
diff --git a/arch/x86/mm/kmemcheck/selftest.h b/arch/x86/mm/kmemcheck/selftest.h
index 8d759aae453d..ea32a7d3cf1b 100644
--- a/arch/x86/mm/kmemcheck/selftest.h
+++ b/arch/x86/mm/kmemcheck/selftest.h
@@ -1,7 +1 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ARCH_X86_MM_KMEMCHECK_SELFTEST_H
-#define ARCH_X86_MM_KMEMCHECK_SELFTEST_H
-
-bool kmemcheck_selftest(void);
-
-#endif
diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c
deleted file mode 100644
index c2638a7d2c10..000000000000
--- a/arch/x86/mm/kmemcheck/shadow.c
+++ /dev/null
@@ -1,173 +0,0 @@
-#include <linux/kmemcheck.h>
-#include <linux/export.h>
-#include <linux/mm.h>
-
-#include <asm/page.h>
-#include <asm/pgtable.h>
-
-#include "pte.h"
-#include "shadow.h"
-
-/*
- * Return the shadow address for the given address. Returns NULL if the
- * address is not tracked.
- *
- * We need to be extremely careful not to follow any invalid pointers,
- * because this function can be called for *any* possible address.
- */
-void *kmemcheck_shadow_lookup(unsigned long address)
-{
- pte_t *pte;
- struct page *page;
-
- if (!virt_addr_valid(address))
- return NULL;
-
- pte = kmemcheck_pte_lookup(address);
- if (!pte)
- return NULL;
-
- page = virt_to_page(address);
- if (!page->shadow)
- return NULL;
- return page->shadow + (address & (PAGE_SIZE - 1));
-}
-
-static void mark_shadow(void *address, unsigned int n,
- enum kmemcheck_shadow status)
-{
- unsigned long addr = (unsigned long) address;
- unsigned long last_addr = addr + n - 1;
- unsigned long page = addr & PAGE_MASK;
- unsigned long last_page = last_addr & PAGE_MASK;
- unsigned int first_n;
- void *shadow;
-
- /* If the memory range crosses a page boundary, stop there. */
- if (page == last_page)
- first_n = n;
- else
- first_n = page + PAGE_SIZE - addr;
-
- shadow = kmemcheck_shadow_lookup(addr);
- if (shadow)
- memset(shadow, status, first_n);
-
- addr += first_n;
- n -= first_n;
-
- /* Do full-page memset()s. */
- while (n >= PAGE_SIZE) {
- shadow = kmemcheck_shadow_lookup(addr);
- if (shadow)
- memset(shadow, status, PAGE_SIZE);
-
- addr += PAGE_SIZE;
- n -= PAGE_SIZE;
- }
-
- /* Do the remaining page, if any. */
- if (n > 0) {
- shadow = kmemcheck_shadow_lookup(addr);
- if (shadow)
- memset(shadow, status, n);
- }
-}
-
-void kmemcheck_mark_unallocated(void *address, unsigned int n)
-{
- mark_shadow(address, n, KMEMCHECK_SHADOW_UNALLOCATED);
-}
-
-void kmemcheck_mark_uninitialized(void *address, unsigned int n)
-{
- mark_shadow(address, n, KMEMCHECK_SHADOW_UNINITIALIZED);
-}
-
-/*
- * Fill the shadow memory of the given address such that the memory at that
- * address is marked as being initialized.
- */
-void kmemcheck_mark_initialized(void *address, unsigned int n)
-{
- mark_shadow(address, n, KMEMCHECK_SHADOW_INITIALIZED);
-}
-EXPORT_SYMBOL_GPL(kmemcheck_mark_initialized);
-
-void kmemcheck_mark_freed(void *address, unsigned int n)
-{
- mark_shadow(address, n, KMEMCHECK_SHADOW_FREED);
-}
-
-void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n)
-{
- unsigned int i;
-
- for (i = 0; i < n; ++i)
- kmemcheck_mark_unallocated(page_address(&p[i]), PAGE_SIZE);
-}
-
-void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n)
-{
- unsigned int i;
-
- for (i = 0; i < n; ++i)
- kmemcheck_mark_uninitialized(page_address(&p[i]), PAGE_SIZE);
-}
-
-void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n)
-{
- unsigned int i;
-
- for (i = 0; i < n; ++i)
- kmemcheck_mark_initialized(page_address(&p[i]), PAGE_SIZE);
-}
-
-enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size)
-{
-#ifdef CONFIG_KMEMCHECK_PARTIAL_OK
- uint8_t *x;
- unsigned int i;
-
- x = shadow;
-
- /*
- * Make sure _some_ bytes are initialized. Gcc frequently generates
- * code to access neighboring bytes.
- */
- for (i = 0; i < size; ++i) {
- if (x[i] == KMEMCHECK_SHADOW_INITIALIZED)
- return x[i];
- }
-
- return x[0];
-#else
- return kmemcheck_shadow_test_all(shadow, size);
-#endif
-}
-
-enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow, unsigned int size)
-{
- uint8_t *x;
- unsigned int i;
-
- x = shadow;
-
- /* All bytes must be initialized. */
- for (i = 0; i < size; ++i) {
- if (x[i] != KMEMCHECK_SHADOW_INITIALIZED)
- return x[i];
- }
-
- return x[0];
-}
-
-void kmemcheck_shadow_set(void *shadow, unsigned int size)
-{
- uint8_t *x;
- unsigned int i;
-
- x = shadow;
- for (i = 0; i < size; ++i)
- x[i] = KMEMCHECK_SHADOW_INITIALIZED;
-}
diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h
index 49768dc18664..ea32a7d3cf1b 100644
--- a/arch/x86/mm/kmemcheck/shadow.h
+++ b/arch/x86/mm/kmemcheck/shadow.h
@@ -1,19 +1 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ARCH__X86__MM__KMEMCHECK__SHADOW_H
-#define ARCH__X86__MM__KMEMCHECK__SHADOW_H
-
-enum kmemcheck_shadow {
- KMEMCHECK_SHADOW_UNALLOCATED,
- KMEMCHECK_SHADOW_UNINITIALIZED,
- KMEMCHECK_SHADOW_INITIALIZED,
- KMEMCHECK_SHADOW_FREED,
-};
-
-void *kmemcheck_shadow_lookup(unsigned long address);
-
-enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size);
-enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow,
- unsigned int size);
-void kmemcheck_shadow_set(void *shadow, unsigned int size);
-
-#endif
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 3fe68483463c..85cf12219dea 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -753,7 +753,7 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
if (!debug_pagealloc_enabled())
spin_unlock(&cpa_lock);
- base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
+ base = alloc_pages(GFP_KERNEL, 0);
if (!debug_pagealloc_enabled())
spin_lock(&cpa_lock);
if (!base)
@@ -904,7 +904,7 @@ static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
static int alloc_pte_page(pmd_t *pmd)
{
- pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+ pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
if (!pte)
return -1;
@@ -914,7 +914,7 @@ static int alloc_pte_page(pmd_t *pmd)
static int alloc_pmd_page(pud_t *pud)
{
- pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+ pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
if (!pmd)
return -1;
@@ -1120,7 +1120,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
pgd_entry = cpa->pgd + pgd_index(addr);
if (pgd_none(*pgd_entry)) {
- p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+ p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
if (!p4d)
return -1;
@@ -1132,7 +1132,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
*/
p4d = p4d_offset(pgd_entry, addr);
if (p4d_none(*p4d)) {
- pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+ pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
if (!pud)
return -1;
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 17ebc5a978cc..96d456a94b03 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -7,7 +7,7 @@
#include <asm/fixmap.h>
#include <asm/mtrr.h>
-#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | __GFP_ZERO)
+#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO)
#ifdef CONFIG_HIGHPTE
#define PGALLOC_USER_GFP __GFP_HIGHMEM
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index ffdbc4836b4f..174c59774cc9 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -592,7 +592,7 @@ enum __force_cpu_type {
static int force_cpu_type;
-static int set_cpu_type(const char *str, struct kernel_param *kp)
+static int set_cpu_type(const char *str, const struct kernel_param *kp)
{
if (!strcmp(str, "timer")) {
force_cpu_type = timer;
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 4210da7b44de..1e996df687a3 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -636,3 +636,88 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid);
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2031, quirk_no_aersid);
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2032, quirk_no_aersid);
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2033, quirk_no_aersid);
+
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+
+#define AMD_141b_MMIO_BASE(x) (0x80 + (x) * 0x8)
+#define AMD_141b_MMIO_BASE_RE_MASK BIT(0)
+#define AMD_141b_MMIO_BASE_WE_MASK BIT(1)
+#define AMD_141b_MMIO_BASE_MMIOBASE_MASK GENMASK(31,8)
+
+#define AMD_141b_MMIO_LIMIT(x) (0x84 + (x) * 0x8)
+#define AMD_141b_MMIO_LIMIT_MMIOLIMIT_MASK GENMASK(31,8)
+
+#define AMD_141b_MMIO_HIGH(x) (0x180 + (x) * 0x4)
+#define AMD_141b_MMIO_HIGH_MMIOBASE_MASK GENMASK(7,0)
+#define AMD_141b_MMIO_HIGH_MMIOLIMIT_SHIFT 16
+#define AMD_141b_MMIO_HIGH_MMIOLIMIT_MASK GENMASK(23,16)
+
+/*
+ * The PCI Firmware Spec, rev 3.2, notes that ACPI should optionally allow
+ * configuring host bridge windows using the _PRS and _SRS methods.
+ *
+ * But this is rarely implemented, so we manually enable a large 64bit BAR for
+ * PCIe device on AMD Family 15h (Models 00h-1fh, 30h-3fh, 60h-7fh) Processors
+ * here.
+ */
+static void pci_amd_enable_64bit_bar(struct pci_dev *dev)
+{
+ unsigned i;
+ u32 base, limit, high;
+ struct resource *res, *conflict;
+
+ for (i = 0; i < 8; i++) {
+ pci_read_config_dword(dev, AMD_141b_MMIO_BASE(i), &base);
+ pci_read_config_dword(dev, AMD_141b_MMIO_HIGH(i), &high);
+
+ /* Is this slot free? */
+ if (!(base & (AMD_141b_MMIO_BASE_RE_MASK |
+ AMD_141b_MMIO_BASE_WE_MASK)))
+ break;
+
+ base >>= 8;
+ base |= high << 24;
+
+ /* Abort if a slot already configures a 64bit BAR. */
+ if (base > 0x10000)
+ return;
+ }
+ if (i == 8)
+ return;
+
+ res = kzalloc(sizeof(*res), GFP_KERNEL);
+ if (!res)
+ return;
+
+ res->name = "PCI Bus 0000:00";
+ res->flags = IORESOURCE_PREFETCH | IORESOURCE_MEM |
+ IORESOURCE_MEM_64 | IORESOURCE_WINDOW;
+ res->start = 0x100000000ull;
+ res->end = 0xfd00000000ull - 1;
+
+ /* Just grab the free area behind system memory for this */
+ while ((conflict = request_resource_conflict(&iomem_resource, res)))
+ res->start = conflict->end + 1;
+
+ dev_info(&dev->dev, "adding root bus resource %pR\n", res);
+
+ base = ((res->start >> 8) & AMD_141b_MMIO_BASE_MMIOBASE_MASK) |
+ AMD_141b_MMIO_BASE_RE_MASK | AMD_141b_MMIO_BASE_WE_MASK;
+ limit = ((res->end + 1) >> 8) & AMD_141b_MMIO_LIMIT_MMIOLIMIT_MASK;
+ high = ((res->start >> 40) & AMD_141b_MMIO_HIGH_MMIOBASE_MASK) |
+ ((((res->end + 1) >> 40) << AMD_141b_MMIO_HIGH_MMIOLIMIT_SHIFT)
+ & AMD_141b_MMIO_HIGH_MMIOLIMIT_MASK);
+
+ pci_write_config_dword(dev, AMD_141b_MMIO_HIGH(i), high);
+ pci_write_config_dword(dev, AMD_141b_MMIO_LIMIT(i), limit);
+ pci_write_config_dword(dev, AMD_141b_MMIO_BASE(i), base);
+
+ pci_bus_add_resource(dev->bus, res, 0);
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x1401, pci_amd_enable_64bit_bar);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x141b, pci_amd_enable_64bit_bar);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x1571, pci_amd_enable_64bit_bar);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x15b1, pci_amd_enable_64bit_bar);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x1601, pci_amd_enable_64bit_bar);
+
+#endif
diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c
index 1012a5f0f98d..511921045312 100644
--- a/arch/x86/pci/intel_mid_pci.c
+++ b/arch/x86/pci/intel_mid_pci.c
@@ -280,7 +280,7 @@ static void intel_mid_pci_irq_disable(struct pci_dev *dev)
}
}
-static struct pci_ops intel_mid_pci_ops = {
+static const struct pci_ops intel_mid_pci_ops __initconst = {
.read = pci_read,
.write = pci_write,
};
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 9e4ee5b04b2d..6a151ce70e86 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -207,7 +207,7 @@ int __init efi_alloc_page_tables(void)
if (efi_enabled(EFI_OLD_MEMMAP))
return 0;
- gfp_mask = GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO;
+ gfp_mask = GFP_KERNEL | __GFP_ZERO;
efi_pgd = (pgd_t *)__get_free_page(gfp_mask);
if (!efi_pgd)
return -ENOMEM;
diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c
index 809b6c812654..92ccc718152d 100644
--- a/arch/x86/xen/grant-table.c
+++ b/arch/x86/xen/grant-table.c
@@ -49,7 +49,7 @@
static struct gnttab_vm_area {
struct vm_struct *area;
pte_t **ptes;
-} gnttab_shared_vm_area;
+} gnttab_shared_vm_area, gnttab_status_vm_area;
int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
unsigned long max_nr_gframes,
@@ -73,16 +73,43 @@ int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
return 0;
}
+int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes,
+ unsigned long max_nr_gframes,
+ grant_status_t **__shared)
+{
+ grant_status_t *shared = *__shared;
+ unsigned long addr;
+ unsigned long i;
+
+ if (shared == NULL)
+ *__shared = shared = gnttab_status_vm_area.area->addr;
+
+ addr = (unsigned long)shared;
+
+ for (i = 0; i < nr_gframes; i++) {
+ set_pte_at(&init_mm, addr, gnttab_status_vm_area.ptes[i],
+ mfn_pte(frames[i], PAGE_KERNEL));
+ addr += PAGE_SIZE;
+ }
+
+ return 0;
+}
+
void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
{
+ pte_t **ptes;
unsigned long addr;
unsigned long i;
+ if (shared == gnttab_status_vm_area.area->addr)
+ ptes = gnttab_status_vm_area.ptes;
+ else
+ ptes = gnttab_shared_vm_area.ptes;
+
addr = (unsigned long)shared;
for (i = 0; i < nr_gframes; i++) {
- set_pte_at(&init_mm, addr, gnttab_shared_vm_area.ptes[i],
- __pte(0));
+ set_pte_at(&init_mm, addr, ptes[i], __pte(0));
addr += PAGE_SIZE;
}
}
@@ -102,12 +129,35 @@ static int arch_gnttab_valloc(struct gnttab_vm_area *area, unsigned nr_frames)
return 0;
}
-int arch_gnttab_init(unsigned long nr_shared)
+static void arch_gnttab_vfree(struct gnttab_vm_area *area)
{
+ free_vm_area(area->area);
+ kfree(area->ptes);
+}
+
+int arch_gnttab_init(unsigned long nr_shared, unsigned long nr_status)
+{
+ int ret;
+
if (!xen_pv_domain())
return 0;
- return arch_gnttab_valloc(&gnttab_shared_vm_area, nr_shared);
+ ret = arch_gnttab_valloc(&gnttab_shared_vm_area, nr_shared);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * Always allocate the space for the status frames in case
+ * we're migrated to a host with V2 support.
+ */
+ ret = arch_gnttab_valloc(&gnttab_status_vm_area, nr_status);
+ if (ret < 0)
+ goto err;
+
+ return 0;
+err:
+ arch_gnttab_vfree(&gnttab_shared_vm_area);
+ return -ENOMEM;
}
#ifdef CONFIG_XEN_PVH
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 3e15345abfe7..d33e7dbe3129 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -172,6 +172,9 @@ int xen_remap_domain_gfn_range(struct vm_area_struct *vma,
pgprot_t prot, unsigned domid,
struct page **pages)
{
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return -EOPNOTSUPP;
+
return do_remap_gfn(vma, addr, &gfn, nr, NULL, prot, domid, pages);
}
EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_range);
@@ -182,6 +185,10 @@ int xen_remap_domain_gfn_array(struct vm_area_struct *vma,
int *err_ptr, pgprot_t prot,
unsigned domid, struct page **pages)
{
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return xen_xlate_remap_gfn_array(vma, addr, gfn, nr, err_ptr,
+ prot, domid, pages);
+
/* We BUG_ON because it's a programmer error to pass a NULL err_ptr,
* and the consequences later is quite hard to detect what the actual
* cause of "wrong memory was mapped in".
@@ -193,9 +200,12 @@ EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_array);
/* Returns: 0 success */
int xen_unmap_domain_gfn_range(struct vm_area_struct *vma,
- int numpgs, struct page **pages)
+ int nr, struct page **pages)
{
- if (!pages || !xen_feature(XENFEAT_auto_translated_physmap))
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return xen_xlate_unmap_gfn_range(vma, nr, pages);
+
+ if (!pages)
return 0;
return -EINVAL;
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 2ccdaba31a07..fc048ec686e7 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -315,7 +315,7 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
static pteval_t pte_mfn_to_pfn(pteval_t val)
{
if (val & _PAGE_PRESENT) {
- unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
+ unsigned long mfn = (val & XEN_PTE_MFN_MASK) >> PAGE_SHIFT;
unsigned long pfn = mfn_to_pfn(mfn);
pteval_t flags = val & PTE_FLAGS_MASK;
@@ -1721,7 +1721,7 @@ static unsigned long __init m2p(phys_addr_t maddr)
{
phys_addr_t paddr;
- maddr &= PTE_PFN_MASK;
+ maddr &= XEN_PTE_MFN_MASK;
paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
return paddr;
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 92bf5ecb6baf..d9f96cc5d743 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -17,6 +17,8 @@
void xen_arch_pre_suspend(void)
{
+ xen_save_time_memory_area();
+
if (xen_pv_domain())
xen_pv_pre_suspend();
}
@@ -27,6 +29,8 @@ void xen_arch_post_suspend(int cancelled)
xen_pv_post_suspend(cancelled);
else
xen_hvm_post_suspend(cancelled);
+
+ xen_restore_time_memory_area();
}
static void xen_vcpu_notify_restore(void *data)
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 80c2a4bdf230..29163c43ebbd 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -75,7 +75,7 @@ static void xen_get_wallclock(struct timespec *now)
static int xen_set_wallclock(const struct timespec *now)
{
- return -1;
+ return -ENODEV;
}
static int xen_pvclock_gtod_notify(struct notifier_block *nb,
@@ -371,8 +371,95 @@ static const struct pv_time_ops xen_time_ops __initconst = {
.steal_clock = xen_steal_clock,
};
+static struct pvclock_vsyscall_time_info *xen_clock __read_mostly;
+
+void xen_save_time_memory_area(void)
+{
+ struct vcpu_register_time_memory_area t;
+ int ret;
+
+ if (!xen_clock)
+ return;
+
+ t.addr.v = NULL;
+
+ ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
+ if (ret != 0)
+ pr_notice("Cannot save secondary vcpu_time_info (err %d)",
+ ret);
+ else
+ clear_page(xen_clock);
+}
+
+void xen_restore_time_memory_area(void)
+{
+ struct vcpu_register_time_memory_area t;
+ int ret;
+
+ if (!xen_clock)
+ return;
+
+ t.addr.v = &xen_clock->pvti;
+
+ ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
+
+ /*
+ * We don't disable VCLOCK_PVCLOCK entirely if it fails to register the
+ * secondary time info with Xen or if we migrated to a host without the
+ * necessary flags. On both of these cases what happens is either
+ * process seeing a zeroed out pvti or seeing no PVCLOCK_TSC_STABLE_BIT
+ * bit set. Userspace checks the latter and if 0, it discards the data
+ * in pvti and fallbacks to a system call for a reliable timestamp.
+ */
+ if (ret != 0)
+ pr_notice("Cannot restore secondary vcpu_time_info (err %d)",
+ ret);
+}
+
+static void xen_setup_vsyscall_time_info(void)
+{
+ struct vcpu_register_time_memory_area t;
+ struct pvclock_vsyscall_time_info *ti;
+ int ret;
+
+ ti = (struct pvclock_vsyscall_time_info *)get_zeroed_page(GFP_KERNEL);
+ if (!ti)
+ return;
+
+ t.addr.v = &ti->pvti;
+
+ ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
+ if (ret) {
+ pr_notice("xen: VCLOCK_PVCLOCK not supported (err %d)\n", ret);
+ free_page((unsigned long)ti);
+ return;
+ }
+
+ /*
+ * If primary time info had this bit set, secondary should too since
+ * it's the same data on both just different memory regions. But we
+ * still check it in case hypervisor is buggy.
+ */
+ if (!(ti->pvti.flags & PVCLOCK_TSC_STABLE_BIT)) {
+ t.addr.v = NULL;
+ ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area,
+ 0, &t);
+ if (!ret)
+ free_page((unsigned long)ti);
+
+ pr_notice("xen: VCLOCK_PVCLOCK not supported (tsc unstable)\n");
+ return;
+ }
+
+ xen_clock = ti;
+ pvclock_set_pvti_cpu0_va(xen_clock);
+
+ xen_clocksource.archdata.vclock_mode = VCLOCK_PVCLOCK;
+}
+
static void __init xen_time_init(void)
{
+ struct pvclock_vcpu_time_info *pvti;
int cpu = smp_processor_id();
struct timespec tp;
@@ -396,6 +483,16 @@ static void __init xen_time_init(void)
setup_force_cpu_cap(X86_FEATURE_TSC);
+ /*
+ * We check ahead on the primary time info if this
+ * bit is supported hence speeding up Xen clocksource.
+ */
+ pvti = &__this_cpu_read(xen_vcpu)->time;
+ if (pvti->flags & PVCLOCK_TSC_STABLE_BIT) {
+ pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
+ xen_setup_vsyscall_time_info();
+ }
+
xen_setup_runstate_info(cpu);
xen_setup_timer(cpu);
xen_setup_cpu_clockevents();
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index f377e1820c6c..75011b80660f 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -70,6 +70,8 @@ void xen_setup_runstate_info(int cpu);
void xen_teardown_timer(int cpu);
u64 xen_clocksource_read(void);
void xen_setup_cpu_clockevents(void);
+void xen_save_time_memory_area(void);
+void xen_restore_time_memory_area(void);
void __init xen_init_time_ops(void);
void __init xen_hvm_init_time_ops(void);