summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig8
-rw-r--r--arch/x86/Makefile8
-rw-r--r--arch/x86/boot/compressed/misc.h4
-rw-r--r--arch/x86/events/amd/core.c140
-rw-r--r--arch/x86/events/core.c13
-rw-r--r--arch/x86/events/intel/core.c8
-rw-r--r--arch/x86/include/asm/bitops.h41
-rw-r--r--arch/x86/include/asm/cpufeature.h5
-rw-r--r--arch/x86/include/asm/kvm_host.h10
-rw-r--r--arch/x86/include/asm/realmode.h6
-rw-r--r--arch/x86/include/asm/syscall.h142
-rw-r--r--arch/x86/include/asm/xen/hypercall.h3
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c3
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c6
-rw-r--r--arch/x86/kvm/hyperv.c9
-rw-r--r--arch/x86/kvm/mmu.c54
-rw-r--r--arch/x86/kvm/mmutrace.h4
-rw-r--r--arch/x86/kvm/svm.c54
-rw-r--r--arch/x86/kvm/vmx/nested.c79
-rw-r--r--arch/x86/kvm/vmx/vmx.c19
-rw-r--r--arch/x86/kvm/vmx/vmx.h1
-rw-r--r--arch/x86/kvm/x86.c59
-rw-r--r--arch/x86/mm/mmap.c2
-rw-r--r--arch/x86/platform/efi/quirks.c2
-rw-r--r--arch/x86/realmode/init.c11
25 files changed, 419 insertions, 272 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c1f9b3cf437c..5ad92419be19 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2217,14 +2217,8 @@ config RANDOMIZE_MEMORY_PHYSICAL_PADDING
If unsure, leave at the default value.
config HOTPLUG_CPU
- bool "Support for hot-pluggable CPUs"
+ def_bool y
depends on SMP
- ---help---
- Say Y here to allow turning CPUs off and on. CPUs can be
- controlled through /sys/devices/system/cpu.
- ( Note: power management support will enable this option
- automatically on SMP systems. )
- Say N if you want to disable CPU hotplug.
config BOOTPARAM_HOTPLUG_CPU0
bool "Set default setting of cpu0_hotpluggable"
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 2d8b9d8ca4f8..a587805c6687 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -219,8 +219,12 @@ ifdef CONFIG_RETPOLINE
# Additionally, avoid generating expensive indirect jumps which
# are subject to retpolines for small number of switch cases.
# clang turns off jump table generation by default when under
- # retpoline builds, however, gcc does not for x86.
- KBUILD_CFLAGS += $(call cc-option,--param=case-values-threshold=20)
+ # retpoline builds, however, gcc does not for x86. This has
+ # only been fixed starting from gcc stable version 8.4.0 and
+ # onwards, but not for older ones. See gcc bug #86952.
+ ifndef CONFIG_CC_IS_CLANG
+ KBUILD_CFLAGS += $(call cc-option,-fno-jump-tables)
+ endif
endif
archscripts: scripts_basic
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index fd13655e0f9b..d2f184165934 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -120,8 +120,6 @@ static inline void console_init(void)
void set_sev_encryption_mask(void);
-#endif
-
/* acpi.c */
#ifdef CONFIG_ACPI
acpi_physical_address get_rsdp_addr(void);
@@ -135,3 +133,5 @@ int count_immovable_mem_regions(void);
#else
static inline int count_immovable_mem_regions(void) { return 0; }
#endif
+
+#endif /* BOOT_COMPRESSED_MISC_H */
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index 7d2d7c801dba..0ecfac84ba91 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -3,10 +3,14 @@
#include <linux/types.h>
#include <linux/init.h>
#include <linux/slab.h>
+#include <linux/delay.h>
#include <asm/apicdef.h>
+#include <asm/nmi.h>
#include "../perf_event.h"
+static DEFINE_PER_CPU(unsigned int, perf_nmi_counter);
+
static __initconst const u64 amd_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -429,6 +433,132 @@ static void amd_pmu_cpu_dead(int cpu)
}
}
+/*
+ * When a PMC counter overflows, an NMI is used to process the event and
+ * reset the counter. NMI latency can result in the counter being updated
+ * before the NMI can run, which can result in what appear to be spurious
+ * NMIs. This function is intended to wait for the NMI to run and reset
+ * the counter to avoid possible unhandled NMI messages.
+ */
+#define OVERFLOW_WAIT_COUNT 50
+
+static void amd_pmu_wait_on_overflow(int idx)
+{
+ unsigned int i;
+ u64 counter;
+
+ /*
+ * Wait for the counter to be reset if it has overflowed. This loop
+ * should exit very, very quickly, but just in case, don't wait
+ * forever...
+ */
+ for (i = 0; i < OVERFLOW_WAIT_COUNT; i++) {
+ rdmsrl(x86_pmu_event_addr(idx), counter);
+ if (counter & (1ULL << (x86_pmu.cntval_bits - 1)))
+ break;
+
+ /* Might be in IRQ context, so can't sleep */
+ udelay(1);
+ }
+}
+
+static void amd_pmu_disable_all(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int idx;
+
+ x86_pmu_disable_all();
+
+ /*
+ * This shouldn't be called from NMI context, but add a safeguard here
+ * to return, since if we're in NMI context we can't wait for an NMI
+ * to reset an overflowed counter value.
+ */
+ if (in_nmi())
+ return;
+
+ /*
+ * Check each counter for overflow and wait for it to be reset by the
+ * NMI if it has overflowed. This relies on the fact that all active
+ * counters are always enabled when this function is caled and
+ * ARCH_PERFMON_EVENTSEL_INT is always set.
+ */
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+
+ amd_pmu_wait_on_overflow(idx);
+ }
+}
+
+static void amd_pmu_disable_event(struct perf_event *event)
+{
+ x86_pmu_disable_event(event);
+
+ /*
+ * This can be called from NMI context (via x86_pmu_stop). The counter
+ * may have overflowed, but either way, we'll never see it get reset
+ * by the NMI if we're already in the NMI. And the NMI latency support
+ * below will take care of any pending NMI that might have been
+ * generated by the overflow.
+ */
+ if (in_nmi())
+ return;
+
+ amd_pmu_wait_on_overflow(event->hw.idx);
+}
+
+/*
+ * Because of NMI latency, if multiple PMC counters are active or other sources
+ * of NMIs are received, the perf NMI handler can handle one or more overflowed
+ * PMC counters outside of the NMI associated with the PMC overflow. If the NMI
+ * doesn't arrive at the LAPIC in time to become a pending NMI, then the kernel
+ * back-to-back NMI support won't be active. This PMC handler needs to take into
+ * account that this can occur, otherwise this could result in unknown NMI
+ * messages being issued. Examples of this is PMC overflow while in the NMI
+ * handler when multiple PMCs are active or PMC overflow while handling some
+ * other source of an NMI.
+ *
+ * Attempt to mitigate this by using the number of active PMCs to determine
+ * whether to return NMI_HANDLED if the perf NMI handler did not handle/reset
+ * any PMCs. The per-CPU perf_nmi_counter variable is set to a minimum of the
+ * number of active PMCs or 2. The value of 2 is used in case an NMI does not
+ * arrive at the LAPIC in time to be collapsed into an already pending NMI.
+ */
+static int amd_pmu_handle_irq(struct pt_regs *regs)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int active, handled;
+
+ /*
+ * Obtain the active count before calling x86_pmu_handle_irq() since
+ * it is possible that x86_pmu_handle_irq() may make a counter
+ * inactive (through x86_pmu_stop).
+ */
+ active = __bitmap_weight(cpuc->active_mask, X86_PMC_IDX_MAX);
+
+ /* Process any counter overflows */
+ handled = x86_pmu_handle_irq(regs);
+
+ /*
+ * If a counter was handled, record the number of possible remaining
+ * NMIs that can occur.
+ */
+ if (handled) {
+ this_cpu_write(perf_nmi_counter,
+ min_t(unsigned int, 2, active));
+
+ return handled;
+ }
+
+ if (!this_cpu_read(perf_nmi_counter))
+ return NMI_DONE;
+
+ this_cpu_dec(perf_nmi_counter);
+
+ return NMI_HANDLED;
+}
+
static struct event_constraint *
amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
struct perf_event *event)
@@ -621,11 +751,11 @@ static ssize_t amd_event_sysfs_show(char *page, u64 config)
static __initconst const struct x86_pmu amd_pmu = {
.name = "AMD",
- .handle_irq = x86_pmu_handle_irq,
- .disable_all = x86_pmu_disable_all,
+ .handle_irq = amd_pmu_handle_irq,
+ .disable_all = amd_pmu_disable_all,
.enable_all = x86_pmu_enable_all,
.enable = x86_pmu_enable_event,
- .disable = x86_pmu_disable_event,
+ .disable = amd_pmu_disable_event,
.hw_config = amd_pmu_hw_config,
.schedule_events = x86_schedule_events,
.eventsel = MSR_K7_EVNTSEL0,
@@ -732,7 +862,7 @@ void amd_pmu_enable_virt(void)
cpuc->perf_ctr_virt_mask = 0;
/* Reload all events */
- x86_pmu_disable_all();
+ amd_pmu_disable_all();
x86_pmu_enable_all(0);
}
EXPORT_SYMBOL_GPL(amd_pmu_enable_virt);
@@ -750,7 +880,7 @@ void amd_pmu_disable_virt(void)
cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
/* Reload all events */
- x86_pmu_disable_all();
+ amd_pmu_disable_all();
x86_pmu_enable_all(0);
}
EXPORT_SYMBOL_GPL(amd_pmu_disable_virt);
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index e2b1447192a8..81911e11a15d 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1349,8 +1349,9 @@ void x86_pmu_stop(struct perf_event *event, int flags)
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
- if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
+ if (test_bit(hwc->idx, cpuc->active_mask)) {
x86_pmu.disable(event);
+ __clear_bit(hwc->idx, cpuc->active_mask);
cpuc->events[hwc->idx] = NULL;
WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
hwc->state |= PERF_HES_STOPPED;
@@ -1447,16 +1448,8 @@ int x86_pmu_handle_irq(struct pt_regs *regs)
apic_write(APIC_LVTPC, APIC_DM_NMI);
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
- if (!test_bit(idx, cpuc->active_mask)) {
- /*
- * Though we deactivated the counter some cpus
- * might still deliver spurious interrupts still
- * in flight. Catch them:
- */
- if (__test_and_clear_bit(idx, cpuc->running))
- handled++;
+ if (!test_bit(idx, cpuc->active_mask))
continue;
- }
event = cpuc->events[idx];
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 8baa441d8000..f61dcbef20ff 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3185,7 +3185,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
return ret;
if (event->attr.precise_ip) {
- if (!event->attr.freq) {
+ if (!(event->attr.freq || event->attr.wakeup_events)) {
event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
if (!(event->attr.sample_type &
~intel_pmu_large_pebs_flags(event)))
@@ -3575,6 +3575,12 @@ static void intel_pmu_cpu_starting(int cpu)
cpuc->lbr_sel = NULL;
+ if (x86_pmu.flags & PMU_FL_TFA) {
+ WARN_ON_ONCE(cpuc->tfa_shadow);
+ cpuc->tfa_shadow = ~0ULL;
+ intel_set_tfa(cpuc, false);
+ }
+
if (x86_pmu.version > 1)
flip_smm_bit(&x86_pmu.attr_freeze_on_smi);
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index d153d570bb04..8e790ec219a5 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -36,16 +36,17 @@
* bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
*/
-#define BITOP_ADDR(x) "+m" (*(volatile long *) (x))
+#define RLONG_ADDR(x) "m" (*(volatile long *) (x))
+#define WBYTE_ADDR(x) "+m" (*(volatile char *) (x))
-#define ADDR BITOP_ADDR(addr)
+#define ADDR RLONG_ADDR(addr)
/*
* We do the locked ops that don't return the old value as
* a mask operation on a byte.
*/
#define IS_IMMEDIATE(nr) (__builtin_constant_p(nr))
-#define CONST_MASK_ADDR(nr, addr) BITOP_ADDR((void *)(addr) + ((nr)>>3))
+#define CONST_MASK_ADDR(nr, addr) WBYTE_ADDR((void *)(addr) + ((nr)>>3))
#define CONST_MASK(nr) (1 << ((nr) & 7))
/**
@@ -73,7 +74,7 @@ set_bit(long nr, volatile unsigned long *addr)
: "memory");
} else {
asm volatile(LOCK_PREFIX __ASM_SIZE(bts) " %1,%0"
- : BITOP_ADDR(addr) : "Ir" (nr) : "memory");
+ : : RLONG_ADDR(addr), "Ir" (nr) : "memory");
}
}
@@ -88,7 +89,7 @@ set_bit(long nr, volatile unsigned long *addr)
*/
static __always_inline void __set_bit(long nr, volatile unsigned long *addr)
{
- asm volatile(__ASM_SIZE(bts) " %1,%0" : ADDR : "Ir" (nr) : "memory");
+ asm volatile(__ASM_SIZE(bts) " %1,%0" : : ADDR, "Ir" (nr) : "memory");
}
/**
@@ -110,8 +111,7 @@ clear_bit(long nr, volatile unsigned long *addr)
: "iq" ((u8)~CONST_MASK(nr)));
} else {
asm volatile(LOCK_PREFIX __ASM_SIZE(btr) " %1,%0"
- : BITOP_ADDR(addr)
- : "Ir" (nr));
+ : : RLONG_ADDR(addr), "Ir" (nr) : "memory");
}
}
@@ -131,7 +131,7 @@ static __always_inline void clear_bit_unlock(long nr, volatile unsigned long *ad
static __always_inline void __clear_bit(long nr, volatile unsigned long *addr)
{
- asm volatile(__ASM_SIZE(btr) " %1,%0" : ADDR : "Ir" (nr));
+ asm volatile(__ASM_SIZE(btr) " %1,%0" : : ADDR, "Ir" (nr) : "memory");
}
static __always_inline bool clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr)
@@ -139,7 +139,7 @@ static __always_inline bool clear_bit_unlock_is_negative_byte(long nr, volatile
bool negative;
asm volatile(LOCK_PREFIX "andb %2,%1"
CC_SET(s)
- : CC_OUT(s) (negative), ADDR
+ : CC_OUT(s) (negative), WBYTE_ADDR(addr)
: "ir" ((char) ~(1 << nr)) : "memory");
return negative;
}
@@ -155,13 +155,9 @@ static __always_inline bool clear_bit_unlock_is_negative_byte(long nr, volatile
* __clear_bit() is non-atomic and implies release semantics before the memory
* operation. It can be used for an unlock if no other CPUs can concurrently
* modify other bits in the word.
- *
- * No memory barrier is required here, because x86 cannot reorder stores past
- * older loads. Same principle as spin_unlock.
*/
static __always_inline void __clear_bit_unlock(long nr, volatile unsigned long *addr)
{
- barrier();
__clear_bit(nr, addr);
}
@@ -176,7 +172,7 @@ static __always_inline void __clear_bit_unlock(long nr, volatile unsigned long *
*/
static __always_inline void __change_bit(long nr, volatile unsigned long *addr)
{
- asm volatile(__ASM_SIZE(btc) " %1,%0" : ADDR : "Ir" (nr));
+ asm volatile(__ASM_SIZE(btc) " %1,%0" : : ADDR, "Ir" (nr) : "memory");
}
/**
@@ -196,8 +192,7 @@ static __always_inline void change_bit(long nr, volatile unsigned long *addr)
: "iq" ((u8)CONST_MASK(nr)));
} else {
asm volatile(LOCK_PREFIX __ASM_SIZE(btc) " %1,%0"
- : BITOP_ADDR(addr)
- : "Ir" (nr));
+ : : RLONG_ADDR(addr), "Ir" (nr) : "memory");
}
}
@@ -242,8 +237,8 @@ static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long *
asm(__ASM_SIZE(bts) " %2,%1"
CC_SET(c)
- : CC_OUT(c) (oldbit), ADDR
- : "Ir" (nr));
+ : CC_OUT(c) (oldbit)
+ : ADDR, "Ir" (nr) : "memory");
return oldbit;
}
@@ -282,8 +277,8 @@ static __always_inline bool __test_and_clear_bit(long nr, volatile unsigned long
asm volatile(__ASM_SIZE(btr) " %2,%1"
CC_SET(c)
- : CC_OUT(c) (oldbit), ADDR
- : "Ir" (nr));
+ : CC_OUT(c) (oldbit)
+ : ADDR, "Ir" (nr) : "memory");
return oldbit;
}
@@ -294,8 +289,8 @@ static __always_inline bool __test_and_change_bit(long nr, volatile unsigned lon
asm volatile(__ASM_SIZE(btc) " %2,%1"
CC_SET(c)
- : CC_OUT(c) (oldbit), ADDR
- : "Ir" (nr) : "memory");
+ : CC_OUT(c) (oldbit)
+ : ADDR, "Ir" (nr) : "memory");
return oldbit;
}
@@ -326,7 +321,7 @@ static __always_inline bool variable_test_bit(long nr, volatile const unsigned l
asm volatile(__ASM_SIZE(bt) " %2,%1"
CC_SET(c)
: CC_OUT(c) (oldbit)
- : "m" (*(unsigned long *)addr), "Ir" (nr));
+ : "m" (*(unsigned long *)addr), "Ir" (nr) : "memory");
return oldbit;
}
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index ce95b8cbd229..0e56ff7e4848 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -112,8 +112,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
test_cpu_cap(c, bit))
#define this_cpu_has(bit) \
- (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
- x86_this_cpu_test_bit(bit, (unsigned long *)&cpu_info.x86_capability))
+ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
+ x86_this_cpu_test_bit(bit, \
+ (unsigned long __percpu *)&cpu_info.x86_capability))
/*
* This macro is for detection of features which need kernel
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a5db4475e72d..159b5988292f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -253,14 +253,14 @@ struct kvm_mmu_memory_cache {
* kvm_memory_slot.arch.gfn_track which is 16 bits, so the role bits used
* by indirect shadow page can not be more than 15 bits.
*
- * Currently, we used 14 bits that are @level, @cr4_pae, @quadrant, @access,
+ * Currently, we used 14 bits that are @level, @gpte_is_8_bytes, @quadrant, @access,
* @nxe, @cr0_wp, @smep_andnot_wp and @smap_andnot_wp.
*/
union kvm_mmu_page_role {
u32 word;
struct {
unsigned level:4;
- unsigned cr4_pae:1;
+ unsigned gpte_is_8_bytes:1;
unsigned quadrant:2;
unsigned direct:1;
unsigned access:3;
@@ -350,6 +350,7 @@ struct kvm_mmu_page {
};
struct kvm_pio_request {
+ unsigned long linear_rip;
unsigned long count;
int in;
int port;
@@ -568,6 +569,7 @@ struct kvm_vcpu_arch {
bool tpr_access_reporting;
u64 ia32_xss;
u64 microcode_version;
+ u64 arch_capabilities;
/*
* Paging state of the vcpu
@@ -1192,6 +1194,8 @@ struct kvm_x86_ops {
int (*nested_enable_evmcs)(struct kvm_vcpu *vcpu,
uint16_t *vmcs_version);
uint16_t (*nested_get_evmcs_version)(struct kvm_vcpu *vcpu);
+
+ bool (*need_emulation_on_page_fault)(struct kvm_vcpu *vcpu);
};
struct kvm_arch_async_pf {
@@ -1252,7 +1256,7 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
gfn_t gfn_offset, unsigned long mask);
void kvm_mmu_zap_all(struct kvm *kvm);
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
-unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
+unsigned int kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm);
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3);
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index 63b3393bd98e..c53682303c9c 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -77,7 +77,11 @@ static inline size_t real_mode_size_needed(void)
return ALIGN(real_mode_blob_end - real_mode_blob, PAGE_SIZE);
}
-void set_real_mode_mem(phys_addr_t mem, size_t size);
+static inline void set_real_mode_mem(phys_addr_t mem)
+{
+ real_mode_header = (struct real_mode_header *) __va(mem);
+}
+
void reserve_real_mode(void);
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index d653139857af..4c305471ec33 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -91,11 +91,9 @@ static inline void syscall_set_return_value(struct task_struct *task,
static inline void syscall_get_arguments(struct task_struct *task,
struct pt_regs *regs,
- unsigned int i, unsigned int n,
unsigned long *args)
{
- BUG_ON(i + n > 6);
- memcpy(args, &regs->bx + i, n * sizeof(args[0]));
+ memcpy(args, &regs->bx, 6 * sizeof(args[0]));
}
static inline void syscall_set_arguments(struct task_struct *task,
@@ -116,124 +114,50 @@ static inline int syscall_get_arch(void)
static inline void syscall_get_arguments(struct task_struct *task,
struct pt_regs *regs,
- unsigned int i, unsigned int n,
unsigned long *args)
{
# ifdef CONFIG_IA32_EMULATION
- if (task->thread_info.status & TS_COMPAT)
- switch (i) {
- case 0:
- if (!n--) break;
- *args++ = regs->bx;
- case 1:
- if (!n--) break;
- *args++ = regs->cx;
- case 2:
- if (!n--) break;
- *args++ = regs->dx;
- case 3:
- if (!n--) break;
- *args++ = regs->si;
- case 4:
- if (!n--) break;
- *args++ = regs->di;
- case 5:
- if (!n--) break;
- *args++ = regs->bp;
- case 6:
- if (!n--) break;
- default:
- BUG();
- break;
- }
- else
+ if (task->thread_info.status & TS_COMPAT) {
+ *args++ = regs->bx;
+ *args++ = regs->cx;
+ *args++ = regs->dx;
+ *args++ = regs->si;
+ *args++ = regs->di;
+ *args = regs->bp;
+ } else
# endif
- switch (i) {
- case 0:
- if (!n--) break;
- *args++ = regs->di;
- case 1:
- if (!n--) break;
- *args++ = regs->si;
- case 2:
- if (!n--) break;
- *args++ = regs->dx;
- case 3:
- if (!n--) break;
- *args++ = regs->r10;
- case 4:
- if (!n--) break;
- *args++ = regs->r8;
- case 5:
- if (!n--) break;
- *args++ = regs->r9;
- case 6:
- if (!n--) break;
- default:
- BUG();
- break;
- }
+ {
+ *args++ = regs->di;
+ *args++ = regs->si;
+ *args++ = regs->dx;
+ *args++ = regs->r10;
+ *args++ = regs->r8;
+ *args = regs->r9;
+ }
}
static inline void syscall_set_arguments(struct task_struct *task,
struct pt_regs *regs,
- unsigned int i, unsigned int n,
const unsigned long *args)
{
# ifdef CONFIG_IA32_EMULATION
- if (task->thread_info.status & TS_COMPAT)
- switch (i) {
- case 0:
- if (!n--) break;
- regs->bx = *args++;
- case 1:
- if (!n--) break;
- regs->cx = *args++;
- case 2:
- if (!n--) break;
- regs->dx = *args++;
- case 3:
- if (!n--) break;
- regs->si = *args++;
- case 4:
- if (!n--) break;
- regs->di = *args++;
- case 5:
- if (!n--) break;
- regs->bp = *args++;
- case 6:
- if (!n--) break;
- default:
- BUG();
- break;
- }
- else
+ if (task->thread_info.status & TS_COMPAT) {
+ regs->bx = *args++;
+ regs->cx = *args++;
+ regs->dx = *args++;
+ regs->si = *args++;
+ regs->di = *args++;
+ regs->bp = *args;
+ } else
# endif
- switch (i) {
- case 0:
- if (!n--) break;
- regs->di = *args++;
- case 1:
- if (!n--) break;
- regs->si = *args++;
- case 2:
- if (!n--) break;
- regs->dx = *args++;
- case 3:
- if (!n--) break;
- regs->r10 = *args++;
- case 4:
- if (!n--) break;
- regs->r8 = *args++;
- case 5:
- if (!n--) break;
- regs->r9 = *args++;
- case 6:
- if (!n--) break;
- default:
- BUG();
- break;
- }
+ {
+ regs->di = *args++;
+ regs->si = *args++;
+ regs->dx = *args++;
+ regs->r10 = *args++;
+ regs->r8 = *args++;
+ regs->r9 = *args;
+ }
}
static inline int syscall_get_arch(void)
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index de6f0d59a24f..2863c2026655 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -206,6 +206,9 @@ xen_single_call(unsigned int call,
__HYPERCALL_DECLS;
__HYPERCALL_5ARG(a1, a2, a3, a4, a5);
+ if (call >= PAGE_SIZE / sizeof(hypercall_page[0]))
+ return -EINVAL;
+
asm volatile(CALL_NOSPEC
: __HYPERCALL_5PARAM
: [thunk_target] "a" (&hypercall_page[call])
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index f33f11f69078..1573a0a6b525 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -501,11 +501,8 @@ out_unlock:
void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms)
{
unsigned long delay = msecs_to_jiffies(delay_ms);
- struct rdt_resource *r;
int cpu;
- r = &rdt_resources_all[RDT_RESOURCE_L3];
-
cpu = cpumask_any(&dom->cpu_mask);
dom->cqm_work_cpu = cpu;
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 399601eda8e4..54b9eef3eea9 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -2039,14 +2039,14 @@ out:
enum rdt_param {
Opt_cdp,
Opt_cdpl2,
- Opt_mba_mpbs,
+ Opt_mba_mbps,
nr__rdt_params
};
static const struct fs_parameter_spec rdt_param_specs[] = {
fsparam_flag("cdp", Opt_cdp),
fsparam_flag("cdpl2", Opt_cdpl2),
- fsparam_flag("mba_mpbs", Opt_mba_mpbs),
+ fsparam_flag("mba_MBps", Opt_mba_mbps),
{}
};
@@ -2072,7 +2072,7 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
case Opt_cdpl2:
ctx->enable_cdpl2 = true;
return 0;
- case Opt_mba_mpbs:
+ case Opt_mba_mbps:
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
return -EINVAL;
ctx->enable_mba_mbps = true;
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 27c43525a05f..421899f6ad7b 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -526,7 +526,9 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config,
new_config.enable = 0;
stimer->config.as_uint64 = new_config.as_uint64;
- stimer_mark_pending(stimer, false);
+ if (stimer->config.enable)
+ stimer_mark_pending(stimer, false);
+
return 0;
}
@@ -542,7 +544,10 @@ static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count,
stimer->config.enable = 0;
else if (stimer->config.auto_enable)
stimer->config.enable = 1;
- stimer_mark_pending(stimer, false);
+
+ if (stimer->config.enable)
+ stimer_mark_pending(stimer, false);
+
return 0;
}
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7837ab001d80..eee455a8a612 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -182,7 +182,7 @@ struct kvm_shadow_walk_iterator {
static const union kvm_mmu_page_role mmu_base_role_mask = {
.cr0_wp = 1,
- .cr4_pae = 1,
+ .gpte_is_8_bytes = 1,
.nxe = 1,
.smep_andnot_wp = 1,
.smap_andnot_wp = 1,
@@ -2205,6 +2205,7 @@ static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
static void kvm_mmu_commit_zap_page(struct kvm *kvm,
struct list_head *invalid_list);
+
#define for_each_valid_sp(_kvm, _sp, _gfn) \
hlist_for_each_entry(_sp, \
&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
@@ -2215,12 +2216,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
for_each_valid_sp(_kvm, _sp, _gfn) \
if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
+static inline bool is_ept_sp(struct kvm_mmu_page *sp)
+{
+ return sp->role.cr0_wp && sp->role.smap_andnot_wp;
+}
+
/* @sp->gfn should be write-protected at the call site */
static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
struct list_head *invalid_list)
{
- if (sp->role.cr4_pae != !!is_pae(vcpu)
- || vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {
+ if ((!is_ept_sp(sp) && sp->role.gpte_is_8_bytes != !!is_pae(vcpu)) ||
+ vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
return false;
}
@@ -2423,7 +2429,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
role.level = level;
role.direct = direct;
if (role.direct)
- role.cr4_pae = 0;
+ role.gpte_is_8_bytes = true;
role.access = access;
if (!vcpu->arch.mmu->direct_map
&& vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
@@ -4794,7 +4800,6 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
role.base.access = ACC_ALL;
role.base.nxe = !!is_nx(vcpu);
- role.base.cr4_pae = !!is_pae(vcpu);
role.base.cr0_wp = is_write_protection(vcpu);
role.base.smm = is_smm(vcpu);
role.base.guest_mode = is_guest_mode(vcpu);
@@ -4815,6 +4820,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
role.base.ad_disabled = (shadow_accessed_mask == 0);
role.base.level = kvm_x86_ops->get_tdp_level(vcpu);
role.base.direct = true;
+ role.base.gpte_is_8_bytes = true;
return role;
}
@@ -4879,6 +4885,7 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
role.base.smap_andnot_wp = role.ext.cr4_smap &&
!is_write_protection(vcpu);
role.base.direct = !is_paging(vcpu);
+ role.base.gpte_is_8_bytes = !!is_pae(vcpu);
if (!is_long_mode(vcpu))
role.base.level = PT32E_ROOT_LEVEL;
@@ -4918,18 +4925,26 @@ static union kvm_mmu_role
kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
bool execonly)
{
- union kvm_mmu_role role;
+ union kvm_mmu_role role = {0};
- /* Base role is inherited from root_mmu */
- role.base.word = vcpu->arch.root_mmu.mmu_role.base.word;
- role.ext = kvm_calc_mmu_role_ext(vcpu);
+ /* SMM flag is inherited from root_mmu */
+ role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm;
role.base.level = PT64_ROOT_4LEVEL;
+ role.base.gpte_is_8_bytes = true;
role.base.direct = false;
role.base.ad_disabled = !accessed_dirty;
role.base.guest_mode = true;
role.base.access = ACC_ALL;
+ /*
+ * WP=1 and NOT_WP=1 is an impossible combination, use WP and the
+ * SMAP variation to denote shadow EPT entries.
+ */
+ role.base.cr0_wp = true;
+ role.base.smap_andnot_wp = true;
+
+ role.ext = kvm_calc_mmu_role_ext(vcpu);
role.ext.execonly = execonly;
return role;
@@ -5179,7 +5194,7 @@ static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
gpa, bytes, sp->role.word);
offset = offset_in_page(gpa);
- pte_size = sp->role.cr4_pae ? 8 : 4;
+ pte_size = sp->role.gpte_is_8_bytes ? 8 : 4;
/*
* Sometimes, the OS only writes the last one bytes to update status
@@ -5203,7 +5218,7 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
page_offset = offset_in_page(gpa);
level = sp->role.level;
*nspte = 1;
- if (!sp->role.cr4_pae) {
+ if (!sp->role.gpte_is_8_bytes) {
page_offset <<= 1; /* 32->64 */
/*
* A 32-bit pde maps 4MB while the shadow pdes map
@@ -5393,10 +5408,12 @@ emulate:
* This can happen if a guest gets a page-fault on data access but the HW
* table walker is not able to read the instruction page (e.g instruction
* page is not present in memory). In those cases we simply restart the
- * guest.
+ * guest, with the exception of AMD Erratum 1096 which is unrecoverable.
*/
- if (unlikely(insn && !insn_len))
- return 1;
+ if (unlikely(insn && !insn_len)) {
+ if (!kvm_x86_ops->need_emulation_on_page_fault(vcpu))
+ return 1;
+ }
er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
@@ -5509,7 +5526,9 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
if (flush && lock_flush_tlb) {
- kvm_flush_remote_tlbs(kvm);
+ kvm_flush_remote_tlbs_with_address(kvm,
+ start_gfn,
+ iterator.gfn - start_gfn + 1);
flush = false;
}
cond_resched_lock(&kvm->mmu_lock);
@@ -5517,7 +5536,8 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
}
if (flush && lock_flush_tlb) {
- kvm_flush_remote_tlbs(kvm);
+ kvm_flush_remote_tlbs_with_address(kvm, start_gfn,
+ end_gfn - start_gfn + 1);
flush = false;
}
@@ -6011,7 +6031,7 @@ out:
/*
* Calculate mmu pages needed for kvm.
*/
-unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
+unsigned int kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
{
unsigned int nr_mmu_pages;
unsigned int nr_pages = 0;
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 9f6c855a0043..dd30dccd2ad5 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -29,10 +29,10 @@
\
role.word = __entry->role; \
\
- trace_seq_printf(p, "sp gfn %llx l%u%s q%u%s %s%s" \
+ trace_seq_printf(p, "sp gfn %llx l%u %u-byte q%u%s %s%s" \
" %snxe %sad root %u %s%c", \
__entry->gfn, role.level, \
- role.cr4_pae ? " pae" : "", \
+ role.gpte_is_8_bytes ? 8 : 4, \
role.quadrant, \
role.direct ? " direct" : "", \
access_str[role.access], \
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index b5b128a0a051..e0a791c3d4fc 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -6422,11 +6422,11 @@ e_free:
return ret;
}
-static int get_num_contig_pages(int idx, struct page **inpages,
- unsigned long npages)
+static unsigned long get_num_contig_pages(unsigned long idx,
+ struct page **inpages, unsigned long npages)
{
unsigned long paddr, next_paddr;
- int i = idx + 1, pages = 1;
+ unsigned long i = idx + 1, pages = 1;
/* find the number of contiguous pages starting from idx */
paddr = __sme_page_pa(inpages[idx]);
@@ -6445,12 +6445,12 @@ static int get_num_contig_pages(int idx, struct page **inpages,
static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
- unsigned long vaddr, vaddr_end, next_vaddr, npages, size;
+ unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i;
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct kvm_sev_launch_update_data params;
struct sev_data_launch_update_data *data;
struct page **inpages;
- int i, ret, pages;
+ int ret;
if (!sev_guest(kvm))
return -ENOTTY;
@@ -6799,7 +6799,8 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
struct page **src_p, **dst_p;
struct kvm_sev_dbg debug;
unsigned long n;
- int ret, size;
+ unsigned int size;
+ int ret;
if (!sev_guest(kvm))
return -ENOTTY;
@@ -6807,6 +6808,11 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
if (copy_from_user(&debug, (void __user *)(uintptr_t)argp->data, sizeof(debug)))
return -EFAULT;
+ if (!debug.len || debug.src_uaddr + debug.len < debug.src_uaddr)
+ return -EINVAL;
+ if (!debug.dst_uaddr)
+ return -EINVAL;
+
vaddr = debug.src_uaddr;
size = debug.len;
vaddr_end = vaddr + size;
@@ -6857,8 +6863,8 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
dst_vaddr,
len, &argp->error);
- sev_unpin_memory(kvm, src_p, 1);
- sev_unpin_memory(kvm, dst_p, 1);
+ sev_unpin_memory(kvm, src_p, n);
+ sev_unpin_memory(kvm, dst_p, n);
if (ret)
goto err;
@@ -7098,6 +7104,36 @@ static int nested_enable_evmcs(struct kvm_vcpu *vcpu,
return -ENODEV;
}
+static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
+{
+ bool is_user, smap;
+
+ is_user = svm_get_cpl(vcpu) == 3;
+ smap = !kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
+
+ /*
+ * Detect and workaround Errata 1096 Fam_17h_00_0Fh
+ *
+ * In non SEV guest, hypervisor will be able to read the guest
+ * memory to decode the instruction pointer when insn_len is zero
+ * so we return true to indicate that decoding is possible.
+ *
+ * But in the SEV guest, the guest memory is encrypted with the
+ * guest specific key and hypervisor will not be able to decode the
+ * instruction pointer so we will not able to workaround it. Lets
+ * print the error and request to kill the guest.
+ */
+ if (is_user && smap) {
+ if (!sev_guest(vcpu->kvm))
+ return true;
+
+ pr_err_ratelimited("KVM: Guest triggered AMD Erratum 1096\n");
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ }
+
+ return false;
+}
+
static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled,
@@ -7231,6 +7267,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.nested_enable_evmcs = nested_enable_evmcs,
.nested_get_evmcs_version = nested_get_evmcs_version,
+
+ .need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index f24a2c225070..7ec9bb1dd723 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -500,6 +500,17 @@ static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
}
}
+static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) {
+ int msr;
+
+ for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
+ unsigned word = msr / BITS_PER_LONG;
+
+ msr_bitmap[word] = ~0;
+ msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
+ }
+}
+
/*
* Merge L0's and L1's MSR bitmap, return false to indicate that
* we do not use the hardware.
@@ -541,39 +552,44 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
return false;
msr_bitmap_l1 = (unsigned long *)kmap(page);
- if (nested_cpu_has_apic_reg_virt(vmcs12)) {
- /*
- * L0 need not intercept reads for MSRs between 0x800 and 0x8ff, it
- * just lets the processor take the value from the virtual-APIC page;
- * take those 256 bits directly from the L1 bitmap.
- */
- for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
- unsigned word = msr / BITS_PER_LONG;
- msr_bitmap_l0[word] = msr_bitmap_l1[word];
- msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0;
- }
- } else {
- for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
- unsigned word = msr / BITS_PER_LONG;
- msr_bitmap_l0[word] = ~0;
- msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0;
- }
- }
- nested_vmx_disable_intercept_for_msr(
- msr_bitmap_l1, msr_bitmap_l0,
- X2APIC_MSR(APIC_TASKPRI),
- MSR_TYPE_W);
+ /*
+ * To keep the control flow simple, pay eight 8-byte writes (sixteen
+ * 4-byte writes on 32-bit systems) up front to enable intercepts for
+ * the x2APIC MSR range and selectively disable them below.
+ */
+ enable_x2apic_msr_intercepts(msr_bitmap_l0);
+
+ if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
+ if (nested_cpu_has_apic_reg_virt(vmcs12)) {
+ /*
+ * L0 need not intercept reads for MSRs between 0x800
+ * and 0x8ff, it just lets the processor take the value
+ * from the virtual-APIC page; take those 256 bits
+ * directly from the L1 bitmap.
+ */
+ for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
+ unsigned word = msr / BITS_PER_LONG;
+
+ msr_bitmap_l0[word] = msr_bitmap_l1[word];
+ }
+ }
- if (nested_cpu_has_vid(vmcs12)) {
- nested_vmx_disable_intercept_for_msr(
- msr_bitmap_l1, msr_bitmap_l0,
- X2APIC_MSR(APIC_EOI),
- MSR_TYPE_W);
nested_vmx_disable_intercept_for_msr(
msr_bitmap_l1, msr_bitmap_l0,
- X2APIC_MSR(APIC_SELF_IPI),
- MSR_TYPE_W);
+ X2APIC_MSR(APIC_TASKPRI),
+ MSR_TYPE_R | MSR_TYPE_W);
+
+ if (nested_cpu_has_vid(vmcs12)) {
+ nested_vmx_disable_intercept_for_msr(
+ msr_bitmap_l1, msr_bitmap_l0,
+ X2APIC_MSR(APIC_EOI),
+ MSR_TYPE_W);
+ nested_vmx_disable_intercept_for_msr(
+ msr_bitmap_l1, msr_bitmap_l0,
+ X2APIC_MSR(APIC_SELF_IPI),
+ MSR_TYPE_W);
+ }
}
if (spec_ctrl)
@@ -2585,6 +2601,11 @@ static int nested_check_host_control_regs(struct kvm_vcpu *vcpu,
!nested_host_cr4_valid(vcpu, vmcs12->host_cr4) ||
!nested_cr3_valid(vcpu, vmcs12->host_cr3))
return -EINVAL;
+
+ if (is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu) ||
+ is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))
+ return -EINVAL;
+
/*
* If the load IA32_EFER VM-exit control is 1, bits reserved in the
* IA32_EFER MSR must be 0 in the field for that register. In addition,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index c73375e01ab8..ab432a930ae8 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1683,12 +1683,6 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = to_vmx(vcpu)->spec_ctrl;
break;
- case MSR_IA32_ARCH_CAPABILITIES:
- if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
- return 1;
- msr_info->data = to_vmx(vcpu)->arch_capabilities;
- break;
case MSR_IA32_SYSENTER_CS:
msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
break;
@@ -1895,11 +1889,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
MSR_TYPE_W);
break;
- case MSR_IA32_ARCH_CAPABILITIES:
- if (!msr_info->host_initiated)
- return 1;
- vmx->arch_capabilities = data;
- break;
case MSR_IA32_CR_PAT:
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
@@ -4088,8 +4077,6 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
++vmx->nmsrs;
}
- vmx->arch_capabilities = kvm_get_arch_capabilities();
-
vm_exit_controls_init(vmx, vmx_vmexit_ctrl());
/* 22.2.1, 20.8.1 */
@@ -7409,6 +7396,11 @@ static int enable_smi_window(struct kvm_vcpu *vcpu)
return 0;
}
+static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
static __init int hardware_setup(void)
{
unsigned long host_bndcfgs;
@@ -7711,6 +7703,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.set_nested_state = NULL,
.get_vmcs12_pages = NULL,
.nested_enable_evmcs = NULL,
+ .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
};
static void vmx_cleanup_l1d_flush(void)
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 1554cb45b393..a1e00d0a2482 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -190,7 +190,6 @@ struct vcpu_vmx {
u64 msr_guest_kernel_gs_base;
#endif
- u64 arch_capabilities;
u64 spec_ctrl;
u32 vm_entry_controls_shadow;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 65e4559eef2f..099b851dabaf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1125,7 +1125,7 @@ static u32 msrs_to_save[] = {
#endif
MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
- MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES,
+ MSR_IA32_SPEC_CTRL,
MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
@@ -1158,6 +1158,7 @@ static u32 emulated_msrs[] = {
MSR_IA32_TSC_ADJUST,
MSR_IA32_TSCDEADLINE,
+ MSR_IA32_ARCH_CAPABILITIES,
MSR_IA32_MISC_ENABLE,
MSR_IA32_MCG_STATUS,
MSR_IA32_MCG_CTL,
@@ -2443,6 +2444,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (msr_info->host_initiated)
vcpu->arch.microcode_version = data;
break;
+ case MSR_IA32_ARCH_CAPABILITIES:
+ if (!msr_info->host_initiated)
+ return 1;
+ vcpu->arch.arch_capabilities = data;
+ break;
case MSR_EFER:
return set_efer(vcpu, data);
case MSR_K7_HWCR:
@@ -2747,6 +2753,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_UCODE_REV:
msr_info->data = vcpu->arch.microcode_version;
break;
+ case MSR_IA32_ARCH_CAPABILITIES:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
+ return 1;
+ msr_info->data = vcpu->arch.arch_capabilities;
+ break;
case MSR_IA32_TSC:
msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset;
break;
@@ -6523,14 +6535,27 @@ int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
}
EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer);
+static int complete_fast_pio_out(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.pio.count = 0;
+
+ if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip)))
+ return 1;
+
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
unsigned short port)
{
unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
size, port, &val, 1);
- /* do not return to emulator after return from userspace */
- vcpu->arch.pio.count = 0;
+
+ if (!ret) {
+ vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
+ vcpu->arch.complete_userspace_io = complete_fast_pio_out;
+ }
return ret;
}
@@ -6541,6 +6566,11 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
/* We should only ever be called with arch.pio.count equal to 1 */
BUG_ON(vcpu->arch.pio.count != 1);
+ if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) {
+ vcpu->arch.pio.count = 0;
+ return 1;
+ }
+
/* For size less than 4 we merge, else we zero extend */
val = (vcpu->arch.pio.size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX)
: 0;
@@ -6553,7 +6583,7 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
vcpu->arch.pio.port, &val, 1);
kvm_register_write(vcpu, VCPU_REGS_RAX, val);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
@@ -6572,6 +6602,7 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
return ret;
}
+ vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
vcpu->arch.complete_userspace_io = complete_fast_pio_in;
return 0;
@@ -6579,16 +6610,13 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in)
{
- int ret = kvm_skip_emulated_instruction(vcpu);
+ int ret;
- /*
- * TODO: we might be squashing a KVM_GUESTDBG_SINGLESTEP-triggered
- * KVM_EXIT_DEBUG here.
- */
if (in)
- return kvm_fast_pio_in(vcpu, size, port) && ret;
+ ret = kvm_fast_pio_in(vcpu, size, port);
else
- return kvm_fast_pio_out(vcpu, size, port) && ret;
+ ret = kvm_fast_pio_out(vcpu, size, port);
+ return ret && kvm_skip_emulated_instruction(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_fast_pio);
@@ -8733,6 +8761,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
{
+ vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
kvm_vcpu_mtrr_init(vcpu);
vcpu_load(vcpu);
@@ -9429,13 +9458,9 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
const struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
- int nr_mmu_pages = 0;
-
if (!kvm->arch.n_requested_mmu_pages)
- nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
-
- if (nr_mmu_pages)
- kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
+ kvm_mmu_change_mmu_pages(kvm,
+ kvm_mmu_calculate_default_mmu_pages(kvm));
/*
* Dirty logging tracks sptes in 4k granularity, meaning that large
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index db3165714521..dc726e07d8ba 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -230,7 +230,7 @@ bool mmap_address_hint_valid(unsigned long addr, unsigned long len)
/* Can we access it for direct reading/writing? Must be RAM: */
int valid_phys_addr_range(phys_addr_t addr, size_t count)
{
- return addr + count <= __pa(high_memory);
+ return addr + count - 1 <= __pa(high_memory - 1);
}
/* Can we access it through mmap? Must be a valid physical address: */
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index 458a0e2bcc57..a25a9fd987a9 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -449,7 +449,7 @@ void __init efi_free_boot_services(void)
*/
rm_size = real_mode_size_needed();
if (rm_size && (start + rm_size) < (1<<20) && size >= rm_size) {
- set_real_mode_mem(start, rm_size);
+ set_real_mode_mem(start);
start += rm_size;
size -= rm_size;
}
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index d10105825d57..7dce39c8c034 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -15,15 +15,6 @@ u32 *trampoline_cr4_features;
/* Hold the pgd entry used on booting additional CPUs */
pgd_t trampoline_pgd_entry;
-void __init set_real_mode_mem(phys_addr_t mem, size_t size)
-{
- void *base = __va(mem);
-
- real_mode_header = (struct real_mode_header *) base;
- printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
- base, (unsigned long long)mem, size);
-}
-
void __init reserve_real_mode(void)
{
phys_addr_t mem;
@@ -42,7 +33,7 @@ void __init reserve_real_mode(void)
}
memblock_reserve(mem, size);
- set_real_mode_mem(mem, size);
+ set_real_mode_mem(mem);
}
static void __init setup_real_mode(void)