summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig6
-rw-r--r--arch/x86/boot/compressed/head_64.S10
-rw-r--r--arch/x86/boot/compressed/pgtable.h2
-rw-r--r--arch/x86/events/core.c14
-rw-r--r--arch/x86/events/intel/core.c25
-rw-r--r--arch/x86/events/intel/uncore_snbep.c4
-rw-r--r--arch/x86/events/perf_event.h16
-rw-r--r--arch/x86/ia32/ia32_aout.c6
-rw-r--r--arch/x86/include/asm/intel-family.h5
-rw-r--r--arch/x86/include/asm/kvm_host.h2
-rw-r--r--arch/x86/include/asm/page_64_types.h4
-rw-r--r--arch/x86/include/asm/pgtable.h2
-rw-r--r--arch/x86/include/asm/resctrl_sched.h4
-rw-r--r--arch/x86/include/asm/uv/bios.h8
-rw-r--r--arch/x86/kernel/cpu/Makefile2
-rw-r--r--arch/x86/kernel/cpu/bugs.c2
-rw-r--r--arch/x86/kernel/cpu/mce/core.c1
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c2
-rw-r--r--arch/x86/kernel/cpu/resctrl/Makefile4
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c3
-rw-r--r--arch/x86/kvm/cpuid.c4
-rw-r--r--arch/x86/kvm/mmu.c18
-rw-r--r--arch/x86/kvm/vmx/nested.c13
-rw-r--r--arch/x86/kvm/vmx/vmx.c32
-rw-r--r--arch/x86/kvm/vmx/vmx.h10
-rw-r--r--arch/x86/kvm/x86.c9
-rw-r--r--arch/x86/lib/iomem.c33
-rw-r--r--arch/x86/mm/fault.c2
-rw-r--r--arch/x86/mm/pageattr.c50
-rw-r--r--arch/x86/platform/uv/bios_uv.c23
30 files changed, 231 insertions, 85 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 26387c7bf305..68261430fe6e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -446,12 +446,12 @@ config RETPOLINE
branches. Requires a compiler with -mindirect-branch=thunk-extern
support for full protection. The kernel may run slower.
-config X86_RESCTRL
- bool "Resource Control support"
+config X86_CPU_RESCTRL
+ bool "x86 CPU resource control support"
depends on X86 && (CPU_SUP_INTEL || CPU_SUP_AMD)
select KERNFS
help
- Enable Resource Control support.
+ Enable x86 CPU resource control support.
Provide support for the allocation and monitoring of system resources
usage by the CPU.
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 64037895b085..f62e347862cc 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -600,6 +600,16 @@ ENTRY(trampoline_32bit_src)
leal TRAMPOLINE_32BIT_PGTABLE_OFFSET(%ecx), %eax
movl %eax, %cr3
3:
+ /* Set EFER.LME=1 as a precaution in case hypervsior pulls the rug */
+ pushl %ecx
+ pushl %edx
+ movl $MSR_EFER, %ecx
+ rdmsr
+ btsl $_EFER_LME, %eax
+ wrmsr
+ popl %edx
+ popl %ecx
+
/* Enable PAE and LA57 (if required) paging modes */
movl $X86_CR4_PAE, %eax
cmpl $0, %edx
diff --git a/arch/x86/boot/compressed/pgtable.h b/arch/x86/boot/compressed/pgtable.h
index 91f75638f6e6..6ff7e81b5628 100644
--- a/arch/x86/boot/compressed/pgtable.h
+++ b/arch/x86/boot/compressed/pgtable.h
@@ -6,7 +6,7 @@
#define TRAMPOLINE_32BIT_PGTABLE_OFFSET 0
#define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE
-#define TRAMPOLINE_32BIT_CODE_SIZE 0x60
+#define TRAMPOLINE_32BIT_CODE_SIZE 0x70
#define TRAMPOLINE_32BIT_STACK_END TRAMPOLINE_32BIT_SIZE
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 374a19712e20..b684f0294f35 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2278,6 +2278,19 @@ void perf_check_microcode(void)
x86_pmu.check_microcode();
}
+static int x86_pmu_check_period(struct perf_event *event, u64 value)
+{
+ if (x86_pmu.check_period && x86_pmu.check_period(event, value))
+ return -EINVAL;
+
+ if (value && x86_pmu.limit_period) {
+ if (x86_pmu.limit_period(event, value) > value)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static struct pmu pmu = {
.pmu_enable = x86_pmu_enable,
.pmu_disable = x86_pmu_disable,
@@ -2302,6 +2315,7 @@ static struct pmu pmu = {
.event_idx = x86_pmu_event_idx,
.sched_task = x86_pmu_sched_task,
.task_ctx_size = sizeof(struct x86_perf_task_context),
+ .check_period = x86_pmu_check_period,
};
void arch_perf_update_userpage(struct perf_event *event,
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 40e12cfc87f6..730978dff63f 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3559,6 +3559,14 @@ static void free_excl_cntrs(int cpu)
static void intel_pmu_cpu_dying(int cpu)
{
+ fini_debug_store_on_cpu(cpu);
+
+ if (x86_pmu.counter_freezing)
+ disable_counter_freeze();
+}
+
+static void intel_pmu_cpu_dead(int cpu)
+{
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
struct intel_shared_regs *pc;
@@ -3570,11 +3578,6 @@ static void intel_pmu_cpu_dying(int cpu)
}
free_excl_cntrs(cpu);
-
- fini_debug_store_on_cpu(cpu);
-
- if (x86_pmu.counter_freezing)
- disable_counter_freeze();
}
static void intel_pmu_sched_task(struct perf_event_context *ctx,
@@ -3584,6 +3587,11 @@ static void intel_pmu_sched_task(struct perf_event_context *ctx,
intel_pmu_lbr_sched_task(ctx, sched_in);
}
+static int intel_pmu_check_period(struct perf_event *event, u64 value)
+{
+ return intel_pmu_has_bts_period(event, value) ? -EINVAL : 0;
+}
+
PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
PMU_FORMAT_ATTR(ldlat, "config1:0-15");
@@ -3663,6 +3671,9 @@ static __initconst const struct x86_pmu core_pmu = {
.cpu_prepare = intel_pmu_cpu_prepare,
.cpu_starting = intel_pmu_cpu_starting,
.cpu_dying = intel_pmu_cpu_dying,
+ .cpu_dead = intel_pmu_cpu_dead,
+
+ .check_period = intel_pmu_check_period,
};
static struct attribute *intel_pmu_attrs[];
@@ -3703,8 +3714,12 @@ static __initconst const struct x86_pmu intel_pmu = {
.cpu_prepare = intel_pmu_cpu_prepare,
.cpu_starting = intel_pmu_cpu_starting,
.cpu_dying = intel_pmu_cpu_dying,
+ .cpu_dead = intel_pmu_cpu_dead,
+
.guest_get_msrs = intel_guest_get_msrs,
.sched_task = intel_pmu_sched_task,
+
+ .check_period = intel_pmu_check_period,
};
static __init void intel_clovertown_quirk(void)
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index c07bee31abe8..b10e04387f38 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -1222,6 +1222,8 @@ static struct pci_driver snbep_uncore_pci_driver = {
.id_table = snbep_uncore_pci_ids,
};
+#define NODE_ID_MASK 0x7
+
/*
* build pci bus to socket mapping
*/
@@ -1243,7 +1245,7 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
err = pci_read_config_dword(ubox_dev, nodeid_loc, &config);
if (err)
break;
- nodeid = config;
+ nodeid = config & NODE_ID_MASK;
/* get the Node ID mapping */
err = pci_read_config_dword(ubox_dev, idmap_loc, &config);
if (err)
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 78d7b7031bfc..d46fd6754d92 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -646,6 +646,11 @@ struct x86_pmu {
* Intel host/guest support (KVM)
*/
struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
+
+ /*
+ * Check period value for PERF_EVENT_IOC_PERIOD ioctl.
+ */
+ int (*check_period) (struct perf_event *event, u64 period);
};
struct x86_perf_task_context {
@@ -857,7 +862,7 @@ static inline int amd_pmu_init(void)
#ifdef CONFIG_CPU_SUP_INTEL
-static inline bool intel_pmu_has_bts(struct perf_event *event)
+static inline bool intel_pmu_has_bts_period(struct perf_event *event, u64 period)
{
struct hw_perf_event *hwc = &event->hw;
unsigned int hw_event, bts_event;
@@ -868,7 +873,14 @@ static inline bool intel_pmu_has_bts(struct perf_event *event)
hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
- return hw_event == bts_event && hwc->sample_period == 1;
+ return hw_event == bts_event && period == 1;
+}
+
+static inline bool intel_pmu_has_bts(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ return intel_pmu_has_bts_period(event, hwc->sample_period);
}
int intel_pmu_save_and_restart(struct perf_event *event);
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index f65b78d32f5e..7dbbe9ffda17 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -51,7 +51,7 @@ static unsigned long get_dr(int n)
/*
* fill in the user structure for a core dump..
*/
-static void dump_thread32(struct pt_regs *regs, struct user32 *dump)
+static void fill_dump(struct pt_regs *regs, struct user32 *dump)
{
u32 fs, gs;
memset(dump, 0, sizeof(*dump));
@@ -157,10 +157,12 @@ static int aout_core_dump(struct coredump_params *cprm)
fs = get_fs();
set_fs(KERNEL_DS);
has_dumped = 1;
+
+ fill_dump(cprm->regs, &dump);
+
strncpy(dump.u_comm, current->comm, sizeof(current->comm));
dump.u_ar0 = offsetof(struct user32, regs);
dump.signal = cprm->siginfo->si_signo;
- dump_thread32(cprm->regs, &dump);
/*
* If the size of the dump file exceeds the rlimit, then see
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index 0dd6b0f4000e..9f15384c504a 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -6,7 +6,7 @@
* "Big Core" Processors (Branded as Core, Xeon, etc...)
*
* The "_X" parts are generally the EP and EX Xeons, or the
- * "Extreme" ones, like Broadwell-E.
+ * "Extreme" ones, like Broadwell-E, or Atom microserver.
*
* While adding a new CPUID for a new microarchitecture, add a new
* group to keep logically sorted out in chronological order. Within
@@ -52,6 +52,8 @@
#define INTEL_FAM6_CANNONLAKE_MOBILE 0x66
+#define INTEL_FAM6_ICELAKE_MOBILE 0x7E
+
/* "Small Core" Processors (Atom) */
#define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */
@@ -71,6 +73,7 @@
#define INTEL_FAM6_ATOM_GOLDMONT 0x5C /* Apollo Lake */
#define INTEL_FAM6_ATOM_GOLDMONT_X 0x5F /* Denverton */
#define INTEL_FAM6_ATOM_GOLDMONT_PLUS 0x7A /* Gemini Lake */
+#define INTEL_FAM6_ATOM_TREMONT_X 0x86 /* Jacobsville */
/* Xeon Phi */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4660ce90de7f..180373360e34 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -299,6 +299,7 @@ union kvm_mmu_extended_role {
unsigned int cr4_smap:1;
unsigned int cr4_smep:1;
unsigned int cr4_la57:1;
+ unsigned int maxphyaddr:6;
};
};
@@ -397,6 +398,7 @@ struct kvm_mmu {
void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
u64 *spte, const void *pte);
hpa_t root_hpa;
+ gpa_t root_cr3;
union kvm_mmu_role mmu_role;
u8 root_level;
u8 shadow_root_level;
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 8f657286d599..0ce558a8150d 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -7,7 +7,11 @@
#endif
#ifdef CONFIG_KASAN
+#ifdef CONFIG_KASAN_EXTRA
+#define KASAN_STACK_ORDER 2
+#else
#define KASAN_STACK_ORDER 1
+#endif
#else
#define KASAN_STACK_ORDER 0
#endif
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 40616e805292..2779ace16d23 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1065,7 +1065,7 @@ static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd)
{
- native_set_pmd(pmdp, pmd);
+ set_pmd(pmdp, pmd);
}
static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
diff --git a/arch/x86/include/asm/resctrl_sched.h b/arch/x86/include/asm/resctrl_sched.h
index 40ebddde6ac2..f6b7fe2833cc 100644
--- a/arch/x86/include/asm/resctrl_sched.h
+++ b/arch/x86/include/asm/resctrl_sched.h
@@ -2,7 +2,7 @@
#ifndef _ASM_X86_RESCTRL_SCHED_H
#define _ASM_X86_RESCTRL_SCHED_H
-#ifdef CONFIG_X86_RESCTRL
+#ifdef CONFIG_X86_CPU_RESCTRL
#include <linux/sched.h>
#include <linux/jump_label.h>
@@ -88,6 +88,6 @@ static inline void resctrl_sched_in(void)
static inline void resctrl_sched_in(void) {}
-#endif /* CONFIG_X86_RESCTRL */
+#endif /* CONFIG_X86_CPU_RESCTRL */
#endif /* _ASM_X86_RESCTRL_SCHED_H */
diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h
index e652a7cc6186..3f697a9e3f59 100644
--- a/arch/x86/include/asm/uv/bios.h
+++ b/arch/x86/include/asm/uv/bios.h
@@ -48,7 +48,8 @@ enum {
BIOS_STATUS_SUCCESS = 0,
BIOS_STATUS_UNIMPLEMENTED = -ENOSYS,
BIOS_STATUS_EINVAL = -EINVAL,
- BIOS_STATUS_UNAVAIL = -EBUSY
+ BIOS_STATUS_UNAVAIL = -EBUSY,
+ BIOS_STATUS_ABORT = -EINTR,
};
/* Address map parameters */
@@ -167,4 +168,9 @@ extern long system_serial_number;
extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */
+/*
+ * EFI runtime lock; cf. firmware/efi/runtime-wrappers.c for details
+ */
+extern struct semaphore __efi_uv_runtime_lock;
+
#endif /* _ASM_X86_UV_BIOS_H */
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index b6fa0869f7aa..cfd24f9f7614 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -39,7 +39,7 @@ obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
obj-$(CONFIG_X86_MCE) += mce/
obj-$(CONFIG_MTRR) += mtrr/
obj-$(CONFIG_MICROCODE) += microcode/
-obj-$(CONFIG_X86_RESCTRL) += resctrl/
+obj-$(CONFIG_X86_CPU_RESCTRL) += resctrl/
obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 1de0f4170178..01874d54f4fd 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -71,7 +71,7 @@ void __init check_bugs(void)
* identify_boot_cpu() initialized SMT support information, let the
* core code know.
*/
- cpu_smt_check_topology_early();
+ cpu_smt_check_topology();
if (!IS_ENABLED(CONFIG_SMP)) {
pr_info("CPU: ");
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 672c7225cb1b..6ce290c506d9 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -784,6 +784,7 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
quirk_no_way_out(i, m, regs);
if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
+ m->bank = i;
mce_read_aux(m, i);
*msg = tmp;
return 1;
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 51adde0a0f1a..e1f3ba19ba54 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -855,7 +855,7 @@ load_microcode_amd(bool save, u8 family, const u8 *data, size_t size)
if (!p) {
return ret;
} else {
- if (boot_cpu_data.microcode == p->patch_id)
+ if (boot_cpu_data.microcode >= p->patch_id)
return ret;
ret = UCODE_NEW;
diff --git a/arch/x86/kernel/cpu/resctrl/Makefile b/arch/x86/kernel/cpu/resctrl/Makefile
index 1cabe6fd8e11..4a06c37b9cf1 100644
--- a/arch/x86/kernel/cpu/resctrl/Makefile
+++ b/arch/x86/kernel/cpu/resctrl/Makefile
@@ -1,4 +1,4 @@
# SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_X86_RESCTRL) += core.o rdtgroup.o monitor.o
-obj-$(CONFIG_X86_RESCTRL) += ctrlmondata.o pseudo_lock.o
+obj-$(CONFIG_X86_CPU_RESCTRL) += core.o rdtgroup.o monitor.o
+obj-$(CONFIG_X86_CPU_RESCTRL) += ctrlmondata.o pseudo_lock.o
CFLAGS_pseudo_lock.o = -I$(src)
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 0d5efa34f359..53917a3ebf94 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -167,6 +167,9 @@ setup_efi_state(struct boot_params *params, unsigned long params_load_addr,
struct efi_info *current_ei = &boot_params.efi_info;
struct efi_info *ei = &params->efi_info;
+ if (!efi_enabled(EFI_RUNTIME_SERVICES))
+ return 0;
+
if (!current_ei->efi_memmap_size)
return 0;
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index bbffa6c54697..c07958b59f50 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -335,6 +335,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0;
unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0;
+ unsigned f_la57 = 0;
/* cpuid 1.edx */
const u32 kvm_cpuid_1_edx_x86_features =
@@ -489,7 +490,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
// TSC_ADJUST is emulated
entry->ebx |= F(TSC_ADJUST);
entry->ecx &= kvm_cpuid_7_0_ecx_x86_features;
+ f_la57 = entry->ecx & F(LA57);
cpuid_mask(&entry->ecx, CPUID_7_ECX);
+ /* Set LA57 based on hardware capability. */
+ entry->ecx |= f_la57;
entry->ecx |= f_umip;
/* PKU is not yet implemented for shadow paging. */
if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index da9c42349b1f..f2d1d230d5b8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3555,6 +3555,7 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
&invalid_list);
mmu->root_hpa = INVALID_PAGE;
}
+ mmu->root_cr3 = 0;
}
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
@@ -3610,6 +3611,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
} else
BUG();
+ vcpu->arch.mmu->root_cr3 = vcpu->arch.mmu->get_cr3(vcpu);
return 0;
}
@@ -3618,10 +3620,11 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
{
struct kvm_mmu_page *sp;
u64 pdptr, pm_mask;
- gfn_t root_gfn;
+ gfn_t root_gfn, root_cr3;
int i;
- root_gfn = vcpu->arch.mmu->get_cr3(vcpu) >> PAGE_SHIFT;
+ root_cr3 = vcpu->arch.mmu->get_cr3(vcpu);
+ root_gfn = root_cr3 >> PAGE_SHIFT;
if (mmu_check_root(vcpu, root_gfn))
return 1;
@@ -3646,7 +3649,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
++sp->root_count;
spin_unlock(&vcpu->kvm->mmu_lock);
vcpu->arch.mmu->root_hpa = root;
- return 0;
+ goto set_root_cr3;
}
/*
@@ -3712,6 +3715,9 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root);
}
+set_root_cr3:
+ vcpu->arch.mmu->root_cr3 = root_cr3;
+
return 0;
}
@@ -4163,7 +4169,7 @@ static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3,
struct kvm_mmu_root_info root;
struct kvm_mmu *mmu = vcpu->arch.mmu;
- root.cr3 = mmu->get_cr3(vcpu);
+ root.cr3 = mmu->root_cr3;
root.hpa = mmu->root_hpa;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
@@ -4176,6 +4182,7 @@ static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3,
}
mmu->root_hpa = root.hpa;
+ mmu->root_cr3 = root.cr3;
return i < KVM_MMU_NUM_PREV_ROOTS;
}
@@ -4770,6 +4777,7 @@ static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu)
ext.cr4_pse = !!is_pse(vcpu);
ext.cr4_pke = !!kvm_read_cr4_bits(vcpu, X86_CR4_PKE);
ext.cr4_la57 = !!kvm_read_cr4_bits(vcpu, X86_CR4_LA57);
+ ext.maxphyaddr = cpuid_maxphyaddr(vcpu);
ext.valid = 1;
@@ -5516,11 +5524,13 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
vcpu->arch.root_mmu.root_hpa = INVALID_PAGE;
+ vcpu->arch.root_mmu.root_cr3 = 0;
vcpu->arch.root_mmu.translate_gpa = translate_gpa;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE;
+ vcpu->arch.guest_mmu.root_cr3 = 0;
vcpu->arch.guest_mmu.translate_gpa = translate_gpa;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 8ff20523661b..d737a51a53ca 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -211,6 +211,7 @@ static void free_nested(struct kvm_vcpu *vcpu)
if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
return;
+ hrtimer_cancel(&vmx->nested.preemption_timer);
vmx->nested.vmxon = false;
vmx->nested.smm.vmxon = false;
free_vpid(vmx->nested.vpid02);
@@ -2472,6 +2473,10 @@ static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu,
(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id))
return -EINVAL;
+ if (!nested_cpu_has_preemption_timer(vmcs12) &&
+ nested_cpu_has_save_preemption_timer(vmcs12))
+ return -EINVAL;
+
if (nested_cpu_has_ept(vmcs12) &&
!valid_ept_address(vcpu, vmcs12->ept_pointer))
return -EINVAL;
@@ -5556,9 +5561,11 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps,
* secondary cpu-based controls. Do not include those that
* depend on CPUID bits, they are added later by vmx_cpuid_update.
*/
- rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
- msrs->secondary_ctls_low,
- msrs->secondary_ctls_high);
+ if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
+ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
+ msrs->secondary_ctls_low,
+ msrs->secondary_ctls_high);
+
msrs->secondary_ctls_low = 0;
msrs->secondary_ctls_high &=
SECONDARY_EXEC_DESC |
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 4341175339f3..30a6bcd735ec 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -26,6 +26,7 @@
#include <linux/mod_devicetable.h>
#include <linux/mm.h>
#include <linux/sched.h>
+#include <linux/sched/smt.h>
#include <linux/slab.h>
#include <linux/tboot.h>
#include <linux/trace_events.h>
@@ -862,7 +863,8 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
if (!entry_only)
j = find_msr(&m->host, msr);
- if (i == NR_AUTOLOAD_MSRS || j == NR_AUTOLOAD_MSRS) {
+ if ((i < 0 && m->guest.nr == NR_AUTOLOAD_MSRS) ||
+ (j < 0 && m->host.nr == NR_AUTOLOAD_MSRS)) {
printk_once(KERN_WARNING "Not enough msr switch entries. "
"Can't add msr %x\n", msr);
return;
@@ -1192,21 +1194,6 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
return;
- /*
- * First handle the simple case where no cmpxchg is necessary; just
- * allow posting non-urgent interrupts.
- *
- * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
- * PI.NDST: pi_post_block will do it for us and the wakeup_handler
- * expects the VCPU to be on the blocked_vcpu_list that matches
- * PI.NDST.
- */
- if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR ||
- vcpu->cpu == cpu) {
- pi_clear_sn(pi_desc);
- return;
- }
-
/* The full case. */
do {
old.control = new.control = pi_desc->control;
@@ -1221,6 +1208,17 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
new.sn = 0;
} while (cmpxchg64(&pi_desc->control, old.control,
new.control) != old.control);
+
+ /*
+ * Clear SN before reading the bitmap. The VT-d firmware
+ * writes the bitmap and reads SN atomically (5.2.3 in the
+ * spec), so it doesn't really have a memory barrier that
+ * pairs with this, but we cannot do that and we need one.
+ */
+ smp_mb__after_atomic();
+
+ if (!bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS))
+ pi_set_on(pi_desc);
}
/*
@@ -6823,7 +6821,7 @@ static int vmx_vm_init(struct kvm *kvm)
* Warn upon starting the first VM in a potentially
* insecure environment.
*/
- if (cpu_smt_control == CPU_SMT_ENABLED)
+ if (sched_smt_active())
pr_warn_once(L1TF_MSG_SMT);
if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
pr_warn_once(L1TF_MSG_L1D);
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 99328954c2fc..0ac0a64c7790 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -337,16 +337,16 @@ static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
}
-static inline void pi_clear_sn(struct pi_desc *pi_desc)
+static inline void pi_set_sn(struct pi_desc *pi_desc)
{
- return clear_bit(POSTED_INTR_SN,
+ return set_bit(POSTED_INTR_SN,
(unsigned long *)&pi_desc->control);
}
-static inline void pi_set_sn(struct pi_desc *pi_desc)
+static inline void pi_set_on(struct pi_desc *pi_desc)
{
- return set_bit(POSTED_INTR_SN,
- (unsigned long *)&pi_desc->control);
+ set_bit(POSTED_INTR_ON,
+ (unsigned long *)&pi_desc->control);
}
static inline void pi_clear_on(struct pi_desc *pi_desc)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3d27206f6c01..941f932373d0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5116,6 +5116,13 @@ int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ /*
+ * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
+ * is returned, but our callers are not ready for that and they blindly
+ * call kvm_inject_page_fault. Ensure that they at least do not leak
+ * uninitialized kernel stack memory into cr2 and error code.
+ */
+ memset(exception, 0, sizeof(*exception));
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
exception);
}
@@ -7794,7 +7801,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
* 1) We should set ->mode before checking ->requests. Please see
* the comment in kvm_vcpu_exiting_guest_mode().
*
- * 2) For APICv, we should set ->mode before checking PIR.ON. This
+ * 2) For APICv, we should set ->mode before checking PID.ON. This
* pairs with the memory barrier implicit in pi_test_and_set_on
* (see vmx_deliver_posted_interrupt).
*
diff --git a/arch/x86/lib/iomem.c b/arch/x86/lib/iomem.c
index 66894675f3c8..df50451d94ef 100644
--- a/arch/x86/lib/iomem.c
+++ b/arch/x86/lib/iomem.c
@@ -2,8 +2,11 @@
#include <linux/module.h>
#include <linux/io.h>
+#define movs(type,to,from) \
+ asm volatile("movs" type:"=&D" (to), "=&S" (from):"0" (to), "1" (from):"memory")
+
/* Originally from i386/string.h */
-static __always_inline void __iomem_memcpy(void *to, const void *from, size_t n)
+static __always_inline void rep_movs(void *to, const void *from, size_t n)
{
unsigned long d0, d1, d2;
asm volatile("rep ; movsl\n\t"
@@ -21,13 +24,37 @@ static __always_inline void __iomem_memcpy(void *to, const void *from, size_t n)
void memcpy_fromio(void *to, const volatile void __iomem *from, size_t n)
{
- __iomem_memcpy(to, (const void *)from, n);
+ if (unlikely(!n))
+ return;
+
+ /* Align any unaligned source IO */
+ if (unlikely(1 & (unsigned long)from)) {
+ movs("b", to, from);
+ n--;
+ }
+ if (n > 1 && unlikely(2 & (unsigned long)from)) {
+ movs("w", to, from);
+ n-=2;
+ }
+ rep_movs(to, (const void *)from, n);
}
EXPORT_SYMBOL(memcpy_fromio);
void memcpy_toio(volatile void __iomem *to, const void *from, size_t n)
{
- __iomem_memcpy((void *)to, (const void *) from, n);
+ if (unlikely(!n))
+ return;
+
+ /* Align any unaligned destination IO */
+ if (unlikely(1 & (unsigned long)to)) {
+ movs("b", to, from);
+ n--;
+ }
+ if (n > 1 && unlikely(2 & (unsigned long)to)) {
+ movs("w", to, from);
+ n-=2;
+ }
+ rep_movs((void *)to, (const void *) from, n);
}
EXPORT_SYMBOL(memcpy_toio);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 2ff25ad33233..9d5c75f02295 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -595,7 +595,7 @@ static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index)
return;
}
- addr = desc.base0 | (desc.base1 << 16) | (desc.base2 << 24);
+ addr = desc.base0 | (desc.base1 << 16) | ((unsigned long)desc.base2 << 24);
#ifdef CONFIG_X86_64
addr |= ((u64)desc.base3 << 32);
#endif
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 4f8972311a77..14e6119838a6 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -230,6 +230,29 @@ static bool __cpa_pfn_in_highmap(unsigned long pfn)
#endif
+/*
+ * See set_mce_nospec().
+ *
+ * Machine check recovery code needs to change cache mode of poisoned pages to
+ * UC to avoid speculative access logging another error. But passing the
+ * address of the 1:1 mapping to set_memory_uc() is a fine way to encourage a
+ * speculative access. So we cheat and flip the top bit of the address. This
+ * works fine for the code that updates the page tables. But at the end of the
+ * process we need to flush the TLB and cache and the non-canonical address
+ * causes a #GP fault when used by the INVLPG and CLFLUSH instructions.
+ *
+ * But in the common case we already have a canonical address. This code
+ * will fix the top bit if needed and is a no-op otherwise.
+ */
+static inline unsigned long fix_addr(unsigned long addr)
+{
+#ifdef CONFIG_X86_64
+ return (long)(addr << 1) >> 1;
+#else
+ return addr;
+#endif
+}
+
static unsigned long __cpa_addr(struct cpa_data *cpa, unsigned long idx)
{
if (cpa->flags & CPA_PAGES_ARRAY) {
@@ -313,7 +336,7 @@ void __cpa_flush_tlb(void *data)
unsigned int i;
for (i = 0; i < cpa->numpages; i++)
- __flush_tlb_one_kernel(__cpa_addr(cpa, i));
+ __flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i)));
}
static void cpa_flush(struct cpa_data *data, int cache)
@@ -347,7 +370,7 @@ static void cpa_flush(struct cpa_data *data, int cache)
* Only flush present addresses:
*/
if (pte && (pte_val(*pte) & _PAGE_PRESENT))
- clflush_cache_range_opt((void *)addr, PAGE_SIZE);
+ clflush_cache_range_opt((void *)fix_addr(addr), PAGE_SIZE);
}
mb();
}
@@ -1627,29 +1650,6 @@ out:
return ret;
}
-/*
- * Machine check recovery code needs to change cache mode of poisoned
- * pages to UC to avoid speculative access logging another error. But
- * passing the address of the 1:1 mapping to set_memory_uc() is a fine
- * way to encourage a speculative access. So we cheat and flip the top
- * bit of the address. This works fine for the code that updates the
- * page tables. But at the end of the process we need to flush the cache
- * and the non-canonical address causes a #GP fault when used by the
- * CLFLUSH instruction.
- *
- * But in the common case we already have a canonical address. This code
- * will fix the top bit if needed and is a no-op otherwise.
- */
-static inline unsigned long make_addr_canonical_again(unsigned long addr)
-{
-#ifdef CONFIG_X86_64
- return (long)(addr << 1) >> 1;
-#else
- return addr;
-#endif
-}
-
-
static int change_page_attr_set_clr(unsigned long *addr, int numpages,
pgprot_t mask_set, pgprot_t mask_clr,
int force_split, int in_flag,
diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c
index 4a6a5a26c582..eb33432f2f24 100644
--- a/arch/x86/platform/uv/bios_uv.c
+++ b/arch/x86/platform/uv/bios_uv.c
@@ -29,7 +29,8 @@
struct uv_systab *uv_systab;
-s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
+static s64 __uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
+ u64 a4, u64 a5)
{
struct uv_systab *tab = uv_systab;
s64 ret;
@@ -51,6 +52,19 @@ s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
return ret;
}
+
+s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
+{
+ s64 ret;
+
+ if (down_interruptible(&__efi_uv_runtime_lock))
+ return BIOS_STATUS_ABORT;
+
+ ret = __uv_bios_call(which, a1, a2, a3, a4, a5);
+ up(&__efi_uv_runtime_lock);
+
+ return ret;
+}
EXPORT_SYMBOL_GPL(uv_bios_call);
s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
@@ -59,10 +73,15 @@ s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
unsigned long bios_flags;
s64 ret;
+ if (down_interruptible(&__efi_uv_runtime_lock))
+ return BIOS_STATUS_ABORT;
+
local_irq_save(bios_flags);
- ret = uv_bios_call(which, a1, a2, a3, a4, a5);
+ ret = __uv_bios_call(which, a1, a2, a3, a4, a5);
local_irq_restore(bios_flags);
+ up(&__efi_uv_runtime_lock);
+
return ret;
}