summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--arch/x86/entry/entry_64.S3
-rw-r--r--arch/x86/events/intel/core.c9
-rw-r--r--arch/x86/events/intel/cstate.c23
-rw-r--r--arch/x86/events/intel/uncore.c1
-rw-r--r--arch/x86/events/intel/uncore_snb.c20
-rw-r--r--arch/x86/events/msr.c1
-rw-r--r--arch/x86/include/asm/asm.h20
-rw-r--r--arch/x86/include/asm/bug.h4
-rw-r--r--arch/x86/include/asm/compat.h6
-rw-r--r--arch/x86/include/asm/intel-family.h3
-rw-r--r--arch/x86/include/asm/io.h2
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h11
-rw-r--r--arch/x86/include/asm/microcode.h2
-rw-r--r--arch/x86/include/asm/msi.h19
-rw-r--r--arch/x86/include/asm/msr-index.h4
-rw-r--r--arch/x86/include/asm/percpu.h6
-rw-r--r--arch/x86/include/asm/perf_event.h5
-rw-r--r--arch/x86/include/asm/pgtable_types.h4
-rw-r--r--arch/x86/include/asm/static_call.h3
-rw-r--r--arch/x86/kernel/cpu/common.c2
-rw-r--r--arch/x86/kernel/cpu/cpu.h5
-rw-r--r--arch/x86/kernel/cpu/intel.c7
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c6
-rw-r--r--arch/x86/kernel/cpu/tsx.c104
-rw-r--r--arch/x86/kernel/crash_dump_64.c1
-rw-r--r--arch/x86/kernel/fpu/core.c67
-rw-r--r--arch/x86/kernel/kvm.c13
-rw-r--r--arch/x86/kernel/static_call.c5
-rw-r--r--arch/x86/kernel/unwind_orc.c8
-rw-r--r--arch/x86/kvm/cpuid.c24
-rw-r--r--arch/x86/kvm/hyperv.c40
-rw-r--r--arch/x86/kvm/hyperv.h2
-rw-r--r--arch/x86/kvm/mmu.h24
-rw-r--r--arch/x86/kvm/mmu/mmu.c111
-rw-r--r--arch/x86/kvm/mmu/spte.c28
-rw-r--r--arch/x86/kvm/mmu/spte.h10
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.h34
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c99
-rw-r--r--arch/x86/kvm/pmu.h9
-rw-r--r--arch/x86/kvm/svm/avic.c3
-rw-r--r--arch/x86/kvm/svm/pmu.c29
-rw-r--r--arch/x86/kvm/svm/sev.c112
-rw-r--r--arch/x86/kvm/svm/svm.c1
-rw-r--r--arch/x86/kvm/svm/svm.h2
-rw-r--r--arch/x86/kvm/vmx/nested.c5
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c8
-rw-r--r--arch/x86/kvm/vmx/vmx.c7
-rw-r--r--arch/x86/kvm/vmx/vmx.h1
-rw-r--r--arch/x86/kvm/x86.c95
-rw-r--r--arch/x86/lib/copy_user_64.S87
-rw-r--r--arch/x86/lib/putuser.S4
-rw-r--r--arch/x86/lib/retpoline.S2
-rw-r--r--arch/x86/lib/usercopy_64.c2
-rw-r--r--arch/x86/mm/init_64.c5
-rw-r--r--arch/x86/mm/pat/set_memory.c11
-rw-r--r--arch/x86/mm/tlb.c37
-rw-r--r--arch/x86/net/bpf_jit_comp.c1
-rw-r--r--arch/x86/pci/xen.c6
-rw-r--r--arch/x86/platform/pvh/head.S1
-rw-r--r--arch/x86/power/cpu.c31
-rw-r--r--arch/x86/xen/xen-head.S1
63 files changed, 784 insertions, 415 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b0142e01002e..4bed3abf444d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1866,7 +1866,7 @@ config X86_KERNEL_IBT
code with them to make this happen.
In addition to building the kernel with IBT, seal all functions that
- are not indirect call targets, avoiding them ever becomming one.
+ are not indirect call targets, avoiding them ever becoming one.
This requires LTO like objtool runs and will slow down the build. It
does significantly reduce the number of ENDBR instructions in the
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 4faac48ebec5..73d958522b6a 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -337,6 +337,9 @@ SYM_CODE_END(ret_from_fork)
call \cfunc
+ /* For some configurations \cfunc ends up being a noreturn. */
+ REACHABLE
+
jmp error_return
.endm
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index e88791b420ee..fc7f458eb3de 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -302,7 +302,7 @@ static struct extra_reg intel_spr_extra_regs[] __read_mostly = {
INTEL_UEVENT_EXTRA_REG(0x012a, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0),
INTEL_UEVENT_EXTRA_REG(0x012b, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1),
INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
- INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE),
+ INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff1f, FE),
INTEL_UEVENT_EXTRA_REG(0x40ad, MSR_PEBS_FRONTEND, 0x7, FE),
INTEL_UEVENT_EXTRA_REG(0x04c2, MSR_PEBS_FRONTEND, 0x8, FE),
EVENT_EXTRA_END
@@ -5536,7 +5536,11 @@ static void intel_pmu_check_event_constraints(struct event_constraint *event_con
/* Disabled fixed counters which are not in CPUID */
c->idxmsk64 &= intel_ctrl;
- if (c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES)
+ /*
+ * Don't extend the pseudo-encoding to the
+ * generic counters
+ */
+ if (!use_fixed_pseudo_encoding(c->code))
c->idxmsk64 |= (1ULL << num_counters) - 1;
}
c->idxmsk64 &=
@@ -6212,6 +6216,7 @@ __init int intel_pmu_init(void)
case INTEL_FAM6_ALDERLAKE:
case INTEL_FAM6_ALDERLAKE_L:
+ case INTEL_FAM6_RAPTORLAKE:
/*
* Alder Lake has 2 types of CPU, core and atom.
*
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index c6262b154c3a..48e5db21142c 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -40,7 +40,7 @@
* Model specific counters:
* MSR_CORE_C1_RES: CORE C1 Residency Counter
* perf code: 0x00
- * Available model: SLM,AMT,GLM,CNL,ICX,TNT,ADL
+ * Available model: SLM,AMT,GLM,CNL,ICX,TNT,ADL,RPL
* Scope: Core (each processor core has a MSR)
* MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter
* perf code: 0x01
@@ -51,49 +51,50 @@
* perf code: 0x02
* Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,
* SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX,
- * TGL,TNT,RKL,ADL
+ * TGL,TNT,RKL,ADL,RPL,SPR
* Scope: Core
* MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter
* perf code: 0x03
* Available model: SNB,IVB,HSW,BDW,SKL,CNL,KBL,CML,
- * ICL,TGL,RKL,ADL
+ * ICL,TGL,RKL,ADL,RPL
* Scope: Core
* MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter.
* perf code: 0x00
* Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL,
- * KBL,CML,ICL,ICX,TGL,TNT,RKL,ADL
+ * KBL,CML,ICL,ICX,TGL,TNT,RKL,ADL,
+ * RPL,SPR
* Scope: Package (physical package)
* MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter.
* perf code: 0x01
* Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,KNL,
* GLM,CNL,KBL,CML,ICL,TGL,TNT,RKL,
- * ADL
+ * ADL,RPL
* Scope: Package (physical package)
* MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter.
* perf code: 0x02
* Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,
* SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX,
- * TGL,TNT,RKL,ADL
+ * TGL,TNT,RKL,ADL,RPL,SPR
* Scope: Package (physical package)
* MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter.
* perf code: 0x03
* Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,CNL,
- * KBL,CML,ICL,TGL,RKL,ADL
+ * KBL,CML,ICL,TGL,RKL,ADL,RPL
* Scope: Package (physical package)
* MSR_PKG_C8_RESIDENCY: Package C8 Residency Counter.
* perf code: 0x04
* Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL,
- * ADL
+ * ADL,RPL
* Scope: Package (physical package)
* MSR_PKG_C9_RESIDENCY: Package C9 Residency Counter.
* perf code: 0x05
* Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL,
- * ADL
+ * ADL,RPL
* Scope: Package (physical package)
* MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter.
* perf code: 0x06
* Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL,
- * TNT,RKL,ADL
+ * TNT,RKL,ADL,RPL
* Scope: Package (physical package)
*
*/
@@ -674,12 +675,14 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &icx_cstates),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &icx_cstates),
+ X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &icx_cstates),
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &icl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &icl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &icl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &adl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &adl_cstates),
+ X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_cstates),
{ },
};
MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match);
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index e497da9bf427..7695dcae280e 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1828,6 +1828,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &rkl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &adl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &adl_uncore_init),
+ X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &spr_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init),
{},
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index f698a55bde81..4262351f52b6 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -79,6 +79,10 @@
#define PCI_DEVICE_ID_INTEL_ADL_14_IMC 0x4650
#define PCI_DEVICE_ID_INTEL_ADL_15_IMC 0x4668
#define PCI_DEVICE_ID_INTEL_ADL_16_IMC 0x4670
+#define PCI_DEVICE_ID_INTEL_RPL_1_IMC 0xA700
+#define PCI_DEVICE_ID_INTEL_RPL_2_IMC 0xA702
+#define PCI_DEVICE_ID_INTEL_RPL_3_IMC 0xA706
+#define PCI_DEVICE_ID_INTEL_RPL_4_IMC 0xA709
/* SNB event control */
#define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff
@@ -1406,6 +1410,22 @@ static const struct pci_device_id tgl_uncore_pci_ids[] = {
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_16_IMC),
.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
},
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_RPL_1_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_RPL_2_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_RPL_3_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_RPL_4_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
{ /* end: all zeroes */ }
};
diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c
index 96c775abe31f..6d759f88315c 100644
--- a/arch/x86/events/msr.c
+++ b/arch/x86/events/msr.c
@@ -103,6 +103,7 @@ static bool test_intel(int idx, void *data)
case INTEL_FAM6_ROCKETLAKE:
case INTEL_FAM6_ALDERLAKE:
case INTEL_FAM6_ALDERLAKE_L:
+ case INTEL_FAM6_RAPTORLAKE:
if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF)
return true;
break;
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index c878fed3056f..fbcfec4dc4cc 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -154,24 +154,24 @@
# define DEFINE_EXTABLE_TYPE_REG \
".macro extable_type_reg type:req reg:req\n" \
- ".set found, 0\n" \
- ".set regnr, 0\n" \
+ ".set .Lfound, 0\n" \
+ ".set .Lregnr, 0\n" \
".irp rs,rax,rcx,rdx,rbx,rsp,rbp,rsi,rdi,r8,r9,r10,r11,r12,r13,r14,r15\n" \
".ifc \\reg, %%\\rs\n" \
- ".set found, found+1\n" \
- ".long \\type + (regnr << 8)\n" \
+ ".set .Lfound, .Lfound+1\n" \
+ ".long \\type + (.Lregnr << 8)\n" \
".endif\n" \
- ".set regnr, regnr+1\n" \
+ ".set .Lregnr, .Lregnr+1\n" \
".endr\n" \
- ".set regnr, 0\n" \
+ ".set .Lregnr, 0\n" \
".irp rs,eax,ecx,edx,ebx,esp,ebp,esi,edi,r8d,r9d,r10d,r11d,r12d,r13d,r14d,r15d\n" \
".ifc \\reg, %%\\rs\n" \
- ".set found, found+1\n" \
- ".long \\type + (regnr << 8)\n" \
+ ".set .Lfound, .Lfound+1\n" \
+ ".long \\type + (.Lregnr << 8)\n" \
".endif\n" \
- ".set regnr, regnr+1\n" \
+ ".set .Lregnr, .Lregnr+1\n" \
".endr\n" \
- ".if (found != 1)\n" \
+ ".if (.Lfound != 1)\n" \
".error \"extable_type_reg: bad register argument\"\n" \
".endif\n" \
".endm\n"
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index 4d20a293c6fd..aaf0cb0db4ae 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -78,9 +78,9 @@ do { \
*/
#define __WARN_FLAGS(flags) \
do { \
- __auto_type f = BUGFLAG_WARNING|(flags); \
+ __auto_type __flags = BUGFLAG_WARNING|(flags); \
instrumentation_begin(); \
- _BUG_FLAGS(ASM_UD2, f, ASM_REACHABLE); \
+ _BUG_FLAGS(ASM_UD2, __flags, ASM_REACHABLE); \
instrumentation_end(); \
} while (0)
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 7516e4199b3c..20fd0acd7d80 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -28,15 +28,13 @@ typedef u16 compat_ipc_pid_t;
typedef __kernel_fsid_t compat_fsid_t;
struct compat_stat {
- compat_dev_t st_dev;
- u16 __pad1;
+ u32 st_dev;
compat_ino_t st_ino;
compat_mode_t st_mode;
compat_nlink_t st_nlink;
__compat_uid_t st_uid;
__compat_gid_t st_gid;
- compat_dev_t st_rdev;
- u16 __pad2;
+ u32 st_rdev;
u32 st_size;
u32 st_blksize;
u32 st_blocks;
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index 048b6d5aff50..def6ca121111 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -26,6 +26,7 @@
* _G - parts with extra graphics on
* _X - regular server parts
* _D - micro server parts
+ * _N,_P - other mobile parts
*
* Historical OPTDIFFs:
*
@@ -107,8 +108,10 @@
#define INTEL_FAM6_ALDERLAKE 0x97 /* Golden Cove / Gracemont */
#define INTEL_FAM6_ALDERLAKE_L 0x9A /* Golden Cove / Gracemont */
+#define INTEL_FAM6_ALDERLAKE_N 0xBE
#define INTEL_FAM6_RAPTORLAKE 0xB7
+#define INTEL_FAM6_RAPTORLAKE_P 0xBA
/* "Small Core" Processors (Atom) */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index f6d91ecb8026..e9736af126b2 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -210,8 +210,6 @@ void __iomem *ioremap(resource_size_t offset, unsigned long size);
extern void iounmap(volatile void __iomem *addr);
#define iounmap iounmap
-extern void set_iounmap_nonlazy(void);
-
#ifdef __KERNEL__
void memcpy_fromio(void *, const volatile void __iomem *, size_t);
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 3c368b639c04..1a6d7e3f6c32 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -118,6 +118,7 @@ KVM_X86_OP_OPTIONAL(mem_enc_register_region)
KVM_X86_OP_OPTIONAL(mem_enc_unregister_region)
KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from)
KVM_X86_OP_OPTIONAL(vm_move_enc_context_from)
+KVM_X86_OP_OPTIONAL(guest_memory_reclaimed)
KVM_X86_OP(get_msr_feature)
KVM_X86_OP(can_emulate_instruction)
KVM_X86_OP(apic_init_signal_blocked)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d23e80a56eb8..4ff36610af6a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -974,12 +974,10 @@ enum hv_tsc_page_status {
HV_TSC_PAGE_UNSET = 0,
/* TSC page MSR was written by the guest, update pending */
HV_TSC_PAGE_GUEST_CHANGED,
- /* TSC page MSR was written by KVM userspace, update pending */
+ /* TSC page update was triggered from the host side */
HV_TSC_PAGE_HOST_CHANGED,
/* TSC page was properly set up and is currently active */
HV_TSC_PAGE_SET,
- /* TSC page is currently being updated and therefore is inactive */
- HV_TSC_PAGE_UPDATING,
/* TSC page was set up with an inaccessible GPA */
HV_TSC_PAGE_BROKEN,
};
@@ -1052,6 +1050,7 @@ enum kvm_apicv_inhibit {
APICV_INHIBIT_REASON_X2APIC,
APICV_INHIBIT_REASON_BLOCKIRQ,
APICV_INHIBIT_REASON_ABSENT,
+ APICV_INHIBIT_REASON_SEV,
};
struct kvm_arch {
@@ -1485,6 +1484,7 @@ struct kvm_x86_ops {
int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp);
int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
int (*vm_move_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
+ void (*guest_memory_reclaimed)(struct kvm *kvm);
int (*get_msr_feature)(struct kvm_msr_entry *entry);
@@ -1585,8 +1585,9 @@ static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm)
#define kvm_arch_pmi_in_guest(vcpu) \
((vcpu) && (vcpu)->arch.handling_intr_from_guest)
-int kvm_mmu_module_init(void);
-void kvm_mmu_module_exit(void);
+void kvm_mmu_x86_module_init(void);
+int kvm_mmu_vendor_module_init(void);
+void kvm_mmu_vendor_module_exit(void);
void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
int kvm_mmu_create(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index d6bfdfb0f0af..0c3d3440fe27 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -131,10 +131,12 @@ extern void __init load_ucode_bsp(void);
extern void load_ucode_ap(void);
void reload_early_microcode(void);
extern bool initrd_gone;
+void microcode_bsp_resume(void);
#else
static inline void __init load_ucode_bsp(void) { }
static inline void load_ucode_ap(void) { }
static inline void reload_early_microcode(void) { }
+static inline void microcode_bsp_resume(void) { }
#endif
#endif /* _ASM_X86_MICROCODE_H */
diff --git a/arch/x86/include/asm/msi.h b/arch/x86/include/asm/msi.h
index b85147d75626..d71c7e8b738d 100644
--- a/arch/x86/include/asm/msi.h
+++ b/arch/x86/include/asm/msi.h
@@ -12,14 +12,17 @@ int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
/* Structs and defines for the X86 specific MSI message format */
typedef struct x86_msi_data {
- u32 vector : 8,
- delivery_mode : 3,
- dest_mode_logical : 1,
- reserved : 2,
- active_low : 1,
- is_level : 1;
-
- u32 dmar_subhandle;
+ union {
+ struct {
+ u32 vector : 8,
+ delivery_mode : 3,
+ dest_mode_logical : 1,
+ reserved : 2,
+ active_low : 1,
+ is_level : 1;
+ };
+ u32 dmar_subhandle;
+ };
} __attribute__ ((packed)) arch_msi_msg_data_t;
#define arch_msi_msg_data x86_msi_data
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 0eb90d21049e..ee15311b6be1 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -128,9 +128,9 @@
#define TSX_CTRL_RTM_DISABLE BIT(0) /* Disable RTM feature */
#define TSX_CTRL_CPUID_CLEAR BIT(1) /* Disable TSX enumeration */
-/* SRBDS support */
#define MSR_IA32_MCU_OPT_CTRL 0x00000123
-#define RNGDS_MITG_DIS BIT(0)
+#define RNGDS_MITG_DIS BIT(0) /* SRBDS support */
+#define RTM_ALLOW BIT(1) /* TSX development mode */
#define MSR_IA32_SYSENTER_CS 0x00000174
#define MSR_IA32_SYSENTER_ESP 0x00000175
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index a3c33b79fb86..13c0d63ed55e 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -38,9 +38,9 @@
#define arch_raw_cpu_ptr(ptr) \
({ \
unsigned long tcp_ptr__; \
- asm volatile("add " __percpu_arg(1) ", %0" \
- : "=r" (tcp_ptr__) \
- : "m" (this_cpu_off), "0" (ptr)); \
+ asm ("add " __percpu_arg(1) ", %0" \
+ : "=r" (tcp_ptr__) \
+ : "m" (this_cpu_off), "0" (ptr)); \
(typeof(*(ptr)) __kernel __force *)tcp_ptr__; \
})
#else
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 58d9e4b1fa0a..b06e4c573add 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -241,6 +241,11 @@ struct x86_pmu_capability {
#define INTEL_PMC_IDX_FIXED_SLOTS (INTEL_PMC_IDX_FIXED + 3)
#define INTEL_PMC_MSK_FIXED_SLOTS (1ULL << INTEL_PMC_IDX_FIXED_SLOTS)
+static inline bool use_fixed_pseudo_encoding(u64 code)
+{
+ return !(code & 0xff);
+}
+
/*
* We model BTS tracing as another fixed-mode PMC.
*
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 40497a9020c6..407084d9fd99 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -559,10 +559,6 @@ static inline void update_page_count(int level, unsigned long pages) { }
extern pte_t *lookup_address(unsigned long address, unsigned int *level);
extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
unsigned int *level);
-
-struct mm_struct;
-extern pte_t *lookup_address_in_mm(struct mm_struct *mm, unsigned long address,
- unsigned int *level);
extern pmd_t *lookup_pmd_address(unsigned long address);
extern phys_addr_t slow_virt_to_phys(void *__address);
extern int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn,
diff --git a/arch/x86/include/asm/static_call.h b/arch/x86/include/asm/static_call.h
index ed4f8bb6c2d9..2d8dacd02643 100644
--- a/arch/x86/include/asm/static_call.h
+++ b/arch/x86/include/asm/static_call.h
@@ -26,6 +26,7 @@
".align 4 \n" \
".globl " STATIC_CALL_TRAMP_STR(name) " \n" \
STATIC_CALL_TRAMP_STR(name) ": \n" \
+ ANNOTATE_NOENDBR \
insns " \n" \
".byte 0x53, 0x43, 0x54 \n" \
".type " STATIC_CALL_TRAMP_STR(name) ", @function \n" \
@@ -38,6 +39,8 @@
#define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name) \
__ARCH_DEFINE_STATIC_CALL_TRAMP(name, "ret; int3; nop; nop; nop")
+#define ARCH_DEFINE_STATIC_CALL_RET0_TRAMP(name) \
+ ARCH_DEFINE_STATIC_CALL_TRAMP(name, __static_call_return0)
#define ARCH_ADD_TRAMP_KEY(name) \
asm(".pushsection .static_call_tramp_key, \"a\" \n" \
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index ed4417500700..e342ae4db3c4 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1855,6 +1855,8 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c)
validate_apic_and_package_id(c);
x86_spec_ctrl_setup_ap();
update_srbds_msr();
+
+ tsx_ap_init();
}
static __init int setup_noclflush(char *arg)
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index ee6f23f7587d..2a8e584fc991 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -55,11 +55,10 @@ enum tsx_ctrl_states {
extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state;
extern void __init tsx_init(void);
-extern void tsx_enable(void);
-extern void tsx_disable(void);
-extern void tsx_clear_cpuid(void);
+void tsx_ap_init(void);
#else
static inline void tsx_init(void) { }
+static inline void tsx_ap_init(void) { }
#endif /* CONFIG_CPU_SUP_INTEL */
extern void get_cpu_cap(struct cpuinfo_x86 *c);
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 8321c43554a1..f7a5370a9b3b 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -717,13 +717,6 @@ static void init_intel(struct cpuinfo_x86 *c)
init_intel_misc_features(c);
- if (tsx_ctrl_state == TSX_CTRL_ENABLE)
- tsx_enable();
- else if (tsx_ctrl_state == TSX_CTRL_DISABLE)
- tsx_disable();
- else if (tsx_ctrl_state == TSX_CTRL_RTM_ALWAYS_ABORT)
- tsx_clear_cpuid();
-
split_lock_init();
bus_lock_init();
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index f955d25076ba..239ff5fcec6a 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -758,9 +758,9 @@ static struct subsys_interface mc_cpu_interface = {
};
/**
- * mc_bp_resume - Update boot CPU microcode during resume.
+ * microcode_bsp_resume - Update boot CPU microcode during resume.
*/
-static void mc_bp_resume(void)
+void microcode_bsp_resume(void)
{
int cpu = smp_processor_id();
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
@@ -772,7 +772,7 @@ static void mc_bp_resume(void)
}
static struct syscore_ops mc_syscore_ops = {
- .resume = mc_bp_resume,
+ .resume = microcode_bsp_resume,
};
static int mc_cpu_starting(unsigned int cpu)
diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c
index 9c7a5f049292..ec7bbac3a9f2 100644
--- a/arch/x86/kernel/cpu/tsx.c
+++ b/arch/x86/kernel/cpu/tsx.c
@@ -19,7 +19,7 @@
enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = TSX_CTRL_NOT_SUPPORTED;
-void tsx_disable(void)
+static void tsx_disable(void)
{
u64 tsx;
@@ -39,7 +39,7 @@ void tsx_disable(void)
wrmsrl(MSR_IA32_TSX_CTRL, tsx);
}
-void tsx_enable(void)
+static void tsx_enable(void)
{
u64 tsx;
@@ -58,7 +58,7 @@ void tsx_enable(void)
wrmsrl(MSR_IA32_TSX_CTRL, tsx);
}
-static bool __init tsx_ctrl_is_supported(void)
+static bool tsx_ctrl_is_supported(void)
{
u64 ia32_cap = x86_read_arch_cap_msr();
@@ -84,7 +84,45 @@ static enum tsx_ctrl_states x86_get_tsx_auto_mode(void)
return TSX_CTRL_ENABLE;
}
-void tsx_clear_cpuid(void)
+/*
+ * Disabling TSX is not a trivial business.
+ *
+ * First of all, there's a CPUID bit: X86_FEATURE_RTM_ALWAYS_ABORT
+ * which says that TSX is practically disabled (all transactions are
+ * aborted by default). When that bit is set, the kernel unconditionally
+ * disables TSX.
+ *
+ * In order to do that, however, it needs to dance a bit:
+ *
+ * 1. The first method to disable it is through MSR_TSX_FORCE_ABORT and
+ * the MSR is present only when *two* CPUID bits are set:
+ *
+ * - X86_FEATURE_RTM_ALWAYS_ABORT
+ * - X86_FEATURE_TSX_FORCE_ABORT
+ *
+ * 2. The second method is for CPUs which do not have the above-mentioned
+ * MSR: those use a different MSR - MSR_IA32_TSX_CTRL and disable TSX
+ * through that one. Those CPUs can also have the initially mentioned
+ * CPUID bit X86_FEATURE_RTM_ALWAYS_ABORT set and for those the same strategy
+ * applies: TSX gets disabled unconditionally.
+ *
+ * When either of the two methods are present, the kernel disables TSX and
+ * clears the respective RTM and HLE feature flags.
+ *
+ * An additional twist in the whole thing presents late microcode loading
+ * which, when done, may cause for the X86_FEATURE_RTM_ALWAYS_ABORT CPUID
+ * bit to be set after the update.
+ *
+ * A subsequent hotplug operation on any logical CPU except the BSP will
+ * cause for the supported CPUID feature bits to get re-detected and, if
+ * RTM and HLE get cleared all of a sudden, but, userspace did consult
+ * them before the update, then funny explosions will happen. Long story
+ * short: the kernel doesn't modify CPUID feature bits after booting.
+ *
+ * That's why, this function's call in init_intel() doesn't clear the
+ * feature flags.
+ */
+static void tsx_clear_cpuid(void)
{
u64 msr;
@@ -97,6 +135,39 @@ void tsx_clear_cpuid(void)
rdmsrl(MSR_TSX_FORCE_ABORT, msr);
msr |= MSR_TFA_TSX_CPUID_CLEAR;
wrmsrl(MSR_TSX_FORCE_ABORT, msr);
+ } else if (tsx_ctrl_is_supported()) {
+ rdmsrl(MSR_IA32_TSX_CTRL, msr);
+ msr |= TSX_CTRL_CPUID_CLEAR;
+ wrmsrl(MSR_IA32_TSX_CTRL, msr);
+ }
+}
+
+/*
+ * Disable TSX development mode
+ *
+ * When the microcode released in Feb 2022 is applied, TSX will be disabled by
+ * default on some processors. MSR 0x122 (TSX_CTRL) and MSR 0x123
+ * (IA32_MCU_OPT_CTRL) can be used to re-enable TSX for development, doing so is
+ * not recommended for production deployments. In particular, applying MD_CLEAR
+ * flows for mitigation of the Intel TSX Asynchronous Abort (TAA) transient
+ * execution attack may not be effective on these processors when Intel TSX is
+ * enabled with updated microcode.
+ */
+static void tsx_dev_mode_disable(void)
+{
+ u64 mcu_opt_ctrl;
+
+ /* Check if RTM_ALLOW exists */
+ if (!boot_cpu_has_bug(X86_BUG_TAA) || !tsx_ctrl_is_supported() ||
+ !cpu_feature_enabled(X86_FEATURE_SRBDS_CTRL))
+ return;
+
+ rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl);
+
+ if (mcu_opt_ctrl & RTM_ALLOW) {
+ mcu_opt_ctrl &= ~RTM_ALLOW;
+ wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl);
+ setup_force_cpu_cap(X86_FEATURE_RTM_ALWAYS_ABORT);
}
}
@@ -105,14 +176,14 @@ void __init tsx_init(void)
char arg[5] = {};
int ret;
+ tsx_dev_mode_disable();
+
/*
- * Hardware will always abort a TSX transaction if both CPUID bits
- * RTM_ALWAYS_ABORT and TSX_FORCE_ABORT are set. In this case, it is
- * better not to enumerate CPUID.RTM and CPUID.HLE bits. Clear them
- * here.
+ * Hardware will always abort a TSX transaction when the CPUID bit
+ * RTM_ALWAYS_ABORT is set. In this case, it is better not to enumerate
+ * CPUID.RTM and CPUID.HLE bits. Clear them here.
*/
- if (boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT) &&
- boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) {
+ if (boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT)) {
tsx_ctrl_state = TSX_CTRL_RTM_ALWAYS_ABORT;
tsx_clear_cpuid();
setup_clear_cpu_cap(X86_FEATURE_RTM);
@@ -175,3 +246,16 @@ void __init tsx_init(void)
setup_force_cpu_cap(X86_FEATURE_HLE);
}
}
+
+void tsx_ap_init(void)
+{
+ tsx_dev_mode_disable();
+
+ if (tsx_ctrl_state == TSX_CTRL_ENABLE)
+ tsx_enable();
+ else if (tsx_ctrl_state == TSX_CTRL_DISABLE)
+ tsx_disable();
+ else if (tsx_ctrl_state == TSX_CTRL_RTM_ALWAYS_ABORT)
+ /* See comment over that function for more details. */
+ tsx_clear_cpuid();
+}
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index a7f617a3981d..97529552dd24 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -37,7 +37,6 @@ static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
} else
memcpy(buf, vaddr + offset, csize);
- set_iounmap_nonlazy();
iounmap((void __iomem *)vaddr);
return csize;
}
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index c049561f373a..e28ab0ecc537 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -41,17 +41,7 @@ struct fpu_state_config fpu_user_cfg __ro_after_init;
*/
struct fpstate init_fpstate __ro_after_init;
-/*
- * Track whether the kernel is using the FPU state
- * currently.
- *
- * This flag is used:
- *
- * - by IRQ context code to potentially use the FPU
- * if it's unused.
- *
- * - to debug kernel_fpu_begin()/end() correctness
- */
+/* Track in-kernel FPU usage */
static DEFINE_PER_CPU(bool, in_kernel_fpu);
/*
@@ -59,42 +49,37 @@ static DEFINE_PER_CPU(bool, in_kernel_fpu);
*/
DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
-static bool kernel_fpu_disabled(void)
-{
- return this_cpu_read(in_kernel_fpu);
-}
-
-static bool interrupted_kernel_fpu_idle(void)
-{
- return !kernel_fpu_disabled();
-}
-
-/*
- * Were we in user mode (or vm86 mode) when we were
- * interrupted?
- *
- * Doing kernel_fpu_begin/end() is ok if we are running
- * in an interrupt context from user mode - we'll just
- * save the FPU state as required.
- */
-static bool interrupted_user_mode(void)
-{
- struct pt_regs *regs = get_irq_regs();
- return regs && user_mode(regs);
-}
-
/*
* Can we use the FPU in kernel mode with the
* whole "kernel_fpu_begin/end()" sequence?
- *
- * It's always ok in process context (ie "not interrupt")
- * but it is sometimes ok even from an irq.
*/
bool irq_fpu_usable(void)
{
- return !in_interrupt() ||
- interrupted_user_mode() ||
- interrupted_kernel_fpu_idle();
+ if (WARN_ON_ONCE(in_nmi()))
+ return false;
+
+ /* In kernel FPU usage already active? */
+ if (this_cpu_read(in_kernel_fpu))
+ return false;
+
+ /*
+ * When not in NMI or hard interrupt context, FPU can be used in:
+ *
+ * - Task context except from within fpregs_lock()'ed critical
+ * regions.
+ *
+ * - Soft interrupt processing context which cannot happen
+ * while in a fpregs_lock()'ed critical region.
+ */
+ if (!in_hardirq())
+ return true;
+
+ /*
+ * In hard interrupt context it's safe when soft interrupts
+ * are enabled, which means the interrupt did not hit in
+ * a fpregs_lock()'ed critical region.
+ */
+ return !softirq_count();
}
EXPORT_SYMBOL(irq_fpu_usable);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index a22deb58f86d..8b1c45c9cda8 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -69,6 +69,7 @@ static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __align
DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible;
static int has_steal_clock = 0;
+static int has_guest_poll = 0;
/*
* No need for any "IO delay" on KVM
*/
@@ -706,14 +707,26 @@ static int kvm_cpu_down_prepare(unsigned int cpu)
static int kvm_suspend(void)
{
+ u64 val = 0;
+
kvm_guest_cpu_offline(false);
+#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
+ if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
+ rdmsrl(MSR_KVM_POLL_CONTROL, val);
+ has_guest_poll = !(val & 1);
+#endif
return 0;
}
static void kvm_resume(void)
{
kvm_cpu_online(raw_smp_processor_id());
+
+#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
+ if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll)
+ wrmsrl(MSR_KVM_POLL_CONTROL, 0);
+#endif
}
static struct syscore_ops kvm_syscore_ops = {
diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c
index 531fb4cbb63f..aa72cefdd5be 100644
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -12,10 +12,9 @@ enum insn_type {
};
/*
- * data16 data16 xorq %rax, %rax - a single 5 byte instruction that clears %rax
- * The REX.W cancels the effect of any data16.
+ * cs cs cs xorl %eax, %eax - a single 5 byte instruction that clears %[er]ax
*/
-static const u8 xor5rax[] = { 0x66, 0x66, 0x48, 0x31, 0xc0 };
+static const u8 xor5rax[] = { 0x2e, 0x2e, 0x2e, 0x31, 0xc0 };
static const u8 retinsn[] = { RET_INSN_OPCODE, 0xcc, 0xcc, 0xcc, 0xcc };
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index 794fdef2501a..38185aedf7d1 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -339,11 +339,11 @@ static bool stack_access_ok(struct unwind_state *state, unsigned long _addr,
struct stack_info *info = &state->stack_info;
void *addr = (void *)_addr;
- if (!on_stack(info, addr, len) &&
- (get_stack_info(addr, state->task, info, &state->stack_mask)))
- return false;
+ if (on_stack(info, addr, len))
+ return true;
- return true;
+ return !get_stack_info(addr, state->task, info, &state->stack_mask) &&
+ on_stack(info, addr, len);
}
static bool deref_stack_reg(struct unwind_state *state, unsigned long addr,
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index b24ca7f4ed7c..0c1ba6aa0765 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -887,6 +887,11 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
union cpuid10_eax eax;
union cpuid10_edx edx;
+ if (!static_cpu_has(X86_FEATURE_ARCH_PERFMON)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+
perf_get_x86_pmu_capability(&cap);
/*
@@ -1085,12 +1090,21 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
case 0x80000000:
entry->eax = min(entry->eax, 0x80000021);
/*
- * Serializing LFENCE is reported in a multitude of ways,
- * and NullSegClearsBase is not reported in CPUID on Zen2;
- * help userspace by providing the CPUID leaf ourselves.
+ * Serializing LFENCE is reported in a multitude of ways, and
+ * NullSegClearsBase is not reported in CPUID on Zen2; help
+ * userspace by providing the CPUID leaf ourselves.
+ *
+ * However, only do it if the host has CPUID leaf 0x8000001d.
+ * QEMU thinks that it can query the host blindly for that
+ * CPUID leaf if KVM reports that it supports 0x8000001d or
+ * above. The processor merrily returns values from the
+ * highest Intel leaf which QEMU tries to use as the guest's
+ * 0x8000001d. Even worse, this can result in an infinite
+ * loop if said highest leaf has no subleaves indexed by ECX.
*/
- if (static_cpu_has(X86_FEATURE_LFENCE_RDTSC)
- || !static_cpu_has_bug(X86_BUG_NULL_SEG))
+ if (entry->eax >= 0x8000001d &&
+ (static_cpu_has(X86_FEATURE_LFENCE_RDTSC)
+ || !static_cpu_has_bug(X86_BUG_NULL_SEG)))
entry->eax = max(entry->eax, 0x80000021);
break;
case 0x80000001:
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 123b677111c5..46f9dfb60469 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1135,11 +1135,13 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence));
BUILD_BUG_ON(offsetof(struct ms_hyperv_tsc_page, tsc_sequence) != 0);
+ mutex_lock(&hv->hv_lock);
+
if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN ||
+ hv->hv_tsc_page_status == HV_TSC_PAGE_SET ||
hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET)
- return;
+ goto out_unlock;
- mutex_lock(&hv->hv_lock);
if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
goto out_unlock;
@@ -1201,45 +1203,19 @@ out_unlock:
mutex_unlock(&hv->hv_lock);
}
-void kvm_hv_invalidate_tsc_page(struct kvm *kvm)
+void kvm_hv_request_tsc_page_update(struct kvm *kvm)
{
struct kvm_hv *hv = to_kvm_hv(kvm);
- u64 gfn;
- int idx;
-
- if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN ||
- hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET ||
- tsc_page_update_unsafe(hv))
- return;
mutex_lock(&hv->hv_lock);
- if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
- goto out_unlock;
-
- /* Preserve HV_TSC_PAGE_GUEST_CHANGED/HV_TSC_PAGE_HOST_CHANGED states */
- if (hv->hv_tsc_page_status == HV_TSC_PAGE_SET)
- hv->hv_tsc_page_status = HV_TSC_PAGE_UPDATING;
+ if (hv->hv_tsc_page_status == HV_TSC_PAGE_SET &&
+ !tsc_page_update_unsafe(hv))
+ hv->hv_tsc_page_status = HV_TSC_PAGE_HOST_CHANGED;
- gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
-
- hv->tsc_ref.tsc_sequence = 0;
-
- /*
- * Take the srcu lock as memslots will be accessed to check the gfn
- * cache generation against the memslots generation.
- */
- idx = srcu_read_lock(&kvm->srcu);
- if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
- &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)))
- hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN;
- srcu_read_unlock(&kvm->srcu, idx);
-
-out_unlock:
mutex_unlock(&hv->hv_lock);
}
-
static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr)
{
if (!hv_vcpu->enforce_cpuid)
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index e19c00ee9ab3..da2737f2a956 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -137,7 +137,7 @@ void kvm_hv_process_stimers(struct kvm_vcpu *vcpu);
void kvm_hv_setup_tsc_page(struct kvm *kvm,
struct pvclock_vcpu_time_info *hv_clock);
-void kvm_hv_invalidate_tsc_page(struct kvm *kvm);
+void kvm_hv_request_tsc_page_update(struct kvm *kvm);
void kvm_hv_init_vm(struct kvm *kvm);
void kvm_hv_destroy_vm(struct kvm *kvm);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index e6cae6f22683..a335e7f1f69e 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -65,6 +65,30 @@ static __always_inline u64 rsvd_bits(int s, int e)
return ((2ULL << (e - s)) - 1) << s;
}
+/*
+ * The number of non-reserved physical address bits irrespective of features
+ * that repurpose legal bits, e.g. MKTME.
+ */
+extern u8 __read_mostly shadow_phys_bits;
+
+static inline gfn_t kvm_mmu_max_gfn(void)
+{
+ /*
+ * Note that this uses the host MAXPHYADDR, not the guest's.
+ * EPT/NPT cannot support GPAs that would exceed host.MAXPHYADDR;
+ * assuming KVM is running on bare metal, guest accesses beyond
+ * host.MAXPHYADDR will hit a #PF(RSVD) and never cause a vmexit
+ * (either EPT Violation/Misconfig or #NPF), and so KVM will never
+ * install a SPTE for such addresses. If KVM is running as a VM
+ * itself, on the other hand, it might see a MAXPHYADDR that is less
+ * than hardware's real MAXPHYADDR. Using the host MAXPHYADDR
+ * disallows such SPTEs entirely and simplifies the TDP MMU.
+ */
+ int max_gpa_bits = likely(tdp_enabled) ? shadow_phys_bits : 52;
+
+ return (1ULL << (max_gpa_bits - PAGE_SHIFT)) - 1;
+}
+
void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 8f19ea752704..311e4e1d7870 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -473,30 +473,6 @@ retry:
}
#endif
-static bool spte_has_volatile_bits(u64 spte)
-{
- if (!is_shadow_present_pte(spte))
- return false;
-
- /*
- * Always atomically update spte if it can be updated
- * out of mmu-lock, it can ensure dirty bit is not lost,
- * also, it can help us to get a stable is_writable_pte()
- * to ensure tlb flush is not missed.
- */
- if (spte_can_locklessly_be_made_writable(spte) ||
- is_access_track_spte(spte))
- return true;
-
- if (spte_ad_enabled(spte)) {
- if ((spte & shadow_accessed_mask) == 0 ||
- (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
- return true;
- }
-
- return false;
-}
-
/* Rules for using mmu_spte_set:
* Set the sptep from nonpresent to present.
* Note: the sptep being assigned *must* be either not present
@@ -557,7 +533,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
* we always atomically update it, see the comments in
* spte_has_volatile_bits().
*/
- if (spte_can_locklessly_be_made_writable(old_spte) &&
+ if (is_mmu_writable_spte(old_spte) &&
!is_writable_pte(new_spte))
flush = true;
@@ -591,7 +567,8 @@ static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
u64 old_spte = *sptep;
int level = sptep_to_sp(sptep)->role.level;
- if (!spte_has_volatile_bits(old_spte))
+ if (!is_shadow_present_pte(old_spte) ||
+ !spte_has_volatile_bits(old_spte))
__update_clear_spte_fast(sptep, 0ull);
else
old_spte = __update_clear_spte_slow(sptep, 0ull);
@@ -1187,7 +1164,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect)
u64 spte = *sptep;
if (!is_writable_pte(spte) &&
- !(pt_protect && spte_can_locklessly_be_made_writable(spte)))
+ !(pt_protect && is_mmu_writable_spte(spte)))
return false;
rmap_printk("spte %p %llx\n", sptep, *sptep);
@@ -2804,8 +2781,12 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
const struct kvm_memory_slot *slot)
{
unsigned long hva;
- pte_t *pte;
- int level;
+ unsigned long flags;
+ int level = PG_LEVEL_4K;
+ pgd_t pgd;
+ p4d_t p4d;
+ pud_t pud;
+ pmd_t pmd;
if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn))
return PG_LEVEL_4K;
@@ -2820,10 +2801,43 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
*/
hva = __gfn_to_hva_memslot(slot, gfn);
- pte = lookup_address_in_mm(kvm->mm, hva, &level);
- if (unlikely(!pte))
- return PG_LEVEL_4K;
+ /*
+ * Lookup the mapping level in the current mm. The information
+ * may become stale soon, but it is safe to use as long as
+ * 1) mmu_notifier_retry was checked after taking mmu_lock, and
+ * 2) mmu_lock is taken now.
+ *
+ * We still need to disable IRQs to prevent concurrent tear down
+ * of page tables.
+ */
+ local_irq_save(flags);
+
+ pgd = READ_ONCE(*pgd_offset(kvm->mm, hva));
+ if (pgd_none(pgd))
+ goto out;
+ p4d = READ_ONCE(*p4d_offset(&pgd, hva));
+ if (p4d_none(p4d) || !p4d_present(p4d))
+ goto out;
+
+ pud = READ_ONCE(*pud_offset(&p4d, hva));
+ if (pud_none(pud) || !pud_present(pud))
+ goto out;
+
+ if (pud_large(pud)) {
+ level = PG_LEVEL_1G;
+ goto out;
+ }
+
+ pmd = READ_ONCE(*pmd_offset(&pud, hva));
+ if (pmd_none(pmd) || !pmd_present(pmd))
+ goto out;
+
+ if (pmd_large(pmd))
+ level = PG_LEVEL_2M;
+
+out:
+ local_irq_restore(flags);
return level;
}
@@ -2992,9 +3006,15 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fa
/*
* If MMIO caching is disabled, emulate immediately without
* touching the shadow page tables as attempting to install an
- * MMIO SPTE will just be an expensive nop.
+ * MMIO SPTE will just be an expensive nop. Do not cache MMIO
+ * whose gfn is greater than host.MAXPHYADDR, any guest that
+ * generates such gfns is running nested and is being tricked
+ * by L0 userspace (you can observe gfn > L1.MAXPHYADDR if
+ * and only if L1's MAXPHYADDR is inaccurate with respect to
+ * the hardware's).
*/
- if (unlikely(!shadow_mmio_value)) {
+ if (unlikely(!shadow_mmio_value) ||
+ unlikely(fault->gfn > kvm_mmu_max_gfn())) {
*ret_val = RET_PF_EMULATE;
return true;
}
@@ -3153,8 +3173,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
* be removed in the fast path only if the SPTE was
* write-protected for dirty-logging or access tracking.
*/
- if (fault->write &&
- spte_can_locklessly_be_made_writable(spte)) {
+ if (fault->write && is_mmu_writable_spte(spte)) {
new_spte |= PT_WRITABLE_MASK;
/*
@@ -6237,12 +6256,24 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
return 0;
}
-int kvm_mmu_module_init(void)
+/*
+ * nx_huge_pages needs to be resolved to true/false when kvm.ko is loaded, as
+ * its default value of -1 is technically undefined behavior for a boolean.
+ */
+void kvm_mmu_x86_module_init(void)
{
- int ret = -ENOMEM;
-
if (nx_huge_pages == -1)
__set_nx_huge_pages(get_nx_auto_mode());
+}
+
+/*
+ * The bulk of the MMU initialization is deferred until the vendor module is
+ * loaded as many of the masks/values may be modified by VMX or SVM, i.e. need
+ * to be reset when a potentially different vendor module is loaded.
+ */
+int kvm_mmu_vendor_module_init(void)
+{
+ int ret = -ENOMEM;
/*
* MMU roles use union aliasing which is, generally speaking, an
@@ -6290,7 +6321,7 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
mmu_free_memory_caches(vcpu);
}
-void kvm_mmu_module_exit(void)
+void kvm_mmu_vendor_module_exit(void)
{
mmu_destroy_caches();
percpu_counter_destroy(&kvm_total_used_mmu_pages);
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 4739b53c9734..e5c0b6db6f2c 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -90,6 +90,34 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
E820_TYPE_RAM);
}
+/*
+ * Returns true if the SPTE has bits that may be set without holding mmu_lock.
+ * The caller is responsible for checking if the SPTE is shadow-present, and
+ * for determining whether or not the caller cares about non-leaf SPTEs.
+ */
+bool spte_has_volatile_bits(u64 spte)
+{
+ /*
+ * Always atomically update spte if it can be updated
+ * out of mmu-lock, it can ensure dirty bit is not lost,
+ * also, it can help us to get a stable is_writable_pte()
+ * to ensure tlb flush is not missed.
+ */
+ if (!is_writable_pte(spte) && is_mmu_writable_spte(spte))
+ return true;
+
+ if (is_access_track_spte(spte))
+ return true;
+
+ if (spte_ad_enabled(spte)) {
+ if (!(spte & shadow_accessed_mask) ||
+ (is_writable_pte(spte) && !(spte & shadow_dirty_mask)))
+ return true;
+ }
+
+ return false;
+}
+
bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
const struct kvm_memory_slot *slot,
unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 73f12615416f..80ab0f5cff01 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -201,12 +201,6 @@ static inline bool is_removed_spte(u64 spte)
*/
extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
-/*
- * The number of non-reserved physical address bits irrespective of features
- * that repurpose legal bits, e.g. MKTME.
- */
-extern u8 __read_mostly shadow_phys_bits;
-
static inline bool is_mmio_spte(u64 spte)
{
return (spte & shadow_mmio_mask) == shadow_mmio_value &&
@@ -396,7 +390,7 @@ static inline void check_spte_writable_invariants(u64 spte)
"kvm: Writable SPTE is not MMU-writable: %llx", spte);
}
-static inline bool spte_can_locklessly_be_made_writable(u64 spte)
+static inline bool is_mmu_writable_spte(u64 spte)
{
return spte & shadow_mmu_writable_mask;
}
@@ -410,6 +404,8 @@ static inline u64 get_mmio_spte_generation(u64 spte)
return gen;
}
+bool spte_has_volatile_bits(u64 spte);
+
bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
const struct kvm_memory_slot *slot,
unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index b1eaf6ec0e0b..f0af385c56e0 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -6,6 +6,7 @@
#include <linux/kvm_host.h>
#include "mmu.h"
+#include "spte.h"
/*
* TDP MMU SPTEs are RCU protected to allow paging structures (non-leaf SPTEs)
@@ -17,9 +18,38 @@ static inline u64 kvm_tdp_mmu_read_spte(tdp_ptep_t sptep)
{
return READ_ONCE(*rcu_dereference(sptep));
}
-static inline void kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 val)
+
+static inline u64 kvm_tdp_mmu_write_spte_atomic(tdp_ptep_t sptep, u64 new_spte)
+{
+ return xchg(rcu_dereference(sptep), new_spte);
+}
+
+static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
+{
+ WRITE_ONCE(*rcu_dereference(sptep), new_spte);
+}
+
+static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
+ u64 new_spte, int level)
{
- WRITE_ONCE(*rcu_dereference(sptep), val);
+ /*
+ * Atomically write the SPTE if it is a shadow-present, leaf SPTE with
+ * volatile bits, i.e. has bits that can be set outside of mmu_lock.
+ * The Writable bit can be set by KVM's fast page fault handler, and
+ * Accessed and Dirty bits can be set by the CPU.
+ *
+ * Note, non-leaf SPTEs do have Accessed bits and those bits are
+ * technically volatile, but KVM doesn't consume the Accessed bit of
+ * non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit. This
+ * logic needs to be reassessed if KVM were to use non-leaf Accessed
+ * bits, e.g. to skip stepping down into child SPTEs when aging SPTEs.
+ */
+ if (is_shadow_present_pte(old_spte) && is_last_spte(old_spte, level) &&
+ spte_has_volatile_bits(old_spte))
+ return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte);
+
+ __kvm_tdp_mmu_write_spte(sptep, new_spte);
+ return old_spte;
}
/*
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index d71d177ae6b8..922b06bf4b94 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -51,7 +51,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
if (!kvm->arch.tdp_mmu_enabled)
return;
- flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
+ /* Also waits for any queued work items. */
destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
@@ -426,9 +426,9 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
tdp_mmu_unlink_sp(kvm, sp, shared);
for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
- u64 *sptep = rcu_dereference(pt) + i;
+ tdp_ptep_t sptep = pt + i;
gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
- u64 old_child_spte;
+ u64 old_spte;
if (shared) {
/*
@@ -440,8 +440,8 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
* value to the removed SPTE value.
*/
for (;;) {
- old_child_spte = xchg(sptep, REMOVED_SPTE);
- if (!is_removed_spte(old_child_spte))
+ old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE);
+ if (!is_removed_spte(old_spte))
break;
cpu_relax();
}
@@ -455,23 +455,43 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
* are guarded by the memslots generation, not by being
* unreachable.
*/
- old_child_spte = READ_ONCE(*sptep);
- if (!is_shadow_present_pte(old_child_spte))
+ old_spte = kvm_tdp_mmu_read_spte(sptep);
+ if (!is_shadow_present_pte(old_spte))
continue;
/*
- * Marking the SPTE as a removed SPTE is not
- * strictly necessary here as the MMU lock will
- * stop other threads from concurrently modifying
- * this SPTE. Using the removed SPTE value keeps
- * the two branches consistent and simplifies
- * the function.
+ * Use the common helper instead of a raw WRITE_ONCE as
+ * the SPTE needs to be updated atomically if it can be
+ * modified by a different vCPU outside of mmu_lock.
+ * Even though the parent SPTE is !PRESENT, the TLB
+ * hasn't yet been flushed, and both Intel and AMD
+ * document that A/D assists can use upper-level PxE
+ * entries that are cached in the TLB, i.e. the CPU can
+ * still access the page and mark it dirty.
+ *
+ * No retry is needed in the atomic update path as the
+ * sole concern is dropping a Dirty bit, i.e. no other
+ * task can zap/remove the SPTE as mmu_lock is held for
+ * write. Marking the SPTE as a removed SPTE is not
+ * strictly necessary for the same reason, but using
+ * the remove SPTE value keeps the shared/exclusive
+ * paths consistent and allows the handle_changed_spte()
+ * call below to hardcode the new value to REMOVED_SPTE.
+ *
+ * Note, even though dropping a Dirty bit is the only
+ * scenario where a non-atomic update could result in a
+ * functional bug, simply checking the Dirty bit isn't
+ * sufficient as a fast page fault could read the upper
+ * level SPTE before it is zapped, and then make this
+ * target SPTE writable, resume the guest, and set the
+ * Dirty bit between reading the SPTE above and writing
+ * it here.
*/
- WRITE_ONCE(*sptep, REMOVED_SPTE);
+ old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
+ REMOVED_SPTE, level);
}
handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
- old_child_spte, REMOVED_SPTE, level,
- shared);
+ old_spte, REMOVED_SPTE, level, shared);
}
call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
@@ -667,14 +687,13 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
KVM_PAGES_PER_HPAGE(iter->level));
/*
- * No other thread can overwrite the removed SPTE as they
- * must either wait on the MMU lock or use
- * tdp_mmu_set_spte_atomic which will not overwrite the
- * special removed SPTE value. No bookkeeping is needed
- * here since the SPTE is going from non-present
- * to non-present.
+ * No other thread can overwrite the removed SPTE as they must either
+ * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
+ * overwrite the special removed SPTE value. No bookkeeping is needed
+ * here since the SPTE is going from non-present to non-present. Use
+ * the raw write helper to avoid an unnecessary check on volatile bits.
*/
- kvm_tdp_mmu_write_spte(iter->sptep, 0);
+ __kvm_tdp_mmu_write_spte(iter->sptep, 0);
return 0;
}
@@ -699,10 +718,13 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
* unless performing certain dirty logging operations.
* Leaving record_dirty_log unset in that case prevents page
* writes from being double counted.
+ *
+ * Returns the old SPTE value, which _may_ be different than @old_spte if the
+ * SPTE had voldatile bits.
*/
-static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
- u64 old_spte, u64 new_spte, gfn_t gfn, int level,
- bool record_acc_track, bool record_dirty_log)
+static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
+ u64 old_spte, u64 new_spte, gfn_t gfn, int level,
+ bool record_acc_track, bool record_dirty_log)
{
lockdep_assert_held_write(&kvm->mmu_lock);
@@ -715,7 +737,7 @@ static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
*/
WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
- kvm_tdp_mmu_write_spte(sptep, new_spte);
+ old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
@@ -724,6 +746,7 @@ static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
if (record_dirty_log)
handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
new_spte, level);
+ return old_spte;
}
static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
@@ -732,9 +755,10 @@ static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
{
WARN_ON_ONCE(iter->yielded);
- __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, iter->old_spte,
- new_spte, iter->gfn, iter->level,
- record_acc_track, record_dirty_log);
+ iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
+ iter->old_spte, new_spte,
+ iter->gfn, iter->level,
+ record_acc_track, record_dirty_log);
}
static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
@@ -815,14 +839,15 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
return iter->yielded;
}
-static inline gfn_t tdp_mmu_max_gfn_host(void)
+static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
{
/*
- * Bound TDP MMU walks at host.MAXPHYADDR, guest accesses beyond that
- * will hit a #PF(RSVD) and never hit an EPT Violation/Misconfig / #NPF,
- * and so KVM will never install a SPTE for such addresses.
+ * Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with
+ * a gpa range that would exceed the max gfn, and KVM does not create
+ * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
+ * the slow emulation path every time.
*/
- return 1ULL << (shadow_phys_bits - PAGE_SHIFT);
+ return kvm_mmu_max_gfn() + 1;
}
static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
@@ -830,7 +855,7 @@ static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
{
struct tdp_iter iter;
- gfn_t end = tdp_mmu_max_gfn_host();
+ gfn_t end = tdp_mmu_max_gfn_exclusive();
gfn_t start = 0;
for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
@@ -923,7 +948,7 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
{
struct tdp_iter iter;
- end = min(end, tdp_mmu_max_gfn_host());
+ end = min(end, tdp_mmu_max_gfn_exclusive());
lockdep_assert_held_write(&kvm->mmu_lock);
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 9e66fba1d6a3..22992b049d38 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -138,6 +138,15 @@ static inline u64 get_sample_period(struct kvm_pmc *pmc, u64 counter_value)
return sample_period;
}
+static inline void pmc_update_sample_period(struct kvm_pmc *pmc)
+{
+ if (!pmc->perf_event || pmc->is_paused)
+ return;
+
+ perf_event_period(pmc->perf_event,
+ get_sample_period(pmc, pmc->counter));
+}
+
void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel);
void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx);
void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx);
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index a1cf9c31273b..421619540ff9 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -837,7 +837,8 @@ bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason)
BIT(APICV_INHIBIT_REASON_IRQWIN) |
BIT(APICV_INHIBIT_REASON_PIT_REINJ) |
BIT(APICV_INHIBIT_REASON_X2APIC) |
- BIT(APICV_INHIBIT_REASON_BLOCKIRQ);
+ BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |
+ BIT(APICV_INHIBIT_REASON_SEV);
return supported & BIT(reason);
}
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index 24eb935b6f85..16a5ebb420cf 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -45,6 +45,22 @@ static struct kvm_event_hw_type_mapping amd_event_mapping[] = {
[7] = { 0xd1, 0x00, PERF_COUNT_HW_STALLED_CYCLES_BACKEND },
};
+/* duplicated from amd_f17h_perfmon_event_map. */
+static struct kvm_event_hw_type_mapping amd_f17h_event_mapping[] = {
+ [0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES },
+ [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
+ [2] = { 0x60, 0xff, PERF_COUNT_HW_CACHE_REFERENCES },
+ [3] = { 0x64, 0x09, PERF_COUNT_HW_CACHE_MISSES },
+ [4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
+ [5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
+ [6] = { 0x87, 0x02, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND },
+ [7] = { 0x87, 0x01, PERF_COUNT_HW_STALLED_CYCLES_BACKEND },
+};
+
+/* amd_pmc_perf_hw_id depends on these being the same size */
+static_assert(ARRAY_SIZE(amd_event_mapping) ==
+ ARRAY_SIZE(amd_f17h_event_mapping));
+
static unsigned int get_msr_base(struct kvm_pmu *pmu, enum pmu_type type)
{
struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
@@ -140,6 +156,7 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc)
{
+ struct kvm_event_hw_type_mapping *event_mapping;
u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
int i;
@@ -148,15 +165,20 @@ static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc)
if (WARN_ON(pmc_is_fixed(pmc)))
return PERF_COUNT_HW_MAX;
+ if (guest_cpuid_family(pmc->vcpu) >= 0x17)
+ event_mapping = amd_f17h_event_mapping;
+ else
+ event_mapping = amd_event_mapping;
+
for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++)
- if (amd_event_mapping[i].eventsel == event_select
- && amd_event_mapping[i].unit_mask == unit_mask)
+ if (event_mapping[i].eventsel == event_select
+ && event_mapping[i].unit_mask == unit_mask)
break;
if (i == ARRAY_SIZE(amd_event_mapping))
return PERF_COUNT_HW_MAX;
- return amd_event_mapping[i].event_type;
+ return event_mapping[i].event_type;
}
/* check if a PMC is enabled by comparing it against global_ctrl bits. Because
@@ -257,6 +279,7 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
if (pmc) {
pmc->counter += data - pmc_read_counter(pmc);
+ pmc_update_sample_period(pmc);
return 0;
}
/* MSR_EVNTSELn */
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 75fa6dd268f0..7c392873626f 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -260,6 +260,8 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
INIT_LIST_HEAD(&sev->regions_list);
INIT_LIST_HEAD(&sev->mirror_vms);
+ kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_SEV);
+
return 0;
e_free:
@@ -465,6 +467,7 @@ static void sev_clflush_pages(struct page *pages[], unsigned long npages)
page_virtual = kmap_atomic(pages[i]);
clflush_cache_range(page_virtual, PAGE_SIZE);
kunmap_atomic(page_virtual);
+ cond_resched();
}
}
@@ -1591,24 +1594,51 @@ static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
atomic_set_release(&src_sev->migration_in_progress, 0);
}
+/* vCPU mutex subclasses. */
+enum sev_migration_role {
+ SEV_MIGRATION_SOURCE = 0,
+ SEV_MIGRATION_TARGET,
+ SEV_NR_MIGRATION_ROLES,
+};
-static int sev_lock_vcpus_for_migration(struct kvm *kvm)
+static int sev_lock_vcpus_for_migration(struct kvm *kvm,
+ enum sev_migration_role role)
{
struct kvm_vcpu *vcpu;
unsigned long i, j;
+ bool first = true;
kvm_for_each_vcpu(i, vcpu, kvm) {
- if (mutex_lock_killable(&vcpu->mutex))
+ if (mutex_lock_killable_nested(&vcpu->mutex, role))
goto out_unlock;
+
+ if (first) {
+ /*
+ * Reset the role to one that avoids colliding with
+ * the role used for the first vcpu mutex.
+ */
+ role = SEV_NR_MIGRATION_ROLES;
+ first = false;
+ } else {
+ mutex_release(&vcpu->mutex.dep_map, _THIS_IP_);
+ }
}
return 0;
out_unlock:
+
+ first = true;
kvm_for_each_vcpu(j, vcpu, kvm) {
if (i == j)
break;
+ if (first)
+ first = false;
+ else
+ mutex_acquire(&vcpu->mutex.dep_map, role, 0, _THIS_IP_);
+
+
mutex_unlock(&vcpu->mutex);
}
return -EINTR;
@@ -1618,8 +1648,15 @@ static void sev_unlock_vcpus_for_migration(struct kvm *kvm)
{
struct kvm_vcpu *vcpu;
unsigned long i;
+ bool first = true;
kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (first)
+ first = false;
+ else
+ mutex_acquire(&vcpu->mutex.dep_map,
+ SEV_NR_MIGRATION_ROLES, 0, _THIS_IP_);
+
mutex_unlock(&vcpu->mutex);
}
}
@@ -1745,10 +1782,10 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
charged = true;
}
- ret = sev_lock_vcpus_for_migration(kvm);
+ ret = sev_lock_vcpus_for_migration(kvm, SEV_MIGRATION_SOURCE);
if (ret)
goto out_dst_cgroup;
- ret = sev_lock_vcpus_for_migration(source_kvm);
+ ret = sev_lock_vcpus_for_migration(source_kvm, SEV_MIGRATION_TARGET);
if (ret)
goto out_dst_vcpu;
@@ -2223,51 +2260,47 @@ int sev_cpu_init(struct svm_cpu_data *sd)
* Pages used by hardware to hold guest encrypted state must be flushed before
* returning them to the system.
*/
-static void sev_flush_guest_memory(struct vcpu_svm *svm, void *va,
- unsigned long len)
+static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va)
{
+ int asid = to_kvm_svm(vcpu->kvm)->sev_info.asid;
+
/*
- * If hardware enforced cache coherency for encrypted mappings of the
- * same physical page is supported, nothing to do.
+ * Note! The address must be a kernel address, as regular page walk
+ * checks are performed by VM_PAGE_FLUSH, i.e. operating on a user
+ * address is non-deterministic and unsafe. This function deliberately
+ * takes a pointer to deter passing in a user address.
*/
- if (boot_cpu_has(X86_FEATURE_SME_COHERENT))
- return;
+ unsigned long addr = (unsigned long)va;
/*
- * If the VM Page Flush MSR is supported, use it to flush the page
- * (using the page virtual address and the guest ASID).
+ * If CPU enforced cache coherency for encrypted mappings of the
+ * same physical page is supported, use CLFLUSHOPT instead. NOTE: cache
+ * flush is still needed in order to work properly with DMA devices.
*/
- if (boot_cpu_has(X86_FEATURE_VM_PAGE_FLUSH)) {
- struct kvm_sev_info *sev;
- unsigned long va_start;
- u64 start, stop;
-
- /* Align start and stop to page boundaries. */
- va_start = (unsigned long)va;
- start = (u64)va_start & PAGE_MASK;
- stop = PAGE_ALIGN((u64)va_start + len);
-
- if (start < stop) {
- sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info;
+ if (boot_cpu_has(X86_FEATURE_SME_COHERENT)) {
+ clflush_cache_range(va, PAGE_SIZE);
+ return;
+ }
- while (start < stop) {
- wrmsrl(MSR_AMD64_VM_PAGE_FLUSH,
- start | sev->asid);
+ /*
+ * VM Page Flush takes a host virtual address and a guest ASID. Fall
+ * back to WBINVD if this faults so as not to make any problems worse
+ * by leaving stale encrypted data in the cache.
+ */
+ if (WARN_ON_ONCE(wrmsrl_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid)))
+ goto do_wbinvd;
- start += PAGE_SIZE;
- }
+ return;
- return;
- }
+do_wbinvd:
+ wbinvd_on_all_cpus();
+}
- WARN(1, "Address overflow, using WBINVD\n");
- }
+void sev_guest_memory_reclaimed(struct kvm *kvm)
+{
+ if (!sev_guest(kvm))
+ return;
- /*
- * Hardware should always have one of the above features,
- * but if not, use WBINVD and issue a warning.
- */
- WARN_ONCE(1, "Using WBINVD to flush guest memory\n");
wbinvd_on_all_cpus();
}
@@ -2281,7 +2314,8 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu)
svm = to_svm(vcpu);
if (vcpu->arch.guest_state_protected)
- sev_flush_guest_memory(svm, svm->sev_es.vmsa, PAGE_SIZE);
+ sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa);
+
__free_page(virt_to_page(svm->sev_es.vmsa));
if (svm->sev_es.ghcb_sa_free)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index bd4c64b362d2..7e45d03cd018 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4620,6 +4620,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.mem_enc_ioctl = sev_mem_enc_ioctl,
.mem_enc_register_region = sev_mem_enc_register_region,
.mem_enc_unregister_region = sev_mem_enc_unregister_region,
+ .guest_memory_reclaimed = sev_guest_memory_reclaimed,
.vm_copy_enc_context_from = sev_vm_copy_enc_context_from,
.vm_move_enc_context_from = sev_vm_move_enc_context_from,
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index f77a7d2d39dd..f76deff71002 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -609,6 +609,8 @@ int sev_mem_enc_unregister_region(struct kvm *kvm,
struct kvm_enc_region *range);
int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd);
int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd);
+void sev_guest_memory_reclaimed(struct kvm *kvm);
+
void pre_sev_run(struct vcpu_svm *svm, int cpu);
void __init sev_set_cpu_caps(void);
void __init sev_hardware_setup(void);
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index f18744f7ff82..856c87563883 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -4618,6 +4618,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
}
+ if (vmx->nested.update_vmcs01_apicv_status) {
+ vmx->nested.update_vmcs01_apicv_status = false;
+ kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+ }
+
if ((vm_exit_reason != -1) &&
(enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)))
vmx->nested.need_vmcs12_to_shadow_sync = true;
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index bc3f8512bb64..b82b6709d7a8 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -431,15 +431,11 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
!(msr & MSR_PMC_FULL_WIDTH_BIT))
data = (s64)(s32)data;
pmc->counter += data - pmc_read_counter(pmc);
- if (pmc->perf_event && !pmc->is_paused)
- perf_event_period(pmc->perf_event,
- get_sample_period(pmc, data));
+ pmc_update_sample_period(pmc);
return 0;
} else if ((pmc = get_fixed_pmc(pmu, msr))) {
pmc->counter += data - pmc_read_counter(pmc);
- if (pmc->perf_event && !pmc->is_paused)
- perf_event_period(pmc->perf_event,
- get_sample_period(pmc, data));
+ pmc_update_sample_period(pmc);
return 0;
} else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
if (data == pmc->eventsel)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 04d170c4b61e..610355b9ccce 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4174,6 +4174,11 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (is_guest_mode(vcpu)) {
+ vmx->nested.update_vmcs01_apicv_status = true;
+ return;
+ }
+
pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
if (cpu_has_secondary_exec_ctrls()) {
if (kvm_vcpu_apicv_active(vcpu))
@@ -5467,7 +5472,7 @@ static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
return vmx->emulation_required && !vmx->rmode.vm86_active &&
- vcpu->arch.exception.pending;
+ (vcpu->arch.exception.pending || vcpu->arch.exception.injected);
}
static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 9c6bfcd84008..b98c7e96697a 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -183,6 +183,7 @@ struct nested_vmx {
bool change_vmcs01_virtual_apic_mode;
bool reload_vmcs01_apic_access_page;
bool update_vmcs01_cpu_dirty_logging;
+ bool update_vmcs01_apicv_status;
/*
* Enlightened VMCS has been enabled. It does not mean that L1 has to
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0c0ca599a353..4790f0d7d40b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2901,7 +2901,7 @@ static void kvm_end_pvclock_update(struct kvm *kvm)
static void kvm_update_masterclock(struct kvm *kvm)
{
- kvm_hv_invalidate_tsc_page(kvm);
+ kvm_hv_request_tsc_page_update(kvm);
kvm_start_pvclock_update(kvm);
pvclock_update_vm_gtod_copy(kvm);
kvm_end_pvclock_update(kvm);
@@ -3113,8 +3113,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
offsetof(struct compat_vcpu_info, time));
if (vcpu->xen.vcpu_time_info_set)
kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_time_info_cache, 0);
- if (!v->vcpu_idx)
- kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
+ kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
return 0;
}
@@ -6241,7 +6240,7 @@ static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp)
if (data.flags & ~KVM_CLOCK_VALID_FLAGS)
return -EINVAL;
- kvm_hv_invalidate_tsc_page(kvm);
+ kvm_hv_request_tsc_page_update(kvm);
kvm_start_pvclock_update(kvm);
pvclock_update_vm_gtod_copy(kvm);
@@ -8926,7 +8925,7 @@ int kvm_arch_init(void *opaque)
}
kvm_nr_uret_msrs = 0;
- r = kvm_mmu_module_init();
+ r = kvm_mmu_vendor_module_init();
if (r)
goto out_free_percpu;
@@ -8974,7 +8973,7 @@ void kvm_arch_exit(void)
cancel_work_sync(&pvclock_gtod_work);
#endif
kvm_x86_ops.hardware_enable = NULL;
- kvm_mmu_module_exit();
+ kvm_mmu_vendor_module_exit();
free_percpu(user_return_msrs);
kmem_cache_destroy(x86_emulator_cache);
#ifdef CONFIG_KVM_XEN
@@ -9112,7 +9111,7 @@ static void kvm_apicv_init(struct kvm *kvm)
if (!enable_apicv)
set_or_clear_apicv_inhibit(inhibits,
- APICV_INHIBIT_REASON_ABSENT, true);
+ APICV_INHIBIT_REASON_DISABLE, true);
}
static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
@@ -9890,6 +9889,11 @@ void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
}
+void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
+{
+ static_call_cond(kvm_x86_guest_memory_reclaimed)(kvm);
+}
+
static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
{
if (!lapic_in_kernel(vcpu))
@@ -10016,12 +10020,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH;
+ vcpu->run->system_event.ndata = 0;
r = 0;
goto out;
}
if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
+ vcpu->run->system_event.ndata = 0;
r = 0;
goto out;
}
@@ -10098,7 +10104,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
/* Store vcpu->apicv_active before vcpu->mode. */
smp_store_release(&vcpu->mode, IN_GUEST_MODE);
- srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+ kvm_vcpu_srcu_read_unlock(vcpu);
/*
* 1) We should set ->mode before checking ->requests. Please see
@@ -10129,7 +10135,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
smp_wmb();
local_irq_enable();
preempt_enable();
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ kvm_vcpu_srcu_read_lock(vcpu);
r = 1;
goto cancel_injection;
}
@@ -10255,7 +10261,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
local_irq_enable();
preempt_enable();
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ kvm_vcpu_srcu_read_lock(vcpu);
/*
* Profile KVM exit RIPs:
@@ -10285,7 +10291,7 @@ out:
}
/* Called within kvm->srcu read side. */
-static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
+static inline int vcpu_block(struct kvm_vcpu *vcpu)
{
bool hv_timer;
@@ -10301,12 +10307,12 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
if (hv_timer)
kvm_lapic_switch_to_sw_timer(vcpu);
- srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+ kvm_vcpu_srcu_read_unlock(vcpu);
if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
kvm_vcpu_halt(vcpu);
else
kvm_vcpu_block(vcpu);
- vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+ kvm_vcpu_srcu_read_lock(vcpu);
if (hv_timer)
kvm_lapic_switch_to_hv_timer(vcpu);
@@ -10348,7 +10354,6 @@ static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
static int vcpu_run(struct kvm_vcpu *vcpu)
{
int r;
- struct kvm *kvm = vcpu->kvm;
vcpu->arch.l1tf_flush_l1d = true;
@@ -10356,7 +10361,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
if (kvm_vcpu_running(vcpu)) {
r = vcpu_enter_guest(vcpu);
} else {
- r = vcpu_block(kvm, vcpu);
+ r = vcpu_block(vcpu);
}
if (r <= 0)
@@ -10375,9 +10380,9 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
}
if (__xfer_to_guest_mode_work_pending()) {
- srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+ kvm_vcpu_srcu_read_unlock(vcpu);
r = xfer_to_guest_mode_handle_work(vcpu);
- vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+ kvm_vcpu_srcu_read_lock(vcpu);
if (r)
return r;
}
@@ -10388,12 +10393,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
{
- int r;
-
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
- r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
- srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
- return r;
+ return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
}
static int complete_emulated_pio(struct kvm_vcpu *vcpu)
@@ -10485,7 +10485,6 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
{
struct kvm_run *kvm_run = vcpu->run;
- struct kvm *kvm = vcpu->kvm;
int r;
vcpu_load(vcpu);
@@ -10493,7 +10492,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
kvm_run->flags = 0;
kvm_load_guest_fpu(vcpu);
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ kvm_vcpu_srcu_read_lock(vcpu);
if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
if (kvm_run->immediate_exit) {
r = -EINTR;
@@ -10505,9 +10504,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
*/
WARN_ON_ONCE(kvm_lapic_hv_timer_in_use(vcpu));
- srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+ kvm_vcpu_srcu_read_unlock(vcpu);
kvm_vcpu_block(vcpu);
- vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+ kvm_vcpu_srcu_read_lock(vcpu);
if (kvm_apic_accept_events(vcpu) < 0) {
r = 0;
@@ -10568,7 +10567,7 @@ out:
if (kvm_run->kvm_valid_regs)
store_regs(vcpu);
post_kvm_run_save(vcpu);
- srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+ kvm_vcpu_srcu_read_unlock(vcpu);
kvm_sigset_deactivate(vcpu);
vcpu_put(vcpu);
@@ -10986,6 +10985,9 @@ static void kvm_arch_vcpu_guestdbg_update_apicv_inhibit(struct kvm *kvm)
struct kvm_vcpu *vcpu;
unsigned long i;
+ if (!enable_apicv)
+ return;
+
down_write(&kvm->arch.apicv_update_lock);
kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -11197,8 +11199,21 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
if (r < 0)
goto fail_mmu_destroy;
- if (kvm_apicv_activated(vcpu->kvm))
+
+ /*
+ * Defer evaluating inhibits until the vCPU is first run, as
+ * this vCPU will not get notified of any changes until this
+ * vCPU is visible to other vCPUs (marked online and added to
+ * the set of vCPUs). Opportunistically mark APICv active as
+ * VMX in particularly is highly unlikely to have inhibits.
+ * Ignore the current per-VM APICv state so that vCPU creation
+ * is guaranteed to run with a deterministic value, the request
+ * will ensure the vCPU gets the correct state before VM-Entry.
+ */
+ if (enable_apicv) {
vcpu->arch.apicv_active = true;
+ kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+ }
} else
static_branch_inc(&kvm_has_noapic_vcpu);
@@ -11996,8 +12011,12 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
- if (change == KVM_MR_CREATE || change == KVM_MR_MOVE)
+ if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) {
+ if ((new->base_gfn + new->npages - 1) > kvm_mmu_max_gfn())
+ return -EINVAL;
+
return kvm_alloc_memslot_metadata(kvm, new);
+ }
if (change == KVM_MR_FLAGS_ONLY)
memcpy(&new->arch, &old->arch, sizeof(old->arch));
@@ -12986,3 +13005,19 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit);
+
+static int __init kvm_x86_init(void)
+{
+ kvm_mmu_x86_module_init();
+ return 0;
+}
+module_init(kvm_x86_init);
+
+static void __exit kvm_x86_exit(void)
+{
+ /*
+ * If module_init() is implemented, module_exit() must also be
+ * implemented to allow module unload.
+ */
+}
+module_exit(kvm_x86_exit);
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 8ca5ecf16dc4..9dec1b38a98f 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -53,12 +53,12 @@
SYM_FUNC_START(copy_user_generic_unrolled)
ASM_STAC
cmpl $8,%edx
- jb 20f /* less then 8 bytes, go to byte copy loop */
+ jb .Lcopy_user_short_string_bytes
ALIGN_DESTINATION
movl %edx,%ecx
andl $63,%edx
shrl $6,%ecx
- jz .L_copy_short_string
+ jz copy_user_short_string
1: movq (%rsi),%r8
2: movq 1*8(%rsi),%r9
3: movq 2*8(%rsi),%r10
@@ -79,37 +79,11 @@ SYM_FUNC_START(copy_user_generic_unrolled)
leaq 64(%rdi),%rdi
decl %ecx
jnz 1b
-.L_copy_short_string:
- movl %edx,%ecx
- andl $7,%edx
- shrl $3,%ecx
- jz 20f
-18: movq (%rsi),%r8
-19: movq %r8,(%rdi)
- leaq 8(%rsi),%rsi
- leaq 8(%rdi),%rdi
- decl %ecx
- jnz 18b
-20: andl %edx,%edx
- jz 23f
- movl %edx,%ecx
-21: movb (%rsi),%al
-22: movb %al,(%rdi)
- incq %rsi
- incq %rdi
- decl %ecx
- jnz 21b
-23: xor %eax,%eax
- ASM_CLAC
- RET
+ jmp copy_user_short_string
30: shll $6,%ecx
addl %ecx,%edx
- jmp 60f
-40: leal (%rdx,%rcx,8),%edx
- jmp 60f
-50: movl %ecx,%edx
-60: jmp .Lcopy_user_handle_tail /* ecx is zerorest also */
+ jmp .Lcopy_user_handle_tail
_ASM_EXTABLE_CPY(1b, 30b)
_ASM_EXTABLE_CPY(2b, 30b)
@@ -127,10 +101,6 @@ SYM_FUNC_START(copy_user_generic_unrolled)
_ASM_EXTABLE_CPY(14b, 30b)
_ASM_EXTABLE_CPY(15b, 30b)
_ASM_EXTABLE_CPY(16b, 30b)
- _ASM_EXTABLE_CPY(18b, 40b)
- _ASM_EXTABLE_CPY(19b, 40b)
- _ASM_EXTABLE_CPY(21b, 50b)
- _ASM_EXTABLE_CPY(22b, 50b)
SYM_FUNC_END(copy_user_generic_unrolled)
EXPORT_SYMBOL(copy_user_generic_unrolled)
@@ -191,7 +161,7 @@ EXPORT_SYMBOL(copy_user_generic_string)
SYM_FUNC_START(copy_user_enhanced_fast_string)
ASM_STAC
/* CPUs without FSRM should avoid rep movsb for short copies */
- ALTERNATIVE "cmpl $64, %edx; jb .L_copy_short_string", "", X86_FEATURE_FSRM
+ ALTERNATIVE "cmpl $64, %edx; jb copy_user_short_string", "", X86_FEATURE_FSRM
movl %edx,%ecx
1: rep movsb
xorl %eax,%eax
@@ -244,6 +214,53 @@ SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail)
SYM_CODE_END(.Lcopy_user_handle_tail)
/*
+ * Finish memcpy of less than 64 bytes. #AC should already be set.
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count (< 64)
+ *
+ * Output:
+ * eax uncopied bytes or 0 if successful.
+ */
+SYM_CODE_START_LOCAL(copy_user_short_string)
+ movl %edx,%ecx
+ andl $7,%edx
+ shrl $3,%ecx
+ jz .Lcopy_user_short_string_bytes
+18: movq (%rsi),%r8
+19: movq %r8,(%rdi)
+ leaq 8(%rsi),%rsi
+ leaq 8(%rdi),%rdi
+ decl %ecx
+ jnz 18b
+.Lcopy_user_short_string_bytes:
+ andl %edx,%edx
+ jz 23f
+ movl %edx,%ecx
+21: movb (%rsi),%al
+22: movb %al,(%rdi)
+ incq %rsi
+ incq %rdi
+ decl %ecx
+ jnz 21b
+23: xor %eax,%eax
+ ASM_CLAC
+ RET
+
+40: leal (%rdx,%rcx,8),%edx
+ jmp 60f
+50: movl %ecx,%edx /* ecx is zerorest also */
+60: jmp .Lcopy_user_handle_tail
+
+ _ASM_EXTABLE_CPY(18b, 40b)
+ _ASM_EXTABLE_CPY(19b, 40b)
+ _ASM_EXTABLE_CPY(21b, 50b)
+ _ASM_EXTABLE_CPY(22b, 50b)
+SYM_CODE_END(copy_user_short_string)
+
+/*
* copy_user_nocache - Uncached memory copy with exception handling
* This will force destination out of cache for more performance.
*
diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S
index ecb2049c1273..b7dfd60243b7 100644
--- a/arch/x86/lib/putuser.S
+++ b/arch/x86/lib/putuser.S
@@ -48,6 +48,7 @@ SYM_FUNC_START(__put_user_1)
cmp %_ASM_BX,%_ASM_CX
jae .Lbad_put_user
SYM_INNER_LABEL(__put_user_nocheck_1, SYM_L_GLOBAL)
+ ENDBR
ASM_STAC
1: movb %al,(%_ASM_CX)
xor %ecx,%ecx
@@ -62,6 +63,7 @@ SYM_FUNC_START(__put_user_2)
cmp %_ASM_BX,%_ASM_CX
jae .Lbad_put_user
SYM_INNER_LABEL(__put_user_nocheck_2, SYM_L_GLOBAL)
+ ENDBR
ASM_STAC
2: movw %ax,(%_ASM_CX)
xor %ecx,%ecx
@@ -76,6 +78,7 @@ SYM_FUNC_START(__put_user_4)
cmp %_ASM_BX,%_ASM_CX
jae .Lbad_put_user
SYM_INNER_LABEL(__put_user_nocheck_4, SYM_L_GLOBAL)
+ ENDBR
ASM_STAC
3: movl %eax,(%_ASM_CX)
xor %ecx,%ecx
@@ -90,6 +93,7 @@ SYM_FUNC_START(__put_user_8)
cmp %_ASM_BX,%_ASM_CX
jae .Lbad_put_user
SYM_INNER_LABEL(__put_user_nocheck_8, SYM_L_GLOBAL)
+ ENDBR
ASM_STAC
4: mov %_ASM_AX,(%_ASM_CX)
#ifdef CONFIG_X86_32
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index 5f87bab4fb8d..b2b2366885a2 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -31,6 +31,7 @@
.align RETPOLINE_THUNK_SIZE
SYM_INNER_LABEL(__x86_indirect_thunk_\reg, SYM_L_GLOBAL)
UNWIND_HINT_EMPTY
+ ANNOTATE_NOENDBR
ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \
__stringify(RETPOLINE \reg), X86_FEATURE_RETPOLINE, \
@@ -55,7 +56,6 @@ SYM_INNER_LABEL(__x86_indirect_thunk_\reg, SYM_L_GLOBAL)
.align RETPOLINE_THUNK_SIZE
SYM_CODE_START(__x86_indirect_thunk_array)
- ANNOTATE_NOENDBR // apply_retpolines
#define GEN(reg) THUNK reg
#include <asm/GEN-for-each-reg.h>
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 0402a749f3a0..0ae6cf804197 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -119,7 +119,7 @@ void __memcpy_flushcache(void *_dst, const void *_src, size_t size)
/* cache copy and flush to align dest */
if (!IS_ALIGNED(dest, 8)) {
- unsigned len = min_t(unsigned, size, ALIGN(dest, 8) - dest);
+ size_t len = min_t(size_t, size, ALIGN(dest, 8) - dest);
memcpy((void *) dest, (void *) source, len);
clean_cache_range((void *) dest, len);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 96d34ebb20a9..e2942335d143 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -902,6 +902,8 @@ static void __meminit vmemmap_use_sub_pmd(unsigned long start, unsigned long end
static void __meminit vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end)
{
+ const unsigned long page = ALIGN_DOWN(start, PMD_SIZE);
+
vmemmap_flush_unused_pmd();
/*
@@ -914,8 +916,7 @@ static void __meminit vmemmap_use_new_sub_pmd(unsigned long start, unsigned long
* Mark with PAGE_UNUSED the unused parts of the new memmap range
*/
if (!IS_ALIGNED(start, PMD_SIZE))
- memset((void *)start, PAGE_UNUSED,
- start - ALIGN_DOWN(start, PMD_SIZE));
+ memset((void *)page, PAGE_UNUSED, start - page);
/*
* We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index abf5ed76e4b7..0656db33574d 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -638,17 +638,6 @@ pte_t *lookup_address(unsigned long address, unsigned int *level)
}
EXPORT_SYMBOL_GPL(lookup_address);
-/*
- * Lookup the page table entry for a virtual address in a given mm. Return a
- * pointer to the entry and the level of the mapping.
- */
-pte_t *lookup_address_in_mm(struct mm_struct *mm, unsigned long address,
- unsigned int *level)
-{
- return lookup_address_in_pgd(pgd_offset(mm, address), address, level);
-}
-EXPORT_SYMBOL_GPL(lookup_address_in_mm);
-
static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
unsigned int *level)
{
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 6eb4d91d5365..d400b6d9d246 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -855,13 +855,11 @@ done:
nr_invalidate);
}
-static bool tlb_is_not_lazy(int cpu)
+static bool tlb_is_not_lazy(int cpu, void *data)
{
return !per_cpu(cpu_tlbstate_shared.is_lazy, cpu);
}
-static DEFINE_PER_CPU(cpumask_t, flush_tlb_mask);
-
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared);
EXPORT_PER_CPU_SYMBOL(cpu_tlbstate_shared);
@@ -890,36 +888,11 @@ STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask,
* up on the new contents of what used to be page tables, while
* doing a speculative memory access.
*/
- if (info->freed_tables) {
+ if (info->freed_tables)
on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true);
- } else {
- /*
- * Although we could have used on_each_cpu_cond_mask(),
- * open-coding it has performance advantages, as it eliminates
- * the need for indirect calls or retpolines. In addition, it
- * allows to use a designated cpumask for evaluating the
- * condition, instead of allocating one.
- *
- * This code works under the assumption that there are no nested
- * TLB flushes, an assumption that is already made in
- * flush_tlb_mm_range().
- *
- * cond_cpumask is logically a stack-local variable, but it is
- * more efficient to have it off the stack and not to allocate
- * it on demand. Preemption is disabled and this code is
- * non-reentrant.
- */
- struct cpumask *cond_cpumask = this_cpu_ptr(&flush_tlb_mask);
- int cpu;
-
- cpumask_clear(cond_cpumask);
-
- for_each_cpu(cpu, cpumask) {
- if (tlb_is_not_lazy(cpu))
- __cpumask_set_cpu(cpu, cond_cpumask);
- }
- on_each_cpu_mask(cond_cpumask, flush_tlb_func, (void *)info, true);
- }
+ else
+ on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func,
+ (void *)info, 1, cpumask);
}
void flush_tlb_multi(const struct cpumask *cpumask,
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 8fe35ed11fd6..16b6efacf7c6 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -412,6 +412,7 @@ static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip)
EMIT_LFENCE();
EMIT2(0xFF, 0xE0 + reg);
} else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) {
+ OPTIMIZER_HIDE_VAR(reg);
emit_jump(&prog, &__x86_indirect_thunk_array[reg], ip);
} else
#endif
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 9bb1e2941179..b94f727251b6 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -467,7 +467,6 @@ static __init void xen_setup_pci_msi(void)
else
xen_msi_ops.setup_msi_irqs = xen_setup_msi_irqs;
xen_msi_ops.teardown_msi_irqs = xen_pv_teardown_msi_irqs;
- pci_msi_ignore_mask = 1;
} else if (xen_hvm_domain()) {
xen_msi_ops.setup_msi_irqs = xen_hvm_setup_msi_irqs;
xen_msi_ops.teardown_msi_irqs = xen_teardown_msi_irqs;
@@ -481,6 +480,11 @@ static __init void xen_setup_pci_msi(void)
* in allocating the native domain and never use it.
*/
x86_init.irqs.create_pci_msi_domain = xen_create_pci_msi_domain;
+ /*
+ * With XEN PIRQ/Eventchannels in use PCI/MSI[-X] masking is solely
+ * controlled by the hypervisor.
+ */
+ pci_msi_ignore_mask = 1;
}
#else /* CONFIG_PCI_MSI */
diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S
index 72c1e42d121d..7fe564eaf228 100644
--- a/arch/x86/platform/pvh/head.S
+++ b/arch/x86/platform/pvh/head.S
@@ -50,6 +50,7 @@
#define PVH_DS_SEL (PVH_GDT_ENTRY_DS * 8)
SYM_CODE_START_LOCAL(pvh_start_xen)
+ UNWIND_HINT_EMPTY
cld
lgdt (_pa(gdt))
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 9f2b251e83c5..bb176c72891c 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -25,6 +25,7 @@
#include <asm/cpu.h>
#include <asm/mmu_context.h>
#include <asm/cpu_device_id.h>
+#include <asm/microcode.h>
#ifdef CONFIG_X86_32
__visible unsigned long saved_context_ebx;
@@ -40,7 +41,8 @@ static void msr_save_context(struct saved_context *ctxt)
struct saved_msr *end = msr + ctxt->saved_msrs.num;
while (msr < end) {
- msr->valid = !rdmsrl_safe(msr->info.msr_no, &msr->info.reg.q);
+ if (msr->valid)
+ rdmsrl(msr->info.msr_no, msr->info.reg.q);
msr++;
}
}
@@ -261,11 +263,18 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)
x86_platform.restore_sched_clock_state();
mtrr_bp_restore();
perf_restore_debug_store();
- msr_restore_context(ctxt);
c = &cpu_data(smp_processor_id());
if (cpu_has(c, X86_FEATURE_MSR_IA32_FEAT_CTL))
init_ia32_feat_ctl(c);
+
+ microcode_bsp_resume();
+
+ /*
+ * This needs to happen after the microcode has been updated upon resume
+ * because some of the MSRs are "emulated" in microcode.
+ */
+ msr_restore_context(ctxt);
}
/* Needed by apm.c */
@@ -424,8 +433,10 @@ static int msr_build_context(const u32 *msr_id, const int num)
}
for (i = saved_msrs->num, j = 0; i < total_num; i++, j++) {
+ u64 dummy;
+
msr_array[i].info.msr_no = msr_id[j];
- msr_array[i].valid = false;
+ msr_array[i].valid = !rdmsrl_safe(msr_id[j], &dummy);
msr_array[i].info.reg.q = 0;
}
saved_msrs->num = total_num;
@@ -500,10 +511,24 @@ static int pm_cpu_check(const struct x86_cpu_id *c)
return ret;
}
+static void pm_save_spec_msr(void)
+{
+ u32 spec_msr_id[] = {
+ MSR_IA32_SPEC_CTRL,
+ MSR_IA32_TSX_CTRL,
+ MSR_TSX_FORCE_ABORT,
+ MSR_IA32_MCU_OPT_CTRL,
+ MSR_AMD64_LS_CFG,
+ };
+
+ msr_build_context(spec_msr_id, ARRAY_SIZE(spec_msr_id));
+}
+
static int pm_check_save_msr(void)
{
dmi_check_system(msr_save_dmi_table);
pm_cpu_check(msr_save_cpu_table);
+ pm_save_spec_msr();
return 0;
}
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index ac17196e2518..3a2cd93bf059 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -45,6 +45,7 @@ SYM_CODE_END(hypercall_page)
__INIT
SYM_CODE_START(startup_xen)
UNWIND_HINT_EMPTY
+ ANNOTATE_NOENDBR
cld
/* Clear .bss */