summaryrefslogtreecommitdiff
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/alternative.c22
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c11
-rw-r--r--arch/x86/kernel/callthunks.c1
-rw-r--r--arch/x86/kernel/cpu/amd.c36
-rw-r--r--arch/x86/kernel/cpu/bugs.c17
-rw-r--r--arch/x86/kernel/cpu/common.c2
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c10
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.c30
-rw-r--r--arch/x86/kernel/fpu/core.c5
-rw-r--r--arch/x86/kernel/fpu/xstate.c12
-rw-r--r--arch/x86/kernel/fpu/xstate.h3
-rw-r--r--arch/x86/kernel/kgdb.c1
-rw-r--r--arch/x86/kernel/paravirt.c67
-rw-r--r--arch/x86/kernel/process.c7
-rw-r--r--arch/x86/kernel/setup.c8
-rw-r--r--arch/x86/kernel/sev-shared.c122
-rw-r--r--arch/x86/kernel/sev.c33
-rw-r--r--arch/x86/kernel/shstk.c33
-rw-r--r--arch/x86/kernel/smp.c39
-rw-r--r--arch/x86/kernel/smpboot.c39
-rw-r--r--arch/x86/kernel/topology.c2
21 files changed, 262 insertions, 238 deletions
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index a5ead6a6d233..73be3931e4f0 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -403,6 +403,17 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
u8 insn_buff[MAX_PATCH_LEN];
DPRINTK(ALT, "alt table %px, -> %px", start, end);
+
+ /*
+ * In the case CONFIG_X86_5LEVEL=y, KASAN_SHADOW_START is defined using
+ * cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here.
+ * During the process, KASAN becomes confused seeing partial LA57
+ * conversion and triggers a false-positive out-of-bound report.
+ *
+ * Disable KASAN until the patching is complete.
+ */
+ kasan_disable_current();
+
/*
* The scan order should be from start to end. A later scanned
* alternative code can overwrite previously scanned alternative code.
@@ -452,6 +463,8 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
text_poke_early(instr, insn_buff, insn_buff_sz);
}
+
+ kasan_enable_current();
}
static inline bool is_jcc32(struct insn *insn)
@@ -720,13 +733,8 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end)
{
s32 *s;
- /*
- * Do not patch out the default return thunks if those needed are the
- * ones generated by the compiler.
- */
- if (cpu_feature_enabled(X86_FEATURE_RETHUNK) &&
- (x86_return_thunk == __x86_return_thunk))
- return;
+ if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
+ static_call_force_reinit();
for (s = start; s < end; s++) {
void *dest = NULL, *addr = (void *)s + *s;
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index d9f5d7492f83..205cee567629 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -1533,7 +1533,7 @@ static void __init build_socket_tables(void)
{
struct uv_gam_range_entry *gre = uv_gre_table;
int nums, numn, nump;
- int cpu, i, lnid;
+ int i, lnid, apicid;
int minsock = _min_socket;
int maxsock = _max_socket;
int minpnode = _min_pnode;
@@ -1584,15 +1584,14 @@ static void __init build_socket_tables(void)
/* Set socket -> node values: */
lnid = NUMA_NO_NODE;
- for_each_possible_cpu(cpu) {
- int nid = cpu_to_node(cpu);
- int apicid, sockid;
+ for (apicid = 0; apicid < ARRAY_SIZE(__apicid_to_node); apicid++) {
+ int nid = __apicid_to_node[apicid];
+ int sockid;
- if (lnid == nid)
+ if ((nid == NUMA_NO_NODE) || (lnid == nid))
continue;
lnid = nid;
- apicid = per_cpu(x86_cpu_to_apicid, cpu);
sockid = apicid >> uv_cpuid.socketid_shift;
if (_socket_to_node[sockid - minsock] == SOCK_EMPTY)
diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c
index c06bfc086565..faa9f2299848 100644
--- a/arch/x86/kernel/callthunks.c
+++ b/arch/x86/kernel/callthunks.c
@@ -272,7 +272,6 @@ void __init callthunks_patch_builtin_calls(void)
pr_info("Setting up call depth tracking\n");
mutex_lock(&text_mutex);
callthunks_setup(&cs, &builtin_coretext);
- static_call_force_reinit();
thunks_initialized = true;
mutex_unlock(&text_mutex);
}
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index dd8379d84445..ece2b5b7b0fe 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -80,6 +80,10 @@ static const int amd_div0[] =
AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0x00, 0x0, 0x2f, 0xf),
AMD_MODEL_RANGE(0x17, 0x50, 0x0, 0x5f, 0xf));
+static const int amd_erratum_1485[] =
+ AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x19, 0x10, 0x0, 0x1f, 0xf),
+ AMD_MODEL_RANGE(0x19, 0x60, 0x0, 0xaf, 0xf));
+
static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum)
{
int osvw_id = *erratum++;
@@ -766,6 +770,15 @@ static void early_init_amd(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_TOPOEXT))
smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1;
+
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_IBPB_BRTYPE)) {
+ if (c->x86 == 0x17 && boot_cpu_has(X86_FEATURE_AMD_IBPB))
+ setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE);
+ else if (c->x86 >= 0x19 && !wrmsrl_safe(MSR_IA32_PRED_CMD, PRED_CMD_SBPB)) {
+ setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE);
+ setup_force_cpu_cap(X86_FEATURE_SBPB);
+ }
+ }
}
static void init_amd_k8(struct cpuinfo_x86 *c)
@@ -1140,6 +1153,10 @@ static void init_amd(struct cpuinfo_x86 *c)
pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n");
setup_force_cpu_bug(X86_BUG_DIV0);
}
+
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR) &&
+ cpu_has_amd_erratum(c, amd_erratum_1485))
+ msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT);
}
#ifdef CONFIG_X86_32
@@ -1301,25 +1318,6 @@ void amd_check_microcode(void)
on_each_cpu(zenbleed_check_cpu, NULL, 1);
}
-bool cpu_has_ibpb_brtype_microcode(void)
-{
- switch (boot_cpu_data.x86) {
- /* Zen1/2 IBPB flushes branch type predictions too. */
- case 0x17:
- return boot_cpu_has(X86_FEATURE_AMD_IBPB);
- case 0x19:
- /* Poke the MSR bit on Zen3/4 to check its presence. */
- if (!wrmsrl_safe(MSR_IA32_PRED_CMD, PRED_CMD_SBPB)) {
- setup_force_cpu_cap(X86_FEATURE_SBPB);
- return true;
- } else {
- return false;
- }
- default:
- return false;
- }
-}
-
/*
* Issue a DIV 0/1 insn to clear any division data from previous DIV
* operations.
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index f081d26616ac..10499bcd4e39 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -2404,27 +2404,16 @@ early_param("spec_rstack_overflow", srso_parse_cmdline);
static void __init srso_select_mitigation(void)
{
- bool has_microcode;
+ bool has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE);
if (!boot_cpu_has_bug(X86_BUG_SRSO) || cpu_mitigations_off())
goto pred_cmd;
- /*
- * The first check is for the kernel running as a guest in order
- * for guests to verify whether IBPB is a viable mitigation.
- */
- has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE) || cpu_has_ibpb_brtype_microcode();
if (!has_microcode) {
pr_warn("IBPB-extending microcode not applied!\n");
pr_warn(SRSO_NOTICE);
} else {
/*
- * Enable the synthetic (even if in a real CPUID leaf)
- * flags for guests.
- */
- setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE);
-
- /*
* Zen1/2 with SMT off aren't vulnerable after the right
* IBPB microcode has been applied.
*/
@@ -2444,7 +2433,7 @@ static void __init srso_select_mitigation(void)
switch (srso_cmd) {
case SRSO_CMD_OFF:
- return;
+ goto pred_cmd;
case SRSO_CMD_MICROCODE:
if (has_microcode) {
@@ -2717,7 +2706,7 @@ static ssize_t srso_show_state(char *buf)
return sysfs_emit(buf, "%s%s\n",
srso_strings[srso_mitigation],
- (cpu_has_ibpb_brtype_microcode() ? "" : ", no microcode"));
+ boot_cpu_has(X86_FEATURE_IBPB_BRTYPE) ? "" : ", no microcode");
}
static ssize_t gds_show_state(char *buf)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 382d4e6b848d..4e5ffc8b0e46 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1303,7 +1303,7 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
VULNBL_AMD(0x15, RETBLEED),
VULNBL_AMD(0x16, RETBLEED),
VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO),
- VULNBL_HYGON(0x18, RETBLEED | SMT_RSB),
+ VULNBL_HYGON(0x18, RETBLEED | SMT_RSB | SRSO),
VULNBL_AMD(0x19, SRSO),
{}
};
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index ded1fc7cb7cb..f136ac046851 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -30,15 +30,15 @@ struct rmid_entry {
struct list_head list;
};
-/**
- * @rmid_free_lru A least recently used list of free RMIDs
+/*
+ * @rmid_free_lru - A least recently used list of free RMIDs
* These RMIDs are guaranteed to have an occupancy less than the
* threshold occupancy
*/
static LIST_HEAD(rmid_free_lru);
-/**
- * @rmid_limbo_count count of currently unused but (potentially)
+/*
+ * @rmid_limbo_count - count of currently unused but (potentially)
* dirty RMIDs.
* This counts RMIDs that no one is currently using but that
* may have a occupancy value > resctrl_rmid_realloc_threshold. User can
@@ -46,7 +46,7 @@ static LIST_HEAD(rmid_free_lru);
*/
static unsigned int rmid_limbo_count;
-/**
+/*
* @rmid_entry - The entry in the limbo and free lists.
*/
static struct rmid_entry *rmid_ptrs;
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 91fa70e51004..279148e72459 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -235,6 +235,21 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page,
return epc_page;
}
+/*
+ * Ensure the SECS page is not swapped out. Must be called with encl->lock
+ * to protect the enclave states including SECS and ensure the SECS page is
+ * not swapped out again while being used.
+ */
+static struct sgx_epc_page *sgx_encl_load_secs(struct sgx_encl *encl)
+{
+ struct sgx_epc_page *epc_page = encl->secs.epc_page;
+
+ if (!epc_page)
+ epc_page = sgx_encl_eldu(&encl->secs, NULL);
+
+ return epc_page;
+}
+
static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl,
struct sgx_encl_page *entry)
{
@@ -248,11 +263,9 @@ static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl,
return entry;
}
- if (!(encl->secs.epc_page)) {
- epc_page = sgx_encl_eldu(&encl->secs, NULL);
- if (IS_ERR(epc_page))
- return ERR_CAST(epc_page);
- }
+ epc_page = sgx_encl_load_secs(encl);
+ if (IS_ERR(epc_page))
+ return ERR_CAST(epc_page);
epc_page = sgx_encl_eldu(entry, encl->secs.epc_page);
if (IS_ERR(epc_page))
@@ -339,6 +352,13 @@ static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
mutex_lock(&encl->lock);
+ epc_page = sgx_encl_load_secs(encl);
+ if (IS_ERR(epc_page)) {
+ if (PTR_ERR(epc_page) == -EBUSY)
+ vmret = VM_FAULT_NOPAGE;
+ goto err_out_unlock;
+ }
+
epc_page = sgx_alloc_epc_page(encl_page, false);
if (IS_ERR(epc_page)) {
if (PTR_ERR(epc_page) == -EBUSY)
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index a86d37052a64..a21a4d0ecc34 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -369,14 +369,15 @@ int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest)
EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpstate);
void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf,
- unsigned int size, u32 pkru)
+ unsigned int size, u64 xfeatures, u32 pkru)
{
struct fpstate *kstate = gfpu->fpstate;
union fpregs_state *ustate = buf;
struct membuf mb = { .p = buf, .left = size };
if (cpu_feature_enabled(X86_FEATURE_XSAVE)) {
- __copy_xstate_to_uabi_buf(mb, kstate, pkru, XSTATE_COPY_XSAVE);
+ __copy_xstate_to_uabi_buf(mb, kstate, xfeatures, pkru,
+ XSTATE_COPY_XSAVE);
} else {
memcpy(&ustate->fxsave, &kstate->regs.fxsave,
sizeof(ustate->fxsave));
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index cadf68737e6b..ef6906107c54 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1049,6 +1049,7 @@ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
* __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
* @to: membuf descriptor
* @fpstate: The fpstate buffer from which to copy
+ * @xfeatures: The mask of xfeatures to save (XSAVE mode only)
* @pkru_val: The PKRU value to store in the PKRU component
* @copy_mode: The requested copy mode
*
@@ -1059,7 +1060,8 @@ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
* It supports partial copy but @to.pos always starts from zero.
*/
void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
- u32 pkru_val, enum xstate_copy_mode copy_mode)
+ u64 xfeatures, u32 pkru_val,
+ enum xstate_copy_mode copy_mode)
{
const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr);
struct xregs_state *xinit = &init_fpstate.regs.xsave;
@@ -1083,7 +1085,7 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
break;
case XSTATE_COPY_XSAVE:
- header.xfeatures &= fpstate->user_xfeatures;
+ header.xfeatures &= fpstate->user_xfeatures & xfeatures;
break;
}
@@ -1185,6 +1187,7 @@ void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
enum xstate_copy_mode copy_mode)
{
__copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate,
+ tsk->thread.fpu.fpstate->user_xfeatures,
tsk->thread.pkru, copy_mode);
}
@@ -1536,10 +1539,7 @@ static int fpstate_realloc(u64 xfeatures, unsigned int ksize,
fpregs_restore_userregs();
newfps->xfeatures = curfps->xfeatures | xfeatures;
-
- if (!guest_fpu)
- newfps->user_xfeatures = curfps->user_xfeatures | xfeatures;
-
+ newfps->user_xfeatures = curfps->user_xfeatures | xfeatures;
newfps->xfd = curfps->xfd & ~xfeatures;
/* Do the final updates within the locked region */
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index a4ecb04d8d64..3518fb26d06b 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -43,7 +43,8 @@ enum xstate_copy_mode {
struct membuf;
extern void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
- u32 pkru_val, enum xstate_copy_mode copy_mode);
+ u64 xfeatures, u32 pkru_val,
+ enum xstate_copy_mode copy_mode);
extern void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
enum xstate_copy_mode mode);
extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru);
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 3a43a2dee658..9c9faa1634fb 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -695,7 +695,6 @@ void kgdb_arch_exit(void)
}
/**
- *
* kgdb_skipexception - Bail out of KGDB when we've been triggered.
* @exception: Exception vector number
* @regs: Current &struct pt_regs.
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 975f98d5eee5..97f1436c1a20 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -143,66 +143,7 @@ int paravirt_disable_iospace(void)
return request_resource(&ioport_resource, &reserve_ioports);
}
-static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LAZY_NONE;
-
-static inline void enter_lazy(enum paravirt_lazy_mode mode)
-{
- BUG_ON(this_cpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
-
- this_cpu_write(paravirt_lazy_mode, mode);
-}
-
-static void leave_lazy(enum paravirt_lazy_mode mode)
-{
- BUG_ON(this_cpu_read(paravirt_lazy_mode) != mode);
-
- this_cpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
-}
-
-void paravirt_enter_lazy_mmu(void)
-{
- enter_lazy(PARAVIRT_LAZY_MMU);
-}
-
-void paravirt_leave_lazy_mmu(void)
-{
- leave_lazy(PARAVIRT_LAZY_MMU);
-}
-
-void paravirt_flush_lazy_mmu(void)
-{
- preempt_disable();
-
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
- arch_leave_lazy_mmu_mode();
- arch_enter_lazy_mmu_mode();
- }
-
- preempt_enable();
-}
-
#ifdef CONFIG_PARAVIRT_XXL
-void paravirt_start_context_switch(struct task_struct *prev)
-{
- BUG_ON(preemptible());
-
- if (this_cpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
- arch_leave_lazy_mmu_mode();
- set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
- }
- enter_lazy(PARAVIRT_LAZY_CPU);
-}
-
-void paravirt_end_context_switch(struct task_struct *next)
-{
- BUG_ON(preemptible());
-
- leave_lazy(PARAVIRT_LAZY_CPU);
-
- if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES))
- arch_enter_lazy_mmu_mode();
-}
-
static noinstr void pv_native_write_cr2(unsigned long val)
{
native_write_cr2(val);
@@ -229,14 +170,6 @@ static noinstr void pv_native_safe_halt(void)
}
#endif
-enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
-{
- if (in_interrupt())
- return PARAVIRT_LAZY_NONE;
-
- return this_cpu_read(paravirt_lazy_mode);
-}
-
struct pv_info pv_info = {
.name = "bare hardware",
#ifdef CONFIG_PARAVIRT_XXL
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 9f0909142a0a..b6f4e8399fca 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -257,13 +257,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
if (!ret && unlikely(test_tsk_thread_flag(current, TIF_IO_BITMAP)))
io_bitmap_share(p);
- /*
- * If copy_thread() if failing, don't leak the shadow stack possibly
- * allocated in shstk_alloc_thread_stack() above.
- */
- if (ret)
- shstk_free(p);
-
return ret;
}
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b9145a63da77..b098b1fa2470 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -358,15 +358,11 @@ static void __init add_early_ima_buffer(u64 phys_addr)
#if defined(CONFIG_HAVE_IMA_KEXEC) && !defined(CONFIG_OF_FLATTREE)
int __init ima_free_kexec_buffer(void)
{
- int rc;
-
if (!ima_kexec_buffer_size)
return -ENOENT;
- rc = memblock_phys_free(ima_kexec_buffer_phys,
- ima_kexec_buffer_size);
- if (rc)
- return rc;
+ memblock_free_late(ima_kexec_buffer_phys,
+ ima_kexec_buffer_size);
ima_kexec_buffer_phys = 0;
ima_kexec_buffer_size = 0;
diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c
index 2eabccde94fb..ccb0915e84e1 100644
--- a/arch/x86/kernel/sev-shared.c
+++ b/arch/x86/kernel/sev-shared.c
@@ -256,7 +256,7 @@ static int __sev_cpuid_hv(u32 fn, int reg_idx, u32 *reg)
return 0;
}
-static int sev_cpuid_hv(struct cpuid_leaf *leaf)
+static int __sev_cpuid_hv_msr(struct cpuid_leaf *leaf)
{
int ret;
@@ -279,6 +279,45 @@ static int sev_cpuid_hv(struct cpuid_leaf *leaf)
return ret;
}
+static int __sev_cpuid_hv_ghcb(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf)
+{
+ u32 cr4 = native_read_cr4();
+ int ret;
+
+ ghcb_set_rax(ghcb, leaf->fn);
+ ghcb_set_rcx(ghcb, leaf->subfn);
+
+ if (cr4 & X86_CR4_OSXSAVE)
+ /* Safe to read xcr0 */
+ ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK));
+ else
+ /* xgetbv will cause #UD - use reset value for xcr0 */
+ ghcb_set_xcr0(ghcb, 1);
+
+ ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0);
+ if (ret != ES_OK)
+ return ret;
+
+ if (!(ghcb_rax_is_valid(ghcb) &&
+ ghcb_rbx_is_valid(ghcb) &&
+ ghcb_rcx_is_valid(ghcb) &&
+ ghcb_rdx_is_valid(ghcb)))
+ return ES_VMM_ERROR;
+
+ leaf->eax = ghcb->save.rax;
+ leaf->ebx = ghcb->save.rbx;
+ leaf->ecx = ghcb->save.rcx;
+ leaf->edx = ghcb->save.rdx;
+
+ return ES_OK;
+}
+
+static int sev_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf)
+{
+ return ghcb ? __sev_cpuid_hv_ghcb(ghcb, ctxt, leaf)
+ : __sev_cpuid_hv_msr(leaf);
+}
+
/*
* This may be called early while still running on the initial identity
* mapping. Use RIP-relative addressing to obtain the correct address
@@ -388,19 +427,20 @@ snp_cpuid_get_validated_func(struct cpuid_leaf *leaf)
return false;
}
-static void snp_cpuid_hv(struct cpuid_leaf *leaf)
+static void snp_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf)
{
- if (sev_cpuid_hv(leaf))
+ if (sev_cpuid_hv(ghcb, ctxt, leaf))
sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV);
}
-static int snp_cpuid_postprocess(struct cpuid_leaf *leaf)
+static int snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
+ struct cpuid_leaf *leaf)
{
struct cpuid_leaf leaf_hv = *leaf;
switch (leaf->fn) {
case 0x1:
- snp_cpuid_hv(&leaf_hv);
+ snp_cpuid_hv(ghcb, ctxt, &leaf_hv);
/* initial APIC ID */
leaf->ebx = (leaf_hv.ebx & GENMASK(31, 24)) | (leaf->ebx & GENMASK(23, 0));
@@ -419,7 +459,7 @@ static int snp_cpuid_postprocess(struct cpuid_leaf *leaf)
break;
case 0xB:
leaf_hv.subfn = 0;
- snp_cpuid_hv(&leaf_hv);
+ snp_cpuid_hv(ghcb, ctxt, &leaf_hv);
/* extended APIC ID */
leaf->edx = leaf_hv.edx;
@@ -467,7 +507,7 @@ static int snp_cpuid_postprocess(struct cpuid_leaf *leaf)
}
break;
case 0x8000001E:
- snp_cpuid_hv(&leaf_hv);
+ snp_cpuid_hv(ghcb, ctxt, &leaf_hv);
/* extended APIC ID */
leaf->eax = leaf_hv.eax;
@@ -488,7 +528,7 @@ static int snp_cpuid_postprocess(struct cpuid_leaf *leaf)
* Returns -EOPNOTSUPP if feature not enabled. Any other non-zero return value
* should be treated as fatal by caller.
*/
-static int snp_cpuid(struct cpuid_leaf *leaf)
+static int snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf)
{
const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
@@ -522,7 +562,7 @@ static int snp_cpuid(struct cpuid_leaf *leaf)
return 0;
}
- return snp_cpuid_postprocess(leaf);
+ return snp_cpuid_postprocess(ghcb, ctxt, leaf);
}
/*
@@ -544,14 +584,14 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
leaf.fn = fn;
leaf.subfn = subfn;
- ret = snp_cpuid(&leaf);
+ ret = snp_cpuid(NULL, NULL, &leaf);
if (!ret)
goto cpuid_done;
if (ret != -EOPNOTSUPP)
goto fail;
- if (sev_cpuid_hv(&leaf))
+ if (__sev_cpuid_hv_msr(&leaf))
goto fail;
cpuid_done:
@@ -592,6 +632,23 @@ fail:
sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
}
+static enum es_result vc_insn_string_check(struct es_em_ctxt *ctxt,
+ unsigned long address,
+ bool write)
+{
+ if (user_mode(ctxt->regs) && fault_in_kernel_space(address)) {
+ ctxt->fi.vector = X86_TRAP_PF;
+ ctxt->fi.error_code = X86_PF_USER;
+ ctxt->fi.cr2 = address;
+ if (write)
+ ctxt->fi.error_code |= X86_PF_WRITE;
+
+ return ES_EXCEPTION;
+ }
+
+ return ES_OK;
+}
+
static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt,
void *src, char *buf,
unsigned int data_size,
@@ -599,7 +656,12 @@ static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt,
bool backwards)
{
int i, b = backwards ? -1 : 1;
- enum es_result ret = ES_OK;
+ unsigned long address = (unsigned long)src;
+ enum es_result ret;
+
+ ret = vc_insn_string_check(ctxt, address, false);
+ if (ret != ES_OK)
+ return ret;
for (i = 0; i < count; i++) {
void *s = src + (i * data_size * b);
@@ -620,7 +682,12 @@ static enum es_result vc_insn_string_write(struct es_em_ctxt *ctxt,
bool backwards)
{
int i, s = backwards ? -1 : 1;
- enum es_result ret = ES_OK;
+ unsigned long address = (unsigned long)dst;
+ enum es_result ret;
+
+ ret = vc_insn_string_check(ctxt, address, true);
+ if (ret != ES_OK)
+ return ret;
for (i = 0; i < count; i++) {
void *d = dst + (i * data_size * s);
@@ -656,6 +723,9 @@ static enum es_result vc_insn_string_write(struct es_em_ctxt *ctxt,
static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo)
{
struct insn *insn = &ctxt->insn;
+ size_t size;
+ u64 port;
+
*exitinfo = 0;
switch (insn->opcode.bytes[0]) {
@@ -664,7 +734,7 @@ static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo)
case 0x6d:
*exitinfo |= IOIO_TYPE_INS;
*exitinfo |= IOIO_SEG_ES;
- *exitinfo |= (ctxt->regs->dx & 0xffff) << 16;
+ port = ctxt->regs->dx & 0xffff;
break;
/* OUTS opcodes */
@@ -672,41 +742,43 @@ static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo)
case 0x6f:
*exitinfo |= IOIO_TYPE_OUTS;
*exitinfo |= IOIO_SEG_DS;
- *exitinfo |= (ctxt->regs->dx & 0xffff) << 16;
+ port = ctxt->regs->dx & 0xffff;
break;
/* IN immediate opcodes */
case 0xe4:
case 0xe5:
*exitinfo |= IOIO_TYPE_IN;
- *exitinfo |= (u8)insn->immediate.value << 16;
+ port = (u8)insn->immediate.value & 0xffff;
break;
/* OUT immediate opcodes */
case 0xe6:
case 0xe7:
*exitinfo |= IOIO_TYPE_OUT;
- *exitinfo |= (u8)insn->immediate.value << 16;
+ port = (u8)insn->immediate.value & 0xffff;
break;
/* IN register opcodes */
case 0xec:
case 0xed:
*exitinfo |= IOIO_TYPE_IN;
- *exitinfo |= (ctxt->regs->dx & 0xffff) << 16;
+ port = ctxt->regs->dx & 0xffff;
break;
/* OUT register opcodes */
case 0xee:
case 0xef:
*exitinfo |= IOIO_TYPE_OUT;
- *exitinfo |= (ctxt->regs->dx & 0xffff) << 16;
+ port = ctxt->regs->dx & 0xffff;
break;
default:
return ES_DECODE_FAILED;
}
+ *exitinfo |= port << 16;
+
switch (insn->opcode.bytes[0]) {
case 0x6c:
case 0x6e:
@@ -716,12 +788,15 @@ static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo)
case 0xee:
/* Single byte opcodes */
*exitinfo |= IOIO_DATA_8;
+ size = 1;
break;
default:
/* Length determined by instruction parsing */
*exitinfo |= (insn->opnd_bytes == 2) ? IOIO_DATA_16
: IOIO_DATA_32;
+ size = (insn->opnd_bytes == 2) ? 2 : 4;
}
+
switch (insn->addr_bytes) {
case 2:
*exitinfo |= IOIO_ADDR_16;
@@ -737,7 +812,7 @@ static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo)
if (insn_has_rep_prefix(insn))
*exitinfo |= IOIO_REP;
- return ES_OK;
+ return vc_ioio_check(ctxt, (u16)port, size);
}
static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
@@ -848,14 +923,15 @@ static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
return ret;
}
-static int vc_handle_cpuid_snp(struct pt_regs *regs)
+static int vc_handle_cpuid_snp(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
{
+ struct pt_regs *regs = ctxt->regs;
struct cpuid_leaf leaf;
int ret;
leaf.fn = regs->ax;
leaf.subfn = regs->cx;
- ret = snp_cpuid(&leaf);
+ ret = snp_cpuid(ghcb, ctxt, &leaf);
if (!ret) {
regs->ax = leaf.eax;
regs->bx = leaf.ebx;
@@ -874,7 +950,7 @@ static enum es_result vc_handle_cpuid(struct ghcb *ghcb,
enum es_result ret;
int snp_cpuid_ret;
- snp_cpuid_ret = vc_handle_cpuid_snp(regs);
+ snp_cpuid_ret = vc_handle_cpuid_snp(ghcb, ctxt);
if (!snp_cpuid_ret)
return ES_OK;
if (snp_cpuid_ret != -EOPNOTSUPP)
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index 2787826d9f60..6395bfd87b68 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -524,6 +524,33 @@ static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt
return ES_OK;
}
+static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size)
+{
+ BUG_ON(size > 4);
+
+ if (user_mode(ctxt->regs)) {
+ struct thread_struct *t = &current->thread;
+ struct io_bitmap *iobm = t->io_bitmap;
+ size_t idx;
+
+ if (!iobm)
+ goto fault;
+
+ for (idx = port; idx < port + size; ++idx) {
+ if (test_bit(idx, iobm->bitmap))
+ goto fault;
+ }
+ }
+
+ return ES_OK;
+
+fault:
+ ctxt->fi.vector = X86_TRAP_GP;
+ ctxt->fi.error_code = 0;
+
+ return ES_EXCEPTION;
+}
+
/* Include code shared with pre-decompression boot stage */
#include "sev-shared.c"
@@ -868,8 +895,7 @@ void snp_set_memory_private(unsigned long vaddr, unsigned long npages)
void snp_accept_memory(phys_addr_t start, phys_addr_t end)
{
- unsigned long vaddr;
- unsigned int npages;
+ unsigned long vaddr, npages;
if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
return;
@@ -1509,6 +1535,9 @@ static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
return ES_DECODE_FAILED;
}
+ if (user_mode(ctxt->regs))
+ return ES_UNSUPPORTED;
+
switch (mmio) {
case INSN_MMIO_WRITE:
memcpy(ghcb->shared_buffer, reg_data, bytes);
diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
index fd689921a1db..59e15dd8d0f8 100644
--- a/arch/x86/kernel/shstk.c
+++ b/arch/x86/kernel/shstk.c
@@ -205,10 +205,21 @@ unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long cl
return 0;
/*
- * For CLONE_VM, except vfork, the child needs a separate shadow
+ * For CLONE_VFORK the child will share the parents shadow stack.
+ * Make sure to clear the internal tracking of the thread shadow
+ * stack so the freeing logic run for child knows to leave it alone.
+ */
+ if (clone_flags & CLONE_VFORK) {
+ shstk->base = 0;
+ shstk->size = 0;
+ return 0;
+ }
+
+ /*
+ * For !CLONE_VM the child will use a copy of the parents shadow
* stack.
*/
- if ((clone_flags & (CLONE_VFORK | CLONE_VM)) != CLONE_VM)
+ if (!(clone_flags & CLONE_VM))
return 0;
size = adjust_shstk_size(stack_size);
@@ -408,7 +419,25 @@ void shstk_free(struct task_struct *tsk)
if (!tsk->mm || tsk->mm != current->mm)
return;
+ /*
+ * If shstk->base is NULL, then this task is not managing its
+ * own shadow stack (CLONE_VFORK). So skip freeing it.
+ */
+ if (!shstk->base)
+ return;
+
+ /*
+ * shstk->base is NULL for CLONE_VFORK child tasks, and so is
+ * normal. But size = 0 on a shstk->base is not normal and
+ * indicated an attempt to free the thread shadow stack twice.
+ * Warn about it.
+ */
+ if (WARN_ON(!shstk->size))
+ return;
+
unmap_shadow_stack(shstk->base, shstk->size);
+
+ shstk->size = 0;
}
static int wrss_control(bool enable)
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 6eb06d001bcc..96a771f9f930 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -131,7 +131,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
}
/*
- * Disable virtualization, APIC etc. and park the CPU in a HLT loop
+ * this function calls the 'stop' function on all other CPUs in the system.
*/
DEFINE_IDTENTRY_SYSVEC(sysvec_reboot)
{
@@ -172,17 +172,13 @@ static void native_stop_other_cpus(int wait)
* 2) Wait for all other CPUs to report that they reached the
* HLT loop in stop_this_cpu()
*
- * 3) If the system uses INIT/STARTUP for CPU bringup, then
- * send all present CPUs an INIT vector, which brings them
- * completely out of the way.
+ * 3) If #2 timed out send an NMI to the CPUs which did not
+ * yet report
*
- * 4) If #3 is not possible and #2 timed out send an NMI to the
- * CPUs which did not yet report
- *
- * 5) Wait for all other CPUs to report that they reached the
+ * 4) Wait for all other CPUs to report that they reached the
* HLT loop in stop_this_cpu()
*
- * #4 can obviously race against a CPU reaching the HLT loop late.
+ * #3 can obviously race against a CPU reaching the HLT loop late.
* That CPU will have reported already and the "have all CPUs
* reached HLT" condition will be true despite the fact that the
* other CPU is still handling the NMI. Again, there is no
@@ -198,7 +194,7 @@ static void native_stop_other_cpus(int wait)
/*
* Don't wait longer than a second for IPI completion. The
* wait request is not checked here because that would
- * prevent an NMI/INIT shutdown in case that not all
+ * prevent an NMI shutdown attempt in case that not all
* CPUs reach shutdown state.
*/
timeout = USEC_PER_SEC;
@@ -206,27 +202,7 @@ static void native_stop_other_cpus(int wait)
udelay(1);
}
- /*
- * Park all other CPUs in INIT including "offline" CPUs, if
- * possible. That's a safe place where they can't resume execution
- * of HLT and then execute the HLT loop from overwritten text or
- * page tables.
- *
- * The only downside is a broadcast MCE, but up to the point where
- * the kexec() kernel brought all APs online again an MCE will just
- * make HLT resume and handle the MCE. The machine crashes and burns
- * due to overwritten text, page tables and data. So there is a
- * choice between fire and frying pan. The result is pretty much
- * the same. Chose frying pan until x86 provides a sane mechanism
- * to park a CPU.
- */
- if (smp_park_other_cpus_in_init())
- goto done;
-
- /*
- * If park with INIT was not possible and the REBOOT_VECTOR didn't
- * take all secondary CPUs offline, try with the NMI.
- */
+ /* if the REBOOT_VECTOR didn't work, try with the NMI */
if (!cpumask_empty(&cpus_stop_mask)) {
/*
* If NMI IPI is enabled, try to register the stop handler
@@ -249,7 +225,6 @@ static void native_stop_other_cpus(int wait)
udelay(1);
}
-done:
local_irq_save(flags);
disable_local_APIC();
mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 4e45ff44aa07..2a187c0cbd5b 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -579,7 +579,6 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
}
-#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_CLUSTER) || defined(CONFIG_SCHED_MC)
static inline int x86_sched_itmt_flags(void)
{
return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0;
@@ -603,7 +602,14 @@ static int x86_cluster_flags(void)
return cpu_cluster_flags() | x86_sched_itmt_flags();
}
#endif
-#endif
+
+static int x86_die_flags(void)
+{
+ if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
+ return x86_sched_itmt_flags();
+
+ return 0;
+}
/*
* Set if a package/die has multiple NUMA nodes inside.
@@ -640,7 +646,7 @@ static void __init build_sched_topology(void)
*/
if (!x86_has_numa_in_package) {
x86_topology[i++] = (struct sched_domain_topology_level){
- cpu_cpu_mask, SD_INIT_NAME(DIE)
+ cpu_cpu_mask, x86_die_flags, SD_INIT_NAME(DIE)
};
}
@@ -1234,33 +1240,6 @@ void arch_thaw_secondary_cpus_end(void)
cache_aps_init();
}
-bool smp_park_other_cpus_in_init(void)
-{
- unsigned int cpu, this_cpu = smp_processor_id();
- unsigned int apicid;
-
- if (apic->wakeup_secondary_cpu_64 || apic->wakeup_secondary_cpu)
- return false;
-
- /*
- * If this is a crash stop which does not execute on the boot CPU,
- * then this cannot use the INIT mechanism because INIT to the boot
- * CPU will reset the machine.
- */
- if (this_cpu)
- return false;
-
- for_each_cpu_and(cpu, &cpus_booted_once_mask, cpu_present_mask) {
- if (cpu == this_cpu)
- continue;
- apicid = apic->cpu_present_to_apicid(cpu);
- if (apicid == BAD_APICID)
- continue;
- send_init_sequence(apicid);
- }
- return true;
-}
-
/*
* Early setup to make printk work.
*/
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index ca004e2e4469..0bab03130033 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -54,7 +54,7 @@ void arch_unregister_cpu(int num)
EXPORT_SYMBOL(arch_unregister_cpu);
#else /* CONFIG_HOTPLUG_CPU */
-static int __init arch_register_cpu(int num)
+int __init arch_register_cpu(int num)
{
return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
}