summaryrefslogtreecommitdiff
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile6
-rw-r--r--arch/x86/kernel/acpi/boot.c9
-rw-r--r--arch/x86/kernel/acpi/cstate.c15
-rw-r--r--arch/x86/kernel/alternative.c191
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c27
-rw-r--r--arch/x86/kernel/audit_64.c10
-rw-r--r--arch/x86/kernel/cc_platform.c69
-rw-r--r--arch/x86/kernel/cpu/Makefile1
-rw-r--r--arch/x86/kernel/cpu/amd.c2
-rw-r--r--arch/x86/kernel/cpu/bugs.c13
-rw-r--r--arch/x86/kernel/cpu/cacheinfo.c1
-rw-r--r--arch/x86/kernel/cpu/common.c52
-rw-r--r--arch/x86/kernel/cpu/cpu.h1
-rw-r--r--arch/x86/kernel/cpu/cpuid-deps.c2
-rw-r--r--arch/x86/kernel/cpu/hygon.c2
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c13
-rw-r--r--arch/x86/kernel/cpu/mce/core.c292
-rw-r--r--arch/x86/kernel/cpu/mce/internal.h71
-rw-r--r--arch/x86/kernel/cpu/mce/p5.c6
-rw-r--r--arch/x86/kernel/cpu/mce/severity.c33
-rw-r--r--arch/x86/kernel/cpu/mce/winchip.c6
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c14
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c17
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c9
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c5
-rw-r--r--arch/x86/kernel/cpu/resctrl/core.c6
-rw-r--r--arch/x86/kernel/cpu/sgx/virt.c65
-rw-r--r--arch/x86/kernel/cpu/vortex.c39
-rw-r--r--arch/x86/kernel/crash_dump_64.c4
-rw-r--r--arch/x86/kernel/devicetree.c10
-rw-r--r--arch/x86/kernel/dumpstack_64.c6
-rw-r--r--arch/x86/kernel/early-quirks.c6
-rw-r--r--arch/x86/kernel/fpu/bugs.c2
-rw-r--r--arch/x86/kernel/fpu/context.h83
-rw-r--r--arch/x86/kernel/fpu/core.c392
-rw-r--r--arch/x86/kernel/fpu/init.c76
-rw-r--r--arch/x86/kernel/fpu/internal.h28
-rw-r--r--arch/x86/kernel/fpu/legacy.h115
-rw-r--r--arch/x86/kernel/fpu/regset.c36
-rw-r--r--arch/x86/kernel/fpu/signal.c292
-rw-r--r--arch/x86/kernel/fpu/xstate.c898
-rw-r--r--arch/x86/kernel/fpu/xstate.h278
-rw-r--r--arch/x86/kernel/ftrace.c76
-rw-r--r--arch/x86/kernel/ftrace_64.S30
-rw-r--r--arch/x86/kernel/head64.c9
-rw-r--r--arch/x86/kernel/hpet.c81
-rw-r--r--arch/x86/kernel/irq.c4
-rw-r--r--arch/x86/kernel/irq_32.c2
-rw-r--r--arch/x86/kernel/irqflags.S2
-rw-r--r--arch/x86/kernel/itmt.c2
-rw-r--r--arch/x86/kernel/kprobes/core.c71
-rw-r--r--arch/x86/kernel/kprobes/ftrace.c2
-rw-r--r--arch/x86/kernel/kprobes/opt.c6
-rw-r--r--arch/x86/kernel/kvm.c3
-rw-r--r--arch/x86/kernel/kvmclock.c17
-rw-r--r--arch/x86/kernel/machine_kexec_64.c19
-rw-r--r--arch/x86/kernel/module.c9
-rw-r--r--arch/x86/kernel/paravirt.c45
-rw-r--r--arch/x86/kernel/pci-swiotlb.c9
-rw-r--r--arch/x86/kernel/probe_roms.c2
-rw-r--r--arch/x86/kernel/process.c93
-rw-r--r--arch/x86/kernel/process_32.c5
-rw-r--r--arch/x86/kernel/process_64.c5
-rw-r--r--arch/x86/kernel/ptrace.c2
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S2
-rw-r--r--arch/x86/kernel/setup.c30
-rw-r--r--arch/x86/kernel/setup_percpu.c2
-rw-r--r--arch/x86/kernel/sev-shared.c68
-rw-r--r--arch/x86/kernel/sev.c74
-rw-r--r--arch/x86/kernel/signal.c83
-rw-r--r--arch/x86/kernel/smpboot.c48
-rw-r--r--arch/x86/kernel/trace.c2
-rw-r--r--arch/x86/kernel/traps.c100
-rw-r--r--arch/x86/kernel/umip.c8
-rw-r--r--arch/x86/kernel/unwind_frame.c3
-rw-r--r--arch/x86/kernel/unwind_guess.c3
-rw-r--r--arch/x86/kernel/unwind_orc.c21
-rw-r--r--arch/x86/kernel/vmlinux.lds.S14
78 files changed, 3054 insertions, 1101 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 8f4e8fa6ed75..2ff3e600f426 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -21,6 +21,7 @@ CFLAGS_REMOVE_ftrace.o = -pg
CFLAGS_REMOVE_early_printk.o = -pg
CFLAGS_REMOVE_head64.o = -pg
CFLAGS_REMOVE_sev.o = -pg
+CFLAGS_REMOVE_cc_platform.o = -pg
endif
KASAN_SANITIZE_head$(BITS).o := n
@@ -29,6 +30,7 @@ KASAN_SANITIZE_dumpstack_$(BITS).o := n
KASAN_SANITIZE_stacktrace.o := n
KASAN_SANITIZE_paravirt.o := n
KASAN_SANITIZE_sev.o := n
+KASAN_SANITIZE_cc_platform.o := n
# With some compiler versions the generated code results in boot hangs, caused
# by several compilation units. To be safe, disable all instrumentation.
@@ -47,6 +49,7 @@ endif
KCOV_INSTRUMENT := n
CFLAGS_head$(BITS).o += -fno-stack-protector
+CFLAGS_cc_platform.o += -fno-stack-protector
CFLAGS_irq.o := -I $(srctree)/$(src)/../include/asm/trace
@@ -147,6 +150,9 @@ obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o
obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o
obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev.o
+
+obj-$(CONFIG_ARCH_HAS_CC_PLATFORM) += cc_platform.o
+
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 14bcd59bcdee..5b6d1a95776f 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -62,6 +62,7 @@ int acpi_fix_pin2_polarity __initdata;
#ifdef CONFIG_X86_LOCAL_APIC
static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
+static bool acpi_support_online_capable;
#endif
#ifdef CONFIG_X86_IO_APIC
@@ -138,6 +139,8 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)
pr_debug("Local APIC address 0x%08x\n", madt->address);
}
+ if (madt->header.revision >= 5)
+ acpi_support_online_capable = true;
default_acpi_madt_oem_check(madt->header.oem_id,
madt->header.oem_table_id);
@@ -239,6 +242,12 @@ acpi_parse_lapic(union acpi_subtable_headers * header, const unsigned long end)
if (processor->id == 0xff)
return 0;
+ /* don't register processors that can not be onlined */
+ if (acpi_support_online_capable &&
+ !(processor->lapic_flags & ACPI_MADT_ENABLED) &&
+ !(processor->lapic_flags & ACPI_MADT_ONLINE_CAPABLE))
+ return 0;
+
/*
* We need to register disabled CPU as well to permit
* counting disabled CPUs. This allows us to size
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 7de599eba7f0..7945eae5b315 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -79,6 +79,21 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags,
*/
flags->bm_control = 0;
}
+ if (c->x86_vendor == X86_VENDOR_AMD && c->x86 >= 0x17) {
+ /*
+ * For all AMD Zen or newer CPUs that support C3, caches
+ * should not be flushed by software while entering C3
+ * type state. Set bm->check to 1 so that kernel doesn't
+ * need to execute cache flush operation.
+ */
+ flags->bm_check = 1;
+ /*
+ * In current AMD C state implementation ARB_DIS is no longer
+ * used. So set bm_control to zero to indicate ARB_DIS is not
+ * required while entering C3 type state.
+ */
+ flags->bm_control = 0;
+ }
}
EXPORT_SYMBOL(acpi_processor_power_init_bm_check);
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index e9da3dc71254..23fb4d51a5da 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -29,6 +29,7 @@
#include <asm/io.h>
#include <asm/fixmap.h>
#include <asm/paravirt.h>
+#include <asm/asm-prototypes.h>
int __read_mostly alternatives_patched;
@@ -113,6 +114,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
}
}
+extern s32 __retpoline_sites[], __retpoline_sites_end[];
extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
extern s32 __smp_locks[], __smp_locks_end[];
void text_poke_early(void *addr, const void *opcode, size_t len);
@@ -221,7 +223,7 @@ static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off)
* "noinline" to cause control flow change and thus invalidate I$ and
* cause refetch after modification.
*/
-static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr)
+static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
{
struct insn insn;
int i = 0;
@@ -239,11 +241,11 @@ static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *ins
* optimized.
*/
if (insn.length == 1 && insn.opcode.bytes[0] == 0x90)
- i += optimize_nops_range(instr, a->instrlen, i);
+ i += optimize_nops_range(instr, len, i);
else
i += insn.length;
- if (i >= a->instrlen)
+ if (i >= len)
return;
}
}
@@ -331,10 +333,185 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
text_poke_early(instr, insn_buff, insn_buff_sz);
next:
- optimize_nops(a, instr);
+ optimize_nops(instr, a->instrlen);
}
}
+#if defined(CONFIG_RETPOLINE) && defined(CONFIG_STACK_VALIDATION)
+
+/*
+ * CALL/JMP *%\reg
+ */
+static int emit_indirect(int op, int reg, u8 *bytes)
+{
+ int i = 0;
+ u8 modrm;
+
+ switch (op) {
+ case CALL_INSN_OPCODE:
+ modrm = 0x10; /* Reg = 2; CALL r/m */
+ break;
+
+ case JMP32_INSN_OPCODE:
+ modrm = 0x20; /* Reg = 4; JMP r/m */
+ break;
+
+ default:
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+
+ if (reg >= 8) {
+ bytes[i++] = 0x41; /* REX.B prefix */
+ reg -= 8;
+ }
+
+ modrm |= 0xc0; /* Mod = 3 */
+ modrm += reg;
+
+ bytes[i++] = 0xff; /* opcode */
+ bytes[i++] = modrm;
+
+ return i;
+}
+
+/*
+ * Rewrite the compiler generated retpoline thunk calls.
+ *
+ * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate
+ * indirect instructions, avoiding the extra indirection.
+ *
+ * For example, convert:
+ *
+ * CALL __x86_indirect_thunk_\reg
+ *
+ * into:
+ *
+ * CALL *%\reg
+ *
+ * It also tries to inline spectre_v2=retpoline,amd when size permits.
+ */
+static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
+{
+ retpoline_thunk_t *target;
+ int reg, ret, i = 0;
+ u8 op, cc;
+
+ target = addr + insn->length + insn->immediate.value;
+ reg = target - __x86_indirect_thunk_array;
+
+ if (WARN_ON_ONCE(reg & ~0xf))
+ return -1;
+
+ /* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */
+ BUG_ON(reg == 4);
+
+ if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) &&
+ !cpu_feature_enabled(X86_FEATURE_RETPOLINE_AMD))
+ return -1;
+
+ op = insn->opcode.bytes[0];
+
+ /*
+ * Convert:
+ *
+ * Jcc.d32 __x86_indirect_thunk_\reg
+ *
+ * into:
+ *
+ * Jncc.d8 1f
+ * [ LFENCE ]
+ * JMP *%\reg
+ * [ NOP ]
+ * 1:
+ */
+ /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */
+ if (op == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80) {
+ cc = insn->opcode.bytes[1] & 0xf;
+ cc ^= 1; /* invert condition */
+
+ bytes[i++] = 0x70 + cc; /* Jcc.d8 */
+ bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */
+
+ /* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */
+ op = JMP32_INSN_OPCODE;
+ }
+
+ /*
+ * For RETPOLINE_AMD: prepend the indirect CALL/JMP with an LFENCE.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_AMD)) {
+ bytes[i++] = 0x0f;
+ bytes[i++] = 0xae;
+ bytes[i++] = 0xe8; /* LFENCE */
+ }
+
+ ret = emit_indirect(op, reg, bytes + i);
+ if (ret < 0)
+ return ret;
+ i += ret;
+
+ for (; i < insn->length;)
+ bytes[i++] = BYTES_NOP1;
+
+ return i;
+}
+
+/*
+ * Generated by 'objtool --retpoline'.
+ */
+void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+ struct insn insn;
+ int len, ret;
+ u8 bytes[16];
+ u8 op1, op2;
+
+ ret = insn_decode_kernel(&insn, addr);
+ if (WARN_ON_ONCE(ret < 0))
+ continue;
+
+ op1 = insn.opcode.bytes[0];
+ op2 = insn.opcode.bytes[1];
+
+ switch (op1) {
+ case CALL_INSN_OPCODE:
+ case JMP32_INSN_OPCODE:
+ break;
+
+ case 0x0f: /* escape */
+ if (op2 >= 0x80 && op2 <= 0x8f)
+ break;
+ fallthrough;
+ default:
+ WARN_ON_ONCE(1);
+ continue;
+ }
+
+ DPRINTK("retpoline at: %pS (%px) len: %d to: %pS",
+ addr, addr, insn.length,
+ addr + insn.length + insn.immediate.value);
+
+ len = patch_retpoline(addr, &insn, bytes);
+ if (len == insn.length) {
+ optimize_nops(bytes, len);
+ DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr);
+ DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr);
+ text_poke_early(addr, bytes, len);
+ }
+ }
+}
+
+#else /* !RETPOLINES || !CONFIG_STACK_VALIDATION */
+
+void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { }
+
+#endif /* CONFIG_RETPOLINE && CONFIG_STACK_VALIDATION */
+
#ifdef CONFIG_SMP
static void alternatives_smp_lock(const s32 *start, const s32 *end,
u8 *text, u8 *text_end)
@@ -643,6 +820,12 @@ void __init alternative_instructions(void)
apply_paravirt(__parainstructions, __parainstructions_end);
/*
+ * Rewrite the retpolines, must be done before alternatives since
+ * those can rewrite the retpoline thunks.
+ */
+ apply_retpolines(__retpoline_sites, __retpoline_sites_end);
+
+ /*
* Then patch alternatives, such that those paravirt calls that are in
* alternatives can be overwritten by their immediate fragments.
*/
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index f4da9bb69a88..e696e22d0531 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -15,9 +15,15 @@ struct cluster_mask {
struct cpumask mask;
};
-static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
+/*
+ * __x2apic_send_IPI_mask() possibly needs to read
+ * x86_cpu_to_logical_apicid for all online cpus in a sequential way.
+ * Using per cpu variable would cost one cache line per cpu.
+ */
+static u32 *x86_cpu_to_logical_apicid __read_mostly;
+
static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
-static DEFINE_PER_CPU(struct cluster_mask *, cluster_masks);
+static DEFINE_PER_CPU_READ_MOSTLY(struct cluster_mask *, cluster_masks);
static struct cluster_mask *cluster_hotplug_mask;
static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
@@ -27,7 +33,7 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
static void x2apic_send_IPI(int cpu, int vector)
{
- u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu);
+ u32 dest = x86_cpu_to_logical_apicid[cpu];
/* x2apic MSRs are special and need a special fence: */
weak_wrmsr_fence();
@@ -58,7 +64,7 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
dest = 0;
for_each_cpu_and(clustercpu, tmpmsk, &cmsk->mask)
- dest |= per_cpu(x86_cpu_to_logical_apicid, clustercpu);
+ dest |= x86_cpu_to_logical_apicid[clustercpu];
if (!dest)
continue;
@@ -94,7 +100,7 @@ static void x2apic_send_IPI_all(int vector)
static u32 x2apic_calc_apicid(unsigned int cpu)
{
- return per_cpu(x86_cpu_to_logical_apicid, cpu);
+ return x86_cpu_to_logical_apicid[cpu];
}
static void init_x2apic_ldr(void)
@@ -103,7 +109,7 @@ static void init_x2apic_ldr(void)
u32 cluster, apicid = apic_read(APIC_LDR);
unsigned int cpu;
- this_cpu_write(x86_cpu_to_logical_apicid, apicid);
+ x86_cpu_to_logical_apicid[smp_processor_id()] = apicid;
if (cmsk)
goto update;
@@ -166,12 +172,21 @@ static int x2apic_dead_cpu(unsigned int dead_cpu)
static int x2apic_cluster_probe(void)
{
+ u32 slots;
+
if (!x2apic_mode)
return 0;
+ slots = max_t(u32, L1_CACHE_BYTES/sizeof(u32), nr_cpu_ids);
+ x86_cpu_to_logical_apicid = kcalloc(slots, sizeof(u32), GFP_KERNEL);
+ if (!x86_cpu_to_logical_apicid)
+ return 0;
+
if (cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "x86/x2apic:prepare",
x2apic_prepare_cpu, x2apic_dead_cpu) < 0) {
pr_err("Failed to register X2APIC_PREPARE\n");
+ kfree(x86_cpu_to_logical_apicid);
+ x86_cpu_to_logical_apicid = NULL;
return 0;
}
init_x2apic_ldr();
diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c
index 83d9cad4e68b..44c3601cfdc4 100644
--- a/arch/x86/kernel/audit_64.c
+++ b/arch/x86/kernel/audit_64.c
@@ -47,14 +47,16 @@ int audit_classify_syscall(int abi, unsigned syscall)
#endif
switch(syscall) {
case __NR_open:
- return 2;
+ return AUDITSC_OPEN;
case __NR_openat:
- return 3;
+ return AUDITSC_OPENAT;
case __NR_execve:
case __NR_execveat:
- return 5;
+ return AUDITSC_EXECVE;
+ case __NR_openat2:
+ return AUDITSC_OPENAT2;
default:
- return 0;
+ return AUDITSC_NATIVE;
}
}
diff --git a/arch/x86/kernel/cc_platform.c b/arch/x86/kernel/cc_platform.c
new file mode 100644
index 000000000000..03bb2f343ddb
--- /dev/null
+++ b/arch/x86/kernel/cc_platform.c
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Confidential Computing Platform Capability checks
+ *
+ * Copyright (C) 2021 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ */
+
+#include <linux/export.h>
+#include <linux/cc_platform.h>
+#include <linux/mem_encrypt.h>
+
+#include <asm/processor.h>
+
+static bool __maybe_unused intel_cc_platform_has(enum cc_attr attr)
+{
+#ifdef CONFIG_INTEL_TDX_GUEST
+ return false;
+#else
+ return false;
+#endif
+}
+
+/*
+ * SME and SEV are very similar but they are not the same, so there are
+ * times that the kernel will need to distinguish between SME and SEV. The
+ * cc_platform_has() function is used for this. When a distinction isn't
+ * needed, the CC_ATTR_MEM_ENCRYPT attribute can be used.
+ *
+ * The trampoline code is a good example for this requirement. Before
+ * paging is activated, SME will access all memory as decrypted, but SEV
+ * will access all memory as encrypted. So, when APs are being brought
+ * up under SME the trampoline area cannot be encrypted, whereas under SEV
+ * the trampoline area must be encrypted.
+ */
+static bool amd_cc_platform_has(enum cc_attr attr)
+{
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ switch (attr) {
+ case CC_ATTR_MEM_ENCRYPT:
+ return sme_me_mask;
+
+ case CC_ATTR_HOST_MEM_ENCRYPT:
+ return sme_me_mask && !(sev_status & MSR_AMD64_SEV_ENABLED);
+
+ case CC_ATTR_GUEST_MEM_ENCRYPT:
+ return sev_status & MSR_AMD64_SEV_ENABLED;
+
+ case CC_ATTR_GUEST_STATE_ENCRYPT:
+ return sev_status & MSR_AMD64_SEV_ES_ENABLED;
+
+ default:
+ return false;
+ }
+#else
+ return false;
+#endif
+}
+
+
+bool cc_platform_has(enum cc_attr attr)
+{
+ if (sme_me_mask)
+ return amd_cc_platform_has(attr);
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(cc_platform_has);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 637b499450d1..9661e3e802be 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -43,6 +43,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
obj-$(CONFIG_CPU_SUP_ZHAOXIN) += zhaoxin.o
+obj-$(CONFIG_CPU_SUP_VORTEX_32) += vortex.o
obj-$(CONFIG_X86_MCE) += mce/
obj-$(CONFIG_MTRR) += mtrr/
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 2131af9f2fa2..4edb6f0f628c 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -989,6 +989,8 @@ static void init_amd(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_IRPERF) &&
!cpu_has_amd_erratum(c, amd_erratum_1054))
msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT);
+
+ check_null_seg_clears_base(c);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index ecfca3bbcd96..1c1f218a701d 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -22,7 +22,7 @@
#include <asm/bugs.h>
#include <asm/processor.h>
#include <asm/processor-flags.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
#include <asm/msr.h>
#include <asm/vmx.h>
#include <asm/paravirt.h>
@@ -758,11 +758,11 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
case SPECTRE_V2_USER_CMD_FORCE:
mode = SPECTRE_V2_USER_STRICT;
break;
+ case SPECTRE_V2_USER_CMD_AUTO:
case SPECTRE_V2_USER_CMD_PRCTL:
case SPECTRE_V2_USER_CMD_PRCTL_IBPB:
mode = SPECTRE_V2_USER_PRCTL;
break;
- case SPECTRE_V2_USER_CMD_AUTO:
case SPECTRE_V2_USER_CMD_SECCOMP:
case SPECTRE_V2_USER_CMD_SECCOMP_IBPB:
if (IS_ENABLED(CONFIG_SECCOMP))
@@ -882,13 +882,6 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
return SPECTRE_V2_CMD_AUTO;
}
- if (cmd == SPECTRE_V2_CMD_RETPOLINE_AMD &&
- boot_cpu_data.x86_vendor != X86_VENDOR_HYGON &&
- boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
- pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n");
- return SPECTRE_V2_CMD_AUTO;
- }
-
spec_v2_print_cond(mitigation_options[i].option,
mitigation_options[i].secure);
return cmd;
@@ -1169,7 +1162,6 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void)
return mode;
switch (cmd) {
- case SPEC_STORE_BYPASS_CMD_AUTO:
case SPEC_STORE_BYPASS_CMD_SECCOMP:
/*
* Choose prctl+seccomp as the default mode if seccomp is
@@ -1183,6 +1175,7 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void)
case SPEC_STORE_BYPASS_CMD_ON:
mode = SPEC_STORE_BYPASS_DISABLE;
break;
+ case SPEC_STORE_BYPASS_CMD_AUTO:
case SPEC_STORE_BYPASS_CMD_PRCTL:
mode = SPEC_STORE_BYPASS_PRCTL;
break;
diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
index b5e36bd0425b..fe98a1465be6 100644
--- a/arch/x86/kernel/cpu/cacheinfo.c
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -846,6 +846,7 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c)
l2 = new_l2;
#ifdef CONFIG_SMP
per_cpu(cpu_llc_id, cpu) = l2_id;
+ per_cpu(cpu_l2c_id, cpu) = l2_id;
#endif
}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 0f8885949e8c..0083464de5e3 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -42,7 +42,7 @@
#include <asm/setup.h>
#include <asm/apic.h>
#include <asm/desc.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
#include <asm/mtrr.h>
#include <asm/hwcap2.h>
#include <linux/numa.h>
@@ -85,6 +85,9 @@ u16 get_llc_id(unsigned int cpu)
}
EXPORT_SYMBOL_GPL(get_llc_id);
+/* L2 cache ID of each logical CPU */
+DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id) = BAD_APICID;
+
/* correctly size the local cpu masks */
void __init setup_cpu_local_masks(void)
{
@@ -326,6 +329,7 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
#ifdef CONFIG_X86_SMAP
cr4_set_bits(X86_CR4_SMAP);
#else
+ clear_cpu_cap(c, X86_FEATURE_SMAP);
cr4_clear_bits(X86_CR4_SMAP);
#endif
}
@@ -1044,6 +1048,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
VULNWL(CENTAUR, 5, X86_MODEL_ANY, NO_SPECULATION),
VULNWL(INTEL, 5, X86_MODEL_ANY, NO_SPECULATION),
VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION),
+ VULNWL(VORTEX, 5, X86_MODEL_ANY, NO_SPECULATION),
+ VULNWL(VORTEX, 6, X86_MODEL_ANY, NO_SPECULATION),
/* Intel Family 6 */
VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
@@ -1395,9 +1401,8 @@ void __init early_cpu_init(void)
early_identify_cpu(&boot_cpu_data);
}
-static void detect_null_seg_behavior(struct cpuinfo_x86 *c)
+static bool detect_null_seg_behavior(void)
{
-#ifdef CONFIG_X86_64
/*
* Empirically, writing zero to a segment selector on AMD does
* not clear the base, whereas writing zero to a segment
@@ -1418,10 +1423,43 @@ static void detect_null_seg_behavior(struct cpuinfo_x86 *c)
wrmsrl(MSR_FS_BASE, 1);
loadsegment(fs, 0);
rdmsrl(MSR_FS_BASE, tmp);
- if (tmp != 0)
- set_cpu_bug(c, X86_BUG_NULL_SEG);
wrmsrl(MSR_FS_BASE, old_base);
-#endif
+ return tmp == 0;
+}
+
+void check_null_seg_clears_base(struct cpuinfo_x86 *c)
+{
+ /* BUG_NULL_SEG is only relevant with 64bit userspace */
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return;
+
+ /* Zen3 CPUs advertise Null Selector Clears Base in CPUID. */
+ if (c->extended_cpuid_level >= 0x80000021 &&
+ cpuid_eax(0x80000021) & BIT(6))
+ return;
+
+ /*
+ * CPUID bit above wasn't set. If this kernel is still running
+ * as a HV guest, then the HV has decided not to advertize
+ * that CPUID bit for whatever reason. For example, one
+ * member of the migration pool might be vulnerable. Which
+ * means, the bug is present: set the BUG flag and return.
+ */
+ if (cpu_has(c, X86_FEATURE_HYPERVISOR)) {
+ set_cpu_bug(c, X86_BUG_NULL_SEG);
+ return;
+ }
+
+ /*
+ * Zen2 CPUs also have this behaviour, but no CPUID bit.
+ * 0x18 is the respective family for Hygon.
+ */
+ if ((c->x86 == 0x17 || c->x86 == 0x18) &&
+ detect_null_seg_behavior())
+ return;
+
+ /* All the remaining ones are affected */
+ set_cpu_bug(c, X86_BUG_NULL_SEG);
}
static void generic_identify(struct cpuinfo_x86 *c)
@@ -1457,8 +1495,6 @@ static void generic_identify(struct cpuinfo_x86 *c)
get_model_name(c); /* Default name */
- detect_null_seg_behavior(c);
-
/*
* ESPFIX is a strange bug. All real CPUs have it. Paravirt
* systems that run Linux at CPL > 0 may or may not have the
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 95521302630d..ee6f23f7587d 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -75,6 +75,7 @@ extern int detect_extended_topology_early(struct cpuinfo_x86 *c);
extern int detect_extended_topology(struct cpuinfo_x86 *c);
extern int detect_ht_early(struct cpuinfo_x86 *c);
extern void detect_ht(struct cpuinfo_x86 *c);
+extern void check_null_seg_clears_base(struct cpuinfo_x86 *c);
unsigned int aperfmperf_get_khz(int cpu);
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index defda61f372d..cb2fdd130aae 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -75,6 +75,8 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_SGX_LC, X86_FEATURE_SGX },
{ X86_FEATURE_SGX1, X86_FEATURE_SGX },
{ X86_FEATURE_SGX2, X86_FEATURE_SGX1 },
+ { X86_FEATURE_XFD, X86_FEATURE_XSAVES },
+ { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD },
{}
};
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index 6d50136f7ab9..3fcdda4c1e11 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -335,6 +335,8 @@ static void init_hygon(struct cpuinfo_x86 *c)
/* Hygon CPUs don't reset SS attributes on SYSRET, Xen does. */
if (!cpu_has(c, X86_FEATURE_XENPV))
set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
+
+ check_null_seg_clears_base(c);
}
static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 08831acc1d03..fc85eb17cb6d 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -119,7 +119,7 @@ const char *smca_get_long_name(enum smca_bank_types t)
}
EXPORT_SYMBOL_GPL(smca_get_long_name);
-static enum smca_bank_types smca_get_bank_type(unsigned int bank)
+enum smca_bank_types smca_get_bank_type(unsigned int bank)
{
struct smca_bank *b;
@@ -132,6 +132,7 @@ static enum smca_bank_types smca_get_bank_type(unsigned int bank)
return b->hwid->bank_type;
}
+EXPORT_SYMBOL_GPL(smca_get_bank_type);
static struct smca_hwid smca_hwid_mcatypes[] = {
/* { bank_type, hwid_mcatype } */
@@ -526,7 +527,7 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high,
/* Fall back to method we used for older processors: */
switch (block) {
case 0:
- addr = msr_ops.misc(bank);
+ addr = mca_msr_reg(bank, MCA_MISC);
break;
case 1:
offset = ((low & MASK_BLKPTR_LO) >> 21);
@@ -978,8 +979,8 @@ static void log_error_deferred(unsigned int bank)
{
bool defrd;
- defrd = _log_error_bank(bank, msr_ops.status(bank),
- msr_ops.addr(bank), 0);
+ defrd = _log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS),
+ mca_msr_reg(bank, MCA_ADDR), 0);
if (!mce_flags.smca)
return;
@@ -1009,7 +1010,7 @@ static void amd_deferred_error_interrupt(void)
static void log_error_thresholding(unsigned int bank, u64 misc)
{
- _log_error_bank(bank, msr_ops.status(bank), msr_ops.addr(bank), misc);
+ _log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS), mca_msr_reg(bank, MCA_ADDR), misc);
}
static void log_and_reset_block(struct threshold_block *block)
@@ -1397,7 +1398,7 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu,
}
}
- err = allocate_threshold_blocks(cpu, b, bank, 0, msr_ops.misc(bank));
+ err = allocate_threshold_blocks(cpu, b, bank, 0, mca_msr_reg(bank, MCA_MISC));
if (err)
goto out_kobj;
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 193204aee880..6ed365337a3b 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -121,8 +121,6 @@ mce_banks_t mce_banks_ce_disabled;
static struct work_struct mce_work;
static struct irq_work mce_irq_work;
-static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
-
/*
* CPU/chipset specific EDAC code can register a notifier call here to print
* MCE errors in a human-readable form.
@@ -176,53 +174,27 @@ void mce_unregister_decode_chain(struct notifier_block *nb)
}
EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
-static inline u32 ctl_reg(int bank)
-{
- return MSR_IA32_MCx_CTL(bank);
-}
-
-static inline u32 status_reg(int bank)
-{
- return MSR_IA32_MCx_STATUS(bank);
-}
-
-static inline u32 addr_reg(int bank)
-{
- return MSR_IA32_MCx_ADDR(bank);
-}
-
-static inline u32 misc_reg(int bank)
+u32 mca_msr_reg(int bank, enum mca_msr reg)
{
- return MSR_IA32_MCx_MISC(bank);
-}
-
-static inline u32 smca_ctl_reg(int bank)
-{
- return MSR_AMD64_SMCA_MCx_CTL(bank);
-}
-
-static inline u32 smca_status_reg(int bank)
-{
- return MSR_AMD64_SMCA_MCx_STATUS(bank);
-}
+ if (mce_flags.smca) {
+ switch (reg) {
+ case MCA_CTL: return MSR_AMD64_SMCA_MCx_CTL(bank);
+ case MCA_ADDR: return MSR_AMD64_SMCA_MCx_ADDR(bank);
+ case MCA_MISC: return MSR_AMD64_SMCA_MCx_MISC(bank);
+ case MCA_STATUS: return MSR_AMD64_SMCA_MCx_STATUS(bank);
+ }
+ }
-static inline u32 smca_addr_reg(int bank)
-{
- return MSR_AMD64_SMCA_MCx_ADDR(bank);
-}
+ switch (reg) {
+ case MCA_CTL: return MSR_IA32_MCx_CTL(bank);
+ case MCA_ADDR: return MSR_IA32_MCx_ADDR(bank);
+ case MCA_MISC: return MSR_IA32_MCx_MISC(bank);
+ case MCA_STATUS: return MSR_IA32_MCx_STATUS(bank);
+ }
-static inline u32 smca_misc_reg(int bank)
-{
- return MSR_AMD64_SMCA_MCx_MISC(bank);
+ return 0;
}
-struct mca_msr_regs msr_ops = {
- .ctl = ctl_reg,
- .status = status_reg,
- .addr = addr_reg,
- .misc = misc_reg
-};
-
static void __print_mce(struct mce *m)
{
pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
@@ -362,24 +334,27 @@ static int msr_to_offset(u32 msr)
if (msr == mca_cfg.rip_msr)
return offsetof(struct mce, ip);
- if (msr == msr_ops.status(bank))
+ if (msr == mca_msr_reg(bank, MCA_STATUS))
return offsetof(struct mce, status);
- if (msr == msr_ops.addr(bank))
+ if (msr == mca_msr_reg(bank, MCA_ADDR))
return offsetof(struct mce, addr);
- if (msr == msr_ops.misc(bank))
+ if (msr == mca_msr_reg(bank, MCA_MISC))
return offsetof(struct mce, misc);
if (msr == MSR_IA32_MCG_STATUS)
return offsetof(struct mce, mcgstatus);
return -1;
}
-__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr,
- unsigned long error_code,
- unsigned long fault_addr)
+void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr)
{
- pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
- (unsigned int)regs->cx, regs->ip, (void *)regs->ip);
+ if (wrmsr) {
+ pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
+ (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax,
+ regs->ip, (void *)regs->ip);
+ } else {
+ pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
+ (unsigned int)regs->cx, regs->ip, (void *)regs->ip);
+ }
show_stack_regs(regs);
@@ -387,8 +362,6 @@ __visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup,
while (true)
cpu_relax();
-
- return true;
}
/* MSR access wrappers used for error injection */
@@ -420,32 +393,13 @@ static noinstr u64 mce_rdmsrl(u32 msr)
*/
asm volatile("1: rdmsr\n"
"2:\n"
- _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_rdmsr_fault)
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR_IN_MCE)
: EAX_EDX_RET(val, low, high) : "c" (msr));
return EAX_EDX_VAL(val, low, high);
}
-__visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr,
- unsigned long error_code,
- unsigned long fault_addr)
-{
- pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
- (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax,
- regs->ip, (void *)regs->ip);
-
- show_stack_regs(regs);
-
- panic("MCA architectural violation!\n");
-
- while (true)
- cpu_relax();
-
- return true;
-}
-
static noinstr void mce_wrmsrl(u32 msr, u64 v)
{
u32 low, high;
@@ -470,7 +424,7 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v)
/* See comment in mce_rdmsrl() */
asm volatile("1: wrmsr\n"
"2:\n"
- _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_fault)
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR_IN_MCE)
: : "c" (msr), "a"(low), "d" (high) : "memory");
}
@@ -685,10 +639,10 @@ static struct notifier_block mce_default_nb = {
static void mce_read_aux(struct mce *m, int i)
{
if (m->status & MCI_STATUS_MISCV)
- m->misc = mce_rdmsrl(msr_ops.misc(i));
+ m->misc = mce_rdmsrl(mca_msr_reg(i, MCA_MISC));
if (m->status & MCI_STATUS_ADDRV) {
- m->addr = mce_rdmsrl(msr_ops.addr(i));
+ m->addr = mce_rdmsrl(mca_msr_reg(i, MCA_ADDR));
/*
* Mask the reported address by the reported granularity.
@@ -758,7 +712,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
m.bank = i;
barrier();
- m.status = mce_rdmsrl(msr_ops.status(i));
+ m.status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
/* If this entry is not valid, ignore it */
if (!(m.status & MCI_STATUS_VAL))
@@ -826,7 +780,7 @@ clear_it:
/*
* Clear state for this bank.
*/
- mce_wrmsrl(msr_ops.status(i), 0);
+ mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
}
/*
@@ -841,6 +795,34 @@ clear_it:
EXPORT_SYMBOL_GPL(machine_check_poll);
/*
+ * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
+ * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
+ * Vol 3B Table 15-20). But this confuses both the code that determines
+ * whether the machine check occurred in kernel or user mode, and also
+ * the severity assessment code. Pretend that EIPV was set, and take the
+ * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
+ */
+static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
+{
+ if (bank != 0)
+ return;
+ if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
+ return;
+ if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
+ MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
+ MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
+ MCACOD)) !=
+ (MCI_STATUS_UC|MCI_STATUS_EN|
+ MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
+ MCI_STATUS_AR|MCACOD_INSTR))
+ return;
+
+ m->mcgstatus |= MCG_STATUS_EIPV;
+ m->ip = regs->ip;
+ m->cs = regs->cs;
+}
+
+/*
* Do a quick check if any of the events requires a panic.
* This decides if we keep the events around or clear them.
*/
@@ -851,13 +833,13 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
int i;
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
- m->status = mce_rdmsrl(msr_ops.status(i));
+ m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
if (!(m->status & MCI_STATUS_VAL))
continue;
__set_bit(i, validp);
- if (quirk_no_way_out)
- quirk_no_way_out(i, m, regs);
+ if (mce_flags.snb_ifu_quirk)
+ quirk_sandybridge_ifu(i, m, regs);
m->bank = i;
if (mce_severity(m, regs, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
@@ -1144,7 +1126,7 @@ static void mce_clear_state(unsigned long *toclear)
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
if (test_bit(i, toclear))
- mce_wrmsrl(msr_ops.status(i), 0);
+ mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
}
}
@@ -1203,7 +1185,7 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin
m->addr = 0;
m->bank = i;
- m->status = mce_rdmsrl(msr_ops.status(i));
+ m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
if (!(m->status & MCI_STATUS_VAL))
continue;
@@ -1272,7 +1254,7 @@ static void kill_me_maybe(struct callback_head *cb)
flags |= MF_MUST_KILL;
ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags);
- if (!ret && !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) {
+ if (!ret) {
set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
sync_core();
return;
@@ -1286,15 +1268,21 @@ static void kill_me_maybe(struct callback_head *cb)
if (ret == -EHWPOISON)
return;
- if (p->mce_vaddr != (void __user *)-1l) {
- force_sig_mceerr(BUS_MCEERR_AR, p->mce_vaddr, PAGE_SHIFT);
- } else {
- pr_err("Memory error not recovered");
- kill_me_now(cb);
- }
+ pr_err("Memory error not recovered");
+ kill_me_now(cb);
+}
+
+static void kill_me_never(struct callback_head *cb)
+{
+ struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
+
+ p->mce_count = 0;
+ pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr);
+ if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0))
+ set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
}
-static void queue_task_work(struct mce *m, char *msg, int kill_current_task)
+static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *))
{
int count = ++current->mce_count;
@@ -1304,11 +1292,7 @@ static void queue_task_work(struct mce *m, char *msg, int kill_current_task)
current->mce_kflags = m->kflags;
current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
current->mce_whole_page = whole_page(m);
-
- if (kill_current_task)
- current->mce_kill_me.func = kill_me_now;
- else
- current->mce_kill_me.func = kill_me_maybe;
+ current->mce_kill_me.func = func;
}
/* Ten is likely overkill. Don't expect more than two faults before task_work() */
@@ -1326,6 +1310,15 @@ static void queue_task_work(struct mce *m, char *msg, int kill_current_task)
task_work_add(current, &current->mce_kill_me, TWA_RESUME);
}
+/* Handle unconfigured int18 (should never happen) */
+static noinstr void unexpected_machine_check(struct pt_regs *regs)
+{
+ instrumentation_begin();
+ pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
+ smp_processor_id());
+ instrumentation_end();
+}
+
/*
* The actual machine check handler. This only handles real
* exceptions when something got corrupted coming in through int 18.
@@ -1346,36 +1339,43 @@ static void queue_task_work(struct mce *m, char *msg, int kill_current_task)
*/
noinstr void do_machine_check(struct pt_regs *regs)
{
+ int worst = 0, order, no_way_out, kill_current_task, lmce;
DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
DECLARE_BITMAP(toclear, MAX_NR_BANKS);
struct mca_config *cfg = &mca_cfg;
struct mce m, *final;
char *msg = NULL;
- int worst = 0;
+
+ if (unlikely(mce_flags.p5))
+ return pentium_machine_check(regs);
+ else if (unlikely(mce_flags.winchip))
+ return winchip_machine_check(regs);
+ else if (unlikely(!mca_cfg.initialized))
+ return unexpected_machine_check(regs);
/*
* Establish sequential order between the CPUs entering the machine
* check handler.
*/
- int order = -1;
+ order = -1;
/*
* If no_way_out gets set, there is no safe way to recover from this
* MCE. If mca_cfg.tolerant is cranked up, we'll try anyway.
*/
- int no_way_out = 0;
+ no_way_out = 0;
/*
* If kill_current_task is not set, there might be a way to recover from this
* error.
*/
- int kill_current_task = 0;
+ kill_current_task = 0;
/*
* MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
* on Intel.
*/
- int lmce = 1;
+ lmce = 1;
this_cpu_inc(mce_exception_count);
@@ -1459,7 +1459,10 @@ noinstr void do_machine_check(struct pt_regs *regs)
/* If this triggers there is no way to recover. Die hard. */
BUG_ON(!on_thread_stack() || !user_mode(regs));
- queue_task_work(&m, msg, kill_current_task);
+ if (kill_current_task)
+ queue_task_work(&m, msg, kill_me_now);
+ else
+ queue_task_work(&m, msg, kill_me_maybe);
} else {
/*
@@ -1477,7 +1480,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
}
if (m.kflags & MCE_IN_KERNEL_COPYIN)
- queue_task_work(&m, msg, kill_current_task);
+ queue_task_work(&m, msg, kill_me_never);
}
out:
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
@@ -1687,8 +1690,8 @@ static void __mcheck_cpu_init_clear_banks(void)
if (!b->init)
continue;
- wrmsrl(msr_ops.ctl(i), b->ctl);
- wrmsrl(msr_ops.status(i), 0);
+ wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl);
+ wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
}
}
@@ -1714,39 +1717,11 @@ static void __mcheck_cpu_check_banks(void)
if (!b->init)
continue;
- rdmsrl(msr_ops.ctl(i), msrval);
+ rdmsrl(mca_msr_reg(i, MCA_CTL), msrval);
b->init = !!msrval;
}
}
-/*
- * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
- * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
- * Vol 3B Table 15-20). But this confuses both the code that determines
- * whether the machine check occurred in kernel or user mode, and also
- * the severity assessment code. Pretend that EIPV was set, and take the
- * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
- */
-static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
-{
- if (bank != 0)
- return;
- if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
- return;
- if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
- MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
- MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
- MCACOD)) !=
- (MCI_STATUS_UC|MCI_STATUS_EN|
- MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
- MCI_STATUS_AR|MCACOD_INSTR))
- return;
-
- m->mcgstatus |= MCG_STATUS_EIPV;
- m->ip = regs->ip;
- m->cs = regs->cs;
-}
-
/* Add per CPU specific workarounds here */
static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
{
@@ -1820,7 +1795,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
cfg->bootlog = 0;
if (c->x86 == 6 && c->x86_model == 45)
- quirk_no_way_out = quirk_sandybridge_ifu;
+ mce_flags.snb_ifu_quirk = 1;
}
if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
@@ -1850,9 +1825,11 @@ static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
intel_p5_mcheck_init(c);
+ mce_flags.p5 = 1;
return 1;
case X86_VENDOR_CENTAUR:
winchip_mcheck_init(c);
+ mce_flags.winchip = 1;
return 1;
default:
return 0;
@@ -1871,13 +1848,6 @@ static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR);
mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA);
mce_flags.amd_threshold = 1;
-
- if (mce_flags.smca) {
- msr_ops.ctl = smca_ctl_reg;
- msr_ops.status = smca_status_reg;
- msr_ops.addr = smca_addr_reg;
- msr_ops.misc = smca_misc_reg;
- }
}
}
@@ -2007,18 +1977,6 @@ bool filter_mce(struct mce *m)
return false;
}
-/* Handle unconfigured int18 (should never happen) */
-static noinstr void unexpected_machine_check(struct pt_regs *regs)
-{
- instrumentation_begin();
- pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
- smp_processor_id());
- instrumentation_end();
-}
-
-/* Call the installed machine check handler for this CPU setup. */
-void (*machine_check_vector)(struct pt_regs *) = unexpected_machine_check;
-
static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
{
irqentry_state_t irq_state;
@@ -2029,31 +1987,22 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
* Only required when from kernel mode. See
* mce_check_crashing_cpu() for details.
*/
- if (machine_check_vector == do_machine_check &&
- mce_check_crashing_cpu())
+ if (mca_cfg.initialized && mce_check_crashing_cpu())
return;
irq_state = irqentry_nmi_enter(regs);
- /*
- * The call targets are marked noinstr, but objtool can't figure
- * that out because it's an indirect call. Annotate it.
- */
- instrumentation_begin();
- machine_check_vector(regs);
+ do_machine_check(regs);
- instrumentation_end();
irqentry_nmi_exit(regs, irq_state);
}
static __always_inline void exc_machine_check_user(struct pt_regs *regs)
{
irqentry_enter_from_user_mode(regs);
- instrumentation_begin();
- machine_check_vector(regs);
+ do_machine_check(regs);
- instrumentation_end();
irqentry_exit_to_user_mode(regs);
}
@@ -2120,7 +2069,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
return;
}
- machine_check_vector = do_machine_check;
+ mca_cfg.initialized = 1;
__mcheck_cpu_init_early(c);
__mcheck_cpu_init_generic();
@@ -2228,7 +2177,6 @@ int __init mcheck_init(void)
mce_register_decode_chain(&early_nb);
mce_register_decode_chain(&mce_uc_nb);
mce_register_decode_chain(&mce_default_nb);
- mcheck_vendor_init_severity();
INIT_WORK(&mce_work, mce_gen_pool_process);
init_irq_work(&mce_irq_work, mce_irq_work_cb);
@@ -2253,7 +2201,7 @@ static void mce_disable_error_reporting(void)
struct mce_bank *b = &mce_banks[i];
if (b->init)
- wrmsrl(msr_ops.ctl(i), 0);
+ wrmsrl(mca_msr_reg(i, MCA_CTL), 0);
}
return;
}
@@ -2605,7 +2553,7 @@ static void mce_reenable_cpu(void)
struct mce_bank *b = &mce_banks[i];
if (b->init)
- wrmsrl(msr_ops.ctl(i), b->ctl);
+ wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl);
}
}
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 88dcc79cfb07..acd61c41846c 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -8,9 +8,6 @@
#include <linux/device.h>
#include <asm/mce.h>
-/* Pointer to the installed machine check handler for this CPU setup. */
-extern void (*machine_check_vector)(struct pt_regs *);
-
enum severity_level {
MCE_NO_SEVERITY,
MCE_DEFERRED_SEVERITY,
@@ -38,8 +35,7 @@ int mce_gen_pool_add(struct mce *mce);
int mce_gen_pool_init(void);
struct llist_node *mce_gen_pool_prepare_records(void);
-extern int (*mce_severity)(struct mce *a, struct pt_regs *regs,
- int tolerant, char **msg, bool is_excp);
+int mce_severity(struct mce *a, struct pt_regs *regs, int tolerant, char **msg, bool is_excp);
struct dentry *mce_get_debugfs_dir(void);
extern mce_banks_t mce_banks_ce_disabled;
@@ -61,7 +57,7 @@ static inline void cmci_disable_bank(int bank) { }
static inline void intel_init_cmci(void) { }
static inline void intel_init_lmce(void) { }
static inline void intel_clear_lmce(void) { }
-static inline bool intel_filter_mce(struct mce *m) { return false; };
+static inline bool intel_filter_mce(struct mce *m) { return false; }
#endif
void mce_timer_kick(unsigned long interval);
@@ -117,23 +113,25 @@ static inline void mce_unregister_injector_chain(struct notifier_block *nb) { }
#endif
struct mca_config {
- bool dont_log_ce;
- bool cmci_disabled;
- bool ignore_ce;
- bool print_all;
-
__u64 lmce_disabled : 1,
disabled : 1,
ser : 1,
recovery : 1,
bios_cmci_threshold : 1,
- __reserved : 59;
+ /* Proper #MC exception handler is set */
+ initialized : 1,
+ __reserved : 58;
+
+ bool dont_log_ce;
+ bool cmci_disabled;
+ bool ignore_ce;
+ bool print_all;
- s8 bootlog;
int tolerant;
int monarch_timeout;
int panic_timeout;
u32 rip_msr;
+ s8 bootlog;
};
extern struct mca_config mca_cfg;
@@ -163,19 +161,28 @@ struct mce_vendor_flags {
/* AMD-style error thresholding banks present. */
amd_threshold : 1,
- __reserved_0 : 60;
+ /* Pentium, family 5-style MCA */
+ p5 : 1,
+
+ /* Centaur Winchip C6-style MCA */
+ winchip : 1,
+
+ /* SandyBridge IFU quirk */
+ snb_ifu_quirk : 1,
+
+ __reserved_0 : 57;
};
extern struct mce_vendor_flags mce_flags;
-struct mca_msr_regs {
- u32 (*ctl) (int bank);
- u32 (*status) (int bank);
- u32 (*addr) (int bank);
- u32 (*misc) (int bank);
+enum mca_msr {
+ MCA_CTL,
+ MCA_STATUS,
+ MCA_ADDR,
+ MCA_MISC,
};
-extern struct mca_msr_regs msr_ops;
+u32 mca_msr_reg(int bank, enum mca_msr reg);
/* Decide whether to add MCE record to MCE event pool or filter it out. */
extern bool filter_mce(struct mce *m);
@@ -183,17 +190,21 @@ extern bool filter_mce(struct mce *m);
#ifdef CONFIG_X86_MCE_AMD
extern bool amd_filter_mce(struct mce *m);
#else
-static inline bool amd_filter_mce(struct mce *m) { return false; };
+static inline bool amd_filter_mce(struct mce *m) { return false; }
#endif
-__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr,
- unsigned long error_code,
- unsigned long fault_addr);
-
-__visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr,
- unsigned long error_code,
- unsigned long fault_addr);
+#ifdef CONFIG_X86_ANCIENT_MCE
+void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
+void winchip_mcheck_init(struct cpuinfo_x86 *c);
+noinstr void pentium_machine_check(struct pt_regs *regs);
+noinstr void winchip_machine_check(struct pt_regs *regs);
+static inline void enable_p5_mce(void) { mce_p5_enabled = 1; }
+#else
+static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {}
+static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
+static inline void enable_p5_mce(void) {}
+static inline void pentium_machine_check(struct pt_regs *regs) {}
+static inline void winchip_machine_check(struct pt_regs *regs) {}
+#endif
#endif /* __X86_MCE_INTERNAL_H__ */
diff --git a/arch/x86/kernel/cpu/mce/p5.c b/arch/x86/kernel/cpu/mce/p5.c
index 19e90cae8e97..2272ad53fc33 100644
--- a/arch/x86/kernel/cpu/mce/p5.c
+++ b/arch/x86/kernel/cpu/mce/p5.c
@@ -21,7 +21,7 @@
int mce_p5_enabled __read_mostly;
/* Machine check handler for Pentium class Intel CPUs: */
-static noinstr void pentium_machine_check(struct pt_regs *regs)
+noinstr void pentium_machine_check(struct pt_regs *regs)
{
u32 loaddr, hi, lotype;
@@ -54,10 +54,6 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
if (!cpu_has(c, X86_FEATURE_MCE))
return;
- machine_check_vector = pentium_machine_check;
- /* Make sure the vector pointer is visible before we enable MCEs: */
- wmb();
-
/* Read registers before enabling: */
rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
rdmsr(MSR_IA32_P5_MC_TYPE, l, h);
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
index 17e631443116..bb019a594a2c 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -265,25 +265,25 @@ static bool is_copy_from_user(struct pt_regs *regs)
*/
static int error_context(struct mce *m, struct pt_regs *regs)
{
- enum handler_type t;
-
if ((m->cs & 3) == 3)
return IN_USER;
if (!mc_recoverable(m->mcgstatus))
return IN_KERNEL;
- t = ex_get_fault_handler_type(m->ip);
- if (t == EX_HANDLER_FAULT) {
- m->kflags |= MCE_IN_KERNEL_RECOV;
- return IN_KERNEL_RECOV;
- }
- if (t == EX_HANDLER_UACCESS && regs && is_copy_from_user(regs)) {
- m->kflags |= MCE_IN_KERNEL_RECOV;
+ switch (ex_get_fixup_type(m->ip)) {
+ case EX_TYPE_UACCESS:
+ case EX_TYPE_COPY:
+ if (!regs || !is_copy_from_user(regs))
+ return IN_KERNEL;
m->kflags |= MCE_IN_KERNEL_COPYIN;
+ fallthrough;
+ case EX_TYPE_FAULT_MCE_SAFE:
+ case EX_TYPE_DEFAULT_MCE_SAFE:
+ m->kflags |= MCE_IN_KERNEL_RECOV;
return IN_KERNEL_RECOV;
+ default:
+ return IN_KERNEL;
}
-
- return IN_KERNEL;
}
static int mce_severity_amd_smca(struct mce *m, enum context err_ctx)
@@ -407,15 +407,14 @@ static int mce_severity_intel(struct mce *m, struct pt_regs *regs,
}
}
-/* Default to mce_severity_intel */
-int (*mce_severity)(struct mce *m, struct pt_regs *regs, int tolerant, char **msg, bool is_excp) =
- mce_severity_intel;
-
-void __init mcheck_vendor_init_severity(void)
+int mce_severity(struct mce *m, struct pt_regs *regs, int tolerant, char **msg,
+ bool is_excp)
{
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
- mce_severity = mce_severity_amd;
+ return mce_severity_amd(m, regs, tolerant, msg, is_excp);
+ else
+ return mce_severity_intel(m, regs, tolerant, msg, is_excp);
}
#ifdef CONFIG_DEBUG_FS
diff --git a/arch/x86/kernel/cpu/mce/winchip.c b/arch/x86/kernel/cpu/mce/winchip.c
index 9c9f0abd2d7f..6c99f2941909 100644
--- a/arch/x86/kernel/cpu/mce/winchip.c
+++ b/arch/x86/kernel/cpu/mce/winchip.c
@@ -17,7 +17,7 @@
#include "internal.h"
/* Machine check handler for WinChip C6: */
-static noinstr void winchip_machine_check(struct pt_regs *regs)
+noinstr void winchip_machine_check(struct pt_regs *regs)
{
instrumentation_begin();
pr_emerg("CPU0: Machine Check Exception.\n");
@@ -30,10 +30,6 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c)
{
u32 lo, hi;
- machine_check_vector = winchip_machine_check;
- /* Make sure the vector pointer is visible before we enable MCEs: */
- wmb();
-
rdmsr(MSR_IDT_FCR1, lo, hi);
lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */
lo &= ~(1<<4); /* Enable MCE */
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 3d4a48336084..8b2fcdfa6d31 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -456,17 +456,23 @@ apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size, bool save_p
static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family)
{
-#ifdef CONFIG_X86_64
char fw_name[36] = "amd-ucode/microcode_amd.bin";
+ struct firmware fw;
+
+ if (IS_ENABLED(CONFIG_X86_32))
+ return false;
if (family >= 0x15)
snprintf(fw_name, sizeof(fw_name),
"amd-ucode/microcode_amd_fam%.2xh.bin", family);
- return get_builtin_firmware(cp, fw_name);
-#else
+ if (firmware_request_builtin(&fw, fw_name)) {
+ cp->size = fw.size;
+ cp->data = (void *)fw.data;
+ return true;
+ }
+
return false;
-#endif
}
static void __load_ucode_amd(unsigned int cpuid_1_eax, struct cpio_data *ret)
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index efb69be41ab1..f955d25076ba 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -140,23 +140,6 @@ static bool __init check_loader_disabled_bsp(void)
return *res;
}
-extern struct builtin_fw __start_builtin_fw[];
-extern struct builtin_fw __end_builtin_fw[];
-
-bool get_builtin_firmware(struct cpio_data *cd, const char *name)
-{
- struct builtin_fw *b_fw;
-
- for (b_fw = __start_builtin_fw; b_fw != __end_builtin_fw; b_fw++) {
- if (!strcmp(name, b_fw->name)) {
- cd->size = b_fw->size;
- cd->data = b_fw->data;
- return true;
- }
- }
- return false;
-}
-
void __init load_ucode_bsp(void)
{
unsigned int cpuid_1_eax;
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 7e8e07bddd5f..d28a9f8f3fec 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -456,6 +456,7 @@ static void save_mc_for_early(struct ucode_cpu_info *uci, u8 *mc, unsigned int s
static bool load_builtin_intel_microcode(struct cpio_data *cp)
{
unsigned int eax = 1, ebx, ecx = 0, edx;
+ struct firmware fw;
char name[30];
if (IS_ENABLED(CONFIG_X86_32))
@@ -466,7 +467,13 @@ static bool load_builtin_intel_microcode(struct cpio_data *cp)
sprintf(name, "intel-ucode/%02x-%02x-%02x",
x86_family(eax), x86_model(eax), x86_stepping(eax));
- return get_builtin_firmware(cp, name);
+ if (firmware_request_builtin(&fw, name)) {
+ cp->size = fw.size;
+ cp->data = (void *)fw.data;
+ return true;
+ }
+
+ return false;
}
/*
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index e095c28d27ae..4794b716ec79 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -313,9 +313,14 @@ static void __init ms_hyperv_init_platform(void)
if (ms_hyperv.priv_high & HV_ISOLATION) {
ms_hyperv.isolation_config_a = cpuid_eax(HYPERV_CPUID_ISOLATION_CONFIG);
ms_hyperv.isolation_config_b = cpuid_ebx(HYPERV_CPUID_ISOLATION_CONFIG);
+ ms_hyperv.shared_gpa_boundary =
+ BIT_ULL(ms_hyperv.shared_gpa_boundary_bits);
pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n",
ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b);
+
+ if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP)
+ static_branch_enable(&isolation_type_snp);
}
if (hv_max_functions_eax >= HYPERV_CPUID_NESTED_FEATURES) {
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 4b8813bafffd..bb1c3f5f60c8 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -527,12 +527,14 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
rdt_domain_reconfigure_cdp(r);
if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
- kfree(d);
+ kfree(hw_dom);
return;
}
if (r->mon_capable && domain_setup_mon_state(r, d)) {
- kfree(d);
+ kfree(hw_dom->ctrl_val);
+ kfree(hw_dom->mbps_val);
+ kfree(hw_dom);
return;
}
diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
index 64511c4a5200..6a77a14eee38 100644
--- a/arch/x86/kernel/cpu/sgx/virt.c
+++ b/arch/x86/kernel/cpu/sgx/virt.c
@@ -111,10 +111,8 @@ static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma)
return 0;
}
-static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
+static int sgx_vepc_remove_page(struct sgx_epc_page *epc_page)
{
- int ret;
-
/*
* Take a previously guest-owned EPC page and return it to the
* general EPC page pool.
@@ -124,7 +122,12 @@ static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
* case that a guest properly EREMOVE'd this page, a superfluous
* EREMOVE is harmless.
*/
- ret = __eremove(sgx_get_epc_virt_addr(epc_page));
+ return __eremove(sgx_get_epc_virt_addr(epc_page));
+}
+
+static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
+{
+ int ret = sgx_vepc_remove_page(epc_page);
if (ret) {
/*
* Only SGX_CHILD_PRESENT is expected, which is because of
@@ -144,10 +147,44 @@ static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
}
sgx_free_epc_page(epc_page);
-
return 0;
}
+static long sgx_vepc_remove_all(struct sgx_vepc *vepc)
+{
+ struct sgx_epc_page *entry;
+ unsigned long index;
+ long failures = 0;
+
+ xa_for_each(&vepc->page_array, index, entry) {
+ int ret = sgx_vepc_remove_page(entry);
+ if (ret) {
+ if (ret == SGX_CHILD_PRESENT) {
+ /* The page is a SECS, userspace will retry. */
+ failures++;
+ } else {
+ /*
+ * Report errors due to #GP or SGX_ENCLAVE_ACT; do not
+ * WARN, as userspace can induce said failures by
+ * calling the ioctl concurrently on multiple vEPCs or
+ * while one or more CPUs is running the enclave. Only
+ * a #PF on EREMOVE indicates a kernel/hardware issue.
+ */
+ WARN_ON_ONCE(encls_faulted(ret) &&
+ ENCLS_TRAPNR(ret) != X86_TRAP_GP);
+ return -EBUSY;
+ }
+ }
+ cond_resched();
+ }
+
+ /*
+ * Return the number of SECS pages that failed to be removed, so
+ * userspace knows that it has to retry.
+ */
+ return failures;
+}
+
static int sgx_vepc_release(struct inode *inode, struct file *file)
{
struct sgx_vepc *vepc = file->private_data;
@@ -233,9 +270,27 @@ static int sgx_vepc_open(struct inode *inode, struct file *file)
return 0;
}
+static long sgx_vepc_ioctl(struct file *file,
+ unsigned int cmd, unsigned long arg)
+{
+ struct sgx_vepc *vepc = file->private_data;
+
+ switch (cmd) {
+ case SGX_IOC_VEPC_REMOVE_ALL:
+ if (arg)
+ return -EINVAL;
+ return sgx_vepc_remove_all(vepc);
+
+ default:
+ return -ENOTTY;
+ }
+}
+
static const struct file_operations sgx_vepc_fops = {
.owner = THIS_MODULE,
.open = sgx_vepc_open,
+ .unlocked_ioctl = sgx_vepc_ioctl,
+ .compat_ioctl = sgx_vepc_ioctl,
.release = sgx_vepc_release,
.mmap = sgx_vepc_mmap,
};
diff --git a/arch/x86/kernel/cpu/vortex.c b/arch/x86/kernel/cpu/vortex.c
new file mode 100644
index 000000000000..e2685470ba94
--- /dev/null
+++ b/arch/x86/kernel/cpu/vortex.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <asm/processor.h>
+#include "cpu.h"
+
+/*
+ * No special init required for Vortex processors.
+ */
+
+static const struct cpu_dev vortex_cpu_dev = {
+ .c_vendor = "Vortex",
+ .c_ident = { "Vortex86 SoC" },
+ .legacy_models = {
+ {
+ .family = 5,
+ .model_names = {
+ [2] = "Vortex86DX",
+ [8] = "Vortex86MX",
+ },
+ },
+ {
+ .family = 6,
+ .model_names = {
+ /*
+ * Both the Vortex86EX and the Vortex86EX2
+ * have the same family and model id.
+ *
+ * However, the -EX2 supports the product name
+ * CPUID call, so this name will only be used
+ * for the -EX, which does not.
+ */
+ [0] = "Vortex86EX",
+ },
+ },
+ },
+ .c_x86_vendor = X86_VENDOR_VORTEX,
+};
+
+cpu_dev_register(vortex_cpu_dev);
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 045e82e8945b..a7f617a3981d 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -10,6 +10,7 @@
#include <linux/crash_dump.h>
#include <linux/uaccess.h>
#include <linux/io.h>
+#include <linux/cc_platform.h>
static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
unsigned long offset, int userbuf,
@@ -73,5 +74,6 @@ ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize,
ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos)
{
- return read_from_oldmem(buf, count, ppos, 0, sev_active());
+ return read_from_oldmem(buf, count, ppos, 0,
+ cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT));
}
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 6a4cb71c2498..5cd51f25f446 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -31,11 +31,6 @@ char __initdata cmd_line[COMMAND_LINE_SIZE];
int __initdata of_ioapic;
-void __init early_init_dt_scan_chosen_arch(unsigned long node)
-{
- BUG();
-}
-
void __init early_init_dt_add_memory_arch(u64 base, u64 size)
{
BUG();
@@ -139,12 +134,11 @@ static void __init dtb_cpu_setup(void)
{
struct device_node *dn;
u32 apic_id, version;
- int ret;
version = GET_APIC_VERSION(apic_read(APIC_LVR));
for_each_of_cpu_node(dn) {
- ret = of_property_read_u32(dn, "reg", &apic_id);
- if (ret < 0) {
+ apic_id = of_get_cpu_hwid(dn, 0);
+ if (apic_id == ~0U) {
pr_warn("%pOF: missing local APIC ID\n", dn);
continue;
}
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 5601b95944fa..6c5defd6569a 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -32,9 +32,15 @@ const char *stack_type_name(enum stack_type type)
{
BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
+ if (type == STACK_TYPE_TASK)
+ return "TASK";
+
if (type == STACK_TYPE_IRQ)
return "IRQ";
+ if (type == STACK_TYPE_SOFTIRQ)
+ return "SOFTIRQ";
+
if (type == STACK_TYPE_ENTRY) {
/*
* On 64-bit, we have a generic entry stack that we
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 38837dad46e6..391a4e2b8604 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -714,12 +714,6 @@ static struct chipset early_qrk[] __initdata = {
*/
{ PCI_VENDOR_ID_INTEL, 0x0f00,
PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
- { PCI_VENDOR_ID_INTEL, 0x3e20,
- PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
- { PCI_VENDOR_ID_INTEL, 0x3ec4,
- PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
- { PCI_VENDOR_ID_INTEL, 0x8a12,
- PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
{ PCI_VENDOR_ID_BROADCOM, 0x4331,
PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset},
{}
diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c
index 2954fab15e51..794e70151203 100644
--- a/arch/x86/kernel/fpu/bugs.c
+++ b/arch/x86/kernel/fpu/bugs.c
@@ -2,7 +2,7 @@
/*
* x86 FPU bug checks:
*/
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
/*
* Boot time CPU/FPU FDIV bug detection code:
diff --git a/arch/x86/kernel/fpu/context.h b/arch/x86/kernel/fpu/context.h
new file mode 100644
index 000000000000..958accf2ccf0
--- /dev/null
+++ b/arch/x86/kernel/fpu/context.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_KERNEL_FPU_CONTEXT_H
+#define __X86_KERNEL_FPU_CONTEXT_H
+
+#include <asm/fpu/xstate.h>
+#include <asm/trace/fpu.h>
+
+/* Functions related to FPU context tracking */
+
+/*
+ * The in-register FPU state for an FPU context on a CPU is assumed to be
+ * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx
+ * matches the FPU.
+ *
+ * If the FPU register state is valid, the kernel can skip restoring the
+ * FPU state from memory.
+ *
+ * Any code that clobbers the FPU registers or updates the in-memory
+ * FPU state for a task MUST let the rest of the kernel know that the
+ * FPU registers are no longer valid for this task.
+ *
+ * Either one of these invalidation functions is enough. Invalidate
+ * a resource you control: CPU if using the CPU for something else
+ * (with preemption disabled), FPU for the current task, or a task that
+ * is prevented from running by the current task.
+ */
+static inline void __cpu_invalidate_fpregs_state(void)
+{
+ __this_cpu_write(fpu_fpregs_owner_ctx, NULL);
+}
+
+static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu)
+{
+ fpu->last_cpu = -1;
+}
+
+static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu)
+{
+ return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu;
+}
+
+static inline void fpregs_deactivate(struct fpu *fpu)
+{
+ __this_cpu_write(fpu_fpregs_owner_ctx, NULL);
+ trace_x86_fpu_regs_deactivated(fpu);
+}
+
+static inline void fpregs_activate(struct fpu *fpu)
+{
+ __this_cpu_write(fpu_fpregs_owner_ctx, fpu);
+ trace_x86_fpu_regs_activated(fpu);
+}
+
+/* Internal helper for switch_fpu_return() and signal frame setup */
+static inline void fpregs_restore_userregs(void)
+{
+ struct fpu *fpu = &current->thread.fpu;
+ int cpu = smp_processor_id();
+
+ if (WARN_ON_ONCE(current->flags & PF_KTHREAD))
+ return;
+
+ if (!fpregs_state_valid(fpu, cpu)) {
+ /*
+ * This restores _all_ xstate which has not been
+ * established yet.
+ *
+ * If PKRU is enabled, then the PKRU value is already
+ * correct because it was either set in switch_to() or in
+ * flush_thread(). So it is excluded because it might be
+ * not up to date in current->thread.fpu.xsave state.
+ *
+ * XFD state is handled in restore_fpregs_from_fpstate().
+ */
+ restore_fpregs_from_fpstate(fpu->fpstate, XFEATURE_MASK_FPSTATE);
+
+ fpregs_activate(fpu);
+ fpu->last_cpu = cpu;
+ }
+ clear_thread_flag(TIF_NEED_FPU_LOAD);
+}
+
+#endif
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 7ada7bd03a32..8ea306b1bf8e 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -6,8 +6,9 @@
* General FPU state handling cleanups
* Gareth Hughes <gareth@valinux.com>, May 2000
*/
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
#include <asm/fpu/regset.h>
+#include <asm/fpu/sched.h>
#include <asm/fpu/signal.h>
#include <asm/fpu/types.h>
#include <asm/traps.h>
@@ -15,15 +16,30 @@
#include <linux/hardirq.h>
#include <linux/pkeys.h>
+#include <linux/vmalloc.h>
+
+#include "context.h"
+#include "internal.h"
+#include "legacy.h"
+#include "xstate.h"
#define CREATE_TRACE_POINTS
#include <asm/trace/fpu.h>
+#ifdef CONFIG_X86_64
+DEFINE_STATIC_KEY_FALSE(__fpu_state_size_dynamic);
+DEFINE_PER_CPU(u64, xfd_state);
+#endif
+
+/* The FPU state configuration data for kernel and user space */
+struct fpu_state_config fpu_kernel_cfg __ro_after_init;
+struct fpu_state_config fpu_user_cfg __ro_after_init;
+
/*
* Represents the initial FPU state. It's mostly (but not completely) zeroes,
* depending on the FPU hardware format:
*/
-union fpregs_state init_fpstate __ro_after_init;
+struct fpstate init_fpstate __ro_after_init;
/*
* Track whether the kernel is using the FPU state
@@ -83,7 +99,7 @@ bool irq_fpu_usable(void)
EXPORT_SYMBOL(irq_fpu_usable);
/*
- * Save the FPU register state in fpu->state. The register state is
+ * Save the FPU register state in fpu->fpstate->regs. The register state is
* preserved.
*
* Must be called with fpregs_lock() held.
@@ -99,19 +115,19 @@ EXPORT_SYMBOL(irq_fpu_usable);
void save_fpregs_to_fpstate(struct fpu *fpu)
{
if (likely(use_xsave())) {
- os_xsave(&fpu->state.xsave);
+ os_xsave(fpu->fpstate);
/*
* AVX512 state is tracked here because its use is
* known to slow the max clock speed of the core.
*/
- if (fpu->state.xsave.header.xfeatures & XFEATURE_MASK_AVX512)
+ if (fpu->fpstate->regs.xsave.header.xfeatures & XFEATURE_MASK_AVX512)
fpu->avx512_timestamp = jiffies;
return;
}
if (likely(use_fxsr())) {
- fxsave(&fpu->state.fxsave);
+ fxsave(&fpu->fpstate->regs.fxsave);
return;
}
@@ -119,12 +135,11 @@ void save_fpregs_to_fpstate(struct fpu *fpu)
* Legacy FPU register saving, FNSAVE always clears FPU registers,
* so we have to reload them from the memory state.
*/
- asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave));
- frstor(&fpu->state.fsave);
+ asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->fpstate->regs.fsave));
+ frstor(&fpu->fpstate->regs.fsave);
}
-EXPORT_SYMBOL(save_fpregs_to_fpstate);
-void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask)
+void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask)
{
/*
* AMD K7/K8 and later CPUs up to Zen don't save/restore
@@ -141,15 +156,181 @@ void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask)
}
if (use_xsave()) {
- os_xrstor(&fpstate->xsave, mask);
+ /*
+ * Dynamically enabled features are enabled in XCR0, but
+ * usage requires also that the corresponding bits in XFD
+ * are cleared. If the bits are set then using a related
+ * instruction will raise #NM. This allows to do the
+ * allocation of the larger FPU buffer lazy from #NM or if
+ * the task has no permission to kill it which would happen
+ * via #UD if the feature is disabled in XCR0.
+ *
+ * XFD state is following the same life time rules as
+ * XSTATE and to restore state correctly XFD has to be
+ * updated before XRSTORS otherwise the component would
+ * stay in or go into init state even if the bits are set
+ * in fpstate::regs::xsave::xfeatures.
+ */
+ xfd_update_state(fpstate);
+
+ /*
+ * Restoring state always needs to modify all features
+ * which are in @mask even if the current task cannot use
+ * extended features.
+ *
+ * So fpstate->xfeatures cannot be used here, because then
+ * a feature for which the task has no permission but was
+ * used by the previous task would not go into init state.
+ */
+ mask = fpu_kernel_cfg.max_features & mask;
+
+ os_xrstor(fpstate, mask);
} else {
if (use_fxsr())
- fxrstor(&fpstate->fxsave);
+ fxrstor(&fpstate->regs.fxsave);
else
- frstor(&fpstate->fsave);
+ frstor(&fpstate->regs.fsave);
}
}
-EXPORT_SYMBOL_GPL(__restore_fpregs_from_fpstate);
+
+void fpu_reset_from_exception_fixup(void)
+{
+ restore_fpregs_from_fpstate(&init_fpstate, XFEATURE_MASK_FPSTATE);
+}
+
+#if IS_ENABLED(CONFIG_KVM)
+static void __fpstate_reset(struct fpstate *fpstate);
+
+bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu)
+{
+ struct fpstate *fpstate;
+ unsigned int size;
+
+ size = fpu_user_cfg.default_size + ALIGN(offsetof(struct fpstate, regs), 64);
+ fpstate = vzalloc(size);
+ if (!fpstate)
+ return false;
+
+ __fpstate_reset(fpstate);
+ fpstate_init_user(fpstate);
+ fpstate->is_valloc = true;
+ fpstate->is_guest = true;
+
+ gfpu->fpstate = fpstate;
+ return true;
+}
+EXPORT_SYMBOL_GPL(fpu_alloc_guest_fpstate);
+
+void fpu_free_guest_fpstate(struct fpu_guest *gfpu)
+{
+ struct fpstate *fps = gfpu->fpstate;
+
+ if (!fps)
+ return;
+
+ if (WARN_ON_ONCE(!fps->is_valloc || !fps->is_guest || fps->in_use))
+ return;
+
+ gfpu->fpstate = NULL;
+ vfree(fps);
+}
+EXPORT_SYMBOL_GPL(fpu_free_guest_fpstate);
+
+int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest)
+{
+ struct fpstate *guest_fps = guest_fpu->fpstate;
+ struct fpu *fpu = &current->thread.fpu;
+ struct fpstate *cur_fps = fpu->fpstate;
+
+ fpregs_lock();
+ if (!cur_fps->is_confidential && !test_thread_flag(TIF_NEED_FPU_LOAD))
+ save_fpregs_to_fpstate(fpu);
+
+ /* Swap fpstate */
+ if (enter_guest) {
+ fpu->__task_fpstate = cur_fps;
+ fpu->fpstate = guest_fps;
+ guest_fps->in_use = true;
+ } else {
+ guest_fps->in_use = false;
+ fpu->fpstate = fpu->__task_fpstate;
+ fpu->__task_fpstate = NULL;
+ }
+
+ cur_fps = fpu->fpstate;
+
+ if (!cur_fps->is_confidential) {
+ /* Includes XFD update */
+ restore_fpregs_from_fpstate(cur_fps, XFEATURE_MASK_FPSTATE);
+ } else {
+ /*
+ * XSTATE is restored by firmware from encrypted
+ * memory. Make sure XFD state is correct while
+ * running with guest fpstate
+ */
+ xfd_update_state(cur_fps);
+ }
+
+ fpregs_mark_activate();
+ fpregs_unlock();
+ return 0;
+}
+EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpstate);
+
+void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf,
+ unsigned int size, u32 pkru)
+{
+ struct fpstate *kstate = gfpu->fpstate;
+ union fpregs_state *ustate = buf;
+ struct membuf mb = { .p = buf, .left = size };
+
+ if (cpu_feature_enabled(X86_FEATURE_XSAVE)) {
+ __copy_xstate_to_uabi_buf(mb, kstate, pkru, XSTATE_COPY_XSAVE);
+ } else {
+ memcpy(&ustate->fxsave, &kstate->regs.fxsave,
+ sizeof(ustate->fxsave));
+ /* Make it restorable on a XSAVE enabled host */
+ ustate->xsave.header.xfeatures = XFEATURE_MASK_FPSSE;
+ }
+}
+EXPORT_SYMBOL_GPL(fpu_copy_guest_fpstate_to_uabi);
+
+int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf,
+ u64 xcr0, u32 *vpkru)
+{
+ struct fpstate *kstate = gfpu->fpstate;
+ const union fpregs_state *ustate = buf;
+ struct pkru_state *xpkru;
+ int ret;
+
+ if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) {
+ if (ustate->xsave.header.xfeatures & ~XFEATURE_MASK_FPSSE)
+ return -EINVAL;
+ if (ustate->fxsave.mxcsr & ~mxcsr_feature_mask)
+ return -EINVAL;
+ memcpy(&kstate->regs.fxsave, &ustate->fxsave, sizeof(ustate->fxsave));
+ return 0;
+ }
+
+ if (ustate->xsave.header.xfeatures & ~xcr0)
+ return -EINVAL;
+
+ ret = copy_uabi_from_kernel_to_xstate(kstate, ustate);
+ if (ret)
+ return ret;
+
+ /* Retrieve PKRU if not in init state */
+ if (kstate->regs.xsave.header.xfeatures & XFEATURE_MASK_PKRU) {
+ xpkru = get_xsave_addr(&kstate->regs.xsave, XFEATURE_PKRU);
+ *vpkru = xpkru->pkru;
+ }
+
+ /* Ensure that XCOMP_BV is set up for XSAVES */
+ xstate_init_xcomp_bv(&kstate->regs.xsave, kstate->xfeatures);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate);
+#endif /* CONFIG_KVM */
void kernel_fpu_begin_mask(unsigned int kfpu_mask)
{
@@ -203,52 +384,88 @@ void fpu_sync_fpstate(struct fpu *fpu)
fpregs_unlock();
}
-static inline void fpstate_init_xstate(struct xregs_state *xsave)
+static inline unsigned int init_fpstate_copy_size(void)
{
- /*
- * XRSTORS requires these bits set in xcomp_bv, or it will
- * trigger #GP:
- */
- xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask_all;
+ if (!use_xsave())
+ return fpu_kernel_cfg.default_size;
+
+ /* XSAVE(S) just needs the legacy and the xstate header part */
+ return sizeof(init_fpstate.regs.xsave);
}
-static inline void fpstate_init_fxstate(struct fxregs_state *fx)
+static inline void fpstate_init_fxstate(struct fpstate *fpstate)
{
- fx->cwd = 0x37f;
- fx->mxcsr = MXCSR_DEFAULT;
+ fpstate->regs.fxsave.cwd = 0x37f;
+ fpstate->regs.fxsave.mxcsr = MXCSR_DEFAULT;
}
/*
* Legacy x87 fpstate state init:
*/
-static inline void fpstate_init_fstate(struct fregs_state *fp)
+static inline void fpstate_init_fstate(struct fpstate *fpstate)
{
- fp->cwd = 0xffff037fu;
- fp->swd = 0xffff0000u;
- fp->twd = 0xffffffffu;
- fp->fos = 0xffff0000u;
+ fpstate->regs.fsave.cwd = 0xffff037fu;
+ fpstate->regs.fsave.swd = 0xffff0000u;
+ fpstate->regs.fsave.twd = 0xffffffffu;
+ fpstate->regs.fsave.fos = 0xffff0000u;
}
-void fpstate_init(union fpregs_state *state)
+/*
+ * Used in two places:
+ * 1) Early boot to setup init_fpstate for non XSAVE systems
+ * 2) fpu_init_fpstate_user() which is invoked from KVM
+ */
+void fpstate_init_user(struct fpstate *fpstate)
{
- if (!static_cpu_has(X86_FEATURE_FPU)) {
- fpstate_init_soft(&state->soft);
+ if (!cpu_feature_enabled(X86_FEATURE_FPU)) {
+ fpstate_init_soft(&fpstate->regs.soft);
return;
}
- memset(state, 0, fpu_kernel_xstate_size);
+ xstate_init_xcomp_bv(&fpstate->regs.xsave, fpstate->xfeatures);
- if (static_cpu_has(X86_FEATURE_XSAVES))
- fpstate_init_xstate(&state->xsave);
- if (static_cpu_has(X86_FEATURE_FXSR))
- fpstate_init_fxstate(&state->fxsave);
+ if (cpu_feature_enabled(X86_FEATURE_FXSR))
+ fpstate_init_fxstate(fpstate);
else
- fpstate_init_fstate(&state->fsave);
+ fpstate_init_fstate(fpstate);
+}
+
+static void __fpstate_reset(struct fpstate *fpstate)
+{
+ /* Initialize sizes and feature masks */
+ fpstate->size = fpu_kernel_cfg.default_size;
+ fpstate->user_size = fpu_user_cfg.default_size;
+ fpstate->xfeatures = fpu_kernel_cfg.default_features;
+ fpstate->user_xfeatures = fpu_user_cfg.default_features;
+ fpstate->xfd = init_fpstate.xfd;
+}
+
+void fpstate_reset(struct fpu *fpu)
+{
+ /* Set the fpstate pointer to the default fpstate */
+ fpu->fpstate = &fpu->__fpstate;
+ __fpstate_reset(fpu->fpstate);
+
+ /* Initialize the permission related info in fpu */
+ fpu->perm.__state_perm = fpu_kernel_cfg.default_features;
+ fpu->perm.__state_size = fpu_kernel_cfg.default_size;
+ fpu->perm.__user_state_size = fpu_user_cfg.default_size;
+}
+
+static inline void fpu_inherit_perms(struct fpu *dst_fpu)
+{
+ if (fpu_state_size_dynamic()) {
+ struct fpu *src_fpu = &current->group_leader->thread.fpu;
+
+ spin_lock_irq(&current->sighand->siglock);
+ /* Fork also inherits the permissions of the parent */
+ dst_fpu->perm = src_fpu->perm;
+ spin_unlock_irq(&current->sighand->siglock);
+ }
}
-EXPORT_SYMBOL_GPL(fpstate_init);
/* Clone current's FPU state on fork */
-int fpu_clone(struct task_struct *dst)
+int fpu_clone(struct task_struct *dst, unsigned long clone_flags)
{
struct fpu *src_fpu = &current->thread.fpu;
struct fpu *dst_fpu = &dst->thread.fpu;
@@ -256,30 +473,51 @@ int fpu_clone(struct task_struct *dst)
/* The new task's FPU state cannot be valid in the hardware. */
dst_fpu->last_cpu = -1;
+ fpstate_reset(dst_fpu);
+
if (!cpu_feature_enabled(X86_FEATURE_FPU))
return 0;
/*
- * Don't let 'init optimized' areas of the XSAVE area
- * leak into the child task:
+ * Enforce reload for user space tasks and prevent kernel threads
+ * from trying to save the FPU registers on context switch.
+ */
+ set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD);
+
+ /*
+ * No FPU state inheritance for kernel threads and IO
+ * worker threads.
+ */
+ if (dst->flags & (PF_KTHREAD | PF_IO_WORKER)) {
+ /* Clear out the minimal state */
+ memcpy(&dst_fpu->fpstate->regs, &init_fpstate.regs,
+ init_fpstate_copy_size());
+ return 0;
+ }
+
+ /*
+ * If a new feature is added, ensure all dynamic features are
+ * caller-saved from here!
*/
- memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size);
+ BUILD_BUG_ON(XFEATURE_MASK_USER_DYNAMIC != XFEATURE_MASK_XTILE_DATA);
/*
- * If the FPU registers are not owned by current just memcpy() the
- * state. Otherwise save the FPU registers directly into the
- * child's FPU context, without any memory-to-memory copying.
+ * Save the default portion of the current FPU state into the
+ * clone. Assume all dynamic features to be defined as caller-
+ * saved, which enables skipping both the expansion of fpstate
+ * and the copying of any dynamic state.
+ *
+ * Do not use memcpy() when TIF_NEED_FPU_LOAD is set because
+ * copying is not valid when current uses non-default states.
*/
fpregs_lock();
if (test_thread_flag(TIF_NEED_FPU_LOAD))
- memcpy(&dst_fpu->state, &src_fpu->state, fpu_kernel_xstate_size);
-
- else
- save_fpregs_to_fpstate(dst_fpu);
+ fpregs_restore_userregs();
+ save_fpregs_to_fpstate(dst_fpu);
+ if (!(clone_flags & CLONE_THREAD))
+ fpu_inherit_perms(dst_fpu);
fpregs_unlock();
- set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD);
-
trace_x86_fpu_copy_src(src_fpu);
trace_x86_fpu_copy_dst(dst_fpu);
@@ -287,6 +525,16 @@ int fpu_clone(struct task_struct *dst)
}
/*
+ * Whitelist the FPU register state embedded into task_struct for hardened
+ * usercopy.
+ */
+void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size)
+{
+ *offset = offsetof(struct thread_struct, fpu.__fpstate.regs);
+ *size = fpu_kernel_cfg.default_size;
+}
+
+/*
* Drops current FPU state: deactivates the fpregs and
* the fpstate. NOTE: it still leaves previous contents
* in the fpregs in the eager-FPU case.
@@ -319,28 +567,19 @@ void fpu__drop(struct fpu *fpu)
static inline void restore_fpregs_from_init_fpstate(u64 features_mask)
{
if (use_xsave())
- os_xrstor(&init_fpstate.xsave, features_mask);
+ os_xrstor(&init_fpstate, features_mask);
else if (use_fxsr())
- fxrstor(&init_fpstate.fxsave);
+ fxrstor(&init_fpstate.regs.fxsave);
else
- frstor(&init_fpstate.fsave);
+ frstor(&init_fpstate.regs.fsave);
pkru_write_default();
}
-static inline unsigned int init_fpstate_copy_size(void)
-{
- if (!use_xsave())
- return fpu_kernel_xstate_size;
-
- /* XSAVE(S) just needs the legacy and the xstate header part */
- return sizeof(init_fpstate.xsave);
-}
-
/*
* Reset current->fpu memory state to the init values.
*/
-static void fpu_reset_fpstate(void)
+static void fpu_reset_fpregs(void)
{
struct fpu *fpu = &current->thread.fpu;
@@ -359,7 +598,7 @@ static void fpu_reset_fpstate(void)
* user space as PKRU is eagerly written in switch_to() and
* flush_thread().
*/
- memcpy(&fpu->state, &init_fpstate, init_fpstate_copy_size());
+ memcpy(&fpu->fpstate->regs, &init_fpstate.regs, init_fpstate_copy_size());
set_thread_flag(TIF_NEED_FPU_LOAD);
fpregs_unlock();
}
@@ -375,7 +614,7 @@ void fpu__clear_user_states(struct fpu *fpu)
fpregs_lock();
if (!cpu_feature_enabled(X86_FEATURE_FPU)) {
- fpu_reset_fpstate();
+ fpu_reset_fpregs();
fpregs_unlock();
return;
}
@@ -385,12 +624,11 @@ void fpu__clear_user_states(struct fpu *fpu)
* corresponding registers.
*/
if (xfeatures_mask_supervisor() &&
- !fpregs_state_valid(fpu, smp_processor_id())) {
- os_xrstor(&fpu->state.xsave, xfeatures_mask_supervisor());
- }
+ !fpregs_state_valid(fpu, smp_processor_id()))
+ os_xrstor_supervisor(fpu->fpstate);
/* Reset user states in registers. */
- restore_fpregs_from_init_fpstate(xfeatures_mask_restore_user());
+ restore_fpregs_from_init_fpstate(XFEATURE_MASK_USER_RESTORE);
/*
* Now all FPU registers have their desired values. Inform the FPU
@@ -405,7 +643,8 @@ void fpu__clear_user_states(struct fpu *fpu)
void fpu_flush_thread(void)
{
- fpu_reset_fpstate();
+ fpstate_reset(&current->thread.fpu);
+ fpu_reset_fpregs();
}
/*
* Load FPU context before returning to userspace.
@@ -445,7 +684,6 @@ void fpregs_mark_activate(void)
fpu->last_cpu = smp_processor_id();
clear_thread_flag(TIF_NEED_FPU_LOAD);
}
-EXPORT_SYMBOL_GPL(fpregs_mark_activate);
/*
* x87 math exception handling:
@@ -468,11 +706,11 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr)
* fully reproduce the context of the exception.
*/
if (boot_cpu_has(X86_FEATURE_FXSR)) {
- cwd = fpu->state.fxsave.cwd;
- swd = fpu->state.fxsave.swd;
+ cwd = fpu->fpstate->regs.fxsave.cwd;
+ swd = fpu->fpstate->regs.fxsave.swd;
} else {
- cwd = (unsigned short)fpu->state.fsave.cwd;
- swd = (unsigned short)fpu->state.fsave.swd;
+ cwd = (unsigned short)fpu->fpstate->regs.fsave.cwd;
+ swd = (unsigned short)fpu->fpstate->regs.fsave.swd;
}
err = swd & ~cwd;
@@ -486,7 +724,7 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr)
unsigned short mxcsr = MXCSR_DEFAULT;
if (boot_cpu_has(X86_FEATURE_XMM))
- mxcsr = fpu->state.fxsave.mxcsr;
+ mxcsr = fpu->fpstate->regs.fxsave.mxcsr;
err = ~(mxcsr >> 7) & mxcsr;
}
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 64e29927cc32..621f4b6cac4a 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -2,7 +2,7 @@
/*
* x86 FPU boot time init code:
*/
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
#include <asm/tlbflush.h>
#include <asm/setup.h>
@@ -10,6 +10,10 @@
#include <linux/sched/task.h>
#include <linux/init.h>
+#include "internal.h"
+#include "legacy.h"
+#include "xstate.h"
+
/*
* Initialize the registers found in all CPUs, CR0 and CR4:
*/
@@ -34,7 +38,7 @@ static void fpu__init_cpu_generic(void)
/* Flush out any pending x87 state: */
#ifdef CONFIG_MATH_EMULATION
if (!boot_cpu_has(X86_FEATURE_FPU))
- fpstate_init_soft(&current->thread.fpu.state.soft);
+ fpstate_init_soft(&current->thread.fpu.fpstate->regs.soft);
else
#endif
asm volatile ("fninit");
@@ -121,23 +125,14 @@ static void __init fpu__init_system_mxcsr(void)
static void __init fpu__init_system_generic(void)
{
/*
- * Set up the legacy init FPU context. (xstate init might overwrite this
- * with a more modern format, if the CPU supports it.)
+ * Set up the legacy init FPU context. Will be updated when the
+ * CPU supports XSAVE[S].
*/
- fpstate_init(&init_fpstate);
+ fpstate_init_user(&init_fpstate);
fpu__init_system_mxcsr();
}
-/*
- * Size of the FPU context state. All tasks in the system use the
- * same context size, regardless of what portion they use.
- * This is inherent to the XSAVE architecture which puts all state
- * components into a single, continuous memory block:
- */
-unsigned int fpu_kernel_xstate_size __ro_after_init;
-EXPORT_SYMBOL_GPL(fpu_kernel_xstate_size);
-
/* Get alignment of the TYPE. */
#define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test)
@@ -162,13 +157,13 @@ static void __init fpu__init_task_struct_size(void)
* Subtract off the static size of the register state.
* It potentially has a bunch of padding.
*/
- task_size -= sizeof(((struct task_struct *)0)->thread.fpu.state);
+ task_size -= sizeof(current->thread.fpu.__fpstate.regs);
/*
* Add back the dynamically-calculated register state
* size.
*/
- task_size += fpu_kernel_xstate_size;
+ task_size += fpu_kernel_cfg.default_size;
/*
* We dynamically size 'struct fpu', so we require that
@@ -177,7 +172,7 @@ static void __init fpu__init_task_struct_size(void)
* you hit a compile error here, check the structure to
* see if something got added to the end.
*/
- CHECK_MEMBER_AT_END_OF(struct fpu, state);
+ CHECK_MEMBER_AT_END_OF(struct fpu, __fpstate);
CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu);
CHECK_MEMBER_AT_END_OF(struct task_struct, thread);
@@ -192,37 +187,34 @@ static void __init fpu__init_task_struct_size(void)
*/
static void __init fpu__init_system_xstate_size_legacy(void)
{
- static int on_boot_cpu __initdata = 1;
-
- WARN_ON_FPU(!on_boot_cpu);
- on_boot_cpu = 0;
+ unsigned int size;
/*
- * Note that xstate sizes might be overwritten later during
- * fpu__init_system_xstate().
+ * Note that the size configuration might be overwritten later
+ * during fpu__init_system_xstate().
*/
-
- if (!boot_cpu_has(X86_FEATURE_FPU)) {
- fpu_kernel_xstate_size = sizeof(struct swregs_state);
+ if (!cpu_feature_enabled(X86_FEATURE_FPU)) {
+ size = sizeof(struct swregs_state);
+ } else if (cpu_feature_enabled(X86_FEATURE_FXSR)) {
+ size = sizeof(struct fxregs_state);
+ fpu_user_cfg.legacy_features = XFEATURE_MASK_FPSSE;
} else {
- if (boot_cpu_has(X86_FEATURE_FXSR))
- fpu_kernel_xstate_size =
- sizeof(struct fxregs_state);
- else
- fpu_kernel_xstate_size =
- sizeof(struct fregs_state);
+ size = sizeof(struct fregs_state);
+ fpu_user_cfg.legacy_features = XFEATURE_MASK_FP;
}
- fpu_user_xstate_size = fpu_kernel_xstate_size;
+ fpu_kernel_cfg.max_size = size;
+ fpu_kernel_cfg.default_size = size;
+ fpu_user_cfg.max_size = size;
+ fpu_user_cfg.default_size = size;
+ fpstate_reset(&current->thread.fpu);
}
-/* Legacy code to initialize eager fpu mode. */
-static void __init fpu__init_system_ctx_switch(void)
+static void __init fpu__init_init_fpstate(void)
{
- static bool on_boot_cpu __initdata = 1;
-
- WARN_ON_FPU(!on_boot_cpu);
- on_boot_cpu = 0;
+ /* Bring init_fpstate size and features up to date */
+ init_fpstate.size = fpu_kernel_cfg.max_size;
+ init_fpstate.xfeatures = fpu_kernel_cfg.max_features;
}
/*
@@ -231,6 +223,7 @@ static void __init fpu__init_system_ctx_switch(void)
*/
void __init fpu__init_system(struct cpuinfo_x86 *c)
{
+ fpstate_reset(&current->thread.fpu);
fpu__init_system_early_generic(c);
/*
@@ -241,8 +234,7 @@ void __init fpu__init_system(struct cpuinfo_x86 *c)
fpu__init_system_generic();
fpu__init_system_xstate_size_legacy();
- fpu__init_system_xstate();
+ fpu__init_system_xstate(fpu_kernel_cfg.max_size);
fpu__init_task_struct_size();
-
- fpu__init_system_ctx_switch();
+ fpu__init_init_fpstate();
}
diff --git a/arch/x86/kernel/fpu/internal.h b/arch/x86/kernel/fpu/internal.h
new file mode 100644
index 000000000000..dbdb31f55fc7
--- /dev/null
+++ b/arch/x86/kernel/fpu/internal.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_KERNEL_FPU_INTERNAL_H
+#define __X86_KERNEL_FPU_INTERNAL_H
+
+extern struct fpstate init_fpstate;
+
+/* CPU feature check wrappers */
+static __always_inline __pure bool use_xsave(void)
+{
+ return cpu_feature_enabled(X86_FEATURE_XSAVE);
+}
+
+static __always_inline __pure bool use_fxsr(void)
+{
+ return cpu_feature_enabled(X86_FEATURE_FXSR);
+}
+
+#ifdef CONFIG_X86_DEBUG_FPU
+# define WARN_ON_FPU(x) WARN_ON_ONCE(x)
+#else
+# define WARN_ON_FPU(x) ({ (void)(x); 0; })
+#endif
+
+/* Used in init.c */
+extern void fpstate_init_user(struct fpstate *fpstate);
+extern void fpstate_reset(struct fpu *fpu);
+
+#endif
diff --git a/arch/x86/kernel/fpu/legacy.h b/arch/x86/kernel/fpu/legacy.h
new file mode 100644
index 000000000000..17c26b164c63
--- /dev/null
+++ b/arch/x86/kernel/fpu/legacy.h
@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_KERNEL_FPU_LEGACY_H
+#define __X86_KERNEL_FPU_LEGACY_H
+
+#include <asm/fpu/types.h>
+
+extern unsigned int mxcsr_feature_mask;
+
+static inline void ldmxcsr(u32 mxcsr)
+{
+ asm volatile("ldmxcsr %0" :: "m" (mxcsr));
+}
+
+/*
+ * Returns 0 on success or the trap number when the operation raises an
+ * exception.
+ */
+#define user_insn(insn, output, input...) \
+({ \
+ int err; \
+ \
+ might_fault(); \
+ \
+ asm volatile(ASM_STAC "\n" \
+ "1: " #insn "\n" \
+ "2: " ASM_CLAC "\n" \
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE) \
+ : [err] "=a" (err), output \
+ : "0"(0), input); \
+ err; \
+})
+
+#define kernel_insn_err(insn, output, input...) \
+({ \
+ int err; \
+ asm volatile("1:" #insn "\n\t" \
+ "2:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "3: movl $-1,%[err]\n" \
+ " jmp 2b\n" \
+ ".previous\n" \
+ _ASM_EXTABLE(1b, 3b) \
+ : [err] "=r" (err), output \
+ : "0"(0), input); \
+ err; \
+})
+
+#define kernel_insn(insn, output, input...) \
+ asm volatile("1:" #insn "\n\t" \
+ "2:\n" \
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FPU_RESTORE) \
+ : output : input)
+
+static inline int fnsave_to_user_sigframe(struct fregs_state __user *fx)
+{
+ return user_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx));
+}
+
+static inline int fxsave_to_user_sigframe(struct fxregs_state __user *fx)
+{
+ if (IS_ENABLED(CONFIG_X86_32))
+ return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx));
+ else
+ return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx));
+
+}
+
+static inline void fxrstor(struct fxregs_state *fx)
+{
+ if (IS_ENABLED(CONFIG_X86_32))
+ kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+ else
+ kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline int fxrstor_safe(struct fxregs_state *fx)
+{
+ if (IS_ENABLED(CONFIG_X86_32))
+ return kernel_insn_err(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+ else
+ return kernel_insn_err(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline int fxrstor_from_user_sigframe(struct fxregs_state __user *fx)
+{
+ if (IS_ENABLED(CONFIG_X86_32))
+ return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+ else
+ return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline void frstor(struct fregs_state *fx)
+{
+ kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline int frstor_safe(struct fregs_state *fx)
+{
+ return kernel_insn_err(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline int frstor_from_user_sigframe(struct fregs_state __user *fx)
+{
+ return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline void fxsave(struct fxregs_state *fx)
+{
+ if (IS_ENABLED(CONFIG_X86_32))
+ asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx));
+ else
+ asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx));
+}
+
+#endif
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 66ed317ebc0d..437d7c930c0b 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -5,10 +5,14 @@
#include <linux/sched/task_stack.h>
#include <linux/vmalloc.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
#include <asm/fpu/signal.h>
#include <asm/fpu/regset.h>
-#include <asm/fpu/xstate.h>
+
+#include "context.h"
+#include "internal.h"
+#include "legacy.h"
+#include "xstate.h"
/*
* The xstateregs_active() routine is the same as the regset_fpregs_active() routine,
@@ -74,8 +78,8 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
sync_fpstate(fpu);
if (!use_xsave()) {
- return membuf_write(&to, &fpu->state.fxsave,
- sizeof(fpu->state.fxsave));
+ return membuf_write(&to, &fpu->fpstate->regs.fxsave,
+ sizeof(fpu->fpstate->regs.fxsave));
}
copy_xstate_to_uabi_buf(to, target, XSTATE_COPY_FX);
@@ -110,15 +114,15 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
fpu_force_restore(fpu);
/* Copy the state */
- memcpy(&fpu->state.fxsave, &newstate, sizeof(newstate));
+ memcpy(&fpu->fpstate->regs.fxsave, &newstate, sizeof(newstate));
/* Clear xmm8..15 */
- BUILD_BUG_ON(sizeof(fpu->state.fxsave.xmm_space) != 16 * 16);
- memset(&fpu->state.fxsave.xmm_space[8], 0, 8 * 16);
+ BUILD_BUG_ON(sizeof(fpu->__fpstate.regs.fxsave.xmm_space) != 16 * 16);
+ memset(&fpu->fpstate->regs.fxsave.xmm_space[8], 0, 8 * 16);
/* Mark FP and SSE as in use when XSAVE is enabled */
if (use_xsave())
- fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE;
+ fpu->fpstate->regs.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE;
return 0;
}
@@ -149,7 +153,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
/*
* A whole standard-format XSAVE buffer is needed:
*/
- if (pos != 0 || count != fpu_user_xstate_size)
+ if (pos != 0 || count != fpu_user_cfg.max_size)
return -EFAULT;
if (!kbuf) {
@@ -164,7 +168,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
}
fpu_force_restore(fpu);
- ret = copy_uabi_from_kernel_to_xstate(&fpu->state.xsave, kbuf ?: tmpbuf);
+ ret = copy_uabi_from_kernel_to_xstate(fpu->fpstate, kbuf ?: tmpbuf);
out:
vfree(tmpbuf);
@@ -283,7 +287,7 @@ static void __convert_from_fxsr(struct user_i387_ia32_struct *env,
void
convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
{
- __convert_from_fxsr(env, tsk, &tsk->thread.fpu.state.fxsave);
+ __convert_from_fxsr(env, tsk, &tsk->thread.fpu.fpstate->regs.fxsave);
}
void convert_to_fxsr(struct fxregs_state *fxsave,
@@ -326,7 +330,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
return fpregs_soft_get(target, regset, to);
if (!cpu_feature_enabled(X86_FEATURE_FXSR)) {
- return membuf_write(&to, &fpu->state.fsave,
+ return membuf_write(&to, &fpu->fpstate->regs.fsave,
sizeof(struct fregs_state));
}
@@ -337,7 +341,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
copy_xstate_to_uabi_buf(mb, target, XSTATE_COPY_FP);
fx = &fxsave;
} else {
- fx = &fpu->state.fxsave;
+ fx = &fpu->fpstate->regs.fxsave;
}
__convert_from_fxsr(&env, target, fx);
@@ -366,16 +370,16 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
fpu_force_restore(fpu);
if (cpu_feature_enabled(X86_FEATURE_FXSR))
- convert_to_fxsr(&fpu->state.fxsave, &env);
+ convert_to_fxsr(&fpu->fpstate->regs.fxsave, &env);
else
- memcpy(&fpu->state.fsave, &env, sizeof(env));
+ memcpy(&fpu->fpstate->regs.fsave, &env, sizeof(env));
/*
* Update the header bit in the xsave header, indicating the
* presence of FP.
*/
if (cpu_feature_enabled(X86_FEATURE_XSAVE))
- fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FP;
+ fpu->fpstate->regs.xsave.header.xfeatures |= XFEATURE_MASK_FP;
return 0;
}
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 445c57c9c539..d5958278eba6 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -7,23 +7,25 @@
#include <linux/cpu.h>
#include <linux/pagemap.h>
-#include <asm/fpu/internal.h>
#include <asm/fpu/signal.h>
#include <asm/fpu/regset.h>
#include <asm/fpu/xstate.h>
#include <asm/sigframe.h>
+#include <asm/trapnr.h>
#include <asm/trace/fpu.h>
-static struct _fpx_sw_bytes fx_sw_reserved __ro_after_init;
-static struct _fpx_sw_bytes fx_sw_reserved_ia32 __ro_after_init;
+#include "context.h"
+#include "internal.h"
+#include "legacy.h"
+#include "xstate.h"
/*
* Check for the presence of extended state information in the
* user fpstate pointer in the sigcontext.
*/
-static inline int check_xstate_in_sigframe(struct fxregs_state __user *fxbuf,
- struct _fpx_sw_bytes *fx_sw)
+static inline bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf,
+ struct _fpx_sw_bytes *fx_sw)
{
int min_xstate_size = sizeof(struct fxregs_state) +
sizeof(struct xstate_header);
@@ -31,12 +33,12 @@ static inline int check_xstate_in_sigframe(struct fxregs_state __user *fxbuf,
unsigned int magic2;
if (__copy_from_user(fx_sw, &fxbuf->sw_reserved[0], sizeof(*fx_sw)))
- return -EFAULT;
+ return false;
/* Check for the first magic field and other error scenarios. */
if (fx_sw->magic1 != FP_XSTATE_MAGIC1 ||
fx_sw->xstate_size < min_xstate_size ||
- fx_sw->xstate_size > fpu_user_xstate_size ||
+ fx_sw->xstate_size > current->thread.fpu.fpstate->user_size ||
fx_sw->xstate_size > fx_sw->extended_size)
goto setfx;
@@ -47,10 +49,10 @@ static inline int check_xstate_in_sigframe(struct fxregs_state __user *fxbuf,
* in the memory layout.
*/
if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size)))
- return -EFAULT;
+ return false;
if (likely(magic2 == FP_XSTATE_MAGIC2))
- return 0;
+ return true;
setfx:
trace_x86_fpu_xstate_check_failed(&current->thread.fpu);
@@ -58,22 +60,22 @@ setfx:
fx_sw->magic1 = 0;
fx_sw->xstate_size = sizeof(struct fxregs_state);
fx_sw->xfeatures = XFEATURE_MASK_FPSSE;
- return 0;
+ return true;
}
/*
* Signal frame handlers.
*/
-static inline int save_fsave_header(struct task_struct *tsk, void __user *buf)
+static inline bool save_fsave_header(struct task_struct *tsk, void __user *buf)
{
if (use_fxsr()) {
- struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
+ struct xregs_state *xsave = &tsk->thread.fpu.fpstate->regs.xsave;
struct user_i387_ia32_struct env;
struct _fpstate_32 __user *fp = buf;
fpregs_lock();
if (!test_thread_flag(TIF_NEED_FPU_LOAD))
- fxsave(&tsk->thread.fpu.state.fxsave);
+ fxsave(&tsk->thread.fpu.fpstate->regs.fxsave);
fpregs_unlock();
convert_from_fxsr(&env, tsk);
@@ -81,33 +83,54 @@ static inline int save_fsave_header(struct task_struct *tsk, void __user *buf)
if (__copy_to_user(buf, &env, sizeof(env)) ||
__put_user(xsave->i387.swd, &fp->status) ||
__put_user(X86_FXSR_MAGIC, &fp->magic))
- return -1;
+ return false;
} else {
struct fregs_state __user *fp = buf;
u32 swd;
+
if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status))
- return -1;
+ return false;
}
- return 0;
+ return true;
+}
+
+/*
+ * Prepare the SW reserved portion of the fxsave memory layout, indicating
+ * the presence of the extended state information in the memory layout
+ * pointed to by the fpstate pointer in the sigcontext.
+ * This is saved when ever the FP and extended state context is
+ * saved on the user stack during the signal handler delivery to the user.
+ */
+static inline void save_sw_bytes(struct _fpx_sw_bytes *sw_bytes, bool ia32_frame,
+ struct fpstate *fpstate)
+{
+ sw_bytes->magic1 = FP_XSTATE_MAGIC1;
+ sw_bytes->extended_size = fpstate->user_size + FP_XSTATE_MAGIC2_SIZE;
+ sw_bytes->xfeatures = fpstate->user_xfeatures;
+ sw_bytes->xstate_size = fpstate->user_size;
+
+ if (ia32_frame)
+ sw_bytes->extended_size += sizeof(struct fregs_state);
}
-static inline int save_xstate_epilog(void __user *buf, int ia32_frame)
+static inline bool save_xstate_epilog(void __user *buf, int ia32_frame,
+ struct fpstate *fpstate)
{
struct xregs_state __user *x = buf;
- struct _fpx_sw_bytes *sw_bytes;
+ struct _fpx_sw_bytes sw_bytes;
u32 xfeatures;
int err;
/* Setup the bytes not touched by the [f]xsave and reserved for SW. */
- sw_bytes = ia32_frame ? &fx_sw_reserved_ia32 : &fx_sw_reserved;
- err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes));
+ save_sw_bytes(&sw_bytes, ia32_frame, fpstate);
+ err = __copy_to_user(&x->i387.sw_reserved, &sw_bytes, sizeof(sw_bytes));
if (!use_xsave())
- return err;
+ return !err;
err |= __put_user(FP_XSTATE_MAGIC2,
- (__u32 __user *)(buf + fpu_user_xstate_size));
+ (__u32 __user *)(buf + fpstate->user_size));
/*
* Read the xfeatures which we copied (directly from the cpu or
@@ -130,23 +153,17 @@ static inline int save_xstate_epilog(void __user *buf, int ia32_frame)
err |= __put_user(xfeatures, (__u32 __user *)&x->header.xfeatures);
- return err;
+ return !err;
}
static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
{
- int err;
-
if (use_xsave())
- err = xsave_to_user_sigframe(buf);
- else if (use_fxsr())
- err = fxsave_to_user_sigframe((struct fxregs_state __user *) buf);
+ return xsave_to_user_sigframe(buf);
+ if (use_fxsr())
+ return fxsave_to_user_sigframe((struct fxregs_state __user *) buf);
else
- err = fnsave_to_user_sigframe((struct fregs_state __user *) buf);
-
- if (unlikely(err) && __clear_user(buf, fpu_user_xstate_size))
- err = -EFAULT;
- return err;
+ return fnsave_to_user_sigframe((struct fregs_state __user *) buf);
}
/*
@@ -159,10 +176,8 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
* buf == buf_fx for 64-bit frames and 32-bit fsave frame.
* buf != buf_fx for 32-bit frames with fxstate.
*
- * Try to save it directly to the user frame with disabled page fault handler.
- * If this fails then do the slow path where the FPU state is first saved to
- * task's fpu->state and then copy it to the user frame pointed to by the
- * aligned pointer 'buf_fx'.
+ * Save it directly to the user frame with disabled page fault handler. If
+ * that faults, try to clear the frame which handles the page fault.
*
* If this is a 32-bit frame with fxstate, put a fsave header before
* the aligned state at 'buf_fx'.
@@ -170,10 +185,11 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
* For [f]xsave state, update the SW reserved fields in the [f]xsave frame
* indicating the absence/presence of the extended state to the user.
*/
-int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
+bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
{
struct task_struct *tsk = current;
- int ia32_fxstate = (buf != buf_fx);
+ struct fpstate *fpstate = tsk->thread.fpu.fpstate;
+ bool ia32_fxstate = (buf != buf_fx);
int ret;
ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) ||
@@ -181,13 +197,25 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
if (!static_cpu_has(X86_FEATURE_FPU)) {
struct user_i387_ia32_struct fp;
+
fpregs_soft_get(current, NULL, (struct membuf){.p = &fp,
.left = sizeof(fp)});
- return copy_to_user(buf, &fp, sizeof(fp)) ? -EFAULT : 0;
+ return !copy_to_user(buf, &fp, sizeof(fp));
}
if (!access_ok(buf, size))
- return -EACCES;
+ return false;
+
+ if (use_xsave()) {
+ struct xregs_state __user *xbuf = buf_fx;
+
+ /*
+ * Clear the xsave header first, so that reserved fields are
+ * initialized to zero.
+ */
+ if (__clear_user(&xbuf->header, sizeof(xbuf->header)))
+ return false;
+ }
retry:
/*
* Load the FPU registers if they are not valid for the current task.
@@ -205,26 +233,26 @@ retry:
fpregs_unlock();
if (ret) {
- if (!fault_in_pages_writeable(buf_fx, fpu_user_xstate_size))
+ if (!__clear_user(buf_fx, fpstate->user_size))
goto retry;
- return -EFAULT;
+ return false;
}
/* Save the fsave header for the 32-bit frames. */
- if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf))
- return -1;
+ if ((ia32_fxstate || !use_fxsr()) && !save_fsave_header(tsk, buf))
+ return false;
- if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate))
- return -1;
+ if (use_fxsr() && !save_xstate_epilog(buf_fx, ia32_fxstate, fpstate))
+ return false;
- return 0;
+ return true;
}
-static int __restore_fpregs_from_user(void __user *buf, u64 xrestore,
- bool fx_only)
+static int __restore_fpregs_from_user(void __user *buf, u64 ufeatures,
+ u64 xrestore, bool fx_only)
{
if (use_xsave()) {
- u64 init_bv = xfeatures_mask_uabi() & ~xrestore;
+ u64 init_bv = ufeatures & ~xrestore;
int ret;
if (likely(!fx_only))
@@ -233,7 +261,7 @@ static int __restore_fpregs_from_user(void __user *buf, u64 xrestore,
ret = fxrstor_from_user_sigframe(buf);
if (!ret && unlikely(init_bv))
- os_xrstor(&init_fpstate.xsave, init_bv);
+ os_xrstor(&init_fpstate, init_bv);
return ret;
} else if (use_fxsr()) {
return fxrstor_from_user_sigframe(buf);
@@ -246,16 +274,19 @@ static int __restore_fpregs_from_user(void __user *buf, u64 xrestore,
* Attempt to restore the FPU registers directly from user memory.
* Pagefaults are handled and any errors returned are fatal.
*/
-static int restore_fpregs_from_user(void __user *buf, u64 xrestore,
- bool fx_only, unsigned int size)
+static bool restore_fpregs_from_user(void __user *buf, u64 xrestore,
+ bool fx_only, unsigned int size)
{
struct fpu *fpu = &current->thread.fpu;
int ret;
retry:
fpregs_lock();
+ /* Ensure that XFD is up to date */
+ xfd_update_state(fpu->fpstate);
pagefault_disable();
- ret = __restore_fpregs_from_user(buf, xrestore, fx_only);
+ ret = __restore_fpregs_from_user(buf, fpu->fpstate->user_xfeatures,
+ xrestore, fx_only);
pagefault_enable();
if (unlikely(ret)) {
@@ -275,13 +306,12 @@ retry:
fpregs_unlock();
/* Try to handle #PF, but anything else is fatal. */
- if (ret != -EFAULT)
- return -EINVAL;
+ if (ret != X86_TRAP_PF)
+ return false;
- ret = fault_in_pages_readable(buf, size);
- if (!ret)
+ if (!fault_in_readable(buf, size))
goto retry;
- return ret;
+ return false;
}
/*
@@ -294,45 +324,40 @@ retry:
* been restored from a user buffer directly.
*/
if (test_thread_flag(TIF_NEED_FPU_LOAD) && xfeatures_mask_supervisor())
- os_xrstor(&fpu->state.xsave, xfeatures_mask_supervisor());
+ os_xrstor_supervisor(fpu->fpstate);
fpregs_mark_activate();
fpregs_unlock();
- return 0;
+ return true;
}
-static int __fpu_restore_sig(void __user *buf, void __user *buf_fx,
- bool ia32_fxstate)
+static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
+ bool ia32_fxstate)
{
- int state_size = fpu_kernel_xstate_size;
struct task_struct *tsk = current;
struct fpu *fpu = &tsk->thread.fpu;
struct user_i387_ia32_struct env;
+ bool success, fx_only = false;
+ union fpregs_state *fpregs;
+ unsigned int state_size;
u64 user_xfeatures = 0;
- bool fx_only = false;
- int ret;
if (use_xsave()) {
struct _fpx_sw_bytes fx_sw_user;
- ret = check_xstate_in_sigframe(buf_fx, &fx_sw_user);
- if (unlikely(ret))
- return ret;
+ if (!check_xstate_in_sigframe(buf_fx, &fx_sw_user))
+ return false;
fx_only = !fx_sw_user.magic1;
state_size = fx_sw_user.xstate_size;
user_xfeatures = fx_sw_user.xfeatures;
} else {
user_xfeatures = XFEATURE_MASK_FPSSE;
+ state_size = fpu->fpstate->user_size;
}
if (likely(!ia32_fxstate)) {
- /*
- * Attempt to restore the FPU registers directly from user
- * memory. For that to succeed, the user access cannot cause page
- * faults. If it does, fall back to the slow path below, going
- * through the kernel buffer with the enabled pagefault handler.
- */
+ /* Restore the FPU registers directly from user memory. */
return restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only,
state_size);
}
@@ -342,9 +367,8 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx,
* to be ignored for histerical raisins. The legacy state is folded
* in once the larger state has been copied.
*/
- ret = __copy_from_user(&env, buf, sizeof(env));
- if (ret)
- return ret;
+ if (__copy_from_user(&env, buf, sizeof(env)))
+ return false;
/*
* By setting TIF_NEED_FPU_LOAD it is ensured that our xstate is
@@ -363,33 +387,38 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx,
* the right place in memory. It's ia32 mode. Shrug.
*/
if (xfeatures_mask_supervisor())
- os_xsave(&fpu->state.xsave);
+ os_xsave(fpu->fpstate);
set_thread_flag(TIF_NEED_FPU_LOAD);
}
__fpu_invalidate_fpregs_state(fpu);
__cpu_invalidate_fpregs_state();
fpregs_unlock();
+ fpregs = &fpu->fpstate->regs;
if (use_xsave() && !fx_only) {
- ret = copy_sigframe_from_user_to_xstate(&fpu->state.xsave, buf_fx);
- if (ret)
- return ret;
+ if (copy_sigframe_from_user_to_xstate(fpu->fpstate, buf_fx))
+ return false;
} else {
- if (__copy_from_user(&fpu->state.fxsave, buf_fx,
- sizeof(fpu->state.fxsave)))
- return -EFAULT;
-
- /* Reject invalid MXCSR values. */
- if (fpu->state.fxsave.mxcsr & ~mxcsr_feature_mask)
- return -EINVAL;
+ if (__copy_from_user(&fpregs->fxsave, buf_fx,
+ sizeof(fpregs->fxsave)))
+ return false;
+
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ /* Reject invalid MXCSR values. */
+ if (fpregs->fxsave.mxcsr & ~mxcsr_feature_mask)
+ return false;
+ } else {
+ /* Mask invalid bits out for historical reasons (broken hardware). */
+ fpregs->fxsave.mxcsr &= mxcsr_feature_mask;
+ }
/* Enforce XFEATURE_MASK_FPSSE when XSAVE is enabled */
if (use_xsave())
- fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE;
+ fpregs->xsave.header.xfeatures |= XFEATURE_MASK_FPSSE;
}
/* Fold the legacy FP storage */
- convert_to_fxsr(&fpu->state.fxsave, &env);
+ convert_to_fxsr(&fpregs->fxsave, &env);
fpregs_lock();
if (use_xsave()) {
@@ -404,40 +433,45 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx,
*/
u64 mask = user_xfeatures | xfeatures_mask_supervisor();
- fpu->state.xsave.header.xfeatures &= mask;
- ret = os_xrstor_safe(&fpu->state.xsave, xfeatures_mask_all);
+ fpregs->xsave.header.xfeatures &= mask;
+ success = !os_xrstor_safe(fpu->fpstate,
+ fpu_kernel_cfg.max_features);
} else {
- ret = fxrstor_safe(&fpu->state.fxsave);
+ success = !fxrstor_safe(&fpregs->fxsave);
}
- if (likely(!ret))
+ if (likely(success))
fpregs_mark_activate();
fpregs_unlock();
- return ret;
+ return success;
}
-static inline int xstate_sigframe_size(void)
+
+static inline unsigned int xstate_sigframe_size(struct fpstate *fpstate)
{
- return use_xsave() ? fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE :
- fpu_user_xstate_size;
+ unsigned int size = fpstate->user_size;
+
+ return use_xsave() ? size + FP_XSTATE_MAGIC2_SIZE : size;
}
/*
* Restore FPU state from a sigframe:
*/
-int fpu__restore_sig(void __user *buf, int ia32_frame)
+bool fpu__restore_sig(void __user *buf, int ia32_frame)
{
- unsigned int size = xstate_sigframe_size();
struct fpu *fpu = &current->thread.fpu;
void __user *buf_fx = buf;
bool ia32_fxstate = false;
- int ret;
+ bool success = false;
+ unsigned int size;
if (unlikely(!buf)) {
fpu__clear_user_states(fpu);
- return 0;
+ return true;
}
+ size = xstate_sigframe_size(fpu->fpstate);
+
ia32_frame &= (IS_ENABLED(CONFIG_X86_32) ||
IS_ENABLED(CONFIG_IA32_EMULATION));
@@ -451,30 +485,28 @@ int fpu__restore_sig(void __user *buf, int ia32_frame)
ia32_fxstate = true;
}
- if (!access_ok(buf, size)) {
- ret = -EACCES;
+ if (!access_ok(buf, size))
goto out;
- }
if (!IS_ENABLED(CONFIG_X86_64) && !cpu_feature_enabled(X86_FEATURE_FPU)) {
- ret = fpregs_soft_set(current, NULL, 0,
- sizeof(struct user_i387_ia32_struct),
- NULL, buf);
+ success = !fpregs_soft_set(current, NULL, 0,
+ sizeof(struct user_i387_ia32_struct),
+ NULL, buf);
} else {
- ret = __fpu_restore_sig(buf, buf_fx, ia32_fxstate);
+ success = __fpu_restore_sig(buf, buf_fx, ia32_fxstate);
}
out:
- if (unlikely(ret))
+ if (unlikely(!success))
fpu__clear_user_states(fpu);
- return ret;
+ return success;
}
unsigned long
fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
unsigned long *buf_fx, unsigned long *size)
{
- unsigned long frame_size = xstate_sigframe_size();
+ unsigned long frame_size = xstate_sigframe_size(current->thread.fpu.fpstate);
*buf_fx = sp = round_down(sp - frame_size, 64);
if (ia32_frame && use_fxsr()) {
@@ -487,9 +519,12 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
return sp;
}
-unsigned long fpu__get_fpstate_size(void)
+unsigned long __init fpu__get_fpstate_size(void)
{
- unsigned long ret = xstate_sigframe_size();
+ unsigned long ret = fpu_user_cfg.max_size;
+
+ if (use_xsave())
+ ret += FP_XSTATE_MAGIC2_SIZE;
/*
* This space is needed on (most) 32-bit kernels, or when a 32-bit
@@ -505,28 +540,3 @@ unsigned long fpu__get_fpstate_size(void)
return ret;
}
-/*
- * Prepare the SW reserved portion of the fxsave memory layout, indicating
- * the presence of the extended state information in the memory layout
- * pointed by the fpstate pointer in the sigcontext.
- * This will be saved when ever the FP and extended state context is
- * saved on the user stack during the signal handler delivery to the user.
- */
-void fpu__init_prepare_fx_sw_frame(void)
-{
- int size = fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE;
-
- fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
- fx_sw_reserved.extended_size = size;
- fx_sw_reserved.xfeatures = xfeatures_mask_uabi();
- fx_sw_reserved.xstate_size = fpu_user_xstate_size;
-
- if (IS_ENABLED(CONFIG_IA32_EMULATION) ||
- IS_ENABLED(CONFIG_X86_32)) {
- int fsave_header_size = sizeof(struct fregs_state);
-
- fx_sw_reserved_ia32 = fx_sw_reserved;
- fx_sw_reserved_ia32.extended_size = size + fsave_header_size;
- }
-}
-
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index c8def1b7f8fb..d28829403ed0 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -4,21 +4,33 @@
*
* Author: Suresh Siddha <suresh.b.siddha@intel.com>
*/
+#include <linux/bitops.h>
#include <linux/compat.h>
#include <linux/cpu.h>
#include <linux/mman.h>
+#include <linux/nospec.h>
#include <linux/pkeys.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
+#include <linux/vmalloc.h>
#include <asm/fpu/api.h>
-#include <asm/fpu/internal.h>
-#include <asm/fpu/signal.h>
#include <asm/fpu/regset.h>
-#include <asm/fpu/xstate.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/xcr.h>
#include <asm/tlbflush.h>
-#include <asm/cpufeature.h>
+#include <asm/prctl.h>
+#include <asm/elf.h>
+
+#include "context.h"
+#include "internal.h"
+#include "legacy.h"
+#include "xstate.h"
+
+#define for_each_extended_xfeature(bit, mask) \
+ (bit) = FIRST_EXTENDED_XFEATURE; \
+ for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask))
/*
* Although we spell it out in here, the Processor Trace
@@ -39,29 +51,32 @@ static const char *xfeature_names[] =
"Protection Keys User registers",
"PASID state",
"unknown xstate feature" ,
+ "unknown xstate feature" ,
+ "unknown xstate feature" ,
+ "unknown xstate feature" ,
+ "unknown xstate feature" ,
+ "unknown xstate feature" ,
+ "AMX Tile config" ,
+ "AMX Tile data" ,
+ "unknown xstate feature" ,
};
-static short xsave_cpuid_features[] __initdata = {
- X86_FEATURE_FPU,
- X86_FEATURE_XMM,
- X86_FEATURE_AVX,
- X86_FEATURE_MPX,
- X86_FEATURE_MPX,
- X86_FEATURE_AVX512F,
- X86_FEATURE_AVX512F,
- X86_FEATURE_AVX512F,
- X86_FEATURE_INTEL_PT,
- X86_FEATURE_PKU,
- X86_FEATURE_ENQCMD,
+static unsigned short xsave_cpuid_features[] __initdata = {
+ [XFEATURE_FP] = X86_FEATURE_FPU,
+ [XFEATURE_SSE] = X86_FEATURE_XMM,
+ [XFEATURE_YMM] = X86_FEATURE_AVX,
+ [XFEATURE_BNDREGS] = X86_FEATURE_MPX,
+ [XFEATURE_BNDCSR] = X86_FEATURE_MPX,
+ [XFEATURE_OPMASK] = X86_FEATURE_AVX512F,
+ [XFEATURE_ZMM_Hi256] = X86_FEATURE_AVX512F,
+ [XFEATURE_Hi16_ZMM] = X86_FEATURE_AVX512F,
+ [XFEATURE_PT_UNIMPLEMENTED_SO_FAR] = X86_FEATURE_INTEL_PT,
+ [XFEATURE_PKRU] = X86_FEATURE_PKU,
+ [XFEATURE_PASID] = X86_FEATURE_ENQCMD,
+ [XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE,
+ [XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE,
};
-/*
- * This represents the full set of bits that should ever be set in a kernel
- * XSAVE buffer, both supervisor and user xstates.
- */
-u64 xfeatures_mask_all __ro_after_init;
-EXPORT_SYMBOL_GPL(xfeatures_mask_all);
-
static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
{ [ 0 ... XFEATURE_MAX - 1] = -1};
static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init =
@@ -72,20 +87,13 @@ static unsigned int xstate_supervisor_only_offsets[XFEATURE_MAX] __ro_after_init
{ [ 0 ... XFEATURE_MAX - 1] = -1};
/*
- * The XSAVE area of kernel can be in standard or compacted format;
- * it is always in standard format for user mode. This is the user
- * mode standard format size used for signal and ptrace frames.
- */
-unsigned int fpu_user_xstate_size __ro_after_init;
-
-/*
* Return whether the system supports a given xfeature.
*
* Also return the name of the (most advanced) feature that the caller requested:
*/
int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
{
- u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask_all;
+ u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features;
if (unlikely(feature_name)) {
long xfeature_idx, max_idx;
@@ -135,17 +143,26 @@ static bool xfeature_is_supervisor(int xfeature_nr)
*/
void fpu__init_cpu_xstate(void)
{
- if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask_all)
+ if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features)
return;
cr4_set_bits(X86_CR4_OSXSAVE);
/*
+ * Must happen after CR4 setup and before xsetbv() to allow KVM
+ * lazy passthrough. Write independent of the dynamic state static
+ * key as that does not work on the boot CPU. This also ensures
+ * that any stale state is wiped out from XFD.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_XFD))
+ wrmsrl(MSR_IA32_XFD, init_fpstate.xfd);
+
+ /*
* XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features
* managed by XSAVE{C, OPT, S} and XRSTOR{S}. Only XSAVE user
* states can be set here.
*/
- xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_uabi());
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
/*
* MSR_IA32_XSS sets supervisor states managed by XSAVES.
@@ -158,7 +175,7 @@ void fpu__init_cpu_xstate(void)
static bool xfeature_enabled(enum xfeature xfeature)
{
- return xfeatures_mask_all & BIT_ULL(xfeature);
+ return fpu_kernel_cfg.max_features & BIT_ULL(xfeature);
}
/*
@@ -184,10 +201,7 @@ static void __init setup_xstate_features(void)
xstate_sizes[XFEATURE_SSE] = sizeof_field(struct fxregs_state,
xmm_space);
- for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
- if (!xfeature_enabled(i))
- continue;
-
+ for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
xstate_sizes[i] = eax;
@@ -236,6 +250,8 @@ static void __init print_xstate_features(void)
print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
print_xstate_feature(XFEATURE_MASK_PKRU);
print_xstate_feature(XFEATURE_MASK_PASID);
+ print_xstate_feature(XFEATURE_MASK_XTILE_CFG);
+ print_xstate_feature(XFEATURE_MASK_XTILE_DATA);
}
/*
@@ -291,20 +307,15 @@ static void __init setup_xstate_comp_offsets(void)
xstate_comp_offsets[XFEATURE_SSE] = offsetof(struct fxregs_state,
xmm_space);
- if (!boot_cpu_has(X86_FEATURE_XSAVES)) {
- for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
- if (xfeature_enabled(i))
- xstate_comp_offsets[i] = xstate_offsets[i];
- }
+ if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) {
+ for_each_extended_xfeature(i, fpu_kernel_cfg.max_features)
+ xstate_comp_offsets[i] = xstate_offsets[i];
return;
}
next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE;
- for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
- if (!xfeature_enabled(i))
- continue;
-
+ for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
if (xfeature_is_aligned(i))
next_offset = ALIGN(next_offset, 64);
@@ -328,8 +339,8 @@ static void __init setup_supervisor_only_offsets(void)
next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE;
- for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
- if (!xfeature_enabled(i) || !xfeature_is_supervisor(i))
+ for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
+ if (!xfeature_is_supervisor(i))
continue;
if (xfeature_is_aligned(i))
@@ -347,15 +358,36 @@ static void __init print_xstate_offset_size(void)
{
int i;
- for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
- if (!xfeature_enabled(i))
- continue;
+ for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n",
i, xstate_comp_offsets[i], i, xstate_sizes[i]);
}
}
/*
+ * This function is called only during boot time when x86 caps are not set
+ * up and alternative can not be used yet.
+ */
+static __init void os_xrstor_booting(struct xregs_state *xstate)
+{
+ u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE;
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+ int err;
+
+ if (cpu_feature_enabled(X86_FEATURE_XSAVES))
+ XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
+ else
+ XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
+
+ /*
+ * We should never fault when copying from a kernel buffer, and the FPU
+ * state we set at boot time should be valid.
+ */
+ WARN_ON_FPU(err);
+}
+
+/*
* All supported features have either init state all zeros or are
* handled in setup_init_fpu() individually. This is an explicit
* feature list and does not use XFEATURE_MASK*SUPPORTED to catch
@@ -372,36 +404,30 @@ static void __init print_xstate_offset_size(void)
XFEATURE_MASK_PKRU | \
XFEATURE_MASK_BNDREGS | \
XFEATURE_MASK_BNDCSR | \
- XFEATURE_MASK_PASID)
+ XFEATURE_MASK_PASID | \
+ XFEATURE_MASK_XTILE)
/*
* setup the xstate image representing the init state
*/
static void __init setup_init_fpu_buf(void)
{
- static int on_boot_cpu __initdata = 1;
-
BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED |
XFEATURE_MASK_SUPERVISOR_SUPPORTED) !=
XFEATURES_INIT_FPSTATE_HANDLED);
- WARN_ON_FPU(!on_boot_cpu);
- on_boot_cpu = 0;
-
if (!boot_cpu_has(X86_FEATURE_XSAVE))
return;
setup_xstate_features();
print_xstate_features();
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- init_fpstate.xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT |
- xfeatures_mask_all;
+ xstate_init_xcomp_bv(&init_fpstate.regs.xsave, fpu_kernel_cfg.max_features);
/*
* Init all the features state with header.xfeatures being 0x0
*/
- os_xrstor_booting(&init_fpstate.xsave);
+ os_xrstor_booting(&init_fpstate.regs.xsave);
/*
* All components are now in init state. Read the state back so
@@ -419,7 +445,7 @@ static void __init setup_init_fpu_buf(void)
* state is all zeroes or if not to add the necessary handling
* here.
*/
- fxsave(&init_fpstate.fxsave);
+ fxsave(&init_fpstate.regs.fxsave);
}
static int xfeature_uncompacted_offset(int xfeature_nr)
@@ -451,10 +477,11 @@ int xfeature_size(int xfeature_nr)
}
/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
-static int validate_user_xstate_header(const struct xstate_header *hdr)
+static int validate_user_xstate_header(const struct xstate_header *hdr,
+ struct fpstate *fpstate)
{
/* No unknown or supervisor features may be set */
- if (hdr->xfeatures & ~xfeatures_mask_uabi())
+ if (hdr->xfeatures & ~fpstate->user_xfeatures)
return -EINVAL;
/* Userspace must use the uncompacted format */
@@ -474,7 +501,7 @@ static int validate_user_xstate_header(const struct xstate_header *hdr)
return 0;
}
-static void __xstate_dump_leaves(void)
+static void __init __xstate_dump_leaves(void)
{
int i;
u32 eax, ebx, ecx, edx;
@@ -509,12 +536,73 @@ static void __xstate_dump_leaves(void)
} \
} while (0)
+/**
+ * check_xtile_data_against_struct - Check tile data state size.
+ *
+ * Calculate the state size by multiplying the single tile size which is
+ * recorded in a C struct, and the number of tiles that the CPU informs.
+ * Compare the provided size with the calculation.
+ *
+ * @size: The tile data state size
+ *
+ * Returns: 0 on success, -EINVAL on mismatch.
+ */
+static int __init check_xtile_data_against_struct(int size)
+{
+ u32 max_palid, palid, state_size;
+ u32 eax, ebx, ecx, edx;
+ u16 max_tile;
+
+ /*
+ * Check the maximum palette id:
+ * eax: the highest numbered palette subleaf.
+ */
+ cpuid_count(TILE_CPUID, 0, &max_palid, &ebx, &ecx, &edx);
+
+ /*
+ * Cross-check each tile size and find the maximum number of
+ * supported tiles.
+ */
+ for (palid = 1, max_tile = 0; palid <= max_palid; palid++) {
+ u16 tile_size, max;
+
+ /*
+ * Check the tile size info:
+ * eax[31:16]: bytes per title
+ * ebx[31:16]: the max names (or max number of tiles)
+ */
+ cpuid_count(TILE_CPUID, palid, &eax, &ebx, &edx, &edx);
+ tile_size = eax >> 16;
+ max = ebx >> 16;
+
+ if (tile_size != sizeof(struct xtile_data)) {
+ pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n",
+ __stringify(XFEATURE_XTILE_DATA),
+ sizeof(struct xtile_data), tile_size);
+ __xstate_dump_leaves();
+ return -EINVAL;
+ }
+
+ if (max > max_tile)
+ max_tile = max;
+ }
+
+ state_size = sizeof(struct xtile_data) * max_tile;
+ if (size != state_size) {
+ pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n",
+ __stringify(XFEATURE_XTILE_DATA), state_size, size);
+ __xstate_dump_leaves();
+ return -EINVAL;
+ }
+ return 0;
+}
+
/*
* We have a C struct for each 'xstate'. We need to ensure
* that our software representation matches what the CPU
* tells us about the state's size.
*/
-static void check_xstate_against_struct(int nr)
+static bool __init check_xstate_against_struct(int nr)
{
/*
* Ask the CPU for the size of the state.
@@ -532,6 +620,11 @@ static void check_xstate_against_struct(int nr)
XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM, struct avx_512_hi16_state);
XCHECK_SZ(sz, nr, XFEATURE_PKRU, struct pkru_state);
XCHECK_SZ(sz, nr, XFEATURE_PASID, struct ia32_pasid_state);
+ XCHECK_SZ(sz, nr, XFEATURE_XTILE_CFG, struct xtile_cfg);
+
+ /* The tile data size varies between implementations. */
+ if (nr == XFEATURE_XTILE_DATA)
+ check_xtile_data_against_struct(sz);
/*
* Make *SURE* to add any feature numbers in below if
@@ -541,10 +634,39 @@ static void check_xstate_against_struct(int nr)
if ((nr < XFEATURE_YMM) ||
(nr >= XFEATURE_MAX) ||
(nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) ||
- ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_LBR))) {
+ ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_RSRVD_COMP_16))) {
WARN_ONCE(1, "no structure for xstate: %d\n", nr);
XSTATE_WARN_ON(1);
+ return false;
+ }
+ return true;
+}
+
+static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted)
+{
+ unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+ int i;
+
+ for_each_extended_xfeature(i, xfeatures) {
+ /* Align from the end of the previous feature */
+ if (xfeature_is_aligned(i))
+ size = ALIGN(size, 64);
+ /*
+ * In compacted format the enabled features are packed,
+ * i.e. disabled features do not occupy space.
+ *
+ * In non-compacted format the offsets are fixed and
+ * disabled states still occupy space in the memory buffer.
+ */
+ if (!compacted)
+ size = xfeature_uncompacted_offset(i);
+ /*
+ * Add the feature size even for non-compacted format
+ * to make the end result correct
+ */
+ size += xfeature_size(i);
}
+ return size;
}
/*
@@ -556,44 +678,29 @@ static void check_xstate_against_struct(int nr)
* covered by these checks. Only the size of the buffer for task->fpu
* is checked here.
*/
-static void do_extra_xstate_size_checks(void)
+static bool __init paranoid_xstate_size_valid(unsigned int kernel_size)
{
- int paranoid_xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+ bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES);
+ unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
int i;
- for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
- if (!xfeature_enabled(i))
- continue;
-
- check_xstate_against_struct(i);
+ for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
+ if (!check_xstate_against_struct(i))
+ return false;
/*
* Supervisor state components can be managed only by
* XSAVES.
*/
- if (!cpu_feature_enabled(X86_FEATURE_XSAVES))
- XSTATE_WARN_ON(xfeature_is_supervisor(i));
-
- /* Align from the end of the previous feature */
- if (xfeature_is_aligned(i))
- paranoid_xstate_size = ALIGN(paranoid_xstate_size, 64);
- /*
- * The offset of a given state in the non-compacted
- * format is given to us in a CPUID leaf. We check
- * them for being ordered (increasing offsets) in
- * setup_xstate_features(). XSAVES uses compacted format.
- */
- if (!cpu_feature_enabled(X86_FEATURE_XSAVES))
- paranoid_xstate_size = xfeature_uncompacted_offset(i);
- /*
- * The compacted-format offset always depends on where
- * the previous state ended.
- */
- paranoid_xstate_size += xfeature_size(i);
+ if (!compacted && xfeature_is_supervisor(i)) {
+ XSTATE_WARN_ON(1);
+ return false;
+ }
}
- XSTATE_WARN_ON(paranoid_xstate_size != fpu_kernel_xstate_size);
+ size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted);
+ XSTATE_WARN_ON(size != kernel_size);
+ return size == kernel_size;
}
-
/*
* Get total size of enabled xstates in XCR0 | IA32_XSS.
*
@@ -644,7 +751,7 @@ static unsigned int __init get_xsaves_size_no_independent(void)
return size;
}
-static unsigned int __init get_xsave_size(void)
+static unsigned int __init get_xsave_size_user(void)
{
unsigned int eax, ebx, ecx, edx;
/*
@@ -662,44 +769,54 @@ static unsigned int __init get_xsave_size(void)
* Will the runtime-enumerated 'xstate_size' fit in the init
* task's statically-allocated buffer?
*/
-static bool is_supported_xstate_size(unsigned int test_xstate_size)
+static bool __init is_supported_xstate_size(unsigned int test_xstate_size)
{
- if (test_xstate_size <= sizeof(union fpregs_state))
+ if (test_xstate_size <= sizeof(init_fpstate.regs))
return true;
pr_warn("x86/fpu: xstate buffer too small (%zu < %d), disabling xsave\n",
- sizeof(union fpregs_state), test_xstate_size);
+ sizeof(init_fpstate.regs), test_xstate_size);
return false;
}
static int __init init_xstate_size(void)
{
/* Recompute the context size for enabled features: */
- unsigned int possible_xstate_size;
- unsigned int xsave_size;
+ unsigned int user_size, kernel_size, kernel_default_size;
+ bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES);
- xsave_size = get_xsave_size();
+ /* Uncompacted user space size */
+ user_size = get_xsave_size_user();
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- possible_xstate_size = get_xsaves_size_no_independent();
+ /*
+ * XSAVES kernel size includes supervisor states and
+ * uses compacted format when available.
+ *
+ * XSAVE does not support supervisor states so
+ * kernel and user size is identical.
+ */
+ if (compacted)
+ kernel_size = get_xsaves_size_no_independent();
else
- possible_xstate_size = xsave_size;
+ kernel_size = user_size;
+
+ kernel_default_size =
+ xstate_calculate_size(fpu_kernel_cfg.default_features, compacted);
- /* Ensure we have the space to store all enabled: */
- if (!is_supported_xstate_size(possible_xstate_size))
+ /* Ensure we have the space to store all default enabled features. */
+ if (!is_supported_xstate_size(kernel_default_size))
return -EINVAL;
- /*
- * The size is OK, we are definitely going to use xsave,
- * make it known to the world that we need more space.
- */
- fpu_kernel_xstate_size = possible_xstate_size;
- do_extra_xstate_size_checks();
+ if (!paranoid_xstate_size_valid(kernel_size))
+ return -EINVAL;
+
+ fpu_kernel_cfg.max_size = kernel_size;
+ fpu_user_cfg.max_size = user_size;
+
+ fpu_kernel_cfg.default_size = kernel_default_size;
+ fpu_user_cfg.default_size =
+ xstate_calculate_size(fpu_user_cfg.default_features, false);
- /*
- * User space is always in standard format.
- */
- fpu_user_xstate_size = xsave_size;
return 0;
}
@@ -707,28 +824,38 @@ static int __init init_xstate_size(void)
* We enabled the XSAVE hardware, but something went wrong and
* we can not use it. Disable it.
*/
-static void fpu__init_disable_system_xstate(void)
+static void __init fpu__init_disable_system_xstate(unsigned int legacy_size)
{
- xfeatures_mask_all = 0;
+ fpu_kernel_cfg.max_features = 0;
cr4_clear_bits(X86_CR4_OSXSAVE);
setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+
+ /* Restore the legacy size.*/
+ fpu_kernel_cfg.max_size = legacy_size;
+ fpu_kernel_cfg.default_size = legacy_size;
+ fpu_user_cfg.max_size = legacy_size;
+ fpu_user_cfg.default_size = legacy_size;
+
+ /*
+ * Prevent enabling the static branch which enables writes to the
+ * XFD MSR.
+ */
+ init_fpstate.xfd = 0;
+
+ fpstate_reset(&current->thread.fpu);
}
/*
* Enable and initialize the xsave feature.
* Called once per system bootup.
*/
-void __init fpu__init_system_xstate(void)
+void __init fpu__init_system_xstate(unsigned int legacy_size)
{
unsigned int eax, ebx, ecx, edx;
- static int on_boot_cpu __initdata = 1;
u64 xfeatures;
int err;
int i;
- WARN_ON_FPU(!on_boot_cpu);
- on_boot_cpu = 0;
-
if (!boot_cpu_has(X86_FEATURE_FPU)) {
pr_info("x86/fpu: No FPU detected\n");
return;
@@ -749,22 +876,22 @@ void __init fpu__init_system_xstate(void)
* Find user xstates supported by the processor.
*/
cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
- xfeatures_mask_all = eax + ((u64)edx << 32);
+ fpu_kernel_cfg.max_features = eax + ((u64)edx << 32);
/*
* Find supervisor xstates supported by the processor.
*/
cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
- xfeatures_mask_all |= ecx + ((u64)edx << 32);
+ fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32);
- if ((xfeatures_mask_uabi() & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
+ if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
/*
* This indicates that something really unexpected happened
* with the enumeration. Disable XSAVE and try to continue
* booting without it. This is too early to BUG().
*/
pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n",
- xfeatures_mask_all);
+ fpu_kernel_cfg.max_features);
goto out_disable;
}
@@ -772,15 +899,39 @@ void __init fpu__init_system_xstate(void)
* Clear XSAVE features that are disabled in the normal CPUID.
*/
for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
- if (!boot_cpu_has(xsave_cpuid_features[i]))
- xfeatures_mask_all &= ~BIT_ULL(i);
+ unsigned short cid = xsave_cpuid_features[i];
+
+ /* Careful: X86_FEATURE_FPU is 0! */
+ if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid))
+ fpu_kernel_cfg.max_features &= ~BIT_ULL(i);
}
- xfeatures_mask_all &= XFEATURE_MASK_USER_SUPPORTED |
+ if (!cpu_feature_enabled(X86_FEATURE_XFD))
+ fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC;
+
+ fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED |
XFEATURE_MASK_SUPERVISOR_SUPPORTED;
+ fpu_user_cfg.max_features = fpu_kernel_cfg.max_features;
+ fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
+
+ /* Clean out dynamic features from default */
+ fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features;
+ fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
+
+ fpu_user_cfg.default_features = fpu_user_cfg.max_features;
+ fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
+
/* Store it for paranoia check at the end */
- xfeatures = xfeatures_mask_all;
+ xfeatures = fpu_kernel_cfg.max_features;
+
+ /*
+ * Initialize the default XFD state in initfp_state and enable the
+ * dynamic sizing mechanism if dynamic states are available. The
+ * static key cannot be enabled here because this runs before
+ * jump_label_init(). This is delayed to an initcall.
+ */
+ init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC;
/* Enable xstate instructions to be able to continue with initialization: */
fpu__init_cpu_xstate();
@@ -788,13 +939,16 @@ void __init fpu__init_system_xstate(void)
if (err)
goto out_disable;
+ /* Reset the state for the current task */
+ fpstate_reset(&current->thread.fpu);
+
/*
* Update info used for ptrace frames; use standard-format size and no
* supervisor xstates:
*/
- update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask_uabi());
+ update_regset_xstate_info(fpu_user_cfg.max_size,
+ fpu_user_cfg.max_features);
- fpu__init_prepare_fx_sw_frame();
setup_init_fpu_buf();
setup_xstate_comp_offsets();
setup_supervisor_only_offsets();
@@ -803,22 +957,22 @@ void __init fpu__init_system_xstate(void)
* Paranoia check whether something in the setup modified the
* xfeatures mask.
*/
- if (xfeatures != xfeatures_mask_all) {
+ if (xfeatures != fpu_kernel_cfg.max_features) {
pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n",
- xfeatures, xfeatures_mask_all);
+ xfeatures, fpu_kernel_cfg.max_features);
goto out_disable;
}
print_xstate_offset_size();
pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
- xfeatures_mask_all,
- fpu_kernel_xstate_size,
+ fpu_kernel_cfg.max_features,
+ fpu_kernel_cfg.max_size,
boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard");
return;
out_disable:
/* something went wrong, try to boot without any XSAVE support */
- fpu__init_disable_system_xstate();
+ fpu__init_disable_system_xstate(legacy_size);
}
/*
@@ -830,7 +984,7 @@ void fpu__resume_cpu(void)
* Restore XCR0 on xsave capable CPUs:
*/
if (cpu_feature_enabled(X86_FEATURE_XSAVE))
- xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_uabi());
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
/*
* Restore IA32_XSS. The same CPUID bit enumerates support
@@ -840,6 +994,9 @@ void fpu__resume_cpu(void)
wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() |
xfeatures_mask_independent());
}
+
+ if (fpu_state_size_dynamic())
+ wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd);
}
/*
@@ -886,7 +1043,7 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
* We should not ever be requesting features that we
* have not enabled.
*/
- WARN_ONCE(!(xfeatures_mask_all & BIT_ULL(xfeature_nr)),
+ WARN_ONCE(!(fpu_kernel_cfg.max_features & BIT_ULL(xfeature_nr)),
"get of unsupported state");
/*
* This assumes the last 'xsave*' instruction to
@@ -904,7 +1061,6 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
return __raw_xsave_addr(xsave, xfeature_nr);
}
-EXPORT_SYMBOL_GPL(get_xsave_addr);
#ifdef CONFIG_ARCH_HAS_PKEYS
@@ -961,9 +1117,10 @@ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
}
/**
- * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
+ * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
* @to: membuf descriptor
- * @tsk: The task from which to copy the saved xstate
+ * @fpstate: The fpstate buffer from which to copy
+ * @pkru_val: The PKRU value to store in the PKRU component
* @copy_mode: The requested copy mode
*
* Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
@@ -972,14 +1129,15 @@ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
*
* It supports partial copy but @to.pos always starts from zero.
*/
-void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
- enum xstate_copy_mode copy_mode)
+void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
+ u32 pkru_val, enum xstate_copy_mode copy_mode)
{
const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr);
- struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
- struct xregs_state *xinit = &init_fpstate.xsave;
+ struct xregs_state *xinit = &init_fpstate.regs.xsave;
+ struct xregs_state *xsave = &fpstate->regs.xsave;
struct xstate_header header;
unsigned int zerofrom;
+ u64 mask;
int i;
memset(&header, 0, sizeof(header));
@@ -996,7 +1154,7 @@ void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
break;
case XSTATE_COPY_XSAVE:
- header.xfeatures &= xfeatures_mask_uabi();
+ header.xfeatures &= fpstate->user_xfeatures;
break;
}
@@ -1033,17 +1191,15 @@ void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
zerofrom = offsetof(struct xregs_state, extended_state_area);
- for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
- /*
- * The ptrace buffer is in non-compacted XSAVE format.
- * In non-compacted format disabled features still occupy
- * state space, but there is no state to copy from in the
- * compacted init_fpstate. The gap tracking will zero this
- * later.
- */
- if (!(xfeatures_mask_uabi() & BIT_ULL(i)))
- continue;
+ /*
+ * The ptrace buffer is in non-compacted XSAVE format. In
+ * non-compacted format disabled features still occupy state space,
+ * but there is no state to copy from in the compacted
+ * init_fpstate. The gap tracking will zero these states.
+ */
+ mask = fpstate->user_xfeatures;
+ for_each_extended_xfeature(i, mask) {
/*
* If there was a feature or alignment gap, zero the space
* in the destination buffer.
@@ -1055,10 +1211,9 @@ void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
struct pkru_state pkru = {0};
/*
* PKRU is not necessarily up to date in the
- * thread's XSAVE buffer. Fill this part from the
- * per-thread storage.
+ * XSAVE buffer. Use the provided value.
*/
- pkru.pkru = tsk->thread.pkru;
+ pkru.pkru = pkru_val;
membuf_write(&to, &pkru, sizeof(pkru));
} else {
copy_feature(header.xfeatures & BIT_ULL(i), &to,
@@ -1078,6 +1233,25 @@ out:
membuf_zero(&to, to.left);
}
+/**
+ * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
+ * @to: membuf descriptor
+ * @tsk: The task from which to copy the saved xstate
+ * @copy_mode: The requested copy mode
+ *
+ * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
+ * format, i.e. from the kernel internal hardware dependent storage format
+ * to the requested @mode. UABI XSTATE is always uncompacted!
+ *
+ * It supports partial copy but @to.pos always starts from zero.
+ */
+void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
+ enum xstate_copy_mode copy_mode)
+{
+ __copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate,
+ tsk->thread.pkru, copy_mode);
+}
+
static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size,
const void *kbuf, const void __user *ubuf)
{
@@ -1091,9 +1265,10 @@ static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size,
}
-static int copy_uabi_to_xstate(struct xregs_state *xsave, const void *kbuf,
+static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
const void __user *ubuf)
{
+ struct xregs_state *xsave = &fpstate->regs.xsave;
unsigned int offset, size;
struct xstate_header hdr;
u64 mask;
@@ -1103,7 +1278,7 @@ static int copy_uabi_to_xstate(struct xregs_state *xsave, const void *kbuf,
if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf))
return -EFAULT;
- if (validate_user_xstate_header(&hdr))
+ if (validate_user_xstate_header(&hdr, fpstate))
return -EINVAL;
/* Validate MXCSR when any of the related features is in use */
@@ -1156,12 +1331,11 @@ static int copy_uabi_to_xstate(struct xregs_state *xsave, const void *kbuf,
/*
* Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S]
- * format and copy to the target thread. This is called from
- * xstateregs_set().
+ * format and copy to the target thread. Used by ptrace and KVM.
*/
-int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
+int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf)
{
- return copy_uabi_to_xstate(xsave, kbuf, NULL);
+ return copy_uabi_to_xstate(fpstate, kbuf, NULL);
}
/*
@@ -1169,26 +1343,20 @@ int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
* XSAVE[S] format and copy to the target thread. This is called from the
* sigreturn() and rt_sigreturn() system calls.
*/
-int copy_sigframe_from_user_to_xstate(struct xregs_state *xsave,
+int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate,
const void __user *ubuf)
{
- return copy_uabi_to_xstate(xsave, NULL, ubuf);
+ return copy_uabi_to_xstate(fpstate, NULL, ubuf);
}
-static bool validate_xsaves_xrstors(u64 mask)
+static bool validate_independent_components(u64 mask)
{
u64 xchk;
if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES)))
return false;
- /*
- * Validate that this is either a task->fpstate related component
- * subset or an independent one.
- */
- if (mask & xfeatures_mask_independent())
- xchk = ~xfeatures_mask_independent();
- else
- xchk = ~xfeatures_mask_all;
+
+ xchk = ~xfeatures_mask_independent();
if (WARN_ON_ONCE(!mask || mask & xchk))
return false;
@@ -1206,14 +1374,13 @@ static bool validate_xsaves_xrstors(u64 mask)
* buffer should be zeroed otherwise a consecutive XRSTORS from that buffer
* can #GP.
*
- * The feature mask must either be a subset of the independent features or
- * a subset of the task->fpstate related features.
+ * The feature mask must be a subset of the independent features.
*/
void xsaves(struct xregs_state *xstate, u64 mask)
{
int err;
- if (!validate_xsaves_xrstors(mask))
+ if (!validate_independent_components(mask))
return;
XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err);
@@ -1231,20 +1398,379 @@ void xsaves(struct xregs_state *xstate, u64 mask)
* Proper usage is to restore the state which was saved with
* xsaves() into @xstate.
*
- * The feature mask must either be a subset of the independent features or
- * a subset of the task->fpstate related features.
+ * The feature mask must be a subset of the independent features.
*/
void xrstors(struct xregs_state *xstate, u64 mask)
{
int err;
- if (!validate_xsaves_xrstors(mask))
+ if (!validate_independent_components(mask))
return;
XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err);
WARN_ON_ONCE(err);
}
+#if IS_ENABLED(CONFIG_KVM)
+void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature)
+{
+ void *addr = get_xsave_addr(&fps->regs.xsave, xfeature);
+
+ if (addr)
+ memset(addr, 0, xstate_sizes[xfeature]);
+}
+EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component);
+#endif
+
+#ifdef CONFIG_X86_64
+
+#ifdef CONFIG_X86_DEBUG_FPU
+/*
+ * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask
+ * can safely operate on the @fpstate buffer.
+ */
+static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor)
+{
+ u64 xfd = __this_cpu_read(xfd_state);
+
+ if (fpstate->xfd == xfd)
+ return true;
+
+ /*
+ * The XFD MSR does not match fpstate->xfd. That's invalid when
+ * the passed in fpstate is current's fpstate.
+ */
+ if (fpstate->xfd == current->thread.fpu.fpstate->xfd)
+ return false;
+
+ /*
+ * XRSTOR(S) from init_fpstate are always correct as it will just
+ * bring all components into init state and not read from the
+ * buffer. XSAVE(S) raises #PF after init.
+ */
+ if (fpstate == &init_fpstate)
+ return rstor;
+
+ /*
+ * XSAVE(S): clone(), fpu_swap_kvm_fpu()
+ * XRSTORS(S): fpu_swap_kvm_fpu()
+ */
+
+ /*
+ * No XSAVE/XRSTOR instructions (except XSAVE itself) touch
+ * the buffer area for XFD-disabled state components.
+ */
+ mask &= ~xfd;
+
+ /*
+ * Remove features which are valid in fpstate. They
+ * have space allocated in fpstate.
+ */
+ mask &= ~fpstate->xfeatures;
+
+ /*
+ * Any remaining state components in 'mask' might be written
+ * by XSAVE/XRSTOR. Fail validation it found.
+ */
+ return !mask;
+}
+
+void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor)
+{
+ WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor));
+}
+#endif /* CONFIG_X86_DEBUG_FPU */
+
+static int __init xfd_update_static_branch(void)
+{
+ /*
+ * If init_fpstate.xfd has bits set then dynamic features are
+ * available and the dynamic sizing must be enabled.
+ */
+ if (init_fpstate.xfd)
+ static_branch_enable(&__fpu_state_size_dynamic);
+ return 0;
+}
+arch_initcall(xfd_update_static_branch)
+
+void fpstate_free(struct fpu *fpu)
+{
+ if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate)
+ vfree(fpu->fpstate);
+}
+
+/**
+ * fpu_install_fpstate - Update the active fpstate in the FPU
+ *
+ * @fpu: A struct fpu * pointer
+ * @newfps: A struct fpstate * pointer
+ *
+ * Returns: A null pointer if the last active fpstate is the embedded
+ * one or the new fpstate is already installed;
+ * otherwise, a pointer to the old fpstate which has to
+ * be freed by the caller.
+ */
+static struct fpstate *fpu_install_fpstate(struct fpu *fpu,
+ struct fpstate *newfps)
+{
+ struct fpstate *oldfps = fpu->fpstate;
+
+ if (fpu->fpstate == newfps)
+ return NULL;
+
+ fpu->fpstate = newfps;
+ return oldfps != &fpu->__fpstate ? oldfps : NULL;
+}
+
+/**
+ * fpstate_realloc - Reallocate struct fpstate for the requested new features
+ *
+ * @xfeatures: A bitmap of xstate features which extend the enabled features
+ * of that task
+ * @ksize: The required size for the kernel buffer
+ * @usize: The required size for user space buffers
+ *
+ * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer
+ * terminates quickly, vfree()-induced IPIs may be a concern, but tasks
+ * with large states are likely to live longer.
+ *
+ * Returns: 0 on success, -ENOMEM on allocation error.
+ */
+static int fpstate_realloc(u64 xfeatures, unsigned int ksize,
+ unsigned int usize)
+{
+ struct fpu *fpu = &current->thread.fpu;
+ struct fpstate *curfps, *newfps = NULL;
+ unsigned int fpsize;
+
+ curfps = fpu->fpstate;
+ fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64);
+
+ newfps = vzalloc(fpsize);
+ if (!newfps)
+ return -ENOMEM;
+ newfps->size = ksize;
+ newfps->user_size = usize;
+ newfps->is_valloc = true;
+
+ fpregs_lock();
+ /*
+ * Ensure that the current state is in the registers before
+ * swapping fpstate as that might invalidate it due to layout
+ * changes.
+ */
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ fpregs_restore_userregs();
+
+ newfps->xfeatures = curfps->xfeatures | xfeatures;
+ newfps->user_xfeatures = curfps->user_xfeatures | xfeatures;
+ newfps->xfd = curfps->xfd & ~xfeatures;
+
+ curfps = fpu_install_fpstate(fpu, newfps);
+
+ /* Do the final updates within the locked region */
+ xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures);
+ xfd_update_state(newfps);
+
+ fpregs_unlock();
+
+ vfree(curfps);
+ return 0;
+}
+
+static int validate_sigaltstack(unsigned int usize)
+{
+ struct task_struct *thread, *leader = current->group_leader;
+ unsigned long framesize = get_sigframe_size();
+
+ lockdep_assert_held(&current->sighand->siglock);
+
+ /* get_sigframe_size() is based on fpu_user_cfg.max_size */
+ framesize -= fpu_user_cfg.max_size;
+ framesize += usize;
+ for_each_thread(leader, thread) {
+ if (thread->sas_ss_size && thread->sas_ss_size < framesize)
+ return -ENOSPC;
+ }
+ return 0;
+}
+
+static int __xstate_request_perm(u64 permitted, u64 requested)
+{
+ /*
+ * This deliberately does not exclude !XSAVES as we still might
+ * decide to optionally context switch XCR0 or talk the silicon
+ * vendors into extending XFD for the pre AMX states, especially
+ * AVX512.
+ */
+ bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES);
+ struct fpu *fpu = &current->group_leader->thread.fpu;
+ unsigned int ksize, usize;
+ u64 mask;
+ int ret;
+
+ /* Check whether fully enabled */
+ if ((permitted & requested) == requested)
+ return 0;
+
+ /* Calculate the resulting kernel state size */
+ mask = permitted | requested;
+ ksize = xstate_calculate_size(mask, compacted);
+
+ /* Calculate the resulting user state size */
+ mask &= XFEATURE_MASK_USER_SUPPORTED;
+ usize = xstate_calculate_size(mask, false);
+
+ ret = validate_sigaltstack(usize);
+ if (ret)
+ return ret;
+
+ /* Pairs with the READ_ONCE() in xstate_get_group_perm() */
+ WRITE_ONCE(fpu->perm.__state_perm, requested);
+ /* Protected by sighand lock */
+ fpu->perm.__state_size = ksize;
+ fpu->perm.__user_state_size = usize;
+ return ret;
+}
+
+/*
+ * Permissions array to map facilities with more than one component
+ */
+static const u64 xstate_prctl_req[XFEATURE_MAX] = {
+ [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA,
+};
+
+static int xstate_request_perm(unsigned long idx)
+{
+ u64 permitted, requested;
+ int ret;
+
+ if (idx >= XFEATURE_MAX)
+ return -EINVAL;
+
+ /*
+ * Look up the facility mask which can require more than
+ * one xstate component.
+ */
+ idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req));
+ requested = xstate_prctl_req[idx];
+ if (!requested)
+ return -EOPNOTSUPP;
+
+ if ((fpu_user_cfg.max_features & requested) != requested)
+ return -EOPNOTSUPP;
+
+ /* Lockless quick check */
+ permitted = xstate_get_host_group_perm();
+ if ((permitted & requested) == requested)
+ return 0;
+
+ /* Protect against concurrent modifications */
+ spin_lock_irq(&current->sighand->siglock);
+ permitted = xstate_get_host_group_perm();
+ ret = __xstate_request_perm(permitted, requested);
+ spin_unlock_irq(&current->sighand->siglock);
+ return ret;
+}
+
+int xfd_enable_feature(u64 xfd_err)
+{
+ u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC;
+ unsigned int ksize, usize;
+ struct fpu *fpu;
+
+ if (!xfd_event) {
+ pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err);
+ return 0;
+ }
+
+ /* Protect against concurrent modifications */
+ spin_lock_irq(&current->sighand->siglock);
+
+ /* If not permitted let it die */
+ if ((xstate_get_host_group_perm() & xfd_event) != xfd_event) {
+ spin_unlock_irq(&current->sighand->siglock);
+ return -EPERM;
+ }
+
+ fpu = &current->group_leader->thread.fpu;
+ ksize = fpu->perm.__state_size;
+ usize = fpu->perm.__user_state_size;
+ /*
+ * The feature is permitted. State size is sufficient. Dropping
+ * the lock is safe here even if more features are added from
+ * another task, the retrieved buffer sizes are valid for the
+ * currently requested feature(s).
+ */
+ spin_unlock_irq(&current->sighand->siglock);
+
+ /*
+ * Try to allocate a new fpstate. If that fails there is no way
+ * out.
+ */
+ if (fpstate_realloc(xfd_event, ksize, usize))
+ return -EFAULT;
+ return 0;
+}
+#else /* CONFIG_X86_64 */
+static inline int xstate_request_perm(unsigned long idx)
+{
+ return -EPERM;
+}
+#endif /* !CONFIG_X86_64 */
+
+/**
+ * fpu_xstate_prctl - xstate permission operations
+ * @tsk: Redundant pointer to current
+ * @option: A subfunction of arch_prctl()
+ * @arg2: option argument
+ * Return: 0 if successful; otherwise, an error code
+ *
+ * Option arguments:
+ *
+ * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info
+ * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info
+ * ARCH_REQ_XCOMP_PERM: Facility number requested
+ *
+ * For facilities which require more than one XSTATE component, the request
+ * must be the highest state component number related to that facility,
+ * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and
+ * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18).
+ */
+long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2)
+{
+ u64 __user *uptr = (u64 __user *)arg2;
+ u64 permitted, supported;
+ unsigned long idx = arg2;
+
+ if (tsk != current)
+ return -EPERM;
+
+ switch (option) {
+ case ARCH_GET_XCOMP_SUPP:
+ supported = fpu_user_cfg.max_features | fpu_user_cfg.legacy_features;
+ return put_user(supported, uptr);
+
+ case ARCH_GET_XCOMP_PERM:
+ /*
+ * Lockless snapshot as it can also change right after the
+ * dropping the lock.
+ */
+ permitted = xstate_get_host_group_perm();
+ permitted &= XFEATURE_MASK_USER_SUPPORTED;
+ return put_user(permitted, uptr);
+
+ case ARCH_REQ_XCOMP_PERM:
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return -EOPNOTSUPP;
+
+ return xstate_request_perm(idx);
+
+ default:
+ return -EINVAL;
+ }
+}
+
#ifdef CONFIG_PROC_PID_ARCH_STATUS
/*
* Report the amount of time elapsed in millisecond since last AVX512
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
new file mode 100644
index 000000000000..e18210dff88c
--- /dev/null
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -0,0 +1,278 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_KERNEL_FPU_XSTATE_H
+#define __X86_KERNEL_FPU_XSTATE_H
+
+#include <asm/cpufeature.h>
+#include <asm/fpu/xstate.h>
+
+#ifdef CONFIG_X86_64
+DECLARE_PER_CPU(u64, xfd_state);
+#endif
+
+static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask)
+{
+ /*
+ * XRSTORS requires these bits set in xcomp_bv, or it will
+ * trigger #GP:
+ */
+ if (cpu_feature_enabled(X86_FEATURE_XSAVES))
+ xsave->header.xcomp_bv = mask | XCOMP_BV_COMPACTED_FORMAT;
+}
+
+static inline u64 xstate_get_host_group_perm(void)
+{
+ /* Pairs with WRITE_ONCE() in xstate_request_perm() */
+ return READ_ONCE(current->group_leader->thread.fpu.perm.__state_perm);
+}
+
+enum xstate_copy_mode {
+ XSTATE_COPY_FP,
+ XSTATE_COPY_FX,
+ XSTATE_COPY_XSAVE,
+};
+
+struct membuf;
+extern void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
+ u32 pkru_val, enum xstate_copy_mode copy_mode);
+extern void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
+ enum xstate_copy_mode mode);
+extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf);
+extern int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate, const void __user *ubuf);
+
+
+extern void fpu__init_cpu_xstate(void);
+extern void fpu__init_system_xstate(unsigned int legacy_size);
+
+extern void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);
+
+static inline u64 xfeatures_mask_supervisor(void)
+{
+ return fpu_kernel_cfg.max_features & XFEATURE_MASK_SUPERVISOR_SUPPORTED;
+}
+
+static inline u64 xfeatures_mask_independent(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR))
+ return XFEATURE_MASK_INDEPENDENT & ~XFEATURE_MASK_LBR;
+
+ return XFEATURE_MASK_INDEPENDENT;
+}
+
+/* XSAVE/XRSTOR wrapper functions */
+
+#ifdef CONFIG_X86_64
+#define REX_PREFIX "0x48, "
+#else
+#define REX_PREFIX
+#endif
+
+/* These macros all use (%edi)/(%rdi) as the single memory argument. */
+#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27"
+#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37"
+#define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f"
+#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f"
+#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f"
+
+/*
+ * After this @err contains 0 on success or the trap number when the
+ * operation raises an exception.
+ */
+#define XSTATE_OP(op, st, lmask, hmask, err) \
+ asm volatile("1:" op "\n\t" \
+ "xor %[err], %[err]\n" \
+ "2:\n\t" \
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE) \
+ : [err] "=a" (err) \
+ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
+ : "memory")
+
+/*
+ * If XSAVES is enabled, it replaces XSAVEOPT because it supports a compact
+ * format and supervisor states in addition to modified optimization in
+ * XSAVEOPT.
+ *
+ * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT
+ * supports modified optimization which is not supported by XSAVE.
+ *
+ * We use XSAVE as a fallback.
+ *
+ * The 661 label is defined in the ALTERNATIVE* macros as the address of the
+ * original instruction which gets replaced. We need to use it here as the
+ * address of the instruction where we might get an exception at.
+ */
+#define XSTATE_XSAVE(st, lmask, hmask, err) \
+ asm volatile(ALTERNATIVE_2(XSAVE, \
+ XSAVEOPT, X86_FEATURE_XSAVEOPT, \
+ XSAVES, X86_FEATURE_XSAVES) \
+ "\n" \
+ "xor %[err], %[err]\n" \
+ "3:\n" \
+ ".pushsection .fixup,\"ax\"\n" \
+ "4: movl $-2, %[err]\n" \
+ "jmp 3b\n" \
+ ".popsection\n" \
+ _ASM_EXTABLE(661b, 4b) \
+ : [err] "=r" (err) \
+ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
+ : "memory")
+
+/*
+ * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact
+ * XSAVE area format.
+ */
+#define XSTATE_XRESTORE(st, lmask, hmask) \
+ asm volatile(ALTERNATIVE(XRSTOR, \
+ XRSTORS, X86_FEATURE_XSAVES) \
+ "\n" \
+ "3:\n" \
+ _ASM_EXTABLE_TYPE(661b, 3b, EX_TYPE_FPU_RESTORE) \
+ : \
+ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
+ : "memory")
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_X86_DEBUG_FPU)
+extern void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor);
+#else
+static inline void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) { }
+#endif
+
+#ifdef CONFIG_X86_64
+static inline void xfd_update_state(struct fpstate *fpstate)
+{
+ if (fpu_state_size_dynamic()) {
+ u64 xfd = fpstate->xfd;
+
+ if (__this_cpu_read(xfd_state) != xfd) {
+ wrmsrl(MSR_IA32_XFD, xfd);
+ __this_cpu_write(xfd_state, xfd);
+ }
+ }
+}
+#else
+static inline void xfd_update_state(struct fpstate *fpstate) { }
+#endif
+
+/*
+ * Save processor xstate to xsave area.
+ *
+ * Uses either XSAVE or XSAVEOPT or XSAVES depending on the CPU features
+ * and command line options. The choice is permanent until the next reboot.
+ */
+static inline void os_xsave(struct fpstate *fpstate)
+{
+ u64 mask = fpstate->xfeatures;
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+ int err;
+
+ WARN_ON_FPU(!alternatives_patched);
+ xfd_validate_state(fpstate, mask, false);
+
+ XSTATE_XSAVE(&fpstate->regs.xsave, lmask, hmask, err);
+
+ /* We should never fault when copying to a kernel buffer: */
+ WARN_ON_FPU(err);
+}
+
+/*
+ * Restore processor xstate from xsave area.
+ *
+ * Uses XRSTORS when XSAVES is used, XRSTOR otherwise.
+ */
+static inline void os_xrstor(struct fpstate *fpstate, u64 mask)
+{
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+
+ xfd_validate_state(fpstate, mask, true);
+ XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask);
+}
+
+/* Restore of supervisor state. Does not require XFD */
+static inline void os_xrstor_supervisor(struct fpstate *fpstate)
+{
+ u64 mask = xfeatures_mask_supervisor();
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+
+ XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask);
+}
+
+/*
+ * Save xstate to user space xsave area.
+ *
+ * We don't use modified optimization because xrstor/xrstors might track
+ * a different application.
+ *
+ * We don't use compacted format xsave area for backward compatibility for
+ * old applications which don't understand the compacted format of the
+ * xsave area.
+ *
+ * The caller has to zero buf::header before calling this because XSAVE*
+ * does not touch the reserved fields in the header.
+ */
+static inline int xsave_to_user_sigframe(struct xregs_state __user *buf)
+{
+ /*
+ * Include the features which are not xsaved/rstored by the kernel
+ * internally, e.g. PKRU. That's user space ABI and also required
+ * to allow the signal handler to modify PKRU.
+ */
+ struct fpstate *fpstate = current->thread.fpu.fpstate;
+ u64 mask = fpstate->user_xfeatures;
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+ int err;
+
+ xfd_validate_state(fpstate, mask, false);
+
+ stac();
+ XSTATE_OP(XSAVE, buf, lmask, hmask, err);
+ clac();
+
+ return err;
+}
+
+/*
+ * Restore xstate from user space xsave area.
+ */
+static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64 mask)
+{
+ struct xregs_state *xstate = ((__force struct xregs_state *)buf);
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+ int err;
+
+ xfd_validate_state(current->thread.fpu.fpstate, mask, true);
+
+ stac();
+ XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
+ clac();
+
+ return err;
+}
+
+/*
+ * Restore xstate from kernel space xsave area, return an error code instead of
+ * an exception.
+ */
+static inline int os_xrstor_safe(struct fpstate *fpstate, u64 mask)
+{
+ struct xregs_state *xstate = &fpstate->regs.xsave;
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+ int err;
+
+ /* Ensure that XFD is up to date */
+ xfd_update_state(fpstate);
+
+ if (cpu_feature_enabled(X86_FEATURE_XSAVES))
+ XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
+ else
+ XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
+
+ return err;
+}
+
+
+#endif
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 1b3ce3b4a2a2..c39f906cdc4e 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -252,11 +252,6 @@ void arch_ftrace_update_code(int command)
ftrace_modify_all_code(command);
}
-int __init ftrace_dyn_arch_init(void)
-{
- return 0;
-}
-
/* Currently only x86_64 supports dynamic trampolines */
#ifdef CONFIG_X86_64
@@ -527,7 +522,7 @@ static void *addr_from_call(void *ptr)
return ptr + CALL_INSN_SIZE + call.disp;
}
-void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
+void prepare_ftrace_return(unsigned long ip, unsigned long *parent,
unsigned long frame_pointer);
/*
@@ -541,7 +536,8 @@ static void *static_tramp_func(struct ftrace_ops *ops, struct dyn_ftrace *rec)
void *ptr;
if (ops && ops->trampoline) {
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+#if !defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS) && \
+ defined(CONFIG_FUNCTION_GRAPH_TRACER)
/*
* We only know about function graph tracer setting as static
* trampoline.
@@ -589,8 +585,9 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops)
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
#ifdef CONFIG_DYNAMIC_FTRACE
-extern void ftrace_graph_call(void);
+#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
+extern void ftrace_graph_call(void);
static const char *ftrace_jmp_replace(unsigned long ip, unsigned long addr)
{
return text_gen_insn(JMP32_INSN_OPCODE, (void *)ip, (void *)addr);
@@ -618,19 +615,28 @@ int ftrace_disable_ftrace_graph_caller(void)
return ftrace_mod_jmp(ip, &ftrace_stub);
}
+#else /* !CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */
+int ftrace_enable_ftrace_graph_caller(void)
+{
+ return 0;
+}
+int ftrace_disable_ftrace_graph_caller(void)
+{
+ return 0;
+}
+#endif /* CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */
#endif /* !CONFIG_DYNAMIC_FTRACE */
/*
* Hook the return address and push it in the stack of return addrs
* in current thread info.
*/
-void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
+void prepare_ftrace_return(unsigned long ip, unsigned long *parent,
unsigned long frame_pointer)
{
unsigned long return_hooker = (unsigned long)&return_to_handler;
- unsigned long old;
- int faulted;
+ int bit;
/*
* When resuming from suspend-to-ram, this function can be indirectly
@@ -650,37 +656,25 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
if (unlikely(atomic_read(&current->tracing_graph_pause)))
return;
- /*
- * Protect against fault, even if it shouldn't
- * happen. This tool is too much intrusive to
- * ignore such a protection.
- */
- asm volatile(
- "1: " _ASM_MOV " (%[parent]), %[old]\n"
- "2: " _ASM_MOV " %[return_hooker], (%[parent])\n"
- " movl $0, %[faulted]\n"
- "3:\n"
-
- ".section .fixup, \"ax\"\n"
- "4: movl $1, %[faulted]\n"
- " jmp 3b\n"
- ".previous\n"
-
- _ASM_EXTABLE(1b, 4b)
- _ASM_EXTABLE(2b, 4b)
-
- : [old] "=&r" (old), [faulted] "=r" (faulted)
- : [parent] "r" (parent), [return_hooker] "r" (return_hooker)
- : "memory"
- );
-
- if (unlikely(faulted)) {
- ftrace_graph_stop();
- WARN_ON(1);
+ bit = ftrace_test_recursion_trylock(ip, *parent);
+ if (bit < 0)
return;
- }
- if (function_graph_enter(old, self_addr, frame_pointer, parent))
- *parent = old;
+ if (!function_graph_enter(*parent, ip, frame_pointer, parent))
+ *parent = return_hooker;
+
+ ftrace_test_recursion_unlock(bit);
}
+
+#ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
+void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct ftrace_regs *fregs)
+{
+ struct pt_regs *regs = &fregs->regs;
+ unsigned long *stack = (unsigned long *)kernel_stack_pointer(regs);
+
+ prepare_ftrace_return(ip, (unsigned long *)stack, 0);
+}
+#endif
+
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index 7c273846c687..7a879901f103 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -174,11 +174,6 @@ SYM_INNER_LABEL(ftrace_caller_end, SYM_L_GLOBAL)
SYM_FUNC_END(ftrace_caller);
SYM_FUNC_START(ftrace_epilogue)
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL)
- jmp ftrace_stub
-#endif
-
/*
* This is weak to keep gas from relaxing the jumps.
* It is also used to copy the retq for trampolines.
@@ -251,7 +246,6 @@ SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL)
* If ORIG_RAX is anything but zero, make this a call to that.
* See arch_ftrace_set_direct_caller().
*/
- movq ORIG_RAX(%rsp), %rax
testq %rax, %rax
SYM_INNER_LABEL(ftrace_regs_caller_jmp, SYM_L_GLOBAL)
jnz 1f
@@ -289,15 +283,6 @@ SYM_FUNC_START(__fentry__)
cmpq $ftrace_stub, ftrace_trace_function
jnz trace
-fgraph_trace:
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- cmpq $ftrace_stub, ftrace_graph_return
- jnz ftrace_graph_caller
-
- cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
- jnz ftrace_graph_caller
-#endif
-
SYM_INNER_LABEL(ftrace_stub, SYM_L_GLOBAL)
retq
@@ -315,25 +300,12 @@ trace:
CALL_NOSPEC r8
restore_mcount_regs
- jmp fgraph_trace
+ jmp ftrace_stub
SYM_FUNC_END(__fentry__)
EXPORT_SYMBOL(__fentry__)
#endif /* CONFIG_DYNAMIC_FTRACE */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-SYM_FUNC_START(ftrace_graph_caller)
- /* Saves rbp into %rdx and fills first parameter */
- save_mcount_regs
-
- leaq MCOUNT_REG_SIZE+8(%rsp), %rsi
- movq $0, %rdx /* No framepointers needed */
- call prepare_ftrace_return
-
- restore_mcount_regs
-
- retq
-SYM_FUNC_END(ftrace_graph_caller)
-
SYM_FUNC_START(return_to_handler)
subq $24, %rsp
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index de01903c3735..fc5371a7e9d1 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -19,7 +19,7 @@
#include <linux/start_kernel.h>
#include <linux/io.h>
#include <linux/memblock.h>
-#include <linux/mem_encrypt.h>
+#include <linux/cc_platform.h>
#include <linux/pgtable.h>
#include <asm/processor.h>
@@ -284,8 +284,13 @@ unsigned long __head __startup_64(unsigned long physaddr,
* The bss section will be memset to zero later in the initialization so
* there is no need to zero it after changing the memory encryption
* attribute.
+ *
+ * This is early code, use an open coded check for SME instead of
+ * using cc_platform_has(). This eliminates worries about removing
+ * instrumentation or checking boot_cpu_data in the cc_platform_has()
+ * function.
*/
- if (mem_encrypt_active()) {
+ if (sme_get_me_mask()) {
vaddr = (unsigned long)__start_bss_decrypted;
vaddr_end = (unsigned long)__end_bss_decrypted;
for (; vaddr < vaddr_end; vaddr += PMD_SIZE) {
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 42fc41dd0e1f..882213df3713 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -10,6 +10,7 @@
#include <asm/irq_remapping.h>
#include <asm/hpet.h>
#include <asm/time.h>
+#include <asm/mwait.h>
#undef pr_fmt
#define pr_fmt(fmt) "hpet: " fmt
@@ -916,6 +917,83 @@ static bool __init hpet_counting(void)
return false;
}
+static bool __init mwait_pc10_supported(void)
+{
+ unsigned int eax, ebx, ecx, mwait_substates;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return false;
+
+ if (!cpu_feature_enabled(X86_FEATURE_MWAIT))
+ return false;
+
+ if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
+ return false;
+
+ cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates);
+
+ return (ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) &&
+ (ecx & CPUID5_ECX_INTERRUPT_BREAK) &&
+ (mwait_substates & (0xF << 28));
+}
+
+/*
+ * Check whether the system supports PC10. If so force disable HPET as that
+ * stops counting in PC10. This check is overbroad as it does not take any
+ * of the following into account:
+ *
+ * - ACPI tables
+ * - Enablement of intel_idle
+ * - Command line arguments which limit intel_idle C-state support
+ *
+ * That's perfectly fine. HPET is a piece of hardware designed by committee
+ * and the only reasons why it is still in use on modern systems is the
+ * fact that it is impossible to reliably query TSC and CPU frequency via
+ * CPUID or firmware.
+ *
+ * If HPET is functional it is useful for calibrating TSC, but this can be
+ * done via PMTIMER as well which seems to be the last remaining timer on
+ * X86/INTEL platforms that has not been completely wreckaged by feature
+ * creep.
+ *
+ * In theory HPET support should be removed altogether, but there are older
+ * systems out there which depend on it because TSC and APIC timer are
+ * dysfunctional in deeper C-states.
+ *
+ * It's only 20 years now that hardware people have been asked to provide
+ * reliable and discoverable facilities which can be used for timekeeping
+ * and per CPU timer interrupts.
+ *
+ * The probability that this problem is going to be solved in the
+ * forseeable future is close to zero, so the kernel has to be cluttered
+ * with heuristics to keep up with the ever growing amount of hardware and
+ * firmware trainwrecks. Hopefully some day hardware people will understand
+ * that the approach of "This can be fixed in software" is not sustainable.
+ * Hope dies last...
+ */
+static bool __init hpet_is_pc10_damaged(void)
+{
+ unsigned long long pcfg;
+
+ /* Check whether PC10 substates are supported */
+ if (!mwait_pc10_supported())
+ return false;
+
+ /* Check whether PC10 is enabled in PKG C-state limit */
+ rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, pcfg);
+ if ((pcfg & 0xF) < 8)
+ return false;
+
+ if (hpet_force_user) {
+ pr_warn("HPET force enabled via command line, but dysfunctional in PC10.\n");
+ return false;
+ }
+
+ pr_info("HPET dysfunctional in PC10. Force disabled.\n");
+ boot_hpet_disable = true;
+ return true;
+}
+
/**
* hpet_enable - Try to setup the HPET timer. Returns 1 on success.
*/
@@ -929,6 +1007,9 @@ int __init hpet_enable(void)
if (!is_hpet_capable())
return 0;
+ if (hpet_is_pc10_damaged())
+ return 0;
+
hpet_set_mapping();
if (!hpet_virt_address)
return 0;
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index e28f6a5d14f1..766ffe3ba313 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -291,8 +291,10 @@ void kvm_set_posted_intr_wakeup_handler(void (*handler)(void))
{
if (handler)
kvm_posted_intr_wakeup_handler = handler;
- else
+ else {
kvm_posted_intr_wakeup_handler = dummy_handler;
+ synchronize_rcu();
+ }
}
EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler);
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 044902d5a3c4..e5dd6da78713 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -132,6 +132,7 @@ int irq_init_percpu_irqstack(unsigned int cpu)
return 0;
}
+#ifndef CONFIG_PREEMPT_RT
void do_softirq_own_stack(void)
{
struct irq_stack *irqstk;
@@ -148,6 +149,7 @@ void do_softirq_own_stack(void)
call_on_stack(__do_softirq, isp);
}
+#endif
void __handle_irq(struct irq_desc *desc, struct pt_regs *regs)
{
diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S
index 8ef35063964b..760e1f293093 100644
--- a/arch/x86/kernel/irqflags.S
+++ b/arch/x86/kernel/irqflags.S
@@ -7,9 +7,11 @@
/*
* unsigned long native_save_fl(void)
*/
+.pushsection .noinstr.text, "ax"
SYM_FUNC_START(native_save_fl)
pushf
pop %_ASM_AX
ret
SYM_FUNC_END(native_save_fl)
+.popsection
EXPORT_SYMBOL(native_save_fl)
diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
index 1afbdd1dd777..9ff480e94511 100644
--- a/arch/x86/kernel/itmt.c
+++ b/arch/x86/kernel/itmt.c
@@ -198,7 +198,7 @@ void sched_set_itmt_core_prio(int prio, int core_cpu)
* of the priority chain and only used when
* all other high priority cpus are out of capacity.
*/
- smt_prio = prio * smp_num_siblings / i;
+ smt_prio = prio * smp_num_siblings / (i * i);
per_cpu(sched_core_priority, cpu) = smt_prio;
i++;
}
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index b6e046e4b289..fce99e249d61 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -809,7 +809,7 @@ void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
ri->fp = sara;
/* Replace the return addr with trampoline addr */
- *sara = (unsigned long) &kretprobe_trampoline;
+ *sara = (unsigned long) &__kretprobe_trampoline;
}
NOKPROBE_SYMBOL(arch_prepare_kretprobe);
@@ -1019,52 +1019,91 @@ NOKPROBE_SYMBOL(kprobe_int3_handler);
*/
asm(
".text\n"
- ".global kretprobe_trampoline\n"
- ".type kretprobe_trampoline, @function\n"
- "kretprobe_trampoline:\n"
- /* We don't bother saving the ss register */
+ ".global __kretprobe_trampoline\n"
+ ".type __kretprobe_trampoline, @function\n"
+ "__kretprobe_trampoline:\n"
#ifdef CONFIG_X86_64
+ /* Push a fake return address to tell the unwinder it's a kretprobe. */
+ " pushq $__kretprobe_trampoline\n"
+ UNWIND_HINT_FUNC
+ /* Save the 'sp - 8', this will be fixed later. */
" pushq %rsp\n"
" pushfq\n"
SAVE_REGS_STRING
" movq %rsp, %rdi\n"
" call trampoline_handler\n"
- /* Replace saved sp with true return address. */
- " movq %rax, 19*8(%rsp)\n"
RESTORE_REGS_STRING
+ /* In trampoline_handler(), 'regs->flags' is copied to 'regs->sp'. */
+ " addq $8, %rsp\n"
" popfq\n"
#else
+ /* Push a fake return address to tell the unwinder it's a kretprobe. */
+ " pushl $__kretprobe_trampoline\n"
+ UNWIND_HINT_FUNC
+ /* Save the 'sp - 4', this will be fixed later. */
" pushl %esp\n"
" pushfl\n"
SAVE_REGS_STRING
" movl %esp, %eax\n"
" call trampoline_handler\n"
- /* Replace saved sp with true return address. */
- " movl %eax, 15*4(%esp)\n"
RESTORE_REGS_STRING
+ /* In trampoline_handler(), 'regs->flags' is copied to 'regs->sp'. */
+ " addl $4, %esp\n"
" popfl\n"
#endif
" ret\n"
- ".size kretprobe_trampoline, .-kretprobe_trampoline\n"
+ ".size __kretprobe_trampoline, .-__kretprobe_trampoline\n"
);
-NOKPROBE_SYMBOL(kretprobe_trampoline);
-STACK_FRAME_NON_STANDARD(kretprobe_trampoline);
+NOKPROBE_SYMBOL(__kretprobe_trampoline);
+/*
+ * __kretprobe_trampoline() skips updating frame pointer. The frame pointer
+ * saved in trampoline_handler() points to the real caller function's
+ * frame pointer. Thus the __kretprobe_trampoline() doesn't have a
+ * standard stack frame with CONFIG_FRAME_POINTER=y.
+ * Let's mark it non-standard function. Anyway, FP unwinder can correctly
+ * unwind without the hint.
+ */
+STACK_FRAME_NON_STANDARD_FP(__kretprobe_trampoline);
+
+/* This is called from kretprobe_trampoline_handler(). */
+void arch_kretprobe_fixup_return(struct pt_regs *regs,
+ kprobe_opcode_t *correct_ret_addr)
+{
+ unsigned long *frame_pointer = &regs->sp + 1;
+ /* Replace fake return address with real one. */
+ *frame_pointer = (unsigned long)correct_ret_addr;
+}
/*
- * Called from kretprobe_trampoline
+ * Called from __kretprobe_trampoline
*/
-__used __visible void *trampoline_handler(struct pt_regs *regs)
+__used __visible void trampoline_handler(struct pt_regs *regs)
{
+ unsigned long *frame_pointer;
+
/* fixup registers */
regs->cs = __KERNEL_CS;
#ifdef CONFIG_X86_32
regs->gs = 0;
#endif
- regs->ip = (unsigned long)&kretprobe_trampoline;
+ regs->ip = (unsigned long)&__kretprobe_trampoline;
regs->orig_ax = ~0UL;
+ regs->sp += sizeof(long);
+ frame_pointer = &regs->sp + 1;
- return (void *)kretprobe_trampoline_handler(regs, &kretprobe_trampoline, &regs->sp);
+ /*
+ * The return address at 'frame_pointer' is recovered by the
+ * arch_kretprobe_fixup_return() which called from the
+ * kretprobe_trampoline_handler().
+ */
+ kretprobe_trampoline_handler(regs, frame_pointer);
+
+ /*
+ * Copy FLAGS to 'pt_regs::sp' so that __kretprobe_trapmoline()
+ * can do RET right after POPF.
+ */
+ regs->sp = regs->flags;
}
NOKPROBE_SYMBOL(trampoline_handler);
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 596de2f6d3a5..dd2ec14adb77 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -25,7 +25,6 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
if (bit < 0)
return;
- preempt_disable_notrace();
p = get_kprobe((kprobe_opcode_t *)ip);
if (unlikely(!p) || kprobe_disabled(p))
goto out;
@@ -59,7 +58,6 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
__this_cpu_write(current_kprobe, NULL);
}
out:
- preempt_enable_notrace();
ftrace_test_recursion_unlock(bit);
}
NOKPROBE_SYMBOL(kprobe_ftrace_handler);
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 71425ebba98a..b4a54a52aa59 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -367,10 +367,10 @@ int arch_check_optimized_kprobe(struct optimized_kprobe *op)
/* Check the addr is within the optimized instructions. */
int arch_within_optimized_kprobe(struct optimized_kprobe *op,
- unsigned long addr)
+ kprobe_opcode_t *addr)
{
- return ((unsigned long)op->kp.addr <= addr &&
- (unsigned long)op->kp.addr + op->optinsn.size > addr);
+ return (op->kp.addr <= addr &&
+ op->kp.addr + op->optinsn.size > addr);
}
/* Free optimized instruction slot */
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index b656456c3a94..8863d1941f1b 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -27,6 +27,7 @@
#include <linux/nmi.h>
#include <linux/swait.h>
#include <linux/syscore_ops.h>
+#include <linux/cc_platform.h>
#include <asm/timer.h>
#include <asm/cpu.h>
#include <asm/traps.h>
@@ -418,7 +419,7 @@ static void __init sev_map_percpu_data(void)
{
int cpu;
- if (!sev_active())
+ if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
return;
for_each_possible_cpu(cpu) {
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index ad273e5861c1..462dd8e9b03d 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -16,9 +16,9 @@
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/set_memory.h>
+#include <linux/cc_platform.h>
#include <asm/hypervisor.h>
-#include <asm/mem_encrypt.h>
#include <asm/x86_init.h>
#include <asm/kvmclock.h>
@@ -49,18 +49,9 @@ early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
static struct pvclock_vsyscall_time_info
hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __bss_decrypted __aligned(PAGE_SIZE);
static struct pvclock_wall_clock wall_clock __bss_decrypted;
-static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
static struct pvclock_vsyscall_time_info *hvclock_mem;
-
-static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void)
-{
- return &this_cpu_read(hv_clock_per_cpu)->pvti;
-}
-
-static inline struct pvclock_vsyscall_time_info *this_cpu_hvclock(void)
-{
- return this_cpu_read(hv_clock_per_cpu);
-}
+DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
+EXPORT_PER_CPU_SYMBOL_GPL(hv_clock_per_cpu);
/*
* The wallclock is the time of day when we booted. Since then, some time may
@@ -232,7 +223,7 @@ static void __init kvmclock_init_mem(void)
* hvclock is shared between the guest and the hypervisor, must
* be mapped decrypted.
*/
- if (sev_active()) {
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
r = set_memory_decrypted((unsigned long) hvclock_mem,
1UL << order);
if (r) {
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 131f30fdcfbd..f5da4a18070a 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -17,6 +17,7 @@
#include <linux/suspend.h>
#include <linux/vmalloc.h>
#include <linux/efi.h>
+#include <linux/cc_platform.h>
#include <asm/init.h>
#include <asm/tlbflush.h>
@@ -166,7 +167,7 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
}
pte = pte_offset_kernel(pmd, vaddr);
- if (sev_active())
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
prot = PAGE_KERNEL_EXEC;
set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
@@ -206,7 +207,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
level4p = (pgd_t *)__va(start_pgtable);
clear_page(level4p);
- if (sev_active()) {
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
info.page_flag |= _PAGE_ENC;
info.kernpg_flag |= _PAGE_ENC;
}
@@ -358,7 +359,7 @@ void machine_kexec(struct kimage *image)
(unsigned long)page_list,
image->start,
image->preserve_context,
- sme_active());
+ cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT));
#ifdef CONFIG_KEXEC_JUMP
if (image->preserve_context)
@@ -569,12 +570,12 @@ void arch_kexec_unprotect_crashkres(void)
*/
int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp)
{
- if (sev_active())
+ if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
return 0;
/*
- * If SME is active we need to be sure that kexec pages are
- * not encrypted because when we boot to the new kernel the
+ * If host memory encryption is active we need to be sure that kexec
+ * pages are not encrypted because when we boot to the new kernel the
* pages won't be accessed encrypted (initially).
*/
return set_memory_decrypted((unsigned long)vaddr, pages);
@@ -582,12 +583,12 @@ int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp)
void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages)
{
- if (sev_active())
+ if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
return;
/*
- * If SME is active we need to reset the pages back to being
- * an encrypted mapping before freeing them.
+ * If host memory encryption is active we need to reset the pages back
+ * to being an encrypted mapping before freeing them.
*/
set_memory_encrypted((unsigned long)vaddr, pages);
}
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 5e9a34b5bd74..169fb6f4cd2e 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -251,7 +251,8 @@ int module_finalize(const Elf_Ehdr *hdr,
struct module *me)
{
const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
- *para = NULL, *orc = NULL, *orc_ip = NULL;
+ *para = NULL, *orc = NULL, *orc_ip = NULL,
+ *retpolines = NULL;
char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
@@ -267,8 +268,14 @@ int module_finalize(const Elf_Ehdr *hdr,
orc = s;
if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name))
orc_ip = s;
+ if (!strcmp(".retpoline_sites", secstrings + s->sh_name))
+ retpolines = s;
}
+ if (retpolines) {
+ void *rseg = (void *)retpolines->sh_addr;
+ apply_retpolines(rseg, rseg + retpolines->sh_size);
+ }
if (alt) {
/* patch .altinstructions */
void *aseg = (void *)alt->sh_addr;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 04cafc057bed..ebc45360ffd4 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -218,6 +218,36 @@ void paravirt_end_context_switch(struct task_struct *next)
if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES))
arch_enter_lazy_mmu_mode();
}
+
+static noinstr unsigned long pv_native_read_cr2(void)
+{
+ return native_read_cr2();
+}
+
+static noinstr void pv_native_write_cr2(unsigned long val)
+{
+ native_write_cr2(val);
+}
+
+static noinstr unsigned long pv_native_get_debugreg(int regno)
+{
+ return native_get_debugreg(regno);
+}
+
+static noinstr void pv_native_set_debugreg(int regno, unsigned long val)
+{
+ native_set_debugreg(regno, val);
+}
+
+static noinstr void pv_native_irq_enable(void)
+{
+ native_irq_enable();
+}
+
+static noinstr void pv_native_irq_disable(void)
+{
+ native_irq_disable();
+}
#endif
enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
@@ -244,8 +274,8 @@ struct paravirt_patch_template pv_ops = {
#ifdef CONFIG_PARAVIRT_XXL
.cpu.cpuid = native_cpuid,
- .cpu.get_debugreg = native_get_debugreg,
- .cpu.set_debugreg = native_set_debugreg,
+ .cpu.get_debugreg = pv_native_get_debugreg,
+ .cpu.set_debugreg = pv_native_set_debugreg,
.cpu.read_cr0 = native_read_cr0,
.cpu.write_cr0 = native_write_cr0,
.cpu.write_cr4 = native_write_cr4,
@@ -281,8 +311,8 @@ struct paravirt_patch_template pv_ops = {
/* Irq ops. */
.irq.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
- .irq.irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
- .irq.irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),
+ .irq.irq_disable = __PV_IS_CALLEE_SAVE(pv_native_irq_disable),
+ .irq.irq_enable = __PV_IS_CALLEE_SAVE(pv_native_irq_enable),
.irq.safe_halt = native_safe_halt,
.irq.halt = native_halt,
#endif /* CONFIG_PARAVIRT_XXL */
@@ -298,8 +328,8 @@ struct paravirt_patch_template pv_ops = {
.mmu.exit_mmap = paravirt_nop,
#ifdef CONFIG_PARAVIRT_XXL
- .mmu.read_cr2 = __PV_IS_CALLEE_SAVE(native_read_cr2),
- .mmu.write_cr2 = native_write_cr2,
+ .mmu.read_cr2 = __PV_IS_CALLEE_SAVE(pv_native_read_cr2),
+ .mmu.write_cr2 = pv_native_write_cr2,
.mmu.read_cr3 = __native_read_cr3,
.mmu.write_cr3 = native_write_cr3,
@@ -371,9 +401,6 @@ struct paravirt_patch_template pv_ops = {
};
#ifdef CONFIG_PARAVIRT_XXL
-/* At this point, native_get/set_debugreg has real function entries */
-NOKPROBE_SYMBOL(native_get_debugreg);
-NOKPROBE_SYMBOL(native_set_debugreg);
NOKPROBE_SYMBOL(native_load_idt);
void (*paravirt_iret)(void) = native_iret;
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index c2cfa5e7c152..814ab46a0dad 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -6,7 +6,7 @@
#include <linux/swiotlb.h>
#include <linux/memblock.h>
#include <linux/dma-direct.h>
-#include <linux/mem_encrypt.h>
+#include <linux/cc_platform.h>
#include <asm/iommu.h>
#include <asm/swiotlb.h>
@@ -45,11 +45,10 @@ int __init pci_swiotlb_detect_4gb(void)
swiotlb = 1;
/*
- * If SME is active then swiotlb will be set to 1 so that bounce
- * buffers are allocated and used for devices that do not support
- * the addressing range required for the encryption mask.
+ * Set swiotlb to 1 so that bounce buffers are allocated and used for
+ * devices that can't support DMA to encrypted memory.
*/
- if (sme_active())
+ if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
swiotlb = 1;
return swiotlb;
diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c
index 9e1def3744f2..36e84d904260 100644
--- a/arch/x86/kernel/probe_roms.c
+++ b/arch/x86/kernel/probe_roms.c
@@ -80,7 +80,7 @@ static struct resource video_rom_resource = {
*/
static bool match_id(struct pci_dev *pdev, unsigned short vendor, unsigned short device)
{
- struct pci_driver *drv = pdev->driver;
+ struct pci_driver *drv = to_pci_driver(pdev->dev.driver);
const struct pci_device_id *id;
if (pdev->vendor == vendor && pdev->device == device)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 1d9463e3096b..e9ee8b526319 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -30,7 +30,9 @@
#include <asm/apic.h>
#include <linux/uaccess.h>
#include <asm/mwait.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
+#include <asm/fpu/sched.h>
+#include <asm/fpu/xstate.h>
#include <asm/debugreg.h>
#include <asm/nmi.h>
#include <asm/tlbflush.h>
@@ -43,6 +45,7 @@
#include <asm/io_bitmap.h>
#include <asm/proto.h>
#include <asm/frame.h>
+#include <asm/unwind.h>
#include "process.h"
@@ -87,9 +90,20 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
#ifdef CONFIG_VM86
dst->thread.vm86 = NULL;
#endif
- return fpu_clone(dst);
+ /* Drop the copied pointer to current's fpstate */
+ dst->thread.fpu.fpstate = NULL;
+
+ return 0;
}
+#ifdef CONFIG_X86_64
+void arch_release_task_struct(struct task_struct *tsk)
+{
+ if (fpu_state_size_dynamic())
+ fpstate_free(&tsk->thread.fpu);
+}
+#endif
+
/*
* Free thread data structures etc..
*/
@@ -132,6 +146,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
frame->ret_addr = (unsigned long) ret_from_fork;
p->thread.sp = (unsigned long) fork_frame;
p->thread.io_bitmap = NULL;
+ p->thread.iopl_warn = 0;
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
#ifdef CONFIG_X86_64
@@ -154,6 +169,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
frame->flags = X86_EFLAGS_FIXED;
#endif
+ fpu_clone(p, clone_flags);
+
/* Kernel thread ? */
if (unlikely(p->flags & PF_KTHREAD)) {
p->thread.pkru = pkru_get_init_value();
@@ -942,70 +959,36 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
* because the task might wake up and we might look at a stack
* changing under us.
*/
-unsigned long get_wchan(struct task_struct *p)
+unsigned long __get_wchan(struct task_struct *p)
{
- unsigned long start, bottom, top, sp, fp, ip, ret = 0;
- int count = 0;
-
- if (p == current || task_is_running(p))
- return 0;
-
- if (!try_get_task_stack(p))
- return 0;
-
- start = (unsigned long)task_stack_page(p);
- if (!start)
- goto out;
+ struct unwind_state state;
+ unsigned long addr = 0;
- /*
- * Layout of the stack page:
- *
- * ----------- topmax = start + THREAD_SIZE - sizeof(unsigned long)
- * PADDING
- * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING
- * stack
- * ----------- bottom = start
- *
- * The tasks stack pointer points at the location where the
- * framepointer is stored. The data on the stack is:
- * ... IP FP ... IP FP
- *
- * We need to read FP and IP, so we need to adjust the upper
- * bound by another unsigned long.
- */
- top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING;
- top -= 2 * sizeof(unsigned long);
- bottom = start;
-
- sp = READ_ONCE(p->thread.sp);
- if (sp < bottom || sp > top)
- goto out;
-
- fp = READ_ONCE_NOCHECK(((struct inactive_task_frame *)sp)->bp);
- do {
- if (fp < bottom || fp > top)
- goto out;
- ip = READ_ONCE_NOCHECK(*(unsigned long *)(fp + sizeof(unsigned long)));
- if (!in_sched_functions(ip)) {
- ret = ip;
- goto out;
- }
- fp = READ_ONCE_NOCHECK(*(unsigned long *)fp);
- } while (count++ < 16 && !task_is_running(p));
+ for (unwind_start(&state, p, NULL, NULL); !unwind_done(&state);
+ unwind_next_frame(&state)) {
+ addr = unwind_get_return_address(&state);
+ if (!addr)
+ break;
+ if (in_sched_functions(addr))
+ continue;
+ break;
+ }
-out:
- put_task_stack(p);
- return ret;
+ return addr;
}
long do_arch_prctl_common(struct task_struct *task, int option,
- unsigned long cpuid_enabled)
+ unsigned long arg2)
{
switch (option) {
case ARCH_GET_CPUID:
return get_cpuid_mode();
case ARCH_SET_CPUID:
- return set_cpuid_mode(task, cpuid_enabled);
+ return set_cpuid_mode(task, arg2);
+ case ARCH_GET_XCOMP_SUPP:
+ case ARCH_GET_XCOMP_PERM:
+ case ARCH_REQ_XCOMP_PERM:
+ return fpu_xstate_prctl(task, option, arg2);
}
return -EINVAL;
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 4f2f54e1281c..26edb1cd07a4 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -41,7 +41,7 @@
#include <asm/ldt.h>
#include <asm/processor.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/sched.h>
#include <asm/desc.h>
#include <linux/err.h>
@@ -160,7 +160,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct thread_struct *prev = &prev_p->thread,
*next = &next_p->thread;
struct fpu *prev_fpu = &prev->fpu;
- struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
@@ -213,7 +212,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
this_cpu_write(current_task, next_p);
- switch_fpu_finish(next_fpu);
+ switch_fpu_finish();
/* Load the Intel cache allocation PQR MSR. */
resctrl_sched_in();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ec0d836a13b1..3402edec236c 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -42,7 +42,7 @@
#include <asm/processor.h>
#include <asm/pkru.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/sched.h>
#include <asm/mmu_context.h>
#include <asm/prctl.h>
#include <asm/desc.h>
@@ -559,7 +559,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct thread_struct *prev = &prev_p->thread;
struct thread_struct *next = &next_p->thread;
struct fpu *prev_fpu = &prev->fpu;
- struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
@@ -620,7 +619,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
this_cpu_write(current_task, next_p);
this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
- switch_fpu_finish(next_fpu);
+ switch_fpu_finish();
/* Reload sp0. */
update_task_stack(next_p);
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 4c208ea3bd9f..6d2244c94799 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -29,9 +29,9 @@
#include <linux/uaccess.h>
#include <asm/processor.h>
-#include <asm/fpu/internal.h>
#include <asm/fpu/signal.h>
#include <asm/fpu/regset.h>
+#include <asm/fpu/xstate.h>
#include <asm/debugreg.h>
#include <asm/ldt.h>
#include <asm/desc.h>
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index c53271aebb64..c8fe74a28143 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -47,7 +47,7 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
* %rsi page_list
* %rdx start address
* %rcx preserve_context
- * %r8 sme_active
+ * %r8 host_mem_enc_active
*/
/* Save the CPU context, used for jumping back */
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 79f164141116..49b596db5631 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -322,7 +322,7 @@ static void __init reserve_initrd(void)
relocate_initrd();
- memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
+ memblock_phys_free(ramdisk_image, ramdisk_end - ramdisk_image);
}
#else
@@ -521,7 +521,7 @@ static void __init reserve_crashkernel(void)
}
if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) {
- memblock_free(crash_base, crash_size);
+ memblock_phys_free(crash_base, crash_size);
return;
}
@@ -830,6 +830,20 @@ void __init setup_arch(char **cmdline_p)
x86_init.oem.arch_setup();
+ /*
+ * Do some memory reservations *before* memory is added to memblock, so
+ * memblock allocations won't overwrite it.
+ *
+ * After this point, everything still needed from the boot loader or
+ * firmware or kernel text should be early reserved or marked not RAM in
+ * e820. All other memory is free game.
+ *
+ * This call needs to happen before e820__memory_setup() which calls the
+ * xen_memory_setup() on Xen dom0 which relies on the fact that those
+ * early reservations have happened already.
+ */
+ early_reserve_memory();
+
iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
e820__memory_setup();
parse_setup_data();
@@ -876,18 +890,6 @@ void __init setup_arch(char **cmdline_p)
parse_early_param();
- /*
- * Do some memory reservations *before* memory is added to
- * memblock, so memblock allocations won't overwrite it.
- * Do it after early param, so we could get (unlikely) panic from
- * serial.
- *
- * After this point everything still needed from the boot loader or
- * firmware or kernel text should be early reserved or marked not
- * RAM in e820. All other memory is free game.
- */
- early_reserve_memory();
-
#ifdef CONFIG_MEMORY_HOTPLUG
/*
* Memory used by the kernel cannot be hot-removed because Linux
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 5afd98559193..7b65275544b2 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -135,7 +135,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
static void __init pcpu_fc_free(void *ptr, size_t size)
{
- memblock_free_ptr(ptr, size);
+ memblock_free(ptr, size);
}
static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c
index 9f90f460a28c..787dc5f568b5 100644
--- a/arch/x86/kernel/sev-shared.c
+++ b/arch/x86/kernel/sev-shared.c
@@ -64,7 +64,7 @@ static bool sev_es_negotiate_protocol(void)
static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb)
{
ghcb->save.sw_exit_code = 0;
- memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
+ __builtin_memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
}
static bool vc_decoding_needed(unsigned long exit_code)
@@ -94,25 +94,15 @@ static void vc_finish_insn(struct es_em_ctxt *ctxt)
ctxt->regs->ip += ctxt->insn.length;
}
-static enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt,
- u64 exit_code, u64 exit_info_1,
- u64 exit_info_2)
+static enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
{
- enum es_result ret;
-
- /* Fill in protocol and format specifiers */
- ghcb->protocol_version = GHCB_PROTOCOL_MAX;
- ghcb->ghcb_usage = GHCB_DEFAULT_USAGE;
-
- ghcb_set_sw_exit_code(ghcb, exit_code);
- ghcb_set_sw_exit_info_1(ghcb, exit_info_1);
- ghcb_set_sw_exit_info_2(ghcb, exit_info_2);
+ u32 ret;
- sev_es_wr_ghcb_msr(__pa(ghcb));
- VMGEXIT();
+ ret = ghcb->save.sw_exit_info_1 & GENMASK_ULL(31, 0);
+ if (!ret)
+ return ES_OK;
- if ((ghcb->save.sw_exit_info_1 & 0xffffffff) == 1) {
+ if (ret == 1) {
u64 info = ghcb->save.sw_exit_info_2;
unsigned long v;
@@ -124,17 +114,40 @@ static enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb,
((v == X86_TRAP_GP) || (v == X86_TRAP_UD)) &&
((info & SVM_EVTINJ_TYPE_MASK) == SVM_EVTINJ_TYPE_EXEPT)) {
ctxt->fi.vector = v;
+
if (info & SVM_EVTINJ_VALID_ERR)
ctxt->fi.error_code = info >> 32;
- ret = ES_EXCEPTION;
- } else {
- ret = ES_VMM_ERROR;
+
+ return ES_EXCEPTION;
}
- } else {
- ret = ES_OK;
}
- return ret;
+ return ES_VMM_ERROR;
+}
+
+enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, bool set_ghcb_msr,
+ struct es_em_ctxt *ctxt, u64 exit_code,
+ u64 exit_info_1, u64 exit_info_2)
+{
+ /* Fill in protocol and format specifiers */
+ ghcb->protocol_version = GHCB_PROTOCOL_MAX;
+ ghcb->ghcb_usage = GHCB_DEFAULT_USAGE;
+
+ ghcb_set_sw_exit_code(ghcb, exit_code);
+ ghcb_set_sw_exit_info_1(ghcb, exit_info_1);
+ ghcb_set_sw_exit_info_2(ghcb, exit_info_2);
+
+ /*
+ * Hyper-V unenlightened guests use a paravisor for communicating and
+ * GHCB pages are being allocated and set up by that paravisor. Linux
+ * should not change the GHCB page's physical address.
+ */
+ if (set_ghcb_msr)
+ sev_es_wr_ghcb_msr(__pa(ghcb));
+
+ VMGEXIT();
+
+ return verify_exception_info(ghcb, ctxt);
}
/*
@@ -411,7 +424,7 @@ static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
*/
sw_scratch = __pa(ghcb) + offsetof(struct ghcb, shared_buffer);
ghcb_set_sw_scratch(ghcb, sw_scratch);
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO,
+ ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_IOIO,
exit_info_1, exit_info_2);
if (ret != ES_OK)
return ret;
@@ -453,7 +466,8 @@ static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
ghcb_set_rax(ghcb, rax);
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, exit_info_1, 0);
+ ret = sev_es_ghcb_hv_call(ghcb, true, ctxt,
+ SVM_EXIT_IOIO, exit_info_1, 0);
if (ret != ES_OK)
return ret;
@@ -484,7 +498,7 @@ static enum es_result vc_handle_cpuid(struct ghcb *ghcb,
/* xgetbv will cause #GP - use reset value for xcr0 */
ghcb_set_xcr0(ghcb, 1);
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0);
+ ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_CPUID, 0, 0);
if (ret != ES_OK)
return ret;
@@ -509,7 +523,7 @@ static enum es_result vc_handle_rdtsc(struct ghcb *ghcb,
bool rdtscp = (exit_code == SVM_EXIT_RDTSCP);
enum es_result ret;
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, 0, 0);
+ ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, exit_code, 0, 0);
if (ret != ES_OK)
return ret;
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index a6895e440bc3..74f0ec955384 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -11,7 +11,7 @@
#include <linux/sched/debug.h> /* For show_regs() */
#include <linux/percpu-defs.h>
-#include <linux/mem_encrypt.h>
+#include <linux/cc_platform.h>
#include <linux/printk.h>
#include <linux/mm_types.h>
#include <linux/set_memory.h>
@@ -23,7 +23,7 @@
#include <asm/stacktrace.h>
#include <asm/sev.h>
#include <asm/insn-eval.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/xcr.h>
#include <asm/processor.h>
#include <asm/realmode.h>
#include <asm/traps.h>
@@ -46,16 +46,6 @@ static struct ghcb __initdata *boot_ghcb;
struct sev_es_runtime_data {
struct ghcb ghcb_page;
- /* Physical storage for the per-CPU IST stack of the #VC handler */
- char ist_stack[EXCEPTION_STKSZ] __aligned(PAGE_SIZE);
-
- /*
- * Physical storage for the per-CPU fall-back stack of the #VC handler.
- * The fall-back stack is used when it is not safe to switch back to the
- * interrupted stack in the #VC entry code.
- */
- char fallback_stack[EXCEPTION_STKSZ] __aligned(PAGE_SIZE);
-
/*
* Reserve one page per CPU as backup storage for the unencrypted GHCB.
* It is needed when an NMI happens while the #VC handler uses the real
@@ -99,27 +89,6 @@ DEFINE_STATIC_KEY_FALSE(sev_es_enable_key);
/* Needed in vc_early_forward_exception */
void do_early_exception(struct pt_regs *regs, int trapnr);
-static void __init setup_vc_stacks(int cpu)
-{
- struct sev_es_runtime_data *data;
- struct cpu_entry_area *cea;
- unsigned long vaddr;
- phys_addr_t pa;
-
- data = per_cpu(runtime_data, cpu);
- cea = get_cpu_entry_area(cpu);
-
- /* Map #VC IST stack */
- vaddr = CEA_ESTACK_BOT(&cea->estacks, VC);
- pa = __pa(data->ist_stack);
- cea_set_pte((void *)vaddr, pa, PAGE_KERNEL);
-
- /* Map VC fall-back stack */
- vaddr = CEA_ESTACK_BOT(&cea->estacks, VC2);
- pa = __pa(data->fallback_stack);
- cea_set_pte((void *)vaddr, pa, PAGE_KERNEL);
-}
-
static __always_inline bool on_vc_stack(struct pt_regs *regs)
{
unsigned long sp = regs->sp;
@@ -615,7 +584,7 @@ int __init sev_es_efi_map_ghcbs(pgd_t *pgd)
int cpu;
u64 pfn;
- if (!sev_es_active())
+ if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
return 0;
pflags = _PAGE_NX | _PAGE_RW;
@@ -648,7 +617,8 @@ static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
ghcb_set_rdx(ghcb, regs->dx);
}
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, exit_info_1, 0);
+ ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_MSR,
+ exit_info_1, 0);
if ((ret == ES_OK) && (!exit_info_1)) {
regs->ax = ghcb->save.rax;
@@ -774,7 +744,7 @@ void __init sev_es_init_vc_handling(void)
BUILD_BUG_ON(offsetof(struct sev_es_runtime_data, ghcb_page) % PAGE_SIZE);
- if (!sev_es_active())
+ if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
return;
if (!sev_es_check_cpu_features())
@@ -787,7 +757,6 @@ void __init sev_es_init_vc_handling(void)
for_each_possible_cpu(cpu) {
alloc_runtime_data(cpu);
init_ghcb(cpu);
- setup_vc_stacks(cpu);
}
sev_es_setup_play_dead();
@@ -867,7 +836,7 @@ static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
ghcb_set_sw_scratch(ghcb, ghcb_pa + offsetof(struct ghcb, shared_buffer));
- return sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, exit_info_1, exit_info_2);
+ return sev_es_ghcb_hv_call(ghcb, true, ctxt, exit_code, exit_info_1, exit_info_2);
}
static enum es_result vc_handle_mmio_twobyte_ops(struct ghcb *ghcb,
@@ -1117,7 +1086,7 @@ static enum es_result vc_handle_dr7_write(struct ghcb *ghcb,
/* Using a value of 0 for ExitInfo1 means RAX holds the value */
ghcb_set_rax(ghcb, val);
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WRITE_DR7, 0, 0);
+ ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_WRITE_DR7, 0, 0);
if (ret != ES_OK)
return ret;
@@ -1147,7 +1116,7 @@ static enum es_result vc_handle_dr7_read(struct ghcb *ghcb,
static enum es_result vc_handle_wbinvd(struct ghcb *ghcb,
struct es_em_ctxt *ctxt)
{
- return sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WBINVD, 0, 0);
+ return sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_WBINVD, 0, 0);
}
static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
@@ -1156,7 +1125,7 @@ static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt
ghcb_set_rcx(ghcb, ctxt->regs->cx);
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_RDPMC, 0, 0);
+ ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_RDPMC, 0, 0);
if (ret != ES_OK)
return ret;
@@ -1197,7 +1166,7 @@ static enum es_result vc_handle_vmmcall(struct ghcb *ghcb,
if (x86_platform.hyper.sev_es_hcall_prepare)
x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs);
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_VMMCALL, 0, 0);
+ ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_VMMCALL, 0, 0);
if (ret != ES_OK)
return ret;
@@ -1319,13 +1288,26 @@ static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt)
}
}
-static __always_inline bool on_vc_fallback_stack(struct pt_regs *regs)
+static __always_inline bool is_vc2_stack(unsigned long sp)
{
- unsigned long sp = (unsigned long)regs;
-
return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2));
}
+static __always_inline bool vc_from_invalid_context(struct pt_regs *regs)
+{
+ unsigned long sp, prev_sp;
+
+ sp = (unsigned long)regs;
+ prev_sp = regs->sp;
+
+ /*
+ * If the code was already executing on the VC2 stack when the #VC
+ * happened, let it proceed to the normal handling routine. This way the
+ * code executing on the VC2 stack can cause #VC exceptions to get handled.
+ */
+ return is_vc2_stack(sp) && !is_vc2_stack(prev_sp);
+}
+
static bool vc_raw_handle_exception(struct pt_regs *regs, unsigned long error_code)
{
struct ghcb_state state;
@@ -1406,7 +1388,7 @@ DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication)
* But keep this here in case the noinstr annotations are violated due
* to bug elsewhere.
*/
- if (unlikely(on_vc_fallback_stack(regs))) {
+ if (unlikely(vc_from_invalid_context(regs))) {
instrumentation_begin();
panic("Can't handle #VC exception from unsupported context\n");
instrumentation_end();
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index f4d21e470083..ec71e06ae364 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -15,6 +15,7 @@
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/kernel.h>
+#include <linux/kstrtox.h>
#include <linux/errno.h>
#include <linux/wait.h>
#include <linux/tracehook.h>
@@ -30,8 +31,8 @@
#include <asm/processor.h>
#include <asm/ucontext.h>
-#include <asm/fpu/internal.h>
#include <asm/fpu/signal.h>
+#include <asm/fpu/xstate.h>
#include <asm/vdso.h>
#include <asm/mce.h>
#include <asm/sighandling.h>
@@ -41,6 +42,7 @@
#include <linux/compat.h>
#include <asm/proto.h>
#include <asm/ia32_unistd.h>
+#include <asm/fpu/xstate.h>
#endif /* CONFIG_X86_64 */
#include <asm/syscall.h>
@@ -79,9 +81,9 @@ static void force_valid_ss(struct pt_regs *regs)
# define CONTEXT_COPY_SIZE sizeof(struct sigcontext)
#endif
-static int restore_sigcontext(struct pt_regs *regs,
- struct sigcontext __user *usc,
- unsigned long uc_flags)
+static bool restore_sigcontext(struct pt_regs *regs,
+ struct sigcontext __user *usc,
+ unsigned long uc_flags)
{
struct sigcontext sc;
@@ -89,7 +91,7 @@ static int restore_sigcontext(struct pt_regs *regs,
current->restart_block.fn = do_no_restart_syscall;
if (copy_from_user(&sc, usc, CONTEXT_COPY_SIZE))
- return -EFAULT;
+ return false;
#ifdef CONFIG_X86_32
set_user_gs(regs, sc.gs);
@@ -244,7 +246,6 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
unsigned long math_size = 0;
unsigned long sp = regs->sp;
unsigned long buf_fx = 0;
- int ret;
/* redzone */
if (IS_ENABLED(CONFIG_X86_64))
@@ -292,8 +293,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
}
/* save i387 and extended state */
- ret = copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size);
- if (ret < 0)
+ if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size))
return (void __user *)-1L;
return (void __user *)sp;
@@ -643,7 +643,7 @@ SYSCALL_DEFINE0(sigreturn)
* x86_32 has no uc_flags bits relevant to restore_sigcontext.
* Save a few cycles by skipping the __get_user.
*/
- if (restore_sigcontext(regs, &frame->sc, 0))
+ if (!restore_sigcontext(regs, &frame->sc, 0))
goto badframe;
return regs->ax;
@@ -671,7 +671,7 @@ SYSCALL_DEFINE0(rt_sigreturn)
set_current_blocked(&set);
- if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
+ if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
goto badframe;
if (restore_altstack(&frame->uc.uc_stack))
@@ -721,12 +721,15 @@ badframe:
/* max_frame_size tells userspace the worst case signal stack size. */
static unsigned long __ro_after_init max_frame_size;
+static unsigned int __ro_after_init fpu_default_state_size;
void __init init_sigframe_size(void)
{
+ fpu_default_state_size = fpu__get_fpstate_size();
+
max_frame_size = MAX_FRAME_SIGINFO_UCTXT_SIZE + MAX_FRAME_PADDING;
- max_frame_size += fpu__get_fpstate_size() + MAX_XSAVE_PADDING;
+ max_frame_size += fpu_default_state_size + MAX_XSAVE_PADDING;
/* Userspace expects an aligned size. */
max_frame_size = round_up(max_frame_size, FRAME_ALIGNMENT);
@@ -910,6 +913,62 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
force_sig(SIGSEGV);
}
+#ifdef CONFIG_DYNAMIC_SIGFRAME
+#ifdef CONFIG_STRICT_SIGALTSTACK_SIZE
+static bool strict_sigaltstack_size __ro_after_init = true;
+#else
+static bool strict_sigaltstack_size __ro_after_init = false;
+#endif
+
+static int __init strict_sas_size(char *arg)
+{
+ return kstrtobool(arg, &strict_sigaltstack_size);
+}
+__setup("strict_sas_size", strict_sas_size);
+
+/*
+ * MINSIGSTKSZ is 2048 and can't be changed despite the fact that AVX512
+ * exceeds that size already. As such programs might never use the
+ * sigaltstack they just continued to work. While always checking against
+ * the real size would be correct, this might be considered a regression.
+ *
+ * Therefore avoid the sanity check, unless enforced by kernel
+ * configuration or command line option.
+ *
+ * When dynamic FPU features are supported, the check is also enforced when
+ * the task has permissions to use dynamic features. Tasks which have no
+ * permission are checked against the size of the non-dynamic feature set
+ * if strict checking is enabled. This avoids forcing all tasks on the
+ * system to allocate large sigaltstacks even if they are never going
+ * to use a dynamic feature. As this is serialized via sighand::siglock
+ * any permission request for a dynamic feature either happened already
+ * or will see the newly install sigaltstack size in the permission checks.
+ */
+bool sigaltstack_size_valid(size_t ss_size)
+{
+ unsigned long fsize = max_frame_size - fpu_default_state_size;
+ u64 mask;
+
+ lockdep_assert_held(&current->sighand->siglock);
+
+ if (!fpu_state_size_dynamic() && !strict_sigaltstack_size)
+ return true;
+
+ fsize += current->group_leader->thread.fpu.perm.__user_state_size;
+ if (likely(ss_size > fsize))
+ return true;
+
+ if (strict_sigaltstack_size)
+ return ss_size > fsize;
+
+ mask = current->group_leader->thread.fpu.perm.__state_perm;
+ if (mask & XFEATURE_MASK_USER_DYNAMIC)
+ return ss_size > fsize;
+
+ return true;
+}
+#endif /* CONFIG_DYNAMIC_SIGFRAME */
+
#ifdef CONFIG_X86_X32_ABI
COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn)
{
@@ -929,7 +988,7 @@ COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn)
set_current_blocked(&set);
- if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
+ if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
goto badframe;
if (compat_restore_altstack(&frame->uc.uc_stack))
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 85f6e242b6b4..8241927addff 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -70,7 +70,7 @@
#include <asm/mwait.h>
#include <asm/apic.h>
#include <asm/io_apic.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
#include <asm/setup.h>
#include <asm/uv/uv.h>
#include <linux/mc146818rtc.h>
@@ -101,6 +101,8 @@ EXPORT_PER_CPU_SYMBOL(cpu_die_map);
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map);
+
/* Per CPU bogomips and other parameters */
DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
EXPORT_PER_CPU_SYMBOL(cpu_info);
@@ -464,6 +466,21 @@ static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
return false;
}
+static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+ int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+ /* If the arch didn't set up l2c_id, fall back to SMT */
+ if (per_cpu(cpu_l2c_id, cpu1) == BAD_APICID)
+ return match_smt(c, o);
+
+ /* Do not match if L2 cache id does not match: */
+ if (per_cpu(cpu_l2c_id, cpu1) != per_cpu(cpu_l2c_id, cpu2))
+ return false;
+
+ return topology_sane(c, o, "l2c");
+}
+
/*
* Unlike the other levels, we do not enforce keeping a
* multicore group inside a NUMA node. If this happens, we will
@@ -523,7 +540,7 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
}
-#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC)
+#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_CLUSTER) || defined(CONFIG_SCHED_MC)
static inline int x86_sched_itmt_flags(void)
{
return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0;
@@ -541,12 +558,21 @@ static int x86_smt_flags(void)
return cpu_smt_flags() | x86_sched_itmt_flags();
}
#endif
+#ifdef CONFIG_SCHED_CLUSTER
+static int x86_cluster_flags(void)
+{
+ return cpu_cluster_flags() | x86_sched_itmt_flags();
+}
+#endif
#endif
static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
#ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
#endif
+#ifdef CONFIG_SCHED_CLUSTER
+ { cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) },
+#endif
#ifdef CONFIG_SCHED_MC
{ cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
#endif
@@ -557,6 +583,9 @@ static struct sched_domain_topology_level x86_topology[] = {
#ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
#endif
+#ifdef CONFIG_SCHED_CLUSTER
+ { cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) },
+#endif
#ifdef CONFIG_SCHED_MC
{ cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
#endif
@@ -584,6 +613,7 @@ void set_cpu_sibling_map(int cpu)
if (!has_mp) {
cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu));
cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
+ cpumask_set_cpu(cpu, cpu_l2c_shared_mask(cpu));
cpumask_set_cpu(cpu, topology_core_cpumask(cpu));
cpumask_set_cpu(cpu, topology_die_cpumask(cpu));
c->booted_cores = 1;
@@ -602,6 +632,9 @@ void set_cpu_sibling_map(int cpu)
if ((i == cpu) || (has_mp && match_llc(c, o)))
link_mask(cpu_llc_shared_mask, cpu, i);
+ if ((i == cpu) || (has_mp && match_l2c(c, o)))
+ link_mask(cpu_l2c_shared_mask, cpu, i);
+
if ((i == cpu) || (has_mp && match_die(c, o)))
link_mask(topology_die_cpumask, cpu, i);
}
@@ -652,6 +685,11 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
return cpu_llc_shared_mask(cpu);
}
+const struct cpumask *cpu_clustergroup_mask(int cpu)
+{
+ return cpu_l2c_shared_mask(cpu);
+}
+
static void impress_friends(void)
{
int cpu;
@@ -1335,6 +1373,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL);
zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
+ zalloc_cpumask_var(&per_cpu(cpu_l2c_shared_map, i), GFP_KERNEL);
}
/*
@@ -1564,7 +1603,10 @@ static void remove_siblinginfo(int cpu)
for_each_cpu(sibling, cpu_llc_shared_mask(cpu))
cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling));
+ for_each_cpu(sibling, cpu_l2c_shared_mask(cpu))
+ cpumask_clear_cpu(cpu, cpu_l2c_shared_mask(sibling));
cpumask_clear(cpu_llc_shared_mask(cpu));
+ cpumask_clear(cpu_l2c_shared_mask(cpu));
cpumask_clear(topology_sibling_cpumask(cpu));
cpumask_clear(topology_core_cpumask(cpu));
cpumask_clear(topology_die_cpumask(cpu));
@@ -2166,7 +2208,7 @@ DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
void arch_scale_freq_tick(void)
{
- u64 freq_scale = SCHED_CAPACITY_SCALE;
+ u64 freq_scale;
u64 aperf, mperf;
u64 acnt, mcnt;
diff --git a/arch/x86/kernel/trace.c b/arch/x86/kernel/trace.c
index 6b73b6f92ad3..8322e8352777 100644
--- a/arch/x86/kernel/trace.c
+++ b/arch/x86/kernel/trace.c
@@ -231,4 +231,4 @@ void osnoise_arch_unregister(void)
unregister_trace_local_timer_exit(trace_intel_irq_exit, "local_timer");
unregister_trace_local_timer_entry(trace_intel_irq_entry, NULL);
}
-#endif /* CONFIG_OSNOISE_TRAECR && CONFIG_X86_LOCAL_APIC */
+#endif /* CONFIG_OSNOISE_TRACER && CONFIG_X86_LOCAL_APIC */
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a58800973aed..c9d566dcf89a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -48,7 +48,7 @@
#include <asm/ftrace.h>
#include <asm/traps.h>
#include <asm/desc.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
#include <asm/cpu.h>
#include <asm/cpu_entry_area.h>
#include <asm/mce.h>
@@ -313,17 +313,19 @@ out:
}
#ifdef CONFIG_VMAP_STACK
-__visible void __noreturn handle_stack_overflow(const char *message,
- struct pt_regs *regs,
- unsigned long fault_address)
+__visible void __noreturn handle_stack_overflow(struct pt_regs *regs,
+ unsigned long fault_address,
+ struct stack_info *info)
{
- printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n",
- (void *)fault_address, current->stack,
- (char *)current->stack + THREAD_SIZE - 1);
- die(message, regs, 0);
+ const char *name = stack_type_name(info->type);
+
+ printk(KERN_EMERG "BUG: %s stack guard page was hit at %p (stack is %p..%p)\n",
+ name, (void *)fault_address, info->begin, info->end);
+
+ die("stack guard page", regs, 0);
/* Be absolutely certain we don't return. */
- panic("%s", message);
+ panic("%s stack guard hit", name);
}
#endif
@@ -353,6 +355,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
#ifdef CONFIG_VMAP_STACK
unsigned long address = read_cr2();
+ struct stack_info info;
#endif
#ifdef CONFIG_X86_ESPFIX64
@@ -455,10 +458,8 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
* stack even if the actual trigger for the double fault was
* something else.
*/
- if ((unsigned long)task_stack_page(tsk) - 1 - address < PAGE_SIZE) {
- handle_stack_overflow("kernel stack overflow (double-fault)",
- regs, address);
- }
+ if (get_stack_guard_info((void *)address, &info))
+ handle_stack_overflow(regs, address, &info);
#endif
pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
@@ -528,6 +529,36 @@ static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs,
#define GPFSTR "general protection fault"
+static bool fixup_iopl_exception(struct pt_regs *regs)
+{
+ struct thread_struct *t = &current->thread;
+ unsigned char byte;
+ unsigned long ip;
+
+ if (!IS_ENABLED(CONFIG_X86_IOPL_IOPERM) || t->iopl_emul != 3)
+ return false;
+
+ if (insn_get_effective_ip(regs, &ip))
+ return false;
+
+ if (get_user(byte, (const char __user *)ip))
+ return false;
+
+ if (byte != 0xfa && byte != 0xfb)
+ return false;
+
+ if (!t->iopl_warn && printk_ratelimit()) {
+ pr_err("%s[%d] attempts to use CLI/STI, pretending it's a NOP, ip:%lx",
+ current->comm, task_pid_nr(current), ip);
+ print_vma_addr(KERN_CONT " in ", ip);
+ pr_cont("\n");
+ t->iopl_warn = 1;
+ }
+
+ regs->ip += 1;
+ return true;
+}
+
DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
{
char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR;
@@ -553,6 +584,9 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
tsk = current;
if (user_mode(regs)) {
+ if (fixup_iopl_exception(regs))
+ goto exit;
+
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_GP;
@@ -709,7 +743,7 @@ asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *r
stack = (unsigned long *)sp;
if (!get_stack_info_noinstr(stack, current, &info) || info.type == STACK_TYPE_ENTRY ||
- info.type >= STACK_TYPE_EXCEPTION_LAST)
+ info.type > STACK_TYPE_EXCEPTION_LAST)
sp = __this_cpu_ist_top_va(VC2);
sync:
@@ -1108,10 +1142,48 @@ DEFINE_IDTENTRY(exc_spurious_interrupt_bug)
*/
}
+static bool handle_xfd_event(struct pt_regs *regs)
+{
+ u64 xfd_err;
+ int err;
+
+ if (!IS_ENABLED(CONFIG_X86_64) || !cpu_feature_enabled(X86_FEATURE_XFD))
+ return false;
+
+ rdmsrl(MSR_IA32_XFD_ERR, xfd_err);
+ if (!xfd_err)
+ return false;
+
+ wrmsrl(MSR_IA32_XFD_ERR, 0);
+
+ /* Die if that happens in kernel space */
+ if (WARN_ON(!user_mode(regs)))
+ return false;
+
+ local_irq_enable();
+
+ err = xfd_enable_feature(xfd_err);
+
+ switch (err) {
+ case -EPERM:
+ force_sig_fault(SIGILL, ILL_ILLOPC, error_get_trap_addr(regs));
+ break;
+ case -EFAULT:
+ force_sig(SIGSEGV);
+ break;
+ }
+
+ local_irq_disable();
+ return true;
+}
+
DEFINE_IDTENTRY(exc_device_not_available)
{
unsigned long cr0 = read_cr0();
+ if (handle_xfd_event(regs))
+ return;
+
#ifdef CONFIG_MATH_EMULATION
if (!boot_cpu_has(X86_FEATURE_FPU) && (cr0 & X86_CR0_EM)) {
struct math_emu_info info = { };
diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c
index 576b47e7523d..5a4b21389b1d 100644
--- a/arch/x86/kernel/umip.c
+++ b/arch/x86/kernel/umip.c
@@ -92,8 +92,8 @@ static const char * const umip_insns[5] = {
#define umip_pr_err(regs, fmt, ...) \
umip_printk(regs, KERN_ERR, fmt, ##__VA_ARGS__)
-#define umip_pr_warn(regs, fmt, ...) \
- umip_printk(regs, KERN_WARNING, fmt, ##__VA_ARGS__)
+#define umip_pr_debug(regs, fmt, ...) \
+ umip_printk(regs, KERN_DEBUG, fmt, ##__VA_ARGS__)
/**
* umip_printk() - Print a rate-limited message
@@ -361,10 +361,10 @@ bool fixup_umip_exception(struct pt_regs *regs)
if (umip_inst < 0)
return false;
- umip_pr_warn(regs, "%s instruction cannot be used by applications.\n",
+ umip_pr_debug(regs, "%s instruction cannot be used by applications.\n",
umip_insns[umip_inst]);
- umip_pr_warn(regs, "For now, expensive software emulation returns the result.\n");
+ umip_pr_debug(regs, "For now, expensive software emulation returns the result.\n");
if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size,
user_64bit_mode(regs)))
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index d7c44b257f7f..8e1c50c86e5d 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -240,8 +240,7 @@ static bool update_stack_state(struct unwind_state *state,
else {
addr_p = unwind_get_return_address_ptr(state);
addr = READ_ONCE_TASK_STACK(state->task, *addr_p);
- state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx,
- addr, addr_p);
+ state->ip = unwind_recover_ret_addr(state, addr, addr_p);
}
/* Save the original stack pointer for unwind_dump(): */
diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c
index c49f10ffd8cd..884d68a6e714 100644
--- a/arch/x86/kernel/unwind_guess.c
+++ b/arch/x86/kernel/unwind_guess.c
@@ -15,8 +15,7 @@ unsigned long unwind_get_return_address(struct unwind_state *state)
addr = READ_ONCE_NOCHECK(*state->sp);
- return ftrace_graph_ret_addr(state->task, &state->graph_idx,
- addr, state->sp);
+ return unwind_recover_ret_addr(state, addr, state->sp);
}
EXPORT_SYMBOL_GPL(unwind_get_return_address);
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index a1202536fc57..e6f7592790af 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -534,9 +534,8 @@ bool unwind_next_frame(struct unwind_state *state)
if (!deref_stack_reg(state, ip_p, &state->ip))
goto err;
- state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx,
- state->ip, (void *)ip_p);
-
+ state->ip = unwind_recover_ret_addr(state, state->ip,
+ (unsigned long *)ip_p);
state->sp = sp;
state->regs = NULL;
state->prev_regs = NULL;
@@ -549,7 +548,18 @@ bool unwind_next_frame(struct unwind_state *state)
(void *)orig_ip);
goto err;
}
-
+ /*
+ * There is a small chance to interrupt at the entry of
+ * __kretprobe_trampoline() where the ORC info doesn't exist.
+ * That point is right after the RET to __kretprobe_trampoline()
+ * which was modified return address.
+ * At that point, the @addr_p of the unwind_recover_kretprobe()
+ * (this has to point the address of the stack entry storing
+ * the modified return address) must be "SP - (a stack entry)"
+ * because SP is incremented by the RET.
+ */
+ state->ip = unwind_recover_kretprobe(state, state->ip,
+ (unsigned long *)(state->sp - sizeof(long)));
state->regs = (struct pt_regs *)sp;
state->prev_regs = NULL;
state->full_regs = true;
@@ -562,6 +572,9 @@ bool unwind_next_frame(struct unwind_state *state)
(void *)orig_ip);
goto err;
}
+ /* See UNWIND_HINT_TYPE_REGS case comment. */
+ state->ip = unwind_recover_kretprobe(state, state->ip,
+ (unsigned long *)(state->sp - sizeof(long)));
if (state->full_regs)
state->prev_regs = state->regs;
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index efd9e9ea17f2..3d6dc12d198f 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -272,6 +272,20 @@ SECTIONS
__parainstructions_end = .;
}
+#ifdef CONFIG_RETPOLINE
+ /*
+ * List of instructions that call/jmp/jcc to retpoline thunks
+ * __x86_indirect_thunk_*(). These instructions can be patched along
+ * with alternatives, after which the section can be freed.
+ */
+ . = ALIGN(8);
+ .retpoline_sites : AT(ADDR(.retpoline_sites) - LOAD_OFFSET) {
+ __retpoline_sites = .;
+ *(.retpoline_sites)
+ __retpoline_sites_end = .;
+ }
+#endif
+
/*
* struct alt_inst entries. From the header (alternative.h):
* "Alternative instructions for different CPU types or capabilities"