summaryrefslogtreecommitdiff
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/alternative.c41
-rw-r--r--arch/x86/kernel/apic/io_apic.c5
-rw-r--r--arch/x86/kernel/apic/vector.c4
-rw-r--r--arch/x86/kernel/dumpstack.c27
-rw-r--r--arch/x86/kernel/fpu/core.c39
-rw-r--r--arch/x86/kernel/fpu/xstate.c91
-rw-r--r--arch/x86/kernel/i8259.c2
-rw-r--r--arch/x86/kernel/idt.c2
-rw-r--r--arch/x86/kernel/kprobes/core.c15
-rw-r--r--arch/x86/kernel/kprobes/opt.c38
-rw-r--r--arch/x86/kernel/kvm.c6
-rw-r--r--arch/x86/kernel/nmi.c9
-rw-r--r--arch/x86/kernel/smpboot.c50
-rw-r--r--arch/x86/kernel/stacktrace.c5
-rw-r--r--arch/x86/kernel/traps.c23
-rw-r--r--arch/x86/kernel/unwind_orc.c8
-rw-r--r--arch/x86/kernel/vmlinux.lds.S1
17 files changed, 300 insertions, 66 deletions
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 8fd39ff74a49..20e07feb4064 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -3,6 +3,7 @@
#include <linux/module.h>
#include <linux/sched.h>
+#include <linux/perf_event.h>
#include <linux/mutex.h>
#include <linux/list.h>
#include <linux/stringify.h>
@@ -53,7 +54,7 @@ __setup("noreplace-smp", setup_noreplace_smp);
#define DPRINTK(fmt, args...) \
do { \
if (debug_alternative) \
- printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args); \
+ printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args); \
} while (0)
#define DUMP_BYTES(buf, len, fmt, args...) \
@@ -64,7 +65,7 @@ do { \
if (!(len)) \
break; \
\
- printk(KERN_DEBUG fmt, ##args); \
+ printk(KERN_DEBUG pr_fmt(fmt), ##args); \
for (j = 0; j < (len) - 1; j++) \
printk(KERN_CONT "%02hhx ", buf[j]); \
printk(KERN_CONT "%02hhx\n", buf[j]); \
@@ -1001,6 +1002,7 @@ struct text_poke_loc {
s32 rel32;
u8 opcode;
const u8 text[POKE_MAX_OPCODE_SIZE];
+ u8 old;
};
struct bp_patching_desc {
@@ -1168,8 +1170,10 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
/*
* First step: add a int3 trap to the address that will be patched.
*/
- for (i = 0; i < nr_entries; i++)
+ for (i = 0; i < nr_entries; i++) {
+ tp[i].old = *(u8 *)text_poke_addr(&tp[i]);
text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE);
+ }
text_poke_sync();
@@ -1177,14 +1181,45 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
* Second step: update all but the first byte of the patched range.
*/
for (do_sync = 0, i = 0; i < nr_entries; i++) {
+ u8 old[POKE_MAX_OPCODE_SIZE] = { tp[i].old, };
int len = text_opcode_size(tp[i].opcode);
if (len - INT3_INSN_SIZE > 0) {
+ memcpy(old + INT3_INSN_SIZE,
+ text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
+ len - INT3_INSN_SIZE);
text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
(const char *)tp[i].text + INT3_INSN_SIZE,
len - INT3_INSN_SIZE);
do_sync++;
}
+
+ /*
+ * Emit a perf event to record the text poke, primarily to
+ * support Intel PT decoding which must walk the executable code
+ * to reconstruct the trace. The flow up to here is:
+ * - write INT3 byte
+ * - IPI-SYNC
+ * - write instruction tail
+ * At this point the actual control flow will be through the
+ * INT3 and handler and not hit the old or new instruction.
+ * Intel PT outputs FUP/TIP packets for the INT3, so the flow
+ * can still be decoded. Subsequently:
+ * - emit RECORD_TEXT_POKE with the new instruction
+ * - IPI-SYNC
+ * - write first byte
+ * - IPI-SYNC
+ * So before the text poke event timestamp, the decoder will see
+ * either the old instruction flow or FUP/TIP of INT3. After the
+ * text poke event timestamp, the decoder will see either the
+ * new instruction flow or FUP/TIP of INT3. Thus decoders can
+ * use the timestamp as the point at which to modify the
+ * executable code.
+ * The old instruction is recorded so that the event can be
+ * processed forwards or backwards.
+ */
+ perf_event_text_poke(text_poke_addr(&tp[i]), old, len,
+ tp[i].text, len);
}
if (do_sync) {
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 81ffcfbfaef2..21325a4a78b9 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2335,8 +2335,13 @@ static int mp_irqdomain_create(int ioapic)
static void ioapic_destroy_irqdomain(int idx)
{
+ struct ioapic_domain_cfg *cfg = &ioapics[idx].irqdomain_cfg;
+ struct fwnode_handle *fn = ioapics[idx].irqdomain->fwnode;
+
if (ioapics[idx].irqdomain) {
irq_domain_remove(ioapics[idx].irqdomain);
+ if (!cfg->dev)
+ irq_domain_free_fwnode(fn);
ioapics[idx].irqdomain = NULL;
}
}
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 7649da2478d8..dae32d948bf2 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -560,6 +560,10 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
* as that can corrupt the affinity move state.
*/
irqd_set_handle_enforce_irqctx(irqd);
+
+ /* Don't invoke affinity setter on deactivated interrupts */
+ irqd_set_affinity_on_activate(irqd);
+
/*
* Legacy vectors are already assigned when the IOAPIC
* takes them over. They stay on the same vector. This is
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index f9a3526af15d..48ce44576947 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -71,6 +71,22 @@ static void printk_stack_address(unsigned long address, int reliable,
printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address);
}
+static int copy_code(struct pt_regs *regs, u8 *buf, unsigned long src,
+ unsigned int nbytes)
+{
+ if (!user_mode(regs))
+ return copy_from_kernel_nofault(buf, (u8 *)src, nbytes);
+
+ /*
+ * Make sure userspace isn't trying to trick us into dumping kernel
+ * memory by pointing the userspace instruction pointer at it.
+ */
+ if (__chk_range_not_ok(src, nbytes, TASK_SIZE_MAX))
+ return -EINVAL;
+
+ return copy_from_user_nmi(buf, (void __user *)src, nbytes);
+}
+
/*
* There are a couple of reasons for the 2/3rd prologue, courtesy of Linus:
*
@@ -97,17 +113,8 @@ void show_opcodes(struct pt_regs *regs, const char *loglvl)
#define OPCODE_BUFSIZE (PROLOGUE_SIZE + 1 + EPILOGUE_SIZE)
u8 opcodes[OPCODE_BUFSIZE];
unsigned long prologue = regs->ip - PROLOGUE_SIZE;
- bool bad_ip;
-
- /*
- * Make sure userspace isn't trying to trick us into dumping kernel
- * memory by pointing the userspace instruction pointer at it.
- */
- bad_ip = user_mode(regs) &&
- __chk_range_not_ok(prologue, OPCODE_BUFSIZE, TASK_SIZE_MAX);
- if (bad_ip || copy_from_kernel_nofault(opcodes, (u8 *)prologue,
- OPCODE_BUFSIZE)) {
+ if (copy_code(regs, opcodes, prologue, sizeof(opcodes))) {
printk("%sCode: Bad RIP value.\n", loglvl);
} else {
printk("%sCode: %" __stringify(PROLOGUE_SIZE) "ph <%02x> %"
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 15247b96c6ea..eb86a2b831b1 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -82,6 +82,45 @@ bool irq_fpu_usable(void)
}
EXPORT_SYMBOL(irq_fpu_usable);
+/*
+ * These must be called with preempt disabled. Returns
+ * 'true' if the FPU state is still intact and we can
+ * keep registers active.
+ *
+ * The legacy FNSAVE instruction cleared all FPU state
+ * unconditionally, so registers are essentially destroyed.
+ * Modern FPU state can be kept in registers, if there are
+ * no pending FP exceptions.
+ */
+int copy_fpregs_to_fpstate(struct fpu *fpu)
+{
+ if (likely(use_xsave())) {
+ copy_xregs_to_kernel(&fpu->state.xsave);
+
+ /*
+ * AVX512 state is tracked here because its use is
+ * known to slow the max clock speed of the core.
+ */
+ if (fpu->state.xsave.header.xfeatures & XFEATURE_MASK_AVX512)
+ fpu->avx512_timestamp = jiffies;
+ return 1;
+ }
+
+ if (likely(use_fxsr())) {
+ copy_fxregs_to_kernel(fpu);
+ return 1;
+ }
+
+ /*
+ * Legacy FPU register saving, FNSAVE always clears FPU registers,
+ * so we have to mark them inactive:
+ */
+ asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave));
+
+ return 0;
+}
+EXPORT_SYMBOL(copy_fpregs_to_fpstate);
+
void kernel_fpu_begin(void)
{
preempt_disable();
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index bda2e5eaca0e..be2a68a09d19 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -233,8 +233,10 @@ void fpu__init_cpu_xstate(void)
/*
* MSR_IA32_XSS sets supervisor states managed by XSAVES.
*/
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor());
+ if (boot_cpu_has(X86_FEATURE_XSAVES)) {
+ wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() |
+ xfeatures_mask_dynamic());
+ }
}
static bool xfeature_enabled(enum xfeature xfeature)
@@ -486,7 +488,7 @@ static int xfeature_uncompacted_offset(int xfeature_nr)
return ebx;
}
-static int xfeature_size(int xfeature_nr)
+int xfeature_size(int xfeature_nr)
{
u32 eax, ebx, ecx, edx;
@@ -598,7 +600,8 @@ static void check_xstate_against_struct(int nr)
*/
if ((nr < XFEATURE_YMM) ||
(nr >= XFEATURE_MAX) ||
- (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR)) {
+ (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) ||
+ ((nr >= XFEATURE_RSRVD_COMP_10) && (nr <= XFEATURE_LBR))) {
WARN_ONCE(1, "no structure for xstate: %d\n", nr);
XSTATE_WARN_ON(1);
}
@@ -847,8 +850,10 @@ void fpu__resume_cpu(void)
* Restore IA32_XSS. The same CPUID bit enumerates support
* of XSAVES and MSR_IA32_XSS.
*/
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor());
+ if (boot_cpu_has(X86_FEATURE_XSAVES)) {
+ wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() |
+ xfeatures_mask_dynamic());
+ }
}
/*
@@ -1074,7 +1079,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of
copy_part(offsetof(struct fxregs_state, st_space), 128,
&xsave->i387.st_space, &kbuf, &offset_start, &count);
if (header.xfeatures & XFEATURE_MASK_SSE)
- copy_part(xstate_offsets[XFEATURE_MASK_SSE], 256,
+ copy_part(xstate_offsets[XFEATURE_SSE], 256,
&xsave->i387.xmm_space, &kbuf, &offset_start, &count);
/*
* Fill xsave->i387.sw_reserved value for ptrace frame:
@@ -1356,6 +1361,78 @@ void copy_supervisor_to_kernel(struct xregs_state *xstate)
}
}
+/**
+ * copy_dynamic_supervisor_to_kernel() - Save dynamic supervisor states to
+ * an xsave area
+ * @xstate: A pointer to an xsave area
+ * @mask: Represent the dynamic supervisor features saved into the xsave area
+ *
+ * Only the dynamic supervisor states sets in the mask are saved into the xsave
+ * area (See the comment in XFEATURE_MASK_DYNAMIC for the details of dynamic
+ * supervisor feature). Besides the dynamic supervisor states, the legacy
+ * region and XSAVE header are also saved into the xsave area. The supervisor
+ * features in the XFEATURE_MASK_SUPERVISOR_SUPPORTED and
+ * XFEATURE_MASK_SUPERVISOR_UNSUPPORTED are not saved.
+ *
+ * The xsave area must be 64-bytes aligned.
+ */
+void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask)
+{
+ u64 dynamic_mask = xfeatures_mask_dynamic() & mask;
+ u32 lmask, hmask;
+ int err;
+
+ if (WARN_ON_FPU(!boot_cpu_has(X86_FEATURE_XSAVES)))
+ return;
+
+ if (WARN_ON_FPU(!dynamic_mask))
+ return;
+
+ lmask = dynamic_mask;
+ hmask = dynamic_mask >> 32;
+
+ XSTATE_OP(XSAVES, xstate, lmask, hmask, err);
+
+ /* Should never fault when copying to a kernel buffer */
+ WARN_ON_FPU(err);
+}
+
+/**
+ * copy_kernel_to_dynamic_supervisor() - Restore dynamic supervisor states from
+ * an xsave area
+ * @xstate: A pointer to an xsave area
+ * @mask: Represent the dynamic supervisor features restored from the xsave area
+ *
+ * Only the dynamic supervisor states sets in the mask are restored from the
+ * xsave area (See the comment in XFEATURE_MASK_DYNAMIC for the details of
+ * dynamic supervisor feature). Besides the dynamic supervisor states, the
+ * legacy region and XSAVE header are also restored from the xsave area. The
+ * supervisor features in the XFEATURE_MASK_SUPERVISOR_SUPPORTED and
+ * XFEATURE_MASK_SUPERVISOR_UNSUPPORTED are not restored.
+ *
+ * The xsave area must be 64-bytes aligned.
+ */
+void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask)
+{
+ u64 dynamic_mask = xfeatures_mask_dynamic() & mask;
+ u32 lmask, hmask;
+ int err;
+
+ if (WARN_ON_FPU(!boot_cpu_has(X86_FEATURE_XSAVES)))
+ return;
+
+ if (WARN_ON_FPU(!dynamic_mask))
+ return;
+
+ lmask = dynamic_mask;
+ hmask = dynamic_mask >> 32;
+
+ XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
+
+ /* Should never fault when copying from a kernel buffer */
+ WARN_ON_FPU(err);
+}
+
#ifdef CONFIG_PROC_PID_ARCH_STATUS
/*
* Report the amount of time elapsed in millisecond since last AVX512
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index f3c76252247d..282b4ee1339f 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -207,7 +207,7 @@ spurious_8259A_irq:
* lets ACK and report it. [once per IRQ]
*/
if (!(spurious_irq_mask & irqmask)) {
- printk(KERN_DEBUG
+ printk_deferred(KERN_DEBUG
"spurious 8259A interrupt: IRQ%d.\n", irq);
spurious_irq_mask |= irqmask;
}
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index 0db21206f2f3..7ecf9babf0cb 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -160,7 +160,7 @@ static const __initconst struct idt_data apic_idts[] = {
/* Must be page-aligned because the real IDT is used in the cpu entry area */
static gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss;
-struct desc_ptr idt_descr __ro_after_init = {
+static struct desc_ptr idt_descr __ro_after_init = {
.size = IDT_TABLE_SIZE - 1,
.address = (unsigned long) idt_table,
};
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index ada39ddbc922..fdadc37d72af 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -33,6 +33,7 @@
#include <linux/hardirq.h>
#include <linux/preempt.h>
#include <linux/sched/debug.h>
+#include <linux/perf_event.h>
#include <linux/extable.h>
#include <linux/kdebug.h>
#include <linux/kallsyms.h>
@@ -472,6 +473,9 @@ static int arch_copy_kprobe(struct kprobe *p)
/* Also, displacement change doesn't affect the first byte */
p->opcode = buf[0];
+ p->ainsn.tp_len = len;
+ perf_event_text_poke(p->ainsn.insn, NULL, 0, buf, len);
+
/* OK, write back the instruction(s) into ROX insn buffer */
text_poke(p->ainsn.insn, buf, len);
@@ -503,12 +507,18 @@ int arch_prepare_kprobe(struct kprobe *p)
void arch_arm_kprobe(struct kprobe *p)
{
- text_poke(p->addr, ((unsigned char []){INT3_INSN_OPCODE}), 1);
+ u8 int3 = INT3_INSN_OPCODE;
+
+ text_poke(p->addr, &int3, 1);
text_poke_sync();
+ perf_event_text_poke(p->addr, &p->opcode, 1, &int3, 1);
}
void arch_disarm_kprobe(struct kprobe *p)
{
+ u8 int3 = INT3_INSN_OPCODE;
+
+ perf_event_text_poke(p->addr, &int3, 1, &p->opcode, 1);
text_poke(p->addr, &p->opcode, 1);
text_poke_sync();
}
@@ -516,6 +526,9 @@ void arch_disarm_kprobe(struct kprobe *p)
void arch_remove_kprobe(struct kprobe *p)
{
if (p->ainsn.insn) {
+ /* Record the perf event before freeing the slot */
+ perf_event_text_poke(p->ainsn.insn, p->ainsn.insn,
+ p->ainsn.tp_len, NULL, 0);
free_insn_slot(p->ainsn.insn, p->ainsn.boostable);
p->ainsn.insn = NULL;
}
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 7af4c61dde52..40f380461e6d 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -6,6 +6,7 @@
* Copyright (C) Hitachi Ltd., 2012
*/
#include <linux/kprobes.h>
+#include <linux/perf_event.h>
#include <linux/ptrace.h>
#include <linux/string.h>
#include <linux/slab.h>
@@ -352,8 +353,15 @@ int arch_within_optimized_kprobe(struct optimized_kprobe *op,
static
void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
{
- if (op->optinsn.insn) {
- free_optinsn_slot(op->optinsn.insn, dirty);
+ u8 *slot = op->optinsn.insn;
+ if (slot) {
+ int len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE;
+
+ /* Record the perf event before freeing the slot */
+ if (dirty)
+ perf_event_text_poke(slot, slot, len, NULL, 0);
+
+ free_optinsn_slot(slot, dirty);
op->optinsn.insn = NULL;
op->optinsn.size = 0;
}
@@ -424,8 +432,15 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
(u8 *)op->kp.addr + op->optinsn.size);
len += JMP32_INSN_SIZE;
+ /*
+ * Note len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE is also
+ * used in __arch_remove_optimized_kprobe().
+ */
+
/* We have to use text_poke() for instruction buffer because it is RO */
+ perf_event_text_poke(slot, NULL, 0, buf, len);
text_poke(slot, buf, len);
+
ret = 0;
out:
kfree(buf);
@@ -477,10 +492,23 @@ void arch_optimize_kprobes(struct list_head *oplist)
*/
void arch_unoptimize_kprobe(struct optimized_kprobe *op)
{
- arch_arm_kprobe(&op->kp);
- text_poke(op->kp.addr + INT3_INSN_SIZE,
- op->optinsn.copied_insn, DISP32_SIZE);
+ u8 new[JMP32_INSN_SIZE] = { INT3_INSN_OPCODE, };
+ u8 old[JMP32_INSN_SIZE];
+ u8 *addr = op->kp.addr;
+
+ memcpy(old, op->kp.addr, JMP32_INSN_SIZE);
+ memcpy(new + INT3_INSN_SIZE,
+ op->optinsn.copied_insn,
+ JMP32_INSN_SIZE - INT3_INSN_SIZE);
+
+ text_poke(addr, new, INT3_INSN_SIZE);
text_poke_sync();
+ text_poke(addr + INT3_INSN_SIZE,
+ new + INT3_INSN_SIZE,
+ JMP32_INSN_SIZE - INT3_INSN_SIZE);
+ text_poke_sync();
+
+ perf_event_text_poke(op->kp.addr, old, JMP32_INSN_SIZE, new, JMP32_INSN_SIZE);
}
/*
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index df63786e7bfa..3f78482d9496 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -233,7 +233,7 @@ EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags);
noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
{
u32 reason = kvm_read_and_reset_apf_flags();
- bool rcu_exit;
+ idtentry_state_t state;
switch (reason) {
case KVM_PV_REASON_PAGE_NOT_PRESENT:
@@ -243,7 +243,7 @@ noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
return false;
}
- rcu_exit = idtentry_enter_cond_rcu(regs);
+ state = idtentry_enter(regs);
instrumentation_begin();
/*
@@ -264,7 +264,7 @@ noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
}
instrumentation_end();
- idtentry_exit_cond_rcu(regs, rcu_exit);
+ idtentry_exit(regs, state);
return true;
}
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index d7c5e44b26f7..4fc9954a9560 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -330,7 +330,6 @@ static noinstr void default_do_nmi(struct pt_regs *regs)
__this_cpu_write(last_nmi_rip, regs->ip);
instrumentation_begin();
- trace_hardirqs_off_finish();
handled = nmi_handle(NMI_LOCAL, regs);
__this_cpu_add(nmi_stats.normal, handled);
@@ -417,8 +416,6 @@ static noinstr void default_do_nmi(struct pt_regs *regs)
unknown_nmi_error(reason, regs);
out:
- if (regs->flags & X86_EFLAGS_IF)
- trace_hardirqs_on_prepare();
instrumentation_end();
}
@@ -478,6 +475,8 @@ static DEFINE_PER_CPU(unsigned long, nmi_dr7);
DEFINE_IDTENTRY_RAW(exc_nmi)
{
+ bool irq_state;
+
if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id()))
return;
@@ -491,14 +490,14 @@ nmi_restart:
this_cpu_write(nmi_dr7, local_db_save());
- nmi_enter();
+ irq_state = idtentry_enter_nmi(regs);
inc_irq_stat(__nmi_count);
if (!ignore_nmis)
default_do_nmi(regs);
- nmi_exit();
+ idtentry_exit_nmi(regs, irq_state);
local_db_restore(this_cpu_read(nmi_dr7));
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ffbd9a3d78d8..518ac6bf752e 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -56,6 +56,7 @@
#include <linux/cpuidle.h>
#include <linux/numa.h>
#include <linux/pgtable.h>
+#include <linux/overflow.h>
#include <asm/acpi.h>
#include <asm/desc.h>
@@ -1777,6 +1778,7 @@ void native_play_dead(void)
#endif
+#ifdef CONFIG_X86_64
/*
* APERF/MPERF frequency ratio computation.
*
@@ -1975,6 +1977,7 @@ static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
static bool intel_set_max_freq_ratio(void)
{
u64 base_freq, turbo_freq;
+ u64 turbo_ratio;
if (slv_set_max_freq_ratio(&base_freq, &turbo_freq))
goto out;
@@ -2000,15 +2003,23 @@ out:
/*
* Some hypervisors advertise X86_FEATURE_APERFMPERF
* but then fill all MSR's with zeroes.
+ * Some CPUs have turbo boost but don't declare any turbo ratio
+ * in MSR_TURBO_RATIO_LIMIT.
*/
- if (!base_freq) {
- pr_debug("Couldn't determine cpu base frequency, necessary for scale-invariant accounting.\n");
+ if (!base_freq || !turbo_freq) {
+ pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n");
return false;
}
- arch_turbo_freq_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE,
- base_freq);
+ turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq);
+ if (!turbo_ratio) {
+ pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n");
+ return false;
+ }
+
+ arch_turbo_freq_ratio = turbo_ratio;
arch_set_max_freq_ratio(turbo_disabled());
+
return true;
}
@@ -2048,11 +2059,19 @@ static void init_freq_invariance(bool secondary)
}
}
+static void disable_freq_invariance_workfn(struct work_struct *work)
+{
+ static_branch_disable(&arch_scale_freq_key);
+}
+
+static DECLARE_WORK(disable_freq_invariance_work,
+ disable_freq_invariance_workfn);
+
DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
void arch_scale_freq_tick(void)
{
- u64 freq_scale;
+ u64 freq_scale = SCHED_CAPACITY_SCALE;
u64 aperf, mperf;
u64 acnt, mcnt;
@@ -2064,19 +2083,32 @@ void arch_scale_freq_tick(void)
acnt = aperf - this_cpu_read(arch_prev_aperf);
mcnt = mperf - this_cpu_read(arch_prev_mperf);
- if (!mcnt)
- return;
this_cpu_write(arch_prev_aperf, aperf);
this_cpu_write(arch_prev_mperf, mperf);
- acnt <<= 2*SCHED_CAPACITY_SHIFT;
- mcnt *= arch_max_freq_ratio;
+ if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
+ goto error;
+
+ if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt)
+ goto error;
freq_scale = div64_u64(acnt, mcnt);
+ if (!freq_scale)
+ goto error;
if (freq_scale > SCHED_CAPACITY_SCALE)
freq_scale = SCHED_CAPACITY_SCALE;
this_cpu_write(arch_freq_scale, freq_scale);
+ return;
+
+error:
+ pr_warn("Scheduler frequency invariance went wobbly, disabling!\n");
+ schedule_work(&disable_freq_invariance_work);
+}
+#else
+static inline void init_freq_invariance(bool secondary)
+{
}
+#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 6ad43fc44556..2fd698e28e4d 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -58,7 +58,6 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry,
* or a page fault), which can make frame pointers
* unreliable.
*/
-
if (IS_ENABLED(CONFIG_FRAME_POINTER))
return -EINVAL;
}
@@ -81,10 +80,6 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry,
if (unwind_error(&state))
return -EINVAL;
- /* Success path for non-user tasks, i.e. kthreads and idle tasks */
- if (!(task->flags & (PF_KTHREAD | PF_IDLE)))
- return -EINVAL;
-
return 0;
}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b7cb3e0716f7..8493f55e1167 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -245,7 +245,7 @@ static noinstr bool handle_bug(struct pt_regs *regs)
DEFINE_IDTENTRY_RAW(exc_invalid_op)
{
- bool rcu_exit;
+ idtentry_state_t state;
/*
* We use UD2 as a short encoding for 'CALL __WARN', as such
@@ -255,11 +255,11 @@ DEFINE_IDTENTRY_RAW(exc_invalid_op)
if (!user_mode(regs) && handle_bug(regs))
return;
- rcu_exit = idtentry_enter_cond_rcu(regs);
+ state = idtentry_enter(regs);
instrumentation_begin();
handle_invalid_op(regs);
instrumentation_end();
- idtentry_exit_cond_rcu(regs, rcu_exit);
+ idtentry_exit(regs, state);
}
DEFINE_IDTENTRY(exc_coproc_segment_overrun)
@@ -405,7 +405,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
}
#endif
- nmi_enter();
+ idtentry_enter_nmi(regs);
instrumentation_begin();
notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
@@ -651,15 +651,12 @@ DEFINE_IDTENTRY_RAW(exc_int3)
instrumentation_end();
idtentry_exit_user(regs);
} else {
- nmi_enter();
+ bool irq_state = idtentry_enter_nmi(regs);
instrumentation_begin();
- trace_hardirqs_off_finish();
if (!do_int3(regs))
die("int3", regs, 0);
- if (regs->flags & X86_EFLAGS_IF)
- trace_hardirqs_on_prepare();
instrumentation_end();
- nmi_exit();
+ idtentry_exit_nmi(regs, irq_state);
}
}
@@ -867,9 +864,8 @@ out:
static __always_inline void exc_debug_kernel(struct pt_regs *regs,
unsigned long dr6)
{
- nmi_enter();
+ bool irq_state = idtentry_enter_nmi(regs);
instrumentation_begin();
- trace_hardirqs_off_finish();
/*
* If something gets miswired and we end up here for a user mode
@@ -886,10 +882,8 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs,
handle_debug(regs, dr6, false);
- if (regs->flags & X86_EFLAGS_IF)
- trace_hardirqs_on_prepare();
instrumentation_end();
- nmi_exit();
+ idtentry_exit_nmi(regs, irq_state);
}
static __always_inline void exc_debug_user(struct pt_regs *regs,
@@ -905,6 +899,7 @@ static __always_inline void exc_debug_user(struct pt_regs *regs,
instrumentation_begin();
handle_debug(regs, dr6, true);
+
instrumentation_end();
idtentry_exit_user(regs);
}
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index 7f969b2d240f..ec88bbe08a32 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -440,8 +440,11 @@ bool unwind_next_frame(struct unwind_state *state)
/*
* Find the orc_entry associated with the text address.
*
- * Decrement call return addresses by one so they work for sibling
- * calls and calls to noreturn functions.
+ * For a call frame (as opposed to a signal frame), state->ip points to
+ * the instruction after the call. That instruction's stack layout
+ * could be different from the call instruction's layout, for example
+ * if the call was to a noreturn function. So get the ORC data for the
+ * call instruction itself.
*/
orc = orc_find(state->signal ? state->ip : state->ip - 1);
if (!orc) {
@@ -662,6 +665,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
state->sp = task->thread.sp;
state->bp = READ_ONCE_NOCHECK(frame->bp);
state->ip = READ_ONCE_NOCHECK(frame->ret_addr);
+ state->signal = (void *)state->ip == ret_from_fork;
}
if (get_stack_info((unsigned long *)state->sp, state->task,
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 3bfc8dd8a43d..9a03e5b23135 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -358,6 +358,7 @@ SECTIONS
.bss : AT(ADDR(.bss) - LOAD_OFFSET) {
__bss_start = .;
*(.bss..page_aligned)
+ . = ALIGN(PAGE_SIZE);
*(BSS_MAIN)
BSS_DECRYPTED
. = ALIGN(PAGE_SIZE);