diff options
Diffstat (limited to 'arch/x86/kernel')
50 files changed, 1281 insertions, 593 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 96d51bbc2bd4..dd61752f4c96 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -45,7 +45,6 @@ obj-y += head$(BITS).o obj-y += ebda.o obj-y += platform-quirks.o obj-y += process_$(BITS).o signal.o signal_$(BITS).o -obj-$(CONFIG_COMPAT) += signal_compat.o obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o dumpstack.o nmi.o obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 907cc98b1938..1c38174b5f01 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -188,6 +188,17 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) return cpu; } +static bool __init acpi_is_processor_usable(u32 lapic_flags) +{ + if (lapic_flags & ACPI_MADT_ENABLED) + return true; + + if (acpi_support_online_capable && (lapic_flags & ACPI_MADT_ONLINE_CAPABLE)) + return true; + + return false; +} + static int __init acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end) { @@ -212,6 +223,10 @@ acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end) if (apic_id == 0xffffffff) return 0; + /* don't register processors that cannot be onlined */ + if (!acpi_is_processor_usable(processor->lapic_flags)) + return 0; + /* * We need to register disabled CPU as well to permit * counting disabled CPUs. This allows us to size @@ -250,9 +265,7 @@ acpi_parse_lapic(union acpi_subtable_headers * header, const unsigned long end) return 0; /* don't register processors that can not be onlined */ - if (acpi_support_online_capable && - !(processor->lapic_flags & ACPI_MADT_ENABLED) && - !(processor->lapic_flags & ACPI_MADT_ONLINE_CAPABLE)) + if (!acpi_is_processor_usable(processor->lapic_flags)) return 0; /* @@ -1840,23 +1853,23 @@ early_param("acpi_sci", setup_acpi_sci); int __acpi_acquire_global_lock(unsigned int *lock) { - unsigned int old, new, val; + unsigned int old, new; + + old = READ_ONCE(*lock); do { - old = *lock; new = (((old & ~0x3) + 2) + ((old >> 1) & 0x1)); - val = cmpxchg(lock, old, new); - } while (unlikely (val != old)); + } while (!try_cmpxchg(lock, &old, new)); return ((new & 0x3) < 3) ? -1 : 0; } int __acpi_release_global_lock(unsigned int *lock) { - unsigned int old, new, val; + unsigned int old, new; + + old = READ_ONCE(*lock); do { - old = *lock; new = old & ~0x3; - val = cmpxchg(lock, old, new); - } while (unlikely (val != old)); + } while (!try_cmpxchg(lock, &old, new)); return old & 0x1; } diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 7d8c3cbde368..f615e0cb6d93 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -282,27 +282,25 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, */ for (a = start; a < end; a++) { int insn_buff_sz = 0; - /* Mask away "NOT" flag bit for feature to test. */ - u16 feature = a->cpuid & ~ALTINSTR_FLAG_INV; instr = (u8 *)&a->instr_offset + a->instr_offset; replacement = (u8 *)&a->repl_offset + a->repl_offset; BUG_ON(a->instrlen > sizeof(insn_buff)); - BUG_ON(feature >= (NCAPINTS + NBUGINTS) * 32); + BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); /* * Patch if either: * - feature is present - * - feature not present but ALTINSTR_FLAG_INV is set to mean, + * - feature not present but ALT_FLAG_NOT is set to mean, * patch if feature is *NOT* present. */ - if (!boot_cpu_has(feature) == !(a->cpuid & ALTINSTR_FLAG_INV)) + if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) goto next; DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)", - (a->cpuid & ALTINSTR_FLAG_INV) ? "!" : "", - feature >> 5, - feature & 0x1f, + (a->flags & ALT_FLAG_NOT) ? "!" : "", + a->cpuid >> 5, + a->cpuid & 0x1f, instr, instr, a->instrlen, replacement, a->replacementlen); @@ -340,6 +338,12 @@ next: } } +static inline bool is_jcc32(struct insn *insn) +{ + /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */ + return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80; +} + #if defined(CONFIG_RETPOLINE) && defined(CONFIG_OBJTOOL) /* @@ -378,12 +382,6 @@ static int emit_indirect(int op, int reg, u8 *bytes) return i; } -static inline bool is_jcc32(struct insn *insn) -{ - /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */ - return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80; -} - static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes) { u8 op = insn->opcode.bytes[0]; @@ -1772,6 +1770,11 @@ void text_poke_sync(void) on_each_cpu(do_sync_core, NULL, 1); } +/* + * NOTE: crazy scheme to allow patching Jcc.d32 but not increase the size of + * this thing. When len == 6 everything is prefixed with 0x0f and we map + * opcode to Jcc.d8, using len to distinguish. + */ struct text_poke_loc { /* addr := _stext + rel_addr */ s32 rel_addr; @@ -1893,6 +1896,10 @@ noinstr int poke_int3_handler(struct pt_regs *regs) int3_emulate_jmp(regs, (long)ip + tp->disp); break; + case 0x70 ... 0x7f: /* Jcc */ + int3_emulate_jcc(regs, tp->opcode & 0xf, (long)ip, tp->disp); + break; + default: BUG(); } @@ -1966,16 +1973,26 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries * Second step: update all but the first byte of the patched range. */ for (do_sync = 0, i = 0; i < nr_entries; i++) { - u8 old[POKE_MAX_OPCODE_SIZE] = { tp[i].old, }; + u8 old[POKE_MAX_OPCODE_SIZE+1] = { tp[i].old, }; + u8 _new[POKE_MAX_OPCODE_SIZE+1]; + const u8 *new = tp[i].text; int len = tp[i].len; if (len - INT3_INSN_SIZE > 0) { memcpy(old + INT3_INSN_SIZE, text_poke_addr(&tp[i]) + INT3_INSN_SIZE, len - INT3_INSN_SIZE); + + if (len == 6) { + _new[0] = 0x0f; + memcpy(_new + 1, new, 5); + new = _new; + } + text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE, - (const char *)tp[i].text + INT3_INSN_SIZE, + new + INT3_INSN_SIZE, len - INT3_INSN_SIZE); + do_sync++; } @@ -2003,8 +2020,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries * The old instruction is recorded so that the event can be * processed forwards or backwards. */ - perf_event_text_poke(text_poke_addr(&tp[i]), old, len, - tp[i].text, len); + perf_event_text_poke(text_poke_addr(&tp[i]), old, len, new, len); } if (do_sync) { @@ -2021,10 +2037,15 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries * replacing opcode. */ for (do_sync = 0, i = 0; i < nr_entries; i++) { - if (tp[i].text[0] == INT3_INSN_OPCODE) + u8 byte = tp[i].text[0]; + + if (tp[i].len == 6) + byte = 0x0f; + + if (byte == INT3_INSN_OPCODE) continue; - text_poke(text_poke_addr(&tp[i]), tp[i].text, INT3_INSN_SIZE); + text_poke(text_poke_addr(&tp[i]), &byte, INT3_INSN_SIZE); do_sync++; } @@ -2042,9 +2063,11 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, const void *opcode, size_t len, const void *emulate) { struct insn insn; - int ret, i; + int ret, i = 0; - memcpy((void *)tp->text, opcode, len); + if (len == 6) + i = 1; + memcpy((void *)tp->text, opcode+i, len-i); if (!emulate) emulate = opcode; @@ -2055,6 +2078,13 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, tp->len = len; tp->opcode = insn.opcode.bytes[0]; + if (is_jcc32(&insn)) { + /* + * Map Jcc.d32 onto Jcc.d8 and use len to distinguish. + */ + tp->opcode = insn.opcode.bytes[1] - 0x10; + } + switch (tp->opcode) { case RET_INSN_OPCODE: case JMP32_INSN_OPCODE: @@ -2071,7 +2101,6 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, BUG_ON(len != insn.length); } - switch (tp->opcode) { case INT3_INSN_OPCODE: case RET_INSN_OPCODE: @@ -2080,6 +2109,7 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, case CALL_INSN_OPCODE: case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: + case 0x70 ... 0x7f: /* Jcc */ tp->disp = insn.immediate.value; break; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index a868b76cd3d4..1f83b052bb74 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2364,9 +2364,8 @@ static int mp_irqdomain_create(int ioapic) return -ENODEV; } - ip->irqdomain = irq_domain_create_linear(fn, hwirqs, cfg->ops, - (void *)(long)ioapic); - + ip->irqdomain = irq_domain_create_hierarchy(parent, 0, hwirqs, fn, cfg->ops, + (void *)(long)ioapic); if (!ip->irqdomain) { /* Release fw handle if it was allocated above */ if (!cfg->dev) @@ -2374,8 +2373,6 @@ static int mp_irqdomain_create(int ioapic) return -ENOMEM; } - ip->irqdomain->parent = parent; - if (cfg->type == IOAPIC_DOMAIN_LEGACY || cfg->type == IOAPIC_DOMAIN_STRICT) ioapic_dynirq_base = max(ioapic_dynirq_base, diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index f769d6d08b43..380753b14cab 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -956,7 +956,7 @@ static void init_amd(struct cpuinfo_x86 *c) init_amd_cacheinfo(c); - if (cpu_has(c, X86_FEATURE_XMM2)) { + if (!cpu_has(c, X86_FEATURE_LFENCE_RDTSC) && cpu_has(c, X86_FEATURE_XMM2)) { /* * Use LFENCE for execution serialization. On families which * don't have that MSR, LFENCE is already serializing. @@ -1158,24 +1158,43 @@ static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) return false; } -void set_dr_addr_mask(unsigned long mask, int dr) +static DEFINE_PER_CPU_READ_MOSTLY(unsigned long[4], amd_dr_addr_mask); + +static unsigned int amd_msr_dr_addr_masks[] = { + MSR_F16H_DR0_ADDR_MASK, + MSR_F16H_DR1_ADDR_MASK, + MSR_F16H_DR1_ADDR_MASK + 1, + MSR_F16H_DR1_ADDR_MASK + 2 +}; + +void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr) { - if (!boot_cpu_has(X86_FEATURE_BPEXT)) + int cpu = smp_processor_id(); + + if (!cpu_feature_enabled(X86_FEATURE_BPEXT)) return; - switch (dr) { - case 0: - wrmsr(MSR_F16H_DR0_ADDR_MASK, mask, 0); - break; - case 1: - case 2: - case 3: - wrmsr(MSR_F16H_DR1_ADDR_MASK - 1 + dr, mask, 0); - break; - default: - break; - } + if (WARN_ON_ONCE(dr >= ARRAY_SIZE(amd_msr_dr_addr_masks))) + return; + + if (per_cpu(amd_dr_addr_mask, cpu)[dr] == mask) + return; + + wrmsr(amd_msr_dr_addr_masks[dr], mask, 0); + per_cpu(amd_dr_addr_mask, cpu)[dr] = mask; +} + +unsigned long amd_get_dr_addr_mask(unsigned int dr) +{ + if (!cpu_feature_enabled(X86_FEATURE_BPEXT)) + return 0; + + if (WARN_ON_ONCE(dr >= ARRAY_SIZE(amd_msr_dr_addr_masks))) + return 0; + + return per_cpu(amd_dr_addr_mask[dr], smp_processor_id()); } +EXPORT_SYMBOL_GPL(amd_get_dr_addr_mask); u32 amd_get_highest_perf(void) { diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index 1f60a2b27936..fdbb5f07448f 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -330,7 +330,16 @@ static void __init bp_init_freq_invariance(void) static void disable_freq_invariance_workfn(struct work_struct *work) { + int cpu; + static_branch_disable(&arch_scale_freq_key); + + /* + * Set arch_freq_scale to a default value on all cpus + * This negates the effect of scaling + */ + for_each_possible_cpu(cpu) + per_cpu(arch_freq_scale, cpu) = SCHED_CAPACITY_SCALE; } static DECLARE_WORK(disable_freq_invariance_work, diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index bca0bd8f4846..cf81848b72f4 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -33,6 +33,7 @@ #include <asm/e820/api.h> #include <asm/hypervisor.h> #include <asm/tlbflush.h> +#include <asm/cpu.h> #include "cpu.h" @@ -86,7 +87,7 @@ void update_spec_ctrl_cond(u64 val) wrmsrl(MSR_IA32_SPEC_CTRL, val); } -u64 spec_ctrl_current(void) +noinstr u64 spec_ctrl_current(void) { return this_cpu_read(x86_spec_ctrl_current); } @@ -144,9 +145,17 @@ void __init check_bugs(void) * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD * init code as it is not enumerated and depends on the family. */ - if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) + if (cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) { rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + /* + * Previously running kernel (kexec), may have some controls + * turned ON. Clear them and let the mitigations setup below + * rediscover them based on configuration. + */ + x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK; + } + /* Select the proper CPU mitigations before patching alternatives: */ spectre_v1_select_mitigation(); spectre_v2_select_mitigation(); @@ -1229,9 +1238,9 @@ static const char * const spectre_v2_strings[] = { [SPECTRE_V2_NONE] = "Vulnerable", [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines", [SPECTRE_V2_LFENCE] = "Mitigation: LFENCE", - [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced IBRS", - [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced IBRS + LFENCE", - [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced IBRS + Retpolines", + [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced / Automatic IBRS", + [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced / Automatic IBRS + LFENCE", + [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced / Automatic IBRS + Retpolines", [SPECTRE_V2_IBRS] = "Mitigation: IBRS", }; @@ -1300,7 +1309,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && !boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { - pr_err("%s selected but CPU doesn't have eIBRS. Switching to AUTO select\n", + pr_err("%s selected but CPU doesn't have Enhanced or Automatic IBRS. Switching to AUTO select\n", mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; } @@ -1486,8 +1495,12 @@ static void __init spectre_v2_select_mitigation(void) pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); if (spectre_v2_in_ibrs_mode(mode)) { - x86_spec_ctrl_base |= SPEC_CTRL_IBRS; - update_spec_ctrl(x86_spec_ctrl_base); + if (boot_cpu_has(X86_FEATURE_AUTOIBRS)) { + msr_set_bit(MSR_EFER, _EFER_AUTOIBRS); + } else { + x86_spec_ctrl_base |= SPEC_CTRL_IBRS; + update_spec_ctrl(x86_spec_ctrl_base); + } } switch (mode) { @@ -1571,8 +1584,8 @@ static void __init spectre_v2_select_mitigation(void) /* * Retpoline protects the kernel, but doesn't protect firmware. IBRS * and Enhanced IBRS protect firmware too, so enable IBRS around - * firmware calls only when IBRS / Enhanced IBRS aren't otherwise - * enabled. + * firmware calls only when IBRS / Enhanced / Automatic IBRS aren't + * otherwise enabled. * * Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because * the user might select retpoline on the kernel command line and if diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index f4e5aa27eec6..4063e8991211 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -734,7 +734,7 @@ void init_hygon_cacheinfo(struct cpuinfo_x86 *c) void init_intel_cacheinfo(struct cpuinfo_x86 *c) { /* Cache sizes */ - unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; + unsigned int l1i = 0, l1d = 0, l2 = 0, l3 = 0; unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; @@ -835,9 +835,6 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) case LVL_3: l3 += cache_table[k].size; break; - case LVL_TRACE: - trace += cache_table[k].size; - break; } break; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 9cfca3d7d0e2..a394bbba7a4b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -567,7 +567,7 @@ static __init int setup_disable_pku(char *arg) return 1; } __setup("nopku", setup_disable_pku); -#endif /* CONFIG_X86_64 */ +#endif #ifdef CONFIG_X86_KERNEL_IBT @@ -1093,6 +1093,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c) if (c->extended_cpuid_level >= 0x8000001f) c->x86_capability[CPUID_8000_001F_EAX] = cpuid_eax(0x8000001f); + if (c->extended_cpuid_level >= 0x80000021) + c->x86_capability[CPUID_8000_0021_EAX] = cpuid_eax(0x80000021); + init_scattered_cpuid_features(c); init_speculation_control(c); @@ -1226,8 +1229,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ - VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), - VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), + VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), /* Zhaoxin Family 7 */ VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO), @@ -1256,6 +1259,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { #define MMIO_SBDS BIT(2) /* CPU is affected by RETbleed, speculating where you would not expect it */ #define RETBLEED BIT(3) +/* CPU is affected by SMT (cross-thread) return predictions */ +#define SMT_RSB BIT(4) static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS), @@ -1287,8 +1292,8 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_AMD(0x15, RETBLEED), VULNBL_AMD(0x16, RETBLEED), - VULNBL_AMD(0x17, RETBLEED), - VULNBL_HYGON(0x18, RETBLEED), + VULNBL_AMD(0x17, RETBLEED | SMT_RSB), + VULNBL_HYGON(0x18, RETBLEED | SMT_RSB), {} }; @@ -1338,8 +1343,16 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); - if (ia32_cap & ARCH_CAP_IBRS_ALL) + /* + * AMD's AutoIBRS is equivalent to Intel's eIBRS - use the Intel feature + * flag and protect from vendor-specific bugs via the whitelist. + */ + if ((ia32_cap & ARCH_CAP_IBRS_ALL) || cpu_has(c, X86_FEATURE_AUTOIBRS)) { setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED); + if (!cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) && + !(ia32_cap & ARCH_CAP_PBRSB_NO)) + setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB); + } if (!cpu_matches(cpu_vuln_whitelist, NO_MDS) && !(ia32_cap & ARCH_CAP_MDS_NO)) { @@ -1401,10 +1414,8 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) setup_force_cpu_bug(X86_BUG_RETBLEED); } - if (cpu_has(c, X86_FEATURE_IBRS_ENHANCED) && - !cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) && - !(ia32_cap & ARCH_CAP_PBRSB_NO)) - setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB); + if (cpu_matches(cpu_vuln_blacklist, SMT_RSB)) + setup_force_cpu_bug(X86_BUG_SMT_RSB); if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; @@ -1682,9 +1693,7 @@ void check_null_seg_clears_base(struct cpuinfo_x86 *c) if (!IS_ENABLED(CONFIG_X86_64)) return; - /* Zen3 CPUs advertise Null Selector Clears Base in CPUID. */ - if (c->extended_cpuid_level >= 0x80000021 && - cpuid_eax(0x80000021) & BIT(6)) + if (cpu_has(c, X86_FEATURE_NULL_SEL_CLR_BASE)) return; /* @@ -1953,13 +1962,13 @@ void __init identify_boot_cpu(void) if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) pr_info("CET detected: Indirect Branch Tracking enabled\n"); #ifdef CONFIG_X86_32 - sysenter_setup(); enable_sep_cpu(); #endif cpu_detect_tlb(&boot_cpu_data); setup_cr_pinning(); tsx_init(); + lkgs_init(); } void identify_secondary_cpu(struct cpuinfo_x86 *c) @@ -2125,7 +2134,6 @@ static void wait_for_master_cpu(int cpu) #endif } -#ifdef CONFIG_X86_64 static inline void setup_getcpu(int cpu) { unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu)); @@ -2147,6 +2155,7 @@ static inline void setup_getcpu(int cpu) write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_CPUNODE, &d, DESCTYPE_S); } +#ifdef CONFIG_X86_64 static inline void ucode_cpu_init(int cpu) { if (cpu) @@ -2166,8 +2175,6 @@ static inline void tss_setup_ist(struct tss_struct *tss) #else /* CONFIG_X86_64 */ -static inline void setup_getcpu(int cpu) { } - static inline void ucode_cpu_init(int cpu) { show_ucode_info_early(); @@ -2297,30 +2304,45 @@ void cpu_init_secondary(void) #endif #ifdef CONFIG_MICROCODE_LATE_LOADING -/* +/** + * store_cpu_caps() - Store a snapshot of CPU capabilities + * @curr_info: Pointer where to store it + * + * Returns: None + */ +void store_cpu_caps(struct cpuinfo_x86 *curr_info) +{ + /* Reload CPUID max function as it might've changed. */ + curr_info->cpuid_level = cpuid_eax(0); + + /* Copy all capability leafs and pick up the synthetic ones. */ + memcpy(&curr_info->x86_capability, &boot_cpu_data.x86_capability, + sizeof(curr_info->x86_capability)); + + /* Get the hardware CPUID leafs */ + get_cpu_cap(curr_info); +} + +/** + * microcode_check() - Check if any CPU capabilities changed after an update. + * @prev_info: CPU capabilities stored before an update. + * * The microcode loader calls this upon late microcode load to recheck features, * only when microcode has been updated. Caller holds microcode_mutex and CPU * hotplug lock. + * + * Return: None */ -void microcode_check(void) +void microcode_check(struct cpuinfo_x86 *prev_info) { - struct cpuinfo_x86 info; + struct cpuinfo_x86 curr_info; perf_check_microcode(); - /* Reload CPUID max function as it might've changed. */ - info.cpuid_level = cpuid_eax(0); - - /* - * Copy all capability leafs to pick up the synthetic ones so that - * memcmp() below doesn't fail on that. The ones coming from CPUID will - * get overwritten in get_cpu_cap(). - */ - memcpy(&info.x86_capability, &boot_cpu_data.x86_capability, sizeof(info.x86_capability)); - - get_cpu_cap(&info); + store_cpu_caps(&curr_info); - if (!memcmp(&info.x86_capability, &boot_cpu_data.x86_capability, sizeof(info.x86_capability))) + if (!memcmp(&prev_info->x86_capability, &curr_info.x86_capability, + sizeof(prev_info->x86_capability))) return; pr_warn("x86/CPU: CPU features have changed after loading microcode, but might not take effect.\n"); diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 7c9b5893c30a..57a5349e6954 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -83,6 +83,4 @@ unsigned int aperfmperf_get_khz(int cpu); extern void x86_spec_ctrl_setup_ap(void); extern void update_srbds_msr(void); -extern u64 x86_read_arch_cap_msr(void); - #endif /* ARCH_X86_CPU_H */ diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index d95221117129..f6748c8bd647 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -68,6 +68,8 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_CQM_OCCUP_LLC, X86_FEATURE_CQM_LLC }, { X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC }, { X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC }, + { X86_FEATURE_BMEC, X86_FEATURE_CQM_MBM_TOTAL }, + { X86_FEATURE_BMEC, X86_FEATURE_CQM_MBM_LOCAL }, { X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL }, { X86_FEATURE_AVX512_FP16, X86_FEATURE_AVX512BW }, { X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES }, diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 10fb5b5c9efa..23c5072fbbb7 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -306,6 +306,8 @@ static void smca_configure(unsigned int bank, unsigned int cpu) if ((low & BIT(5)) && !((high >> 5) & 0x3)) high |= BIT(5); + this_cpu_ptr(mce_banks_array)[bank].lsb_in_status = !!(low & BIT(8)); + wrmsr(smca_config, low, high); } @@ -736,15 +738,7 @@ static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) if (m.status & MCI_STATUS_ADDRV) { m.addr = addr; - /* - * Extract [55:<lsb>] where lsb is the least significant - * *valid* bit of the address bits. - */ - if (mce_flags.smca) { - u8 lsb = (m.addr >> 56) & 0x3f; - - m.addr &= GENMASK_ULL(55, lsb); - } + smca_extract_err_addr(&m); } if (mce_flags.smca) { diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 2c8ec5c71712..7832a69d170e 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -67,13 +67,7 @@ DEFINE_PER_CPU(unsigned, mce_exception_count); DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks); -struct mce_bank { - u64 ctl; /* subevents to enable */ - - __u64 init : 1, /* initialise bank? */ - __reserved_1 : 63; -}; -static DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array); +DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array); #define ATTR_LEN 16 /* One object for each MCE bank, shared by all CPUs */ @@ -579,7 +573,7 @@ static int uc_decode_notifier(struct notifier_block *nb, unsigned long val, mce->severity != MCE_DEFERRED_SEVERITY) return NOTIFY_DONE; - pfn = mce->addr >> PAGE_SHIFT; + pfn = (mce->addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT; if (!memory_failure(pfn, 0)) { set_mce_nospec(pfn); mce->kflags |= MCE_HANDLED_UC; @@ -633,15 +627,7 @@ static noinstr void mce_read_aux(struct mce *m, int i) m->addr <<= shift; } - /* - * Extract [55:<lsb>] where lsb is the least significant - * *valid* bit of the address bits. - */ - if (mce_flags.smca) { - u8 lsb = (m->addr >> 56) & 0x3f; - - m->addr &= GENMASK_ULL(55, lsb); - } + smca_extract_err_addr(m); } if (mce_flags.smca) { @@ -1308,6 +1294,7 @@ static void kill_me_maybe(struct callback_head *cb) { struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me); int flags = MF_ACTION_REQUIRED; + unsigned long pfn; int ret; p->mce_count = 0; @@ -1316,9 +1303,10 @@ static void kill_me_maybe(struct callback_head *cb) if (!p->mce_ripv) flags |= MF_MUST_KILL; - ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags); + pfn = (p->mce_addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT; + ret = memory_failure(pfn, flags); if (!ret) { - set_mce_nospec(p->mce_addr >> PAGE_SHIFT); + set_mce_nospec(pfn); sync_core(); return; } @@ -1340,11 +1328,13 @@ static void kill_me_maybe(struct callback_head *cb) static void kill_me_never(struct callback_head *cb) { struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me); + unsigned long pfn; p->mce_count = 0; pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr); - if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0)) - set_mce_nospec(p->mce_addr >> PAGE_SHIFT); + pfn = (p->mce_addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT; + if (!memory_failure(pfn, 0)) + set_mce_nospec(pfn); } static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *)) diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c index 100fbeebdc72..a05ac0716ecf 100644 --- a/arch/x86/kernel/cpu/mce/dev-mcelog.c +++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c @@ -105,8 +105,7 @@ static ssize_t set_trigger(struct device *s, struct device_attribute *attr, { char *p; - strncpy(mce_helper, buf, sizeof(mce_helper)); - mce_helper[sizeof(mce_helper)-1] = 0; + strscpy(mce_helper, buf, sizeof(mce_helper)); p = strchr(mce_helper, '\n'); if (p) diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 7e03f5b7f6bd..91a415553c27 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -177,6 +177,24 @@ struct mce_vendor_flags { extern struct mce_vendor_flags mce_flags; +struct mce_bank { + /* subevents to enable */ + u64 ctl; + + /* initialise bank? */ + __u64 init : 1, + + /* + * (AMD) MCA_CONFIG[McaLsbInStatusSupported]: When set, this bit indicates + * the LSB field is found in MCA_STATUS and not in MCA_ADDR. + */ + lsb_in_status : 1, + + __reserved_1 : 62; +}; + +DECLARE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array); + enum mca_msr { MCA_CTL, MCA_STATUS, @@ -189,8 +207,34 @@ extern bool filter_mce(struct mce *m); #ifdef CONFIG_X86_MCE_AMD extern bool amd_filter_mce(struct mce *m); + +/* + * If MCA_CONFIG[McaLsbInStatusSupported] is set, extract ErrAddr in bits + * [56:0] of MCA_STATUS, else in bits [55:0] of MCA_ADDR. + */ +static __always_inline void smca_extract_err_addr(struct mce *m) +{ + u8 lsb; + + if (!mce_flags.smca) + return; + + if (this_cpu_ptr(mce_banks_array)[m->bank].lsb_in_status) { + lsb = (m->status >> 24) & 0x3f; + + m->addr &= GENMASK_ULL(56, lsb); + + return; + } + + lsb = (m->addr >> 56) & 0x3f; + + m->addr &= GENMASK_ULL(55, lsb); +} + #else static inline bool amd_filter_mce(struct mce *m) { return false; } +static inline void smca_extract_err_addr(struct mce *m) { } #endif #ifdef CONFIG_X86_ANCIENT_MCE diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 56471f750762..9eb457b10341 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -55,7 +55,9 @@ struct cont_desc { }; static u32 ucode_new_rev; -static u8 amd_ucode_patch[PATCH_MAX_SIZE]; + +/* One blob per node. */ +static u8 amd_ucode_patch[MAX_NUMNODES][PATCH_MAX_SIZE]; /* * Microcode patch container file is prepended to the initrd in cpio @@ -330,8 +332,9 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc) ret = verify_patch(x86_family(desc->cpuid_1_eax), buf, size, &patch_size, true); if (ret < 0) { /* - * Patch verification failed, skip to the next - * container, if there's one: + * Patch verification failed, skip to the next container, if + * there is one. Before exit, check whether that container has + * found a patch already. If so, use it. */ goto out; } else if (ret > 0) { @@ -350,6 +353,7 @@ skip: size -= patch_size + SECTION_HDR_SIZE; } +out: /* * If we have found a patch (desc->mc), it means we're looking at the * container which has a patch for this CPU so return 0 to mean, @ucode @@ -364,7 +368,6 @@ skip: return 0; } -out: return orig_size - size; } @@ -414,8 +417,7 @@ static int __apply_microcode_amd(struct microcode_amd *mc) * * Returns true if container found (sets @desc), false otherwise. */ -static bool -apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size, bool save_patch) +static bool early_apply_microcode(u32 cpuid_1_eax, void *ucode, size_t size, bool save_patch) { struct cont_desc desc = { 0 }; u8 (*patch)[PATCH_MAX_SIZE]; @@ -428,7 +430,7 @@ apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size, bool save_p patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch); #else new_rev = &ucode_new_rev; - patch = &amd_ucode_patch; + patch = &amd_ucode_patch[0]; #endif desc.cpuid_1_eax = cpuid_1_eax; @@ -481,7 +483,7 @@ static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family) return false; } -static void __load_ucode_amd(unsigned int cpuid_1_eax, struct cpio_data *ret) +static void find_blobs_in_containers(unsigned int cpuid_1_eax, struct cpio_data *ret) { struct ucode_cpu_info *uci; struct cpio_data cp; @@ -511,11 +513,11 @@ void __init load_ucode_amd_bsp(unsigned int cpuid_1_eax) { struct cpio_data cp = { }; - __load_ucode_amd(cpuid_1_eax, &cp); + find_blobs_in_containers(cpuid_1_eax, &cp); if (!(cp.data && cp.size)) return; - apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, true); + early_apply_microcode(cpuid_1_eax, cp.data, cp.size, true); } void load_ucode_amd_ap(unsigned int cpuid_1_eax) @@ -546,15 +548,14 @@ void load_ucode_amd_ap(unsigned int cpuid_1_eax) } } - __load_ucode_amd(cpuid_1_eax, &cp); + find_blobs_in_containers(cpuid_1_eax, &cp); if (!(cp.data && cp.size)) return; - apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, false); + early_apply_microcode(cpuid_1_eax, cp.data, cp.size, false); } -static enum ucode_state -load_microcode_amd(bool save, u8 family, const u8 *data, size_t size); +static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size); int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax) { @@ -572,19 +573,19 @@ int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax) if (!desc.mc) return -EINVAL; - ret = load_microcode_amd(true, x86_family(cpuid_1_eax), desc.data, desc.size); + ret = load_microcode_amd(x86_family(cpuid_1_eax), desc.data, desc.size); if (ret > UCODE_UPDATED) return -EINVAL; return 0; } -void reload_ucode_amd(void) +void reload_ucode_amd(unsigned int cpu) { - struct microcode_amd *mc; u32 rev, dummy __always_unused; + struct microcode_amd *mc; - mc = (struct microcode_amd *)amd_ucode_patch; + mc = (struct microcode_amd *)amd_ucode_patch[cpu_to_node(cpu)]; rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); @@ -816,6 +817,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover, return 0; } +/* Scan the blob in @data and add microcode patches to the cache. */ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, size_t size) { @@ -850,9 +852,10 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, return UCODE_OK; } -static enum ucode_state -load_microcode_amd(bool save, u8 family, const u8 *data, size_t size) +static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size) { + struct cpuinfo_x86 *c; + unsigned int nid, cpu; struct ucode_patch *p; enum ucode_state ret; @@ -865,22 +868,22 @@ load_microcode_amd(bool save, u8 family, const u8 *data, size_t size) return ret; } - p = find_patch(0); - if (!p) { - return ret; - } else { - if (boot_cpu_data.microcode >= p->patch_id) - return ret; + for_each_node(nid) { + cpu = cpumask_first(cpumask_of_node(nid)); + c = &cpu_data(cpu); - ret = UCODE_NEW; - } + p = find_patch(cpu); + if (!p) + continue; - /* save BSP's matching patch for early load */ - if (!save) - return ret; + if (c->microcode >= p->patch_id) + continue; - memset(amd_ucode_patch, 0, PATCH_MAX_SIZE); - memcpy(amd_ucode_patch, p->data, min_t(u32, p->size, PATCH_MAX_SIZE)); + ret = UCODE_NEW; + + memset(&amd_ucode_patch[nid], 0, PATCH_MAX_SIZE); + memcpy(&amd_ucode_patch[nid], p->data, min_t(u32, p->size, PATCH_MAX_SIZE)); + } return ret; } @@ -905,14 +908,9 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device) { char fw_name[36] = "amd-ucode/microcode_amd.bin"; struct cpuinfo_x86 *c = &cpu_data(cpu); - bool bsp = c->cpu_index == boot_cpu_data.cpu_index; enum ucode_state ret = UCODE_NFOUND; const struct firmware *fw; - /* reload ucode container only on the boot cpu */ - if (!bsp) - return UCODE_OK; - if (c->x86 >= 0x15) snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86); @@ -925,7 +923,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device) if (!verify_container(fw->data, fw->size, false)) goto fw_release; - ret = load_microcode_amd(bsp, c->x86, fw->data, fw->size); + ret = load_microcode_amd(c->x86, fw->data, fw->size); fw_release: release_firmware(fw); diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 712aafff96e0..7a329e561354 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -298,7 +298,7 @@ struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa) #endif } -void reload_early_microcode(void) +void reload_early_microcode(unsigned int cpu) { int vendor, family; @@ -312,7 +312,7 @@ void reload_early_microcode(void) break; case X86_VENDOR_AMD: if (family >= 0x10) - reload_ucode_amd(); + reload_ucode_amd(cpu); break; default: break; @@ -409,10 +409,10 @@ static int __reload_late(void *info) goto wait_for_siblings; if (err >= UCODE_NFOUND) { - if (err == UCODE_ERROR) + if (err == UCODE_ERROR) { pr_warn("Error reloading microcode on CPU %d\n", cpu); - - ret = -1; + ret = -1; + } } wait_for_siblings: @@ -438,6 +438,7 @@ wait_for_siblings: static int microcode_reload_late(void) { int old = boot_cpu_data.microcode, ret; + struct cpuinfo_x86 prev_info; pr_err("Attempting late microcode loading - it is dangerous and taints the kernel.\n"); pr_err("You should switch to early loading, if possible.\n"); @@ -445,12 +446,21 @@ static int microcode_reload_late(void) atomic_set(&late_cpus_in, 0); atomic_set(&late_cpus_out, 0); - ret = stop_machine_cpuslocked(__reload_late, NULL, cpu_online_mask); - if (ret == 0) - microcode_check(); + /* + * Take a snapshot before the microcode update in order to compare and + * check whether any bits changed after an update. + */ + store_cpu_caps(&prev_info); - pr_info("Reload completed, microcode revision: 0x%x -> 0x%x\n", - old, boot_cpu_data.microcode); + ret = stop_machine_cpuslocked(__reload_late, NULL, cpu_online_mask); + if (!ret) { + pr_info("Reload succeeded, microcode revision: 0x%x -> 0x%x\n", + old, boot_cpu_data.microcode); + microcode_check(&prev_info); + } else { + pr_info("Reload failed, current microcode revision: 0x%x\n", + boot_cpu_data.microcode); + } return ret; } @@ -465,11 +475,8 @@ static ssize_t reload_store(struct device *dev, ssize_t ret = 0; ret = kstrtoul(buf, 0, &val); - if (ret) - return ret; - - if (val != 1) - return size; + if (ret || val != 1) + return -EINVAL; cpus_read_lock(); @@ -507,7 +514,7 @@ static ssize_t version_show(struct device *dev, return sprintf(buf, "0x%x\n", uci->cpu_sig.rev); } -static ssize_t pf_show(struct device *dev, +static ssize_t processor_flags_show(struct device *dev, struct device_attribute *attr, char *buf) { struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; @@ -515,8 +522,8 @@ static ssize_t pf_show(struct device *dev, return sprintf(buf, "0x%x\n", uci->cpu_sig.pf); } -static DEVICE_ATTR(version, 0444, version_show, NULL); -static DEVICE_ATTR(processor_flags, 0444, pf_show, NULL); +static DEVICE_ATTR_RO(version); +static DEVICE_ATTR_RO(processor_flags); static struct attribute *mc_default_attrs[] = { &dev_attr_version.attr, @@ -557,7 +564,7 @@ void microcode_bsp_resume(void) if (uci->mc) microcode_ops->apply_microcode(cpu); else - reload_early_microcode(); + reload_early_microcode(cpu); } static struct syscore_ops mc_syscore_ops = { diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index cac2bdb57f0b..467cf37ea90a 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -305,14 +305,11 @@ static bool load_builtin_intel_microcode(struct cpio_data *cp) return false; } -/* - * Print ucode update info. - */ -static void -print_ucode_info(struct ucode_cpu_info *uci, unsigned int date) +static void print_ucode_info(int old_rev, int new_rev, unsigned int date) { - pr_info_once("microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n", - uci->cpu_sig.rev, + pr_info_once("updated early: 0x%x -> 0x%x, date = %04x-%02x-%02x\n", + old_rev, + new_rev, date & 0xffff, date >> 24, (date >> 16) & 0xff); @@ -322,6 +319,7 @@ print_ucode_info(struct ucode_cpu_info *uci, unsigned int date) static int delay_ucode_info; static int current_mc_date; +static int early_old_rev; /* * Print early updated ucode info after printk works. This is delayed info dump. @@ -332,7 +330,7 @@ void show_ucode_info_early(void) if (delay_ucode_info) { intel_cpu_collect_info(&uci); - print_ucode_info(&uci, current_mc_date); + print_ucode_info(early_old_rev, uci.cpu_sig.rev, current_mc_date); delay_ucode_info = 0; } } @@ -341,40 +339,32 @@ void show_ucode_info_early(void) * At this point, we can not call printk() yet. Delay printing microcode info in * show_ucode_info_early() until printk() works. */ -static void print_ucode(struct ucode_cpu_info *uci) +static void print_ucode(int old_rev, int new_rev, int date) { - struct microcode_intel *mc; int *delay_ucode_info_p; int *current_mc_date_p; - - mc = uci->mc; - if (!mc) - return; + int *early_old_rev_p; delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info); current_mc_date_p = (int *)__pa_nodebug(¤t_mc_date); + early_old_rev_p = (int *)__pa_nodebug(&early_old_rev); *delay_ucode_info_p = 1; - *current_mc_date_p = mc->hdr.date; + *current_mc_date_p = date; + *early_old_rev_p = old_rev; } #else -static inline void print_ucode(struct ucode_cpu_info *uci) +static inline void print_ucode(int old_rev, int new_rev, int date) { - struct microcode_intel *mc; - - mc = uci->mc; - if (!mc) - return; - - print_ucode_info(uci, mc->hdr.date); + print_ucode_info(old_rev, new_rev, date); } #endif static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) { struct microcode_intel *mc; - u32 rev; + u32 rev, old_rev; mc = uci->mc; if (!mc) @@ -391,6 +381,8 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) return UCODE_OK; } + old_rev = rev; + /* * Writeback and invalidate caches before updating microcode to avoid * internal issues depending on what the microcode is updating. @@ -407,9 +399,9 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) uci->cpu_sig.rev = rev; if (early) - print_ucode(uci); + print_ucode(old_rev, uci->cpu_sig.rev, mc->hdr.date); else - print_ucode_info(uci, mc->hdr.date); + print_ucode_info(old_rev, uci->cpu_sig.rev, mc->hdr.date); return 0; } diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 46668e255421..f924a76c6923 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -37,9 +37,76 @@ /* Is Linux running as the root partition? */ bool hv_root_partition; +/* Is Linux running on nested Microsoft Hypervisor */ +bool hv_nested; struct ms_hyperv_info ms_hyperv; #if IS_ENABLED(CONFIG_HYPERV) +static inline unsigned int hv_get_nested_reg(unsigned int reg) +{ + if (hv_is_sint_reg(reg)) + return reg - HV_REGISTER_SINT0 + HV_REGISTER_NESTED_SINT0; + + switch (reg) { + case HV_REGISTER_SIMP: + return HV_REGISTER_NESTED_SIMP; + case HV_REGISTER_SIEFP: + return HV_REGISTER_NESTED_SIEFP; + case HV_REGISTER_SVERSION: + return HV_REGISTER_NESTED_SVERSION; + case HV_REGISTER_SCONTROL: + return HV_REGISTER_NESTED_SCONTROL; + case HV_REGISTER_EOM: + return HV_REGISTER_NESTED_EOM; + default: + return reg; + } +} + +u64 hv_get_non_nested_register(unsigned int reg) +{ + u64 value; + + if (hv_is_synic_reg(reg) && hv_isolation_type_snp()) + hv_ghcb_msr_read(reg, &value); + else + rdmsrl(reg, value); + return value; +} +EXPORT_SYMBOL_GPL(hv_get_non_nested_register); + +void hv_set_non_nested_register(unsigned int reg, u64 value) +{ + if (hv_is_synic_reg(reg) && hv_isolation_type_snp()) { + hv_ghcb_msr_write(reg, value); + + /* Write proxy bit via wrmsl instruction */ + if (hv_is_sint_reg(reg)) + wrmsrl(reg, value | 1 << 20); + } else { + wrmsrl(reg, value); + } +} +EXPORT_SYMBOL_GPL(hv_set_non_nested_register); + +u64 hv_get_register(unsigned int reg) +{ + if (hv_nested) + reg = hv_get_nested_reg(reg); + + return hv_get_non_nested_register(reg); +} +EXPORT_SYMBOL_GPL(hv_get_register); + +void hv_set_register(unsigned int reg, u64 value) +{ + if (hv_nested) + reg = hv_get_nested_reg(reg); + + hv_set_non_nested_register(reg, value); +} +EXPORT_SYMBOL_GPL(hv_set_register); + static void (*vmbus_handler)(void); static void (*hv_stimer0_handler)(void); static void (*hv_kexec_handler)(void); @@ -301,6 +368,11 @@ static void __init ms_hyperv_init_platform(void) pr_info("Hyper-V: running as root partition\n"); } + if (ms_hyperv.hints & HV_X64_HYPERV_NESTED) { + hv_nested = true; + pr_info("Hyper-V: running on a nested hypervisor\n"); + } + /* * Extract host information. */ diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index c98e52ff5f20..030d3b409768 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -100,6 +100,18 @@ struct rdt_hw_resource rdt_resources_all[] = { .fflags = RFTYPE_RES_MB, }, }, + [RDT_RESOURCE_SMBA] = + { + .r_resctrl = { + .rid = RDT_RESOURCE_SMBA, + .name = "SMBA", + .cache_level = 3, + .domains = domain_init(RDT_RESOURCE_SMBA), + .parse_ctrlval = parse_bw, + .format_str = "%d=%*u", + .fflags = RFTYPE_RES_MB, + }, + }, }; /* @@ -150,6 +162,13 @@ bool is_mba_sc(struct rdt_resource *r) if (!r) return rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl.membw.mba_sc; + /* + * The software controller support is only applicable to MBA resource. + * Make sure to check for resource type. + */ + if (r->rid != RDT_RESOURCE_MBA) + return false; + return r->membw.mba_sc; } @@ -213,9 +232,15 @@ static bool __rdt_get_mem_config_amd(struct rdt_resource *r) struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); union cpuid_0x10_3_eax eax; union cpuid_0x10_x_edx edx; - u32 ebx, ecx; + u32 ebx, ecx, subleaf; + + /* + * Query CPUID_Fn80000020_EDX_x01 for MBA and + * CPUID_Fn80000020_EDX_x02 for SMBA + */ + subleaf = (r->rid == RDT_RESOURCE_SMBA) ? 2 : 1; - cpuid_count(0x80000020, 1, &eax.full, &ebx, &ecx, &edx.full); + cpuid_count(0x80000020, subleaf, &eax.full, &ebx, &ecx, &edx.full); hw_res->num_closid = edx.split.cos_max + 1; r->default_ctrl = MAX_MBA_BW_AMD; @@ -647,6 +672,8 @@ enum { RDT_FLAG_L2_CAT, RDT_FLAG_L2_CDP, RDT_FLAG_MBA, + RDT_FLAG_SMBA, + RDT_FLAG_BMEC, }; #define RDT_OPT(idx, n, f) \ @@ -670,6 +697,8 @@ static struct rdt_options rdt_options[] __initdata = { RDT_OPT(RDT_FLAG_L2_CAT, "l2cat", X86_FEATURE_CAT_L2), RDT_OPT(RDT_FLAG_L2_CDP, "l2cdp", X86_FEATURE_CDP_L2), RDT_OPT(RDT_FLAG_MBA, "mba", X86_FEATURE_MBA), + RDT_OPT(RDT_FLAG_SMBA, "smba", X86_FEATURE_SMBA), + RDT_OPT(RDT_FLAG_BMEC, "bmec", X86_FEATURE_BMEC), }; #define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options) @@ -699,7 +728,7 @@ static int __init set_rdt_options(char *str) } __setup("rdt", set_rdt_options); -static bool __init rdt_cpu_has(int flag) +bool __init rdt_cpu_has(int flag) { bool ret = boot_cpu_has(flag); struct rdt_options *o; @@ -734,6 +763,19 @@ static __init bool get_mem_config(void) return false; } +static __init bool get_slow_mem_config(void) +{ + struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_SMBA]; + + if (!rdt_cpu_has(X86_FEATURE_SMBA)) + return false; + + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + return __rdt_get_mem_config_amd(&hw_res->r_resctrl); + + return false; +} + static __init bool get_rdt_alloc_resources(void) { struct rdt_resource *r; @@ -764,6 +806,9 @@ static __init bool get_rdt_alloc_resources(void) if (get_mem_config()) ret = true; + if (get_slow_mem_config()) + ret = true; + return ret; } @@ -853,6 +898,9 @@ static __init void rdt_init_res_defs_amd(void) } else if (r->rid == RDT_RESOURCE_MBA) { hw_res->msr_base = MSR_IA32_MBA_BW_BASE; hw_res->msr_update = mba_wrmsr_amd; + } else if (r->rid == RDT_RESOURCE_SMBA) { + hw_res->msr_base = MSR_IA32_SMBA_BW_BASE; + hw_res->msr_update = mba_wrmsr_amd; } } } diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 1df0e3262bca..eb07d4435391 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -209,7 +209,7 @@ static int parse_line(char *line, struct resctrl_schema *s, unsigned long dom_id; if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && - r->rid == RDT_RESOURCE_MBA) { + (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) { rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n"); return -EINVAL; } @@ -310,7 +310,6 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) enum resctrl_conf_type t; cpumask_var_t cpu_mask; struct rdt_domain *d; - int cpu; u32 idx; if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) @@ -341,13 +340,9 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) if (cpumask_empty(cpu_mask)) goto done; - cpu = get_cpu(); - /* Update resource control msr on this CPU if it's in cpu_mask. */ - if (cpumask_test_cpu(cpu, cpu_mask)) - rdt_ctrl_update(&msr_param); - /* Update resource control msr on other CPUs. */ - smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1); - put_cpu(); + + /* Update resource control msr on all the CPUs. */ + on_each_cpu_mask(cpu_mask, rdt_ctrl_update, &msr_param, 1); done: free_cpumask_var(cpu_mask); diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 5ebd28e6aa0c..8edecc5763d8 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -30,6 +30,29 @@ */ #define MBM_CNTR_WIDTH_OFFSET_MAX (62 - MBM_CNTR_WIDTH_BASE) +/* Reads to Local DRAM Memory */ +#define READS_TO_LOCAL_MEM BIT(0) + +/* Reads to Remote DRAM Memory */ +#define READS_TO_REMOTE_MEM BIT(1) + +/* Non-Temporal Writes to Local Memory */ +#define NON_TEMP_WRITE_TO_LOCAL_MEM BIT(2) + +/* Non-Temporal Writes to Remote Memory */ +#define NON_TEMP_WRITE_TO_REMOTE_MEM BIT(3) + +/* Reads to Local Memory the system identifies as "Slow Memory" */ +#define READS_TO_LOCAL_S_MEM BIT(4) + +/* Reads to Remote Memory the system identifies as "Slow Memory" */ +#define READS_TO_REMOTE_S_MEM BIT(5) + +/* Dirty Victims to All Types of Memory */ +#define DIRTY_VICTIMS_TO_ALL_MEM BIT(6) + +/* Max event bits supported */ +#define MAX_EVT_CONFIG_BITS GENMASK(6, 0) struct rdt_fs_context { struct kernfs_fs_context kfc; @@ -52,11 +75,13 @@ DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key); * struct mon_evt - Entry in the event list of a resource * @evtid: event id * @name: name of the event + * @configurable: true if the event is configurable * @list: entry in &rdt_resource->evt_list */ struct mon_evt { enum resctrl_event_id evtid; char *name; + bool configurable; struct list_head list; }; @@ -409,6 +434,7 @@ enum resctrl_res_level { RDT_RESOURCE_L3, RDT_RESOURCE_L2, RDT_RESOURCE_MBA, + RDT_RESOURCE_SMBA, /* Must be the last */ RDT_NUM_RESOURCES, @@ -511,6 +537,7 @@ void closid_free(int closid); int alloc_rmid(void); void free_rmid(u32 rmid); int rdt_get_mon_l3_config(struct rdt_resource *r); +bool __init rdt_cpu_has(int flag); void mon_event_count(void *info); int rdtgroup_mondata_show(struct seq_file *m, void *arg); void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, @@ -527,5 +554,6 @@ bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); void __check_limbo(struct rdt_domain *d, bool force_free); void rdt_domain_reconfigure_cdp(struct rdt_resource *r); void __init thread_throttle_mode_init(void); +void __init mbm_config_rftype_init(const char *config); #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 77538abeb72a..7fe51488e136 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -204,6 +204,23 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d, } } +/* + * Assumes that hardware counters are also reset and thus that there is + * no need to record initial non-zero counts. + */ +void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_domain *d) +{ + struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + + if (is_mbm_total_enabled()) + memset(hw_dom->arch_mbm_total, 0, + sizeof(*hw_dom->arch_mbm_total) * r->num_rmid); + + if (is_mbm_local_enabled()) + memset(hw_dom->arch_mbm_local, 0, + sizeof(*hw_dom->arch_mbm_local) * r->num_rmid); +} + static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) { u64 shift = 64 - width, chunks; @@ -763,7 +780,7 @@ static void l3_mon_evt_init(struct rdt_resource *r) list_add_tail(&mbm_local_event.list, &r->evt_list); } -int rdt_get_mon_l3_config(struct rdt_resource *r) +int __init rdt_get_mon_l3_config(struct rdt_resource *r) { unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); @@ -800,6 +817,17 @@ int rdt_get_mon_l3_config(struct rdt_resource *r) if (ret) return ret; + if (rdt_cpu_has(X86_FEATURE_BMEC)) { + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) { + mbm_total_event.configurable = true; + mbm_config_rftype_init("mbm_total_bytes_config"); + } + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) { + mbm_local_event.configurable = true; + mbm_config_rftype_init("mbm_local_bytes_config"); + } + } + l3_mon_evt_init(r); r->mon_capable = true; diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 5993da21d822..e2c1599d1b37 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -325,12 +325,7 @@ static void update_cpu_closid_rmid(void *info) static void update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r) { - int cpu = get_cpu(); - - if (cpumask_test_cpu(cpu, cpu_mask)) - update_cpu_closid_rmid(r); - smp_call_function_many(cpu_mask, update_cpu_closid_rmid, r, 1); - put_cpu(); + on_each_cpu_mask(cpu_mask, update_cpu_closid_rmid, r, 1); } static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, @@ -1003,8 +998,11 @@ static int rdt_mon_features_show(struct kernfs_open_file *of, struct rdt_resource *r = of->kn->parent->priv; struct mon_evt *mevt; - list_for_each_entry(mevt, &r->evt_list, list) + list_for_each_entry(mevt, &r->evt_list, list) { seq_printf(seq, "%s\n", mevt->name); + if (mevt->configurable) + seq_printf(seq, "%s_config\n", mevt->name); + } return 0; } @@ -1215,7 +1213,7 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) list_for_each_entry(s, &resctrl_schema_all, list) { r = s->res; - if (r->rid == RDT_RESOURCE_MBA) + if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA) continue; has_cache = true; list_for_each_entry(d, &r->domains, list) { @@ -1404,7 +1402,8 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, ctrl = resctrl_arch_get_config(r, d, closid, type); - if (r->rid == RDT_RESOURCE_MBA) + if (r->rid == RDT_RESOURCE_MBA || + r->rid == RDT_RESOURCE_SMBA) size = ctrl; else size = rdtgroup_cbm_to_size(r, d, ctrl); @@ -1421,6 +1420,248 @@ out: return ret; } +struct mon_config_info { + u32 evtid; + u32 mon_config; +}; + +#define INVALID_CONFIG_INDEX UINT_MAX + +/** + * mon_event_config_index_get - get the hardware index for the + * configurable event + * @evtid: event id. + * + * Return: 0 for evtid == QOS_L3_MBM_TOTAL_EVENT_ID + * 1 for evtid == QOS_L3_MBM_LOCAL_EVENT_ID + * INVALID_CONFIG_INDEX for invalid evtid + */ +static inline unsigned int mon_event_config_index_get(u32 evtid) +{ + switch (evtid) { + case QOS_L3_MBM_TOTAL_EVENT_ID: + return 0; + case QOS_L3_MBM_LOCAL_EVENT_ID: + return 1; + default: + /* Should never reach here */ + return INVALID_CONFIG_INDEX; + } +} + +static void mon_event_config_read(void *info) +{ + struct mon_config_info *mon_info = info; + unsigned int index; + u64 msrval; + + index = mon_event_config_index_get(mon_info->evtid); + if (index == INVALID_CONFIG_INDEX) { + pr_warn_once("Invalid event id %d\n", mon_info->evtid); + return; + } + rdmsrl(MSR_IA32_EVT_CFG_BASE + index, msrval); + + /* Report only the valid event configuration bits */ + mon_info->mon_config = msrval & MAX_EVT_CONFIG_BITS; +} + +static void mondata_config_read(struct rdt_domain *d, struct mon_config_info *mon_info) +{ + smp_call_function_any(&d->cpu_mask, mon_event_config_read, mon_info, 1); +} + +static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid) +{ + struct mon_config_info mon_info = {0}; + struct rdt_domain *dom; + bool sep = false; + + mutex_lock(&rdtgroup_mutex); + + list_for_each_entry(dom, &r->domains, list) { + if (sep) + seq_puts(s, ";"); + + memset(&mon_info, 0, sizeof(struct mon_config_info)); + mon_info.evtid = evtid; + mondata_config_read(dom, &mon_info); + + seq_printf(s, "%d=0x%02x", dom->id, mon_info.mon_config); + sep = true; + } + seq_puts(s, "\n"); + + mutex_unlock(&rdtgroup_mutex); + + return 0; +} + +static int mbm_total_bytes_config_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + mbm_config_show(seq, r, QOS_L3_MBM_TOTAL_EVENT_ID); + + return 0; +} + +static int mbm_local_bytes_config_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + mbm_config_show(seq, r, QOS_L3_MBM_LOCAL_EVENT_ID); + + return 0; +} + +static void mon_event_config_write(void *info) +{ + struct mon_config_info *mon_info = info; + unsigned int index; + + index = mon_event_config_index_get(mon_info->evtid); + if (index == INVALID_CONFIG_INDEX) { + pr_warn_once("Invalid event id %d\n", mon_info->evtid); + return; + } + wrmsr(MSR_IA32_EVT_CFG_BASE + index, mon_info->mon_config, 0); +} + +static int mbm_config_write_domain(struct rdt_resource *r, + struct rdt_domain *d, u32 evtid, u32 val) +{ + struct mon_config_info mon_info = {0}; + int ret = 0; + + /* mon_config cannot be more than the supported set of events */ + if (val > MAX_EVT_CONFIG_BITS) { + rdt_last_cmd_puts("Invalid event configuration\n"); + return -EINVAL; + } + + /* + * Read the current config value first. If both are the same then + * no need to write it again. + */ + mon_info.evtid = evtid; + mondata_config_read(d, &mon_info); + if (mon_info.mon_config == val) + goto out; + + mon_info.mon_config = val; + + /* + * Update MSR_IA32_EVT_CFG_BASE MSR on one of the CPUs in the + * domain. The MSRs offset from MSR MSR_IA32_EVT_CFG_BASE + * are scoped at the domain level. Writing any of these MSRs + * on one CPU is observed by all the CPUs in the domain. + */ + smp_call_function_any(&d->cpu_mask, mon_event_config_write, + &mon_info, 1); + + /* + * When an Event Configuration is changed, the bandwidth counters + * for all RMIDs and Events will be cleared by the hardware. The + * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for + * every RMID on the next read to any event for every RMID. + * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62) + * cleared while it is tracked by the hardware. Clear the + * mbm_local and mbm_total counts for all the RMIDs. + */ + resctrl_arch_reset_rmid_all(r, d); + +out: + return ret; +} + +static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) +{ + char *dom_str = NULL, *id_str; + unsigned long dom_id, val; + struct rdt_domain *d; + int ret = 0; + +next: + if (!tok || tok[0] == '\0') + return 0; + + /* Start processing the strings for each domain */ + dom_str = strim(strsep(&tok, ";")); + id_str = strsep(&dom_str, "="); + + if (!id_str || kstrtoul(id_str, 10, &dom_id)) { + rdt_last_cmd_puts("Missing '=' or non-numeric domain id\n"); + return -EINVAL; + } + + if (!dom_str || kstrtoul(dom_str, 16, &val)) { + rdt_last_cmd_puts("Non-numeric event configuration value\n"); + return -EINVAL; + } + + list_for_each_entry(d, &r->domains, list) { + if (d->id == dom_id) { + ret = mbm_config_write_domain(r, d, evtid, val); + if (ret) + return -EINVAL; + goto next; + } + } + + return -EINVAL; +} + +static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + struct rdt_resource *r = of->kn->parent->priv; + int ret; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + buf[nbytes - 1] = '\0'; + + ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID); + + mutex_unlock(&rdtgroup_mutex); + + return ret ?: nbytes; +} + +static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + struct rdt_resource *r = of->kn->parent->priv; + int ret; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + buf[nbytes - 1] = '\0'; + + ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID); + + mutex_unlock(&rdtgroup_mutex); + + return ret ?: nbytes; +} + /* rdtgroup information files for one cache resource. */ static struct rftype res_common_files[] = { { @@ -1520,6 +1761,20 @@ static struct rftype res_common_files[] = { .fflags = RF_MON_INFO | RFTYPE_RES_CACHE, }, { + .name = "mbm_total_bytes_config", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = mbm_total_bytes_config_show, + .write = mbm_total_bytes_config_write, + }, + { + .name = "mbm_local_bytes_config", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = mbm_local_bytes_config_show, + .write = mbm_local_bytes_config_write, + }, + { .name = "cpus", .mode = 0644, .kf_ops = &rdtgroup_kf_single_ops, @@ -1625,6 +1880,15 @@ void __init thread_throttle_mode_init(void) rft->fflags = RF_CTRL_INFO | RFTYPE_RES_MB; } +void __init mbm_config_rftype_init(const char *config) +{ + struct rftype *rft; + + rft = rdtgroup_get_rftype_by_name(config); + if (rft) + rft->fflags = RF_MON_INFO | RFTYPE_RES_CACHE; +} + /** * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file * @r: The resource group with which the file is associated. @@ -1866,13 +2130,9 @@ static int set_cache_qos_cfg(int level, bool enable) /* Pick one CPU from each domain instance to update MSR */ cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); } - cpu = get_cpu(); - /* Update QOS_CFG MSR on this cpu if it's in cpu_mask. */ - if (cpumask_test_cpu(cpu, cpu_mask)) - update(&enable); - /* Update QOS_CFG MSR on all other cpus in cpu_mask. */ - smp_call_function_many(cpu_mask, update, &enable, 1); - put_cpu(); + + /* Update QOS_CFG MSR on all the CPUs in cpu_mask */ + on_each_cpu_mask(cpu_mask, update, &enable, 1); free_cpumask_var(cpu_mask); @@ -2349,7 +2609,7 @@ static int reset_all_ctrls(struct rdt_resource *r) struct msr_param msr_param; cpumask_var_t cpu_mask; struct rdt_domain *d; - int i, cpu; + int i; if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) return -ENOMEM; @@ -2370,13 +2630,9 @@ static int reset_all_ctrls(struct rdt_resource *r) for (i = 0; i < hw_res->num_closid; i++) hw_dom->ctrl_val[i] = r->default_ctrl; } - cpu = get_cpu(); - /* Update CBM on this cpu if it's in cpu_mask. */ - if (cpumask_test_cpu(cpu, cpu_mask)) - rdt_ctrl_update(&msr_param); - /* Update CBM on all other cpus in cpu_mask. */ - smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1); - put_cpu(); + + /* Update CBM on all the CPUs in cpu_mask */ + on_each_cpu_mask(cpu_mask, rdt_ctrl_update, &msr_param, 1); free_cpumask_var(cpu_mask); @@ -2855,7 +3111,8 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) list_for_each_entry(s, &resctrl_schema_all, list) { r = s->res; - if (r->rid == RDT_RESOURCE_MBA) { + if (r->rid == RDT_RESOURCE_MBA || + r->rid == RDT_RESOURCE_SMBA) { rdtgroup_init_mba(r, rdtgrp->closid); if (is_mba_sc(r)) continue; diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index f53944fb8f7f..0dad49a09b7a 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -45,6 +45,8 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, + { X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 }, + { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 }, { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 }, { X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 }, { 0, 0, 0, 0, 0 } diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c index 8009c8346d8f..b31ee4f1657a 100644 --- a/arch/x86/kernel/cpu/tsx.c +++ b/arch/x86/kernel/cpu/tsx.c @@ -11,6 +11,7 @@ #include <linux/cpufeature.h> #include <asm/cmdline.h> +#include <asm/cpu.h> #include "cpu.h" diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 02039ec3597d..11f83d07925e 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -143,7 +143,7 @@ static __init int parse_no_stealacc(char *arg) } early_param("no-steal-acc", parse_no_stealacc); -static unsigned long long notrace vmware_sched_clock(void) +static noinstr u64 vmware_sched_clock(void) { unsigned long long ns; diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 9dac24680ff8..fb8cf953380d 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -53,7 +53,7 @@ * * Once the E820 map has been converted to the standard Linux memory layout * information its role stops - modifying it has no effect and does not get - * re-propagated. So itsmain role is a temporary bootstrap storage of firmware + * re-propagated. So its main role is a temporary bootstrap storage of firmware * specific memory layout data during early bootup. */ static struct e820_table e820_table_init __initdata; @@ -395,7 +395,7 @@ int __init e820__update_table(struct e820_table *table) /* Continue building up new map based on this information: */ if (current_type != last_type || e820_nomerge(current_type)) { - if (last_type != 0) { + if (last_type) { new_entries[new_nr_entries].size = change_point[chg_idx]->addr - last_addr; /* Move forward only if the new size was non-zero: */ if (new_entries[new_nr_entries].size != 0) @@ -403,7 +403,7 @@ int __init e820__update_table(struct e820_table *table) if (++new_nr_entries >= max_nr_entries) break; } - if (current_type != 0) { + if (current_type) { new_entries[new_nr_entries].addr = change_point[chg_idx]->addr; new_entries[new_nr_entries].type = current_type; last_addr = change_point[chg_idx]->addr; diff --git a/arch/x86/kernel/fpu/context.h b/arch/x86/kernel/fpu/context.h index 958accf2ccf0..9fcfa5c4dad7 100644 --- a/arch/x86/kernel/fpu/context.h +++ b/arch/x86/kernel/fpu/context.h @@ -57,7 +57,7 @@ static inline void fpregs_restore_userregs(void) struct fpu *fpu = ¤t->thread.fpu; int cpu = smp_processor_id(); - if (WARN_ON_ONCE(current->flags & PF_KTHREAD)) + if (WARN_ON_ONCE(current->flags & (PF_KTHREAD | PF_IO_WORKER))) return; if (!fpregs_state_valid(fpu, cpu)) { diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 9baa89a8877d..caf33486dc5e 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -426,7 +426,7 @@ void kernel_fpu_begin_mask(unsigned int kfpu_mask) this_cpu_write(in_kernel_fpu, true); - if (!(current->flags & PF_KTHREAD) && + if (!(current->flags & (PF_KTHREAD | PF_IO_WORKER)) && !test_thread_flag(TIF_NEED_FPU_LOAD)) { set_thread_flag(TIF_NEED_FPU_LOAD); save_fpregs_to_fpstate(¤t->thread.fpu); @@ -853,12 +853,12 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr) * Initialize register state that may prevent from entering low-power idle. * This function will be invoked from the cpuidle driver only when needed. */ -void fpu_idle_fpregs(void) +noinstr void fpu_idle_fpregs(void) { /* Note: AMX_TILE being enabled implies XGETBV1 support */ if (cpu_feature_enabled(X86_FEATURE_AMX_TILE) && (xfeatures_in_use() & XFEATURE_MASK_XTILE)) { tile_release(); - fpregs_deactivate(¤t->thread.fpu); + __this_cpu_write(fpu_fpregs_owner_ctx, NULL); } } diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 71f336425e58..c8eb1ac5125a 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -1091,6 +1091,8 @@ int __init hpet_enable(void) if (!hpet_counting()) goto out_nohpet; + if (tsc_clocksource_watchdog_disabled()) + clocksource_hpet.flags |= CLOCK_SOURCE_MUST_VERIFY; clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq); if (id & HPET_ID_LEGSUP) { diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index bbb0f737aab1..b01644c949b2 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -127,7 +127,7 @@ int arch_install_hw_breakpoint(struct perf_event *bp) set_debugreg(*dr7, 7); if (info->mask) - set_dr_addr_mask(info->mask, i); + amd_set_dr_addr_mask(info->mask, i); return 0; } @@ -166,7 +166,7 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp) set_debugreg(dr7, 7); if (info->mask) - set_dr_addr_mask(0, i); + amd_set_dr_addr_mask(0, i); /* * Ensure the write to cpu_dr7 is after we've set the DR7 register. diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 3aa5304200c5..4d8aff05a509 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -114,6 +114,7 @@ static void make_8259A_irq(unsigned int irq) disable_irq_nosync(irq); io_apic_irqs &= ~(1<<irq); irq_set_chip_and_handler(irq, &i8259A_chip, handle_level_irq); + irq_set_status_flags(irq, IRQ_LEVEL); enable_irq(irq); lapic_assign_legacy_vector(irq, true); } diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index beb1bada1b0a..c683666876f1 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -65,8 +65,10 @@ void __init init_ISA_irqs(void) legacy_pic->init(0); - for (i = 0; i < nr_legacy_irqs(); i++) + for (i = 0; i < nr_legacy_irqs(); i++) { irq_set_chip_and_handler(i, chip, handle_level_irq); + irq_set_status_flags(i, IRQ_LEVEL); + } } void __init init_IRQ(void) diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index b36f3c367cb2..f7f6042eb7e6 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -464,50 +464,26 @@ static void kprobe_emulate_call(struct kprobe *p, struct pt_regs *regs) } NOKPROBE_SYMBOL(kprobe_emulate_call); -static nokprobe_inline -void __kprobe_emulate_jmp(struct kprobe *p, struct pt_regs *regs, bool cond) +static void kprobe_emulate_jmp(struct kprobe *p, struct pt_regs *regs) { unsigned long ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size; - if (cond) - ip += p->ainsn.rel32; + ip += p->ainsn.rel32; int3_emulate_jmp(regs, ip); } - -static void kprobe_emulate_jmp(struct kprobe *p, struct pt_regs *regs) -{ - __kprobe_emulate_jmp(p, regs, true); -} NOKPROBE_SYMBOL(kprobe_emulate_jmp); -static const unsigned long jcc_mask[6] = { - [0] = X86_EFLAGS_OF, - [1] = X86_EFLAGS_CF, - [2] = X86_EFLAGS_ZF, - [3] = X86_EFLAGS_CF | X86_EFLAGS_ZF, - [4] = X86_EFLAGS_SF, - [5] = X86_EFLAGS_PF, -}; - static void kprobe_emulate_jcc(struct kprobe *p, struct pt_regs *regs) { - bool invert = p->ainsn.jcc.type & 1; - bool match; + unsigned long ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size; - if (p->ainsn.jcc.type < 0xc) { - match = regs->flags & jcc_mask[p->ainsn.jcc.type >> 1]; - } else { - match = ((regs->flags & X86_EFLAGS_SF) >> X86_EFLAGS_SF_BIT) ^ - ((regs->flags & X86_EFLAGS_OF) >> X86_EFLAGS_OF_BIT); - if (p->ainsn.jcc.type >= 0xe) - match = match || (regs->flags & X86_EFLAGS_ZF); - } - __kprobe_emulate_jmp(p, regs, (match && !invert) || (!match && invert)); + int3_emulate_jcc(regs, p->ainsn.jcc.type, ip, p->ainsn.rel32); } NOKPROBE_SYMBOL(kprobe_emulate_jcc); static void kprobe_emulate_loop(struct kprobe *p, struct pt_regs *regs) { + unsigned long ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size; bool match; if (p->ainsn.loop.type != 3) { /* LOOP* */ @@ -535,7 +511,9 @@ static void kprobe_emulate_loop(struct kprobe *p, struct pt_regs *regs) else if (p->ainsn.loop.type == 1) /* LOOPE */ match = match && (regs->flags & X86_EFLAGS_ZF); - __kprobe_emulate_jmp(p, regs, match); + if (match) + ip += p->ainsn.rel32; + int3_emulate_jmp(regs, ip); } NOKPROBE_SYMBOL(kprobe_emulate_loop); @@ -625,7 +603,7 @@ static int prepare_emulation(struct kprobe *p, struct insn *insn) /* 1 byte conditional jump */ p->ainsn.emulate_op = kprobe_emulate_jcc; p->ainsn.jcc.type = opcode & 0xf; - p->ainsn.rel32 = *(char *)insn->immediate.bytes; + p->ainsn.rel32 = insn->immediate.value; break; case 0x0f: opcode = insn->opcode.bytes[1]; @@ -659,17 +637,19 @@ static int prepare_emulation(struct kprobe *p, struct insn *insn) * is determined by the MOD/RM byte. */ opcode = insn->modrm.bytes[0]; - if ((opcode & 0x30) == 0x10) { - if ((opcode & 0x8) == 0x8) - return -EOPNOTSUPP; /* far call */ - /* call absolute, indirect */ + switch (X86_MODRM_REG(opcode)) { + case 0b010: /* FF /2, call near, absolute indirect */ p->ainsn.emulate_op = kprobe_emulate_call_indirect; - } else if ((opcode & 0x30) == 0x20) { - if ((opcode & 0x8) == 0x8) - return -EOPNOTSUPP; /* far jmp */ - /* jmp near absolute indirect */ + break; + case 0b100: /* FF /4, jmp near, absolute indirect */ p->ainsn.emulate_op = kprobe_emulate_jmp_indirect; - } else + break; + case 0b011: /* FF /3, call far, absolute indirect */ + case 0b101: /* FF /5, jmp far, absolute indirect */ + return -EOPNOTSUPP; + } + + if (!p->ainsn.emulate_op) break; if (insn->addr_bytes != sizeof(unsigned long)) @@ -990,20 +970,6 @@ int kprobe_int3_handler(struct pt_regs *regs) kprobe_post_process(p, regs, kcb); return 1; } - } - - if (*addr != INT3_INSN_OPCODE) { - /* - * The breakpoint instruction was removed right - * after we hit it. Another cpu has removed - * either a probepoint or a debugger breakpoint - * at this address. In either case, no further - * handling of this interrupt is appropriate. - * Back up over the (now missing) int3 and run - * the original instruction. - */ - regs->ip = (unsigned long)addr; - return 1; } /* else: not a kprobe fault; let the kernel handle it */ return 0; diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 16333ba1904b..0f35d44c56fe 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -71,12 +71,12 @@ static int kvm_set_wallclock(const struct timespec64 *now) return -ENODEV; } -static u64 kvm_clock_read(void) +static noinstr u64 kvm_clock_read(void) { u64 ret; preempt_disable_notrace(); - ret = pvclock_clocksource_read(this_cpu_pvti()); + ret = pvclock_clocksource_read_nowd(this_cpu_pvti()); preempt_enable_notrace(); return ret; } @@ -86,7 +86,7 @@ static u64 kvm_clock_get_cycles(struct clocksource *cs) return kvm_clock_read(); } -static u64 kvm_sched_clock_read(void) +static noinstr u64 kvm_sched_clock_read(void) { return kvm_clock_read() - kvm_sched_clock_offset; } diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 327757afb027..42e182868873 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -32,6 +32,7 @@ #include <asm/special_insns.h> #include <asm/tlb.h> #include <asm/io_bitmap.h> +#include <asm/gsseg.h> /* * nop stub, which must not clobber anything *including the stack* to @@ -216,6 +217,11 @@ static noinstr void pv_native_set_debugreg(int regno, unsigned long val) native_set_debugreg(regno, val); } +noinstr void pv_native_wbinvd(void) +{ + native_wbinvd(); +} + static noinstr void pv_native_irq_enable(void) { native_irq_enable(); @@ -225,6 +231,11 @@ static noinstr void pv_native_irq_disable(void) { native_irq_disable(); } + +static noinstr void pv_native_safe_halt(void) +{ + native_safe_halt(); +} #endif enum paravirt_lazy_mode paravirt_get_lazy_mode(void) @@ -256,7 +267,7 @@ struct paravirt_patch_template pv_ops = { .cpu.read_cr0 = native_read_cr0, .cpu.write_cr0 = native_write_cr0, .cpu.write_cr4 = native_write_cr4, - .cpu.wbinvd = native_wbinvd, + .cpu.wbinvd = pv_native_wbinvd, .cpu.read_msr = native_read_msr, .cpu.write_msr = native_write_msr, .cpu.read_msr_safe = native_read_msr_safe, @@ -290,7 +301,7 @@ struct paravirt_patch_template pv_ops = { .irq.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), .irq.irq_disable = __PV_IS_CALLEE_SAVE(pv_native_irq_disable), .irq.irq_enable = __PV_IS_CALLEE_SAVE(pv_native_irq_enable), - .irq.safe_halt = native_safe_halt, + .irq.safe_halt = pv_native_safe_halt, .irq.halt = native_halt, #endif /* CONFIG_PARAVIRT_XXL */ diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 40d156a31676..b650cde3f64d 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -24,6 +24,7 @@ #include <linux/cpuidle.h> #include <linux/acpi.h> #include <linux/elf-randomize.h> +#include <linux/static_call.h> #include <trace/events/power.h> #include <linux/hw_breakpoint.h> #include <asm/cpu.h> @@ -694,7 +695,24 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE; EXPORT_SYMBOL(boot_option_idle_override); -static void (*x86_idle)(void); +/* + * We use this if we don't have any better idle routine.. + */ +void __cpuidle default_idle(void) +{ + raw_safe_halt(); + raw_local_irq_disable(); +} +#if defined(CONFIG_APM_MODULE) || defined(CONFIG_HALTPOLL_CPUIDLE_MODULE) +EXPORT_SYMBOL(default_idle); +#endif + +DEFINE_STATIC_CALL_NULL(x86_idle, default_idle); + +static bool x86_idle_set(void) +{ + return !!static_call_query(x86_idle); +} #ifndef CONFIG_SMP static inline void play_dead(void) @@ -717,28 +735,18 @@ void arch_cpu_idle_dead(void) /* * Called from the generic idle code. */ -void arch_cpu_idle(void) -{ - x86_idle(); -} - -/* - * We use this if we don't have any better idle routine.. - */ -void __cpuidle default_idle(void) +void __cpuidle arch_cpu_idle(void) { - raw_safe_halt(); + static_call(x86_idle)(); } -#if defined(CONFIG_APM_MODULE) || defined(CONFIG_HALTPOLL_CPUIDLE_MODULE) -EXPORT_SYMBOL(default_idle); -#endif +EXPORT_SYMBOL_GPL(arch_cpu_idle); #ifdef CONFIG_XEN bool xen_set_default_idle(void) { - bool ret = !!x86_idle; + bool ret = x86_idle_set(); - x86_idle = default_idle; + static_call_update(x86_idle, default_idle); return ret; } @@ -800,13 +808,7 @@ static void amd_e400_idle(void) default_idle(); - /* - * The switch back from broadcast mode needs to be called with - * interrupts disabled. - */ - raw_local_irq_disable(); tick_broadcast_exit(); - raw_local_irq_enable(); } /* @@ -864,12 +866,10 @@ static __cpuidle void mwait_idle(void) } __monitor((void *)¤t_thread_info()->flags, 0, 0); - if (!need_resched()) + if (!need_resched()) { __sti_mwait(0, 0); - else - raw_local_irq_enable(); - } else { - raw_local_irq_enable(); + raw_local_irq_disable(); + } } __current_clr_polling(); } @@ -880,20 +880,20 @@ void select_idle_routine(const struct cpuinfo_x86 *c) if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1) pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n"); #endif - if (x86_idle || boot_option_idle_override == IDLE_POLL) + if (x86_idle_set() || boot_option_idle_override == IDLE_POLL) return; if (boot_cpu_has_bug(X86_BUG_AMD_E400)) { pr_info("using AMD E400 aware idle routine\n"); - x86_idle = amd_e400_idle; + static_call_update(x86_idle, amd_e400_idle); } else if (prefer_mwait_c1_over_halt(c)) { pr_info("using mwait in idle threads\n"); - x86_idle = mwait_idle; + static_call_update(x86_idle, mwait_idle); } else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { pr_info("using TDX aware idle routine\n"); - x86_idle = tdx_safe_halt; + static_call_update(x86_idle, tdx_safe_halt); } else - x86_idle = default_idle; + static_call_update(x86_idle, default_idle); } void amd_e400_c1e_apic_setup(void) @@ -946,7 +946,7 @@ static int __init idle_setup(char *str) * To continue to load the CPU idle driver, don't touch * the boot_option_idle_override. */ - x86_idle = default_idle; + static_call_update(x86_idle, default_idle); boot_option_idle_override = IDLE_HALT; } else if (!strcmp(str, "nomwait")) { /* diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index eda37df016f0..56acf53a782a 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -64,7 +64,8 @@ u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src) return flags & valid_flags; } -u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) +static __always_inline +u64 __pvclock_clocksource_read(struct pvclock_vcpu_time_info *src, bool dowd) { unsigned version; u64 ret; @@ -77,7 +78,7 @@ u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) flags = src->flags; } while (pvclock_read_retry(src, version)); - if (unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) { + if (dowd && unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) { src->flags &= ~PVCLOCK_GUEST_STOPPED; pvclock_touch_watchdogs(); } @@ -100,16 +101,25 @@ u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) * updating at the same time, and one of them could be slightly behind, * making the assumption that last_value always go forward fail to hold. */ - last = atomic64_read(&last_value); + last = arch_atomic64_read(&last_value); do { - if (ret < last) + if (ret <= last) return last; - last = atomic64_cmpxchg(&last_value, last, ret); - } while (unlikely(last != ret)); + } while (!arch_atomic64_try_cmpxchg(&last_value, &last, ret)); return ret; } +u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) +{ + return __pvclock_clocksource_read(src, true); +} + +noinstr u64 pvclock_clocksource_read_nowd(struct pvclock_vcpu_time_info *src) +{ + return __pvclock_clocksource_read(src, false); +} + void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, struct pvclock_vcpu_time_info *vcpu_time, struct timespec64 *ts) diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 349046434513..1309b9b05338 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -138,15 +138,12 @@ static __init int add_rtc_cmos(void) static const char * const ids[] __initconst = { "PNP0b00", "PNP0b01", "PNP0b02", }; struct pnp_dev *dev; - struct pnp_id *id; int i; pnp_for_each_dev(dev) { - for (id = dev->id; id; id = id->next) { - for (i = 0; i < ARRAY_SIZE(ids); i++) { - if (compare_pnp_id(id, ids[i]) != 0) - return 0; - } + for (i = 0; i < ARRAY_SIZE(ids); i++) { + if (compare_pnp_id(dev->id, ids[i]) != 0) + return 0; } } #endif diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 88188549647c..16babff771bd 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -114,11 +114,6 @@ static struct resource bss_resource = { #ifdef CONFIG_X86_32 /* CPU data as detected by the assembly code in head_32.S */ struct cpuinfo_x86 new_cpu_data; - -/* Common CPU data for all CPUs */ -struct cpuinfo_x86 boot_cpu_data __read_mostly; -EXPORT_SYMBOL(boot_cpu_data); - unsigned int def_to_bigsmp; struct apm_info apm_info; @@ -132,11 +127,10 @@ EXPORT_SYMBOL(ist_info); struct ist_info ist_info; #endif -#else -struct cpuinfo_x86 boot_cpu_data __read_mostly; -EXPORT_SYMBOL(boot_cpu_data); #endif +struct cpuinfo_x86 boot_cpu_data __read_mostly; +EXPORT_SYMBOL(boot_cpu_data); #if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) __visible unsigned long mmu_cr4_features __ro_after_init; diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 1504eb8d25aa..004cb30b7419 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -360,7 +360,7 @@ static bool strict_sigaltstack_size __ro_after_init = false; static int __init strict_sas_size(char *arg) { - return kstrtobool(arg, &strict_sigaltstack_size); + return kstrtobool(arg, &strict_sigaltstack_size) == 0; } __setup("strict_sas_size", strict_sas_size); diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 2553136cf39b..9027fc088f97 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -31,6 +31,7 @@ #include <asm/sigframe.h> #include <asm/sighandling.h> #include <asm/smap.h> +#include <asm/gsseg.h> #ifdef CONFIG_IA32_EMULATION #include <asm/ia32_unistd.h> @@ -54,12 +55,14 @@ static inline void reload_segments(struct sigcontext_32 *sc) } #define sigset32_t compat_sigset_t +#define siginfo32_t compat_siginfo_t #define restore_altstack32 compat_restore_altstack #define unsafe_save_altstack32 unsafe_compat_save_altstack #else #define sigset32_t sigset_t +#define siginfo32_t siginfo_t #define __NR_ia32_sigreturn __NR_sigreturn #define __NR_ia32_rt_sigreturn __NR_rt_sigreturn #define restore_altstack32 restore_altstack @@ -377,3 +380,128 @@ Efault: user_access_end(); return -EFAULT; } + +/* + * The siginfo_t structure and handing code is very easy + * to break in several ways. It must always be updated when new + * updates are made to the main siginfo_t, and + * copy_siginfo_to_user32() must be updated when the + * (arch-independent) copy_siginfo_to_user() is updated. + * + * It is also easy to put a new member in the siginfo_t + * which has implicit alignment which can move internal structure + * alignment around breaking the ABI. This can happen if you, + * for instance, put a plain 64-bit value in there. + */ + +/* +* If adding a new si_code, there is probably new data in +* the siginfo. Make sure folks bumping the si_code +* limits also have to look at this code. Make sure any +* new fields are handled in copy_siginfo_to_user32()! +*/ +static_assert(NSIGILL == 11); +static_assert(NSIGFPE == 15); +static_assert(NSIGSEGV == 9); +static_assert(NSIGBUS == 5); +static_assert(NSIGTRAP == 6); +static_assert(NSIGCHLD == 6); +static_assert(NSIGSYS == 2); + +/* This is part of the ABI and can never change in size: */ +static_assert(sizeof(siginfo32_t) == 128); + +/* This is a part of the ABI and can never change in alignment */ +static_assert(__alignof__(siginfo32_t) == 4); + +/* +* The offsets of all the (unioned) si_fields are fixed +* in the ABI, of course. Make sure none of them ever +* move and are always at the beginning: +*/ +static_assert(offsetof(siginfo32_t, _sifields) == 3 * sizeof(int)); + +static_assert(offsetof(siginfo32_t, si_signo) == 0); +static_assert(offsetof(siginfo32_t, si_errno) == 4); +static_assert(offsetof(siginfo32_t, si_code) == 8); + +/* +* Ensure that the size of each si_field never changes. +* If it does, it is a sign that the +* copy_siginfo_to_user32() code below needs to updated +* along with the size in the CHECK_SI_SIZE(). +* +* We repeat this check for both the generic and compat +* siginfos. +* +* Note: it is OK for these to grow as long as the whole +* structure stays within the padding size (checked +* above). +*/ + +#define CHECK_SI_OFFSET(name) \ + static_assert(offsetof(siginfo32_t, _sifields) == \ + offsetof(siginfo32_t, _sifields.name)) + +#define CHECK_SI_SIZE(name, size) \ + static_assert(sizeof_field(siginfo32_t, _sifields.name) == size) + +CHECK_SI_OFFSET(_kill); +CHECK_SI_SIZE (_kill, 2*sizeof(int)); +static_assert(offsetof(siginfo32_t, si_pid) == 0xC); +static_assert(offsetof(siginfo32_t, si_uid) == 0x10); + +CHECK_SI_OFFSET(_timer); +#ifdef CONFIG_COMPAT +/* compat_siginfo_t doesn't have si_sys_private */ +CHECK_SI_SIZE (_timer, 3*sizeof(int)); +#else +CHECK_SI_SIZE (_timer, 4*sizeof(int)); +#endif +static_assert(offsetof(siginfo32_t, si_tid) == 0x0C); +static_assert(offsetof(siginfo32_t, si_overrun) == 0x10); +static_assert(offsetof(siginfo32_t, si_value) == 0x14); + +CHECK_SI_OFFSET(_rt); +CHECK_SI_SIZE (_rt, 3*sizeof(int)); +static_assert(offsetof(siginfo32_t, si_pid) == 0x0C); +static_assert(offsetof(siginfo32_t, si_uid) == 0x10); +static_assert(offsetof(siginfo32_t, si_value) == 0x14); + +CHECK_SI_OFFSET(_sigchld); +CHECK_SI_SIZE (_sigchld, 5*sizeof(int)); +static_assert(offsetof(siginfo32_t, si_pid) == 0x0C); +static_assert(offsetof(siginfo32_t, si_uid) == 0x10); +static_assert(offsetof(siginfo32_t, si_status) == 0x14); +static_assert(offsetof(siginfo32_t, si_utime) == 0x18); +static_assert(offsetof(siginfo32_t, si_stime) == 0x1C); + +CHECK_SI_OFFSET(_sigfault); +CHECK_SI_SIZE (_sigfault, 4*sizeof(int)); +static_assert(offsetof(siginfo32_t, si_addr) == 0x0C); + +static_assert(offsetof(siginfo32_t, si_trapno) == 0x10); + +static_assert(offsetof(siginfo32_t, si_addr_lsb) == 0x10); + +static_assert(offsetof(siginfo32_t, si_lower) == 0x14); +static_assert(offsetof(siginfo32_t, si_upper) == 0x18); + +static_assert(offsetof(siginfo32_t, si_pkey) == 0x14); + +static_assert(offsetof(siginfo32_t, si_perf_data) == 0x10); +static_assert(offsetof(siginfo32_t, si_perf_type) == 0x14); +static_assert(offsetof(siginfo32_t, si_perf_flags) == 0x18); + +CHECK_SI_OFFSET(_sigpoll); +CHECK_SI_SIZE (_sigpoll, 2*sizeof(int)); +static_assert(offsetof(siginfo32_t, si_band) == 0x0C); +static_assert(offsetof(siginfo32_t, si_fd) == 0x10); + +CHECK_SI_OFFSET(_sigsys); +CHECK_SI_SIZE (_sigsys, 3*sizeof(int)); +static_assert(offsetof(siginfo32_t, si_call_addr) == 0x0C); +static_assert(offsetof(siginfo32_t, si_syscall) == 0x10); +static_assert(offsetof(siginfo32_t, si_arch) == 0x14); + +/* any new si_fields should be added here */ diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index ff9c55064223..13a1e6083837 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -381,3 +381,130 @@ badframe: return 0; } #endif /* CONFIG_X86_X32_ABI */ + +#ifdef CONFIG_COMPAT +void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact) +{ + if (!act) + return; + + if (in_ia32_syscall()) + act->sa.sa_flags |= SA_IA32_ABI; + if (in_x32_syscall()) + act->sa.sa_flags |= SA_X32_ABI; +} +#endif /* CONFIG_COMPAT */ + +/* +* If adding a new si_code, there is probably new data in +* the siginfo. Make sure folks bumping the si_code +* limits also have to look at this code. Make sure any +* new fields are handled in copy_siginfo_to_user32()! +*/ +static_assert(NSIGILL == 11); +static_assert(NSIGFPE == 15); +static_assert(NSIGSEGV == 9); +static_assert(NSIGBUS == 5); +static_assert(NSIGTRAP == 6); +static_assert(NSIGCHLD == 6); +static_assert(NSIGSYS == 2); + +/* This is part of the ABI and can never change in size: */ +static_assert(sizeof(siginfo_t) == 128); + +/* This is a part of the ABI and can never change in alignment */ +static_assert(__alignof__(siginfo_t) == 8); + +/* +* The offsets of all the (unioned) si_fields are fixed +* in the ABI, of course. Make sure none of them ever +* move and are always at the beginning: +*/ +static_assert(offsetof(siginfo_t, si_signo) == 0); +static_assert(offsetof(siginfo_t, si_errno) == 4); +static_assert(offsetof(siginfo_t, si_code) == 8); + +/* +* Ensure that the size of each si_field never changes. +* If it does, it is a sign that the +* copy_siginfo_to_user32() code below needs to updated +* along with the size in the CHECK_SI_SIZE(). +* +* We repeat this check for both the generic and compat +* siginfos. +* +* Note: it is OK for these to grow as long as the whole +* structure stays within the padding size (checked +* above). +*/ + +#define CHECK_SI_OFFSET(name) \ + static_assert(offsetof(siginfo_t, _sifields) == \ + offsetof(siginfo_t, _sifields.name)) +#define CHECK_SI_SIZE(name, size) \ + static_assert(sizeof_field(siginfo_t, _sifields.name) == size) + +CHECK_SI_OFFSET(_kill); +CHECK_SI_SIZE (_kill, 2*sizeof(int)); +static_assert(offsetof(siginfo_t, si_pid) == 0x10); +static_assert(offsetof(siginfo_t, si_uid) == 0x14); + +CHECK_SI_OFFSET(_timer); +CHECK_SI_SIZE (_timer, 6*sizeof(int)); +static_assert(offsetof(siginfo_t, si_tid) == 0x10); +static_assert(offsetof(siginfo_t, si_overrun) == 0x14); +static_assert(offsetof(siginfo_t, si_value) == 0x18); + +CHECK_SI_OFFSET(_rt); +CHECK_SI_SIZE (_rt, 4*sizeof(int)); +static_assert(offsetof(siginfo_t, si_pid) == 0x10); +static_assert(offsetof(siginfo_t, si_uid) == 0x14); +static_assert(offsetof(siginfo_t, si_value) == 0x18); + +CHECK_SI_OFFSET(_sigchld); +CHECK_SI_SIZE (_sigchld, 8*sizeof(int)); +static_assert(offsetof(siginfo_t, si_pid) == 0x10); +static_assert(offsetof(siginfo_t, si_uid) == 0x14); +static_assert(offsetof(siginfo_t, si_status) == 0x18); +static_assert(offsetof(siginfo_t, si_utime) == 0x20); +static_assert(offsetof(siginfo_t, si_stime) == 0x28); + +#ifdef CONFIG_X86_X32_ABI +/* no _sigchld_x32 in the generic siginfo_t */ +static_assert(sizeof_field(compat_siginfo_t, _sifields._sigchld_x32) == + 7*sizeof(int)); +static_assert(offsetof(compat_siginfo_t, _sifields) == + offsetof(compat_siginfo_t, _sifields._sigchld_x32)); +static_assert(offsetof(compat_siginfo_t, _sifields._sigchld_x32._utime) == 0x18); +static_assert(offsetof(compat_siginfo_t, _sifields._sigchld_x32._stime) == 0x20); +#endif + +CHECK_SI_OFFSET(_sigfault); +CHECK_SI_SIZE (_sigfault, 8*sizeof(int)); +static_assert(offsetof(siginfo_t, si_addr) == 0x10); + +static_assert(offsetof(siginfo_t, si_trapno) == 0x18); + +static_assert(offsetof(siginfo_t, si_addr_lsb) == 0x18); + +static_assert(offsetof(siginfo_t, si_lower) == 0x20); +static_assert(offsetof(siginfo_t, si_upper) == 0x28); + +static_assert(offsetof(siginfo_t, si_pkey) == 0x20); + +static_assert(offsetof(siginfo_t, si_perf_data) == 0x18); +static_assert(offsetof(siginfo_t, si_perf_type) == 0x20); +static_assert(offsetof(siginfo_t, si_perf_flags) == 0x24); + +CHECK_SI_OFFSET(_sigpoll); +CHECK_SI_SIZE (_sigpoll, 4*sizeof(int)); +static_assert(offsetof(siginfo_t, si_band) == 0x10); +static_assert(offsetof(siginfo_t, si_fd) == 0x18); + +CHECK_SI_OFFSET(_sigsys); +CHECK_SI_SIZE (_sigsys, 4*sizeof(int)); +static_assert(offsetof(siginfo_t, si_call_addr) == 0x10); +static_assert(offsetof(siginfo_t, si_syscall) == 0x18); +static_assert(offsetof(siginfo_t, si_arch) == 0x1C); + +/* any new si_fields should be added here */ diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c deleted file mode 100644 index 879ef8c72f5c..000000000000 --- a/arch/x86/kernel/signal_compat.c +++ /dev/null @@ -1,191 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/compat.h> -#include <linux/uaccess.h> -#include <linux/ptrace.h> - -/* - * The compat_siginfo_t structure and handing code is very easy - * to break in several ways. It must always be updated when new - * updates are made to the main siginfo_t, and - * copy_siginfo_to_user32() must be updated when the - * (arch-independent) copy_siginfo_to_user() is updated. - * - * It is also easy to put a new member in the compat_siginfo_t - * which has implicit alignment which can move internal structure - * alignment around breaking the ABI. This can happen if you, - * for instance, put a plain 64-bit value in there. - */ -static inline void signal_compat_build_tests(void) -{ - int _sifields_offset = offsetof(compat_siginfo_t, _sifields); - - /* - * If adding a new si_code, there is probably new data in - * the siginfo. Make sure folks bumping the si_code - * limits also have to look at this code. Make sure any - * new fields are handled in copy_siginfo_to_user32()! - */ - BUILD_BUG_ON(NSIGILL != 11); - BUILD_BUG_ON(NSIGFPE != 15); - BUILD_BUG_ON(NSIGSEGV != 9); - BUILD_BUG_ON(NSIGBUS != 5); - BUILD_BUG_ON(NSIGTRAP != 6); - BUILD_BUG_ON(NSIGCHLD != 6); - BUILD_BUG_ON(NSIGSYS != 2); - - /* This is part of the ABI and can never change in size: */ - BUILD_BUG_ON(sizeof(siginfo_t) != 128); - BUILD_BUG_ON(sizeof(compat_siginfo_t) != 128); - - /* This is a part of the ABI and can never change in alignment */ - BUILD_BUG_ON(__alignof__(siginfo_t) != 8); - BUILD_BUG_ON(__alignof__(compat_siginfo_t) != 4); - - /* - * The offsets of all the (unioned) si_fields are fixed - * in the ABI, of course. Make sure none of them ever - * move and are always at the beginning: - */ - BUILD_BUG_ON(offsetof(compat_siginfo_t, _sifields) != 3 * sizeof(int)); -#define CHECK_CSI_OFFSET(name) BUILD_BUG_ON(_sifields_offset != offsetof(compat_siginfo_t, _sifields.name)) - - BUILD_BUG_ON(offsetof(siginfo_t, si_signo) != 0); - BUILD_BUG_ON(offsetof(siginfo_t, si_errno) != 4); - BUILD_BUG_ON(offsetof(siginfo_t, si_code) != 8); - - BUILD_BUG_ON(offsetof(compat_siginfo_t, si_signo) != 0); - BUILD_BUG_ON(offsetof(compat_siginfo_t, si_errno) != 4); - BUILD_BUG_ON(offsetof(compat_siginfo_t, si_code) != 8); - /* - * Ensure that the size of each si_field never changes. - * If it does, it is a sign that the - * copy_siginfo_to_user32() code below needs to updated - * along with the size in the CHECK_SI_SIZE(). - * - * We repeat this check for both the generic and compat - * siginfos. - * - * Note: it is OK for these to grow as long as the whole - * structure stays within the padding size (checked - * above). - */ -#define CHECK_CSI_SIZE(name, size) BUILD_BUG_ON(size != sizeof(((compat_siginfo_t *)0)->_sifields.name)) -#define CHECK_SI_SIZE(name, size) BUILD_BUG_ON(size != sizeof(((siginfo_t *)0)->_sifields.name)) - - CHECK_CSI_OFFSET(_kill); - CHECK_CSI_SIZE (_kill, 2*sizeof(int)); - CHECK_SI_SIZE (_kill, 2*sizeof(int)); - - BUILD_BUG_ON(offsetof(siginfo_t, si_pid) != 0x10); - BUILD_BUG_ON(offsetof(siginfo_t, si_uid) != 0x14); - BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pid) != 0xC); - BUILD_BUG_ON(offsetof(compat_siginfo_t, si_uid) != 0x10); - - CHECK_CSI_OFFSET(_timer); - CHECK_CSI_SIZE (_timer, 3*sizeof(int)); - CHECK_SI_SIZE (_timer, 6*sizeof(int)); - - BUILD_BUG_ON(offsetof(siginfo_t, si_tid) != 0x10); - BUILD_BUG_ON(offsetof(siginfo_t, si_overrun) != 0x14); - BUILD_BUG_ON(offsetof(siginfo_t, si_value) != 0x18); - BUILD_BUG_ON(offsetof(compat_siginfo_t, si_tid) != 0x0C); - BUILD_BUG_ON(offsetof(compat_siginfo_t, si_overrun) != 0x10); - BUILD_BUG_ON(offsetof(compat_siginfo_t, si_value) != 0x14); - - CHECK_CSI_OFFSET(_rt); - CHECK_CSI_SIZE (_rt, 3*sizeof(int)); - CHECK_SI_SIZE (_rt, 4*sizeof(int)); - - BUILD_BUG_ON(offsetof(siginfo_t, si_pid) != 0x10); - BUILD_BUG_ON(offsetof(siginfo_t, si_uid) != 0x14); - BUILD_BUG_ON(offsetof(siginfo_t, si_value) != 0x18); - BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pid) != 0x0C); - BUILD_BUG_ON(offsetof(compat_siginfo_t, si_uid) != 0x10); - BUILD_BUG_ON(offsetof(compat_siginfo_t, si_value) != 0x14); - - CHECK_CSI_OFFSET(_sigchld); - CHECK_CSI_SIZE (_sigchld, 5*sizeof(int)); - CHECK_SI_SIZE (_sigchld, 8*sizeof(int)); - - BUILD_BUG_ON(offsetof(siginfo_t, si_pid) != 0x10); - BUILD_BUG_ON(offsetof(siginfo_t, si_uid) != 0x14); - BUILD_BUG_ON(offsetof(siginfo_t, si_status) != 0x18); - BUILD_BUG_ON(offsetof(siginfo_t, si_utime) != 0x20); - BUILD_BUG_ON(offsetof(siginfo_t, si_stime) != 0x28); - BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pid) != 0x0C); - BUILD_BUG_ON(offsetof(compat_siginfo_t, si_uid) != 0x10); - BUILD_BUG_ON(offsetof(compat_siginfo_t, si_status) != 0x14); - BUILD_BUG_ON(offsetof(compat_siginfo_t, si_utime) != 0x18); - BUILD_BUG_ON(offsetof(compat_siginfo_t, si_stime) != 0x1C); - -#ifdef CONFIG_X86_X32_ABI - CHECK_CSI_OFFSET(_sigchld_x32); - CHECK_CSI_SIZE (_sigchld_x32, 7*sizeof(int)); - /* no _sigchld_x32 in the generic siginfo_t */ - BUILD_BUG_ON(offsetof(compat_siginfo_t, _sifields._sigchld_x32._utime) != 0x18); - BUILD_BUG_ON(offsetof(compat_siginfo_t, _sifields._sigchld_x32._stime) != 0x20); -#endif - - CHECK_CSI_OFFSET(_sigfault); - CHECK_CSI_SIZE (_sigfault, 4*sizeof(int)); - CHECK_SI_SIZE (_sigfault, 8*sizeof(int)); - - BUILD_BUG_ON(offsetof(siginfo_t, si_addr) != 0x10); - BUILD_BUG_ON(offsetof(compat_siginfo_t, si_addr) != 0x0C); - - BUILD_BUG_ON(offsetof(siginfo_t, si_trapno) != 0x18); - BUILD_BUG_ON(offsetof(compat_siginfo_t, si_trapno) != 0x10); - - BUILD_BUG_ON(offsetof(siginfo_t, si_addr_lsb) != 0x18); - BUILD_BUG_ON(offsetof(compat_siginfo_t, si_addr_lsb) != 0x10); - - BUILD_BUG_ON(offsetof(siginfo_t, si_lower) != 0x20); - BUILD_BUG_ON(offsetof(siginfo_t, si_upper) != 0x28); - BUILD_BUG_ON(offsetof(compat_siginfo_t, si_lower) != 0x14); - BUILD_BUG_ON(offsetof(compat_siginfo_t, si_upper) != 0x18); - - BUILD_BUG_ON(offsetof(siginfo_t, si_pkey) != 0x20); - BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pkey) != 0x14); - - BUILD_BUG_ON(offsetof(siginfo_t, si_perf_data) != 0x18); - BUILD_BUG_ON(offsetof(siginfo_t, si_perf_type) != 0x20); - BUILD_BUG_ON(offsetof(siginfo_t, si_perf_flags) != 0x24); - BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_data) != 0x10); - BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_type) != 0x14); - BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_flags) != 0x18); - - CHECK_CSI_OFFSET(_sigpoll); - CHECK_CSI_SIZE (_sigpoll, 2*sizeof(int)); - CHECK_SI_SIZE (_sigpoll, 4*sizeof(int)); - - BUILD_BUG_ON(offsetof(siginfo_t, si_band) != 0x10); - BUILD_BUG_ON(offsetof(siginfo_t, si_fd) != 0x18); - BUILD_BUG_ON(offsetof(compat_siginfo_t, si_band) != 0x0C); - BUILD_BUG_ON(offsetof(compat_siginfo_t, si_fd) != 0x10); - - CHECK_CSI_OFFSET(_sigsys); - CHECK_CSI_SIZE (_sigsys, 3*sizeof(int)); - CHECK_SI_SIZE (_sigsys, 4*sizeof(int)); - - BUILD_BUG_ON(offsetof(siginfo_t, si_call_addr) != 0x10); - BUILD_BUG_ON(offsetof(siginfo_t, si_syscall) != 0x18); - BUILD_BUG_ON(offsetof(siginfo_t, si_arch) != 0x1C); - BUILD_BUG_ON(offsetof(compat_siginfo_t, si_call_addr) != 0x0C); - BUILD_BUG_ON(offsetof(compat_siginfo_t, si_syscall) != 0x10); - BUILD_BUG_ON(offsetof(compat_siginfo_t, si_arch) != 0x14); - - /* any new si_fields should be added here */ -} - -void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact) -{ - signal_compat_build_tests(); - - if (!act) - return; - - if (in_ia32_syscall()) - act->sa.sa_flags |= SA_IA32_ABI; - if (in_x32_syscall()) - act->sa.sa_flags |= SA_X32_ABI; -} diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 55cad72715d9..9013bb28255a 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1833,7 +1833,7 @@ void native_play_dead(void) play_dead_common(); tboot_shutdown(TB_SHUTDOWN_WFS); - mwait_play_dead(); /* Only returns on failure */ + mwait_play_dead(); if (cpuidle_play_dead()) hlt_play_dead(); } diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index 2ebc338980bc..b70670a98597 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -9,6 +9,7 @@ enum insn_type { NOP = 1, /* site cond-call */ JMP = 2, /* tramp / site tail-call */ RET = 3, /* tramp / site cond-tail-call */ + JCC = 4, }; /* @@ -25,12 +26,40 @@ static const u8 xor5rax[] = { 0x2e, 0x2e, 0x2e, 0x31, 0xc0 }; static const u8 retinsn[] = { RET_INSN_OPCODE, 0xcc, 0xcc, 0xcc, 0xcc }; +static u8 __is_Jcc(u8 *insn) /* Jcc.d32 */ +{ + u8 ret = 0; + + if (insn[0] == 0x0f) { + u8 tmp = insn[1]; + if ((tmp & 0xf0) == 0x80) + ret = tmp; + } + + return ret; +} + +extern void __static_call_return(void); + +asm (".global __static_call_return\n\t" + ".type __static_call_return, @function\n\t" + ASM_FUNC_ALIGN "\n\t" + "__static_call_return:\n\t" + ANNOTATE_NOENDBR + ANNOTATE_RETPOLINE_SAFE + "ret; int3\n\t" + ".size __static_call_return, . - __static_call_return \n\t"); + static void __ref __static_call_transform(void *insn, enum insn_type type, void *func, bool modinit) { const void *emulate = NULL; int size = CALL_INSN_SIZE; const void *code; + u8 op, buf[6]; + + if ((type == JMP || type == RET) && (op = __is_Jcc(insn))) + type = JCC; switch (type) { case CALL: @@ -57,6 +86,20 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, else code = &retinsn; break; + + case JCC: + if (!func) { + func = __static_call_return; + if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + func = x86_return_thunk; + } + + buf[0] = 0x0f; + __text_gen_insn(buf+1, op, insn+1, func, 5); + code = buf; + size = 6; + + break; } if (memcmp(insn, code, size) == 0) @@ -68,9 +111,9 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, text_poke_bp(insn, code, size, emulate); } -static void __static_call_validate(void *insn, bool tail, bool tramp) +static void __static_call_validate(u8 *insn, bool tail, bool tramp) { - u8 opcode = *(u8 *)insn; + u8 opcode = insn[0]; if (tramp && memcmp(insn+5, tramp_ud, 3)) { pr_err("trampoline signature fail"); @@ -79,7 +122,8 @@ static void __static_call_validate(void *insn, bool tail, bool tramp) if (tail) { if (opcode == JMP32_INSN_OPCODE || - opcode == RET_INSN_OPCODE) + opcode == RET_INSN_OPCODE || + __is_Jcc(insn)) return; } else { if (opcode == CALL_INSN_OPCODE || diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index 3c883e064242..3ffbab0081f4 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -12,6 +12,7 @@ #include <asm/ldt.h> #include <asm/processor.h> #include <asm/proto.h> +#include <asm/gsseg.h> #include "tls.h" diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index a78e73da4a74..344698852146 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -48,6 +48,8 @@ static DEFINE_STATIC_KEY_FALSE(__use_tsc); int tsc_clocksource_reliable; +static int __read_mostly tsc_force_recalibrate; + static u32 art_to_tsc_numerator; static u32 art_to_tsc_denominator; static u64 art_to_tsc_offset; @@ -215,7 +217,7 @@ static void __init cyc2ns_init_secondary_cpus(void) /* * Scheduler clock - returns current time in nanosec units. */ -u64 native_sched_clock(void) +noinstr u64 native_sched_clock(void) { if (static_branch_likely(&__use_tsc)) { u64 tsc_now = rdtsc(); @@ -248,7 +250,7 @@ u64 native_sched_clock_from_tsc(u64 tsc) /* We need to define a real function for sched_clock, to override the weak default version */ #ifdef CONFIG_PARAVIRT -unsigned long long sched_clock(void) +noinstr u64 sched_clock(void) { return paravirt_sched_clock(); } @@ -258,8 +260,7 @@ bool using_native_sched_clock(void) return static_call_query(pv_sched_clock) == native_sched_clock; } #else -unsigned long long -sched_clock(void) __attribute__((alias("native_sched_clock"))); +u64 sched_clock(void) __attribute__((alias("native_sched_clock"))); bool using_native_sched_clock(void) { return true; } #endif @@ -292,6 +293,7 @@ __setup("notsc", notsc_setup); static int no_sched_irq_time; static int no_tsc_watchdog; +static int tsc_as_watchdog; static int __init tsc_setup(char *str) { @@ -301,8 +303,22 @@ static int __init tsc_setup(char *str) no_sched_irq_time = 1; if (!strcmp(str, "unstable")) mark_tsc_unstable("boot parameter"); - if (!strcmp(str, "nowatchdog")) + if (!strcmp(str, "nowatchdog")) { no_tsc_watchdog = 1; + if (tsc_as_watchdog) + pr_alert("%s: Overriding earlier tsc=watchdog with tsc=nowatchdog\n", + __func__); + tsc_as_watchdog = 0; + } + if (!strcmp(str, "recalibrate")) + tsc_force_recalibrate = 1; + if (!strcmp(str, "watchdog")) { + if (no_tsc_watchdog) + pr_alert("%s: tsc=watchdog overridden by earlier tsc=nowatchdog\n", + __func__); + else + tsc_as_watchdog = 1; + } return 1; } @@ -912,8 +928,7 @@ void recalibrate_cpu_khz(void) cpu_khz_old, cpu_khz); #endif } - -EXPORT_SYMBOL(recalibrate_cpu_khz); +EXPORT_SYMBOL_GPL(recalibrate_cpu_khz); static unsigned long long cyc2ns_suspend; @@ -1186,6 +1201,12 @@ static void __init tsc_disable_clocksource_watchdog(void) clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; } +bool tsc_clocksource_watchdog_disabled(void) +{ + return !(clocksource_tsc.flags & CLOCK_SOURCE_MUST_VERIFY) && + tsc_as_watchdog && !no_tsc_watchdog; +} + static void __init check_system_tsc_reliable(void) { #if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC) @@ -1374,6 +1395,25 @@ restart: else freq = calc_pmtimer_ref(delta, ref_start, ref_stop); + /* Will hit this only if tsc_force_recalibrate has been set */ + if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) { + + /* Warn if the deviation exceeds 500 ppm */ + if (abs(tsc_khz - freq) > (tsc_khz >> 11)) { + pr_warn("Warning: TSC freq calibrated by CPUID/MSR differs from what is calibrated by HW timer, please check with vendor!!\n"); + pr_info("Previous calibrated TSC freq:\t %lu.%03lu MHz\n", + (unsigned long)tsc_khz / 1000, + (unsigned long)tsc_khz % 1000); + } + + pr_info("TSC freq recalibrated by [%s]:\t %lu.%03lu MHz\n", + hpet ? "HPET" : "PM_TIMER", + (unsigned long)freq / 1000, + (unsigned long)freq % 1000); + + return; + } + /* Make sure we're within 1% */ if (abs(tsc_khz - freq) > tsc_khz/100) goto out; @@ -1407,8 +1447,10 @@ static int __init init_tsc_clocksource(void) if (!boot_cpu_has(X86_FEATURE_TSC) || !tsc_khz) return 0; - if (tsc_unstable) - goto unreg; + if (tsc_unstable) { + clocksource_unregister(&clocksource_tsc_early); + return 0; + } if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3)) clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP; @@ -1421,9 +1463,10 @@ static int __init init_tsc_clocksource(void) if (boot_cpu_has(X86_FEATURE_ART)) art_related_clocksource = &clocksource_tsc; clocksource_register_khz(&clocksource_tsc, tsc_khz); -unreg: clocksource_unregister(&clocksource_tsc_early); - return 0; + + if (!tsc_force_recalibrate) + return 0; } schedule_delayed_work(&tsc_irqwork, 0); @@ -1510,6 +1553,11 @@ void __init tsc_early_init(void) void __init tsc_init(void) { + if (!cpu_feature_enabled(X86_FEATURE_TSC)) { + setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); + return; + } + /* * native_calibrate_cpu_early can only calibrate using methods that are * available early in boot. @@ -1517,11 +1565,6 @@ void __init tsc_init(void) if (x86_platform.calibrate_cpu == native_calibrate_cpu_early) x86_platform.calibrate_cpu = native_calibrate_cpu; - if (!boot_cpu_has(X86_FEATURE_TSC)) { - setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); - return; - } - if (!tsc_khz) { /* We failed to determine frequencies earlier, try again */ if (!determine_cpu_tsc_frequencies(false)) { diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 2e0ee14229bf..25f155205770 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -129,7 +129,6 @@ SECTIONS HEAD_TEXT TEXT_TEXT SCHED_TEXT - CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT SOFTIRQENTRY_TEXT |