diff options
Diffstat (limited to 'arch/x86/kernel')
55 files changed, 2342 insertions, 1720 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index dd61752f4c96..4070a01c11b7 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -17,6 +17,7 @@ CFLAGS_REMOVE_ftrace.o = -pg CFLAGS_REMOVE_early_printk.o = -pg CFLAGS_REMOVE_head64.o = -pg CFLAGS_REMOVE_sev.o = -pg +CFLAGS_REMOVE_rethook.o = -pg endif KASAN_SANITIZE_head$(BITS).o := n diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 1328c221af30..6dfecb27b846 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -16,6 +16,7 @@ #include <asm/cacheflush.h> #include <asm/realmode.h> #include <asm/hypervisor.h> +#include <asm/smp.h> #include <linux/ftrace.h> #include "../../realmode/rm/wakeup.h" @@ -127,7 +128,13 @@ int x86_acpi_suspend_lowlevel(void) * value is in the actual %rsp register. */ current->thread.sp = (unsigned long)temp_stack + sizeof(temp_stack); - smpboot_control = smp_processor_id(); + /* + * Ensure the CPU knows which one it is when it comes back, if + * it isn't in parallel mode and expected to work that out for + * itself. + */ + if (!(smpboot_control & STARTUP_PARALLEL_MASK)) + smpboot_control = smp_processor_id(); #endif initial_code = (unsigned long)wakeup_long64; saved_magic = 0x123456789abcdef0L; diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h index 171a40c74db6..054c15a2f860 100644 --- a/arch/x86/kernel/acpi/sleep.h +++ b/arch/x86/kernel/acpi/sleep.h @@ -12,7 +12,6 @@ extern int wakeup_pmode_return; extern u8 wake_sleep_flags; -extern unsigned long acpi_copy_wakeup_routine(unsigned long); extern void wakeup_long64(void); extern void do_suspend_lowlevel(void); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 18f16e93838f..72646d75b6ff 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -37,11 +37,23 @@ EXPORT_SYMBOL_GPL(alternatives_patched); #define MAX_PATCH_LEN (255-1) -static int __initdata_or_module debug_alternative; +#define DA_ALL (~0) +#define DA_ALT 0x01 +#define DA_RET 0x02 +#define DA_RETPOLINE 0x04 +#define DA_ENDBR 0x08 +#define DA_SMP 0x10 + +static unsigned int __initdata_or_module debug_alternative; static int __init debug_alt(char *str) { - debug_alternative = 1; + if (str && *str == '=') + str++; + + if (!str || kstrtouint(str, 0, &debug_alternative)) + debug_alternative = DA_ALL; + return 1; } __setup("debug-alternative", debug_alt); @@ -55,15 +67,15 @@ static int __init setup_noreplace_smp(char *str) } __setup("noreplace-smp", setup_noreplace_smp); -#define DPRINTK(fmt, args...) \ +#define DPRINTK(type, fmt, args...) \ do { \ - if (debug_alternative) \ + if (debug_alternative & DA_##type) \ printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args); \ } while (0) -#define DUMP_BYTES(buf, len, fmt, args...) \ +#define DUMP_BYTES(type, buf, len, fmt, args...) \ do { \ - if (unlikely(debug_alternative)) { \ + if (unlikely(debug_alternative & DA_##type)) { \ int j; \ \ if (!(len)) \ @@ -86,6 +98,11 @@ static const unsigned char x86nops[] = BYTES_NOP6, BYTES_NOP7, BYTES_NOP8, +#ifdef CONFIG_64BIT + BYTES_NOP9, + BYTES_NOP10, + BYTES_NOP11, +#endif }; const unsigned char * const x86_nops[ASM_NOP_MAX+1] = @@ -99,19 +116,44 @@ const unsigned char * const x86_nops[ASM_NOP_MAX+1] = x86nops + 1 + 2 + 3 + 4 + 5, x86nops + 1 + 2 + 3 + 4 + 5 + 6, x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, +#ifdef CONFIG_64BIT + x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, + x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9, + x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10, +#endif }; -/* Use this to add nops to a buffer, then text_poke the whole buffer. */ -static void __init_or_module add_nops(void *insns, unsigned int len) +/* + * Fill the buffer with a single effective instruction of size @len. + * + * In order not to issue an ORC stack depth tracking CFI entry (Call Frame Info) + * for every single-byte NOP, try to generate the maximally available NOP of + * size <= ASM_NOP_MAX such that only a single CFI entry is generated (vs one for + * each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and + * *jump* over instead of executing long and daft NOPs. + */ +static void __init_or_module add_nop(u8 *instr, unsigned int len) { - while (len > 0) { - unsigned int noplen = len; - if (noplen > ASM_NOP_MAX) - noplen = ASM_NOP_MAX; - memcpy(insns, x86_nops[noplen], noplen); - insns += noplen; - len -= noplen; + u8 *target = instr + len; + + if (!len) + return; + + if (len <= ASM_NOP_MAX) { + memcpy(instr, x86_nops[len], len); + return; } + + if (len < 128) { + __text_gen_insn(instr, JMP8_INSN_OPCODE, instr, target, JMP8_INSN_SIZE); + instr += JMP8_INSN_SIZE; + } else { + __text_gen_insn(instr, JMP32_INSN_OPCODE, instr, target, JMP32_INSN_SIZE); + instr += JMP32_INSN_SIZE; + } + + for (;instr < target; instr++) + *instr = INT3_INSN_OPCODE; } extern s32 __retpoline_sites[], __retpoline_sites_end[]; @@ -123,133 +165,223 @@ extern s32 __smp_locks[], __smp_locks_end[]; void text_poke_early(void *addr, const void *opcode, size_t len); /* - * Are we looking at a near JMP with a 1 or 4-byte displacement. + * Matches NOP and NOPL, not any of the other possible NOPs. */ -static inline bool is_jmp(const u8 opcode) +static bool insn_is_nop(struct insn *insn) { - return opcode == 0xeb || opcode == 0xe9; + /* Anything NOP, but no REP NOP */ + if (insn->opcode.bytes[0] == 0x90 && + (!insn->prefixes.nbytes || insn->prefixes.bytes[0] != 0xF3)) + return true; + + /* NOPL */ + if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F) + return true; + + /* TODO: more nops */ + + return false; } -static void __init_or_module -recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insn_buff) +/* + * Find the offset of the first non-NOP instruction starting at @offset + * but no further than @len. + */ +static int skip_nops(u8 *instr, int offset, int len) { - u8 *next_rip, *tgt_rip; - s32 n_dspl, o_dspl; - int repl_len; + struct insn insn; - if (a->replacementlen != 5) - return; + for (; offset < len; offset += insn.length) { + if (insn_decode_kernel(&insn, &instr[offset])) + break; - o_dspl = *(s32 *)(insn_buff + 1); + if (!insn_is_nop(&insn)) + break; + } - /* next_rip of the replacement JMP */ - next_rip = repl_insn + a->replacementlen; - /* target rip of the replacement JMP */ - tgt_rip = next_rip + o_dspl; - n_dspl = tgt_rip - orig_insn; + return offset; +} - DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl); +/* + * Optimize a sequence of NOPs, possibly preceded by an unconditional jump + * to the end of the NOP sequence into a single NOP. + */ +static bool __init_or_module +__optimize_nops(u8 *instr, size_t len, struct insn *insn, int *next, int *prev, int *target) +{ + int i = *next - insn->length; - if (tgt_rip - orig_insn >= 0) { - if (n_dspl - 2 <= 127) - goto two_byte_jmp; - else - goto five_byte_jmp; - /* negative offset */ - } else { - if (((n_dspl - 2) & 0xff) == (n_dspl - 2)) - goto two_byte_jmp; - else - goto five_byte_jmp; + switch (insn->opcode.bytes[0]) { + case JMP8_INSN_OPCODE: + case JMP32_INSN_OPCODE: + *prev = i; + *target = *next + insn->immediate.value; + return false; } -two_byte_jmp: - n_dspl -= 2; + if (insn_is_nop(insn)) { + int nop = i; - insn_buff[0] = 0xeb; - insn_buff[1] = (s8)n_dspl; - add_nops(insn_buff + 2, 3); + *next = skip_nops(instr, *next, len); + if (*target && *next == *target) + nop = *prev; - repl_len = 2; - goto done; + add_nop(instr + nop, *next - nop); + DUMP_BYTES(ALT, instr, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, *next); + return true; + } + + *target = 0; + return false; +} -five_byte_jmp: - n_dspl -= 5; +/* + * "noinline" to cause control flow change and thus invalidate I$ and + * cause refetch after modification. + */ +static void __init_or_module noinline optimize_nops(u8 *instr, size_t len) +{ + int prev, target = 0; - insn_buff[0] = 0xe9; - *(s32 *)&insn_buff[1] = n_dspl; + for (int next, i = 0; i < len; i = next) { + struct insn insn; - repl_len = 5; + if (insn_decode_kernel(&insn, &instr[i])) + return; -done: + next = i + insn.length; - DPRINTK("final displ: 0x%08x, JMP 0x%lx", - n_dspl, (unsigned long)orig_insn + n_dspl + repl_len); + __optimize_nops(instr, len, &insn, &next, &prev, &target); + } } /* - * optimize_nops_range() - Optimize a sequence of single byte NOPs (0x90) + * In this context, "source" is where the instructions are placed in the + * section .altinstr_replacement, for example during kernel build by the + * toolchain. + * "Destination" is where the instructions are being patched in by this + * machinery. * - * @instr: instruction byte stream - * @instrlen: length of the above - * @off: offset within @instr where the first NOP has been detected + * The source offset is: * - * Return: number of NOPs found (and replaced). + * src_imm = target - src_next_ip (1) + * + * and the target offset is: + * + * dst_imm = target - dst_next_ip (2) + * + * so rework (1) as an expression for target like: + * + * target = src_imm + src_next_ip (1a) + * + * and substitute in (2) to get: + * + * dst_imm = (src_imm + src_next_ip) - dst_next_ip (3) + * + * Now, since the instruction stream is 'identical' at src and dst (it + * is being copied after all) it can be stated that: + * + * src_next_ip = src + ip_offset + * dst_next_ip = dst + ip_offset (4) + * + * Substitute (4) in (3) and observe ip_offset being cancelled out to + * obtain: + * + * dst_imm = src_imm + (src + ip_offset) - (dst + ip_offset) + * = src_imm + src - dst + ip_offset - ip_offset + * = src_imm + src - dst (5) + * + * IOW, only the relative displacement of the code block matters. */ -static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off) -{ - unsigned long flags; - int i = off, nnops; - while (i < instrlen) { - if (instr[i] != 0x90) - break; +#define apply_reloc_n(n_, p_, d_) \ + do { \ + s32 v = *(s##n_ *)(p_); \ + v += (d_); \ + BUG_ON((v >> 31) != (v >> (n_-1))); \ + *(s##n_ *)(p_) = (s##n_)v; \ + } while (0) + - i++; +static __always_inline +void apply_reloc(int n, void *ptr, uintptr_t diff) +{ + switch (n) { + case 1: apply_reloc_n(8, ptr, diff); break; + case 2: apply_reloc_n(16, ptr, diff); break; + case 4: apply_reloc_n(32, ptr, diff); break; + default: BUG(); } +} - nnops = i - off; +static __always_inline +bool need_reloc(unsigned long offset, u8 *src, size_t src_len) +{ + u8 *target = src + offset; + /* + * If the target is inside the patched block, it's relative to the + * block itself and does not need relocation. + */ + return (target < src || target > src + src_len); +} - if (nnops <= 1) - return nnops; +static void __init_or_module noinline +apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len) +{ + int prev, target = 0; - local_irq_save(flags); - add_nops(instr + off, nnops); - local_irq_restore(flags); + for (int next, i = 0; i < len; i = next) { + struct insn insn; - DUMP_BYTES(instr, instrlen, "%px: [%d:%d) optimized NOPs: ", instr, off, i); + if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i]))) + return; - return nnops; -} + next = i + insn.length; -/* - * "noinline" to cause control flow change and thus invalidate I$ and - * cause refetch after modification. - */ -static void __init_or_module noinline optimize_nops(u8 *instr, size_t len) -{ - struct insn insn; - int i = 0; + if (__optimize_nops(buf, len, &insn, &next, &prev, &target)) + continue; - /* - * Jump over the non-NOP insns and optimize single-byte NOPs into bigger - * ones. - */ - for (;;) { - if (insn_decode_kernel(&insn, &instr[i])) - return; + switch (insn.opcode.bytes[0]) { + case 0x0f: + if (insn.opcode.bytes[1] < 0x80 || + insn.opcode.bytes[1] > 0x8f) + break; - /* - * See if this and any potentially following NOPs can be - * optimized. - */ - if (insn.length == 1 && insn.opcode.bytes[0] == 0x90) - i += optimize_nops_range(instr, len, i); - else - i += insn.length; + fallthrough; /* Jcc.d32 */ + case 0x70 ... 0x7f: /* Jcc.d8 */ + case JMP8_INSN_OPCODE: + case JMP32_INSN_OPCODE: + case CALL_INSN_OPCODE: + if (need_reloc(next + insn.immediate.value, src, src_len)) { + apply_reloc(insn.immediate.nbytes, + buf + i + insn_offset_immediate(&insn), + src - dest); + } - if (i >= len) - return; + /* + * Where possible, convert JMP.d32 into JMP.d8. + */ + if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) { + s32 imm = insn.immediate.value; + imm += src - dest; + imm += JMP32_INSN_SIZE - JMP8_INSN_SIZE; + if ((imm >> 31) == (imm >> 7)) { + buf[i+0] = JMP8_INSN_OPCODE; + buf[i+1] = (s8)imm; + + memset(&buf[i+2], INT3_INSN_OPCODE, insn.length - 2); + } + } + break; + } + + if (insn_rip_relative(&insn)) { + if (need_reloc(next + insn.displacement.value, src, src_len)) { + apply_reloc(insn.displacement.nbytes, + buf + i + insn_offset_displacement(&insn), + src - dest); + } + } } } @@ -270,7 +402,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, u8 *instr, *replacement; u8 insn_buff[MAX_PATCH_LEN]; - DPRINTK("alt table %px, -> %px", start, end); + DPRINTK(ALT, "alt table %px, -> %px", start, end); /* * The scan order should be from start to end. A later scanned * alternative code can overwrite previously scanned alternative code. @@ -294,47 +426,31 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, * - feature not present but ALT_FLAG_NOT is set to mean, * patch if feature is *NOT* present. */ - if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) - goto next; + if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) { + optimize_nops(instr, a->instrlen); + continue; + } - DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)", + DPRINTK(ALT, "feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)", (a->flags & ALT_FLAG_NOT) ? "!" : "", a->cpuid >> 5, a->cpuid & 0x1f, instr, instr, a->instrlen, replacement, a->replacementlen); - DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr); - DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement); - memcpy(insn_buff, replacement, a->replacementlen); insn_buff_sz = a->replacementlen; - /* - * 0xe8 is a relative jump; fix the offset. - * - * Instruction length is checked before the opcode to avoid - * accessing uninitialized bytes for zero-length replacements. - */ - if (a->replacementlen == 5 && *insn_buff == 0xe8) { - *(s32 *)(insn_buff + 1) += replacement - instr; - DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx", - *(s32 *)(insn_buff + 1), - (unsigned long)instr + *(s32 *)(insn_buff + 1) + 5); - } - - if (a->replacementlen && is_jmp(replacement[0])) - recompute_jump(a, instr, replacement, insn_buff); - for (; insn_buff_sz < a->instrlen; insn_buff_sz++) insn_buff[insn_buff_sz] = 0x90; - DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr); + apply_relocation(insn_buff, a->instrlen, instr, replacement, a->replacementlen); - text_poke_early(instr, insn_buff, insn_buff_sz); + DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr); + DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement); + DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr); -next: - optimize_nops(instr, a->instrlen); + text_poke_early(instr, insn_buff, insn_buff_sz); } } @@ -555,15 +671,15 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) continue; } - DPRINTK("retpoline at: %pS (%px) len: %d to: %pS", + DPRINTK(RETPOLINE, "retpoline at: %pS (%px) len: %d to: %pS", addr, addr, insn.length, addr + insn.length + insn.immediate.value); len = patch_retpoline(addr, &insn, bytes); if (len == insn.length) { optimize_nops(bytes, len); - DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr); - DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr); + DUMP_BYTES(RETPOLINE, ((u8*)addr), len, "%px: orig: ", addr); + DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr); text_poke_early(addr, bytes, len); } } @@ -590,13 +706,12 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes) { int i = 0; + /* Patch the custom return thunks... */ if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) { - if (x86_return_thunk == __x86_return_thunk) - return -1; - i = JMP32_INSN_SIZE; __text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i); } else { + /* ... or patch them out if not needed. */ bytes[i++] = RET_INSN_OPCODE; } @@ -609,6 +724,14 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) { s32 *s; + /* + * Do not patch out the default return thunks if those needed are the + * ones generated by the compiler. + */ + if (cpu_feature_enabled(X86_FEATURE_RETHUNK) && + (x86_return_thunk == __x86_return_thunk)) + return; + for (s = start; s < end; s++) { void *dest = NULL, *addr = (void *)s + *s; struct insn insn; @@ -630,14 +753,14 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) addr, dest, 5, addr)) continue; - DPRINTK("return thunk at: %pS (%px) len: %d to: %pS", + DPRINTK(RET, "return thunk at: %pS (%px) len: %d to: %pS", addr, addr, insn.length, addr + insn.length + insn.immediate.value); len = patch_return(addr, &insn, bytes); if (len == insn.length) { - DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr); - DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr); + DUMP_BYTES(RET, ((u8*)addr), len, "%px: orig: ", addr); + DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr); text_poke_early(addr, bytes, len); } } @@ -655,7 +778,7 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } #ifdef CONFIG_X86_KERNEL_IBT -static void poison_endbr(void *addr, bool warn) +static void __init_or_module poison_endbr(void *addr, bool warn) { u32 endbr, poison = gen_endbr_poison(); @@ -667,13 +790,13 @@ static void poison_endbr(void *addr, bool warn) return; } - DPRINTK("ENDBR at: %pS (%px)", addr, addr); + DPRINTK(ENDBR, "ENDBR at: %pS (%px)", addr, addr); /* * When we have IBT, the lack of ENDBR will trigger #CP */ - DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr); - DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr); + DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr); + DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr); text_poke_early(addr, &poison, 4); } @@ -1148,7 +1271,7 @@ void __init_or_module alternatives_smp_module_add(struct module *mod, smp->locks_end = locks_end; smp->text = text; smp->text_end = text_end; - DPRINTK("locks %p -> %p, text %p -> %p, name %s\n", + DPRINTK(SMP, "locks %p -> %p, text %p -> %p, name %s\n", smp->locks, smp->locks_end, smp->text, smp->text_end, smp->name); @@ -1225,6 +1348,20 @@ int alternatives_text_reserved(void *start, void *end) #endif /* CONFIG_SMP */ #ifdef CONFIG_PARAVIRT + +/* Use this to add nops to a buffer, then text_poke the whole buffer. */ +static void __init_or_module add_nops(void *insns, unsigned int len) +{ + while (len > 0) { + unsigned int noplen = len; + if (noplen > ASM_NOP_MAX) + noplen = ASM_NOP_MAX; + memcpy(insns, x86_nops[noplen], noplen); + insns += noplen; + len -= noplen; + } +} + void __init_or_module apply_paravirt(struct paravirt_patch_site *start, struct paravirt_patch_site *end) { @@ -1332,6 +1469,35 @@ static noinline void __init int3_selftest(void) unregister_die_notifier(&int3_exception_nb); } +static __initdata int __alt_reloc_selftest_addr; + +__visible noinline void __init __alt_reloc_selftest(void *arg) +{ + WARN_ON(arg != &__alt_reloc_selftest_addr); +} + +static noinline void __init alt_reloc_selftest(void) +{ + /* + * Tests apply_relocation(). + * + * This has a relative immediate (CALL) in a place other than the first + * instruction and additionally on x86_64 we get a RIP-relative LEA: + * + * lea 0x0(%rip),%rdi # 5d0: R_X86_64_PC32 .init.data+0x5566c + * call +0 # 5d5: R_X86_64_PLT32 __alt_reloc_selftest-0x4 + * + * Getting this wrong will either crash and burn or tickle the WARN + * above. + */ + asm_inline volatile ( + ALTERNATIVE("", "lea %[mem], %%" _ASM_ARG1 "; call __alt_reloc_selftest;", X86_FEATURE_ALWAYS) + : /* output */ + : [mem] "m" (__alt_reloc_selftest_addr) + : _ASM_ARG1 + ); +} + void __init alternative_instructions(void) { int3_selftest(); @@ -1419,6 +1585,8 @@ void __init alternative_instructions(void) restart_nmi(); alternatives_patched = 1; + + alt_reloc_selftest(); } /** @@ -1954,6 +2122,16 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries atomic_set_release(&bp_desc.refs, 1); /* + * Function tracing can enable thousands of places that need to be + * updated. This can take quite some time, and with full kernel debugging + * enabled, this could cause the softlockup watchdog to trigger. + * This function gets called every 256 entries added to be patched. + * Call cond_resched() here to make sure that other tasks can get scheduled + * while processing all the functions being patched. + */ + cond_resched(); + + /* * Corresponding read barrier in int3 notifier for making sure the * nr_entries and handler are correctly ordered wrt. patching. */ diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 7e331e8f3692..035a3db5330b 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -15,28 +15,31 @@ #include <linux/pci_ids.h> #include <asm/amd_nb.h> -#define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450 -#define PCI_DEVICE_ID_AMD_17H_M10H_ROOT 0x15d0 -#define PCI_DEVICE_ID_AMD_17H_M30H_ROOT 0x1480 -#define PCI_DEVICE_ID_AMD_17H_M60H_ROOT 0x1630 -#define PCI_DEVICE_ID_AMD_17H_MA0H_ROOT 0x14b5 -#define PCI_DEVICE_ID_AMD_19H_M10H_ROOT 0x14a4 -#define PCI_DEVICE_ID_AMD_19H_M60H_ROOT 0x14d8 -#define PCI_DEVICE_ID_AMD_19H_M70H_ROOT 0x14e8 -#define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464 -#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec -#define PCI_DEVICE_ID_AMD_17H_M30H_DF_F4 0x1494 -#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F4 0x144c -#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444 -#define PCI_DEVICE_ID_AMD_17H_MA0H_DF_F4 0x1728 -#define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654 -#define PCI_DEVICE_ID_AMD_19H_M10H_DF_F4 0x14b1 -#define PCI_DEVICE_ID_AMD_19H_M40H_ROOT 0x14b5 -#define PCI_DEVICE_ID_AMD_19H_M40H_DF_F4 0x167d -#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F4 0x166e -#define PCI_DEVICE_ID_AMD_19H_M60H_DF_F4 0x14e4 -#define PCI_DEVICE_ID_AMD_19H_M70H_DF_F4 0x14f4 -#define PCI_DEVICE_ID_AMD_19H_M78H_DF_F4 0x12fc +#define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450 +#define PCI_DEVICE_ID_AMD_17H_M10H_ROOT 0x15d0 +#define PCI_DEVICE_ID_AMD_17H_M30H_ROOT 0x1480 +#define PCI_DEVICE_ID_AMD_17H_M60H_ROOT 0x1630 +#define PCI_DEVICE_ID_AMD_17H_MA0H_ROOT 0x14b5 +#define PCI_DEVICE_ID_AMD_19H_M10H_ROOT 0x14a4 +#define PCI_DEVICE_ID_AMD_19H_M40H_ROOT 0x14b5 +#define PCI_DEVICE_ID_AMD_19H_M60H_ROOT 0x14d8 +#define PCI_DEVICE_ID_AMD_19H_M70H_ROOT 0x14e8 +#define PCI_DEVICE_ID_AMD_MI200_ROOT 0x14bb + +#define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464 +#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec +#define PCI_DEVICE_ID_AMD_17H_M30H_DF_F4 0x1494 +#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F4 0x144c +#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444 +#define PCI_DEVICE_ID_AMD_17H_MA0H_DF_F4 0x1728 +#define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654 +#define PCI_DEVICE_ID_AMD_19H_M10H_DF_F4 0x14b1 +#define PCI_DEVICE_ID_AMD_19H_M40H_DF_F4 0x167d +#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F4 0x166e +#define PCI_DEVICE_ID_AMD_19H_M60H_DF_F4 0x14e4 +#define PCI_DEVICE_ID_AMD_19H_M70H_DF_F4 0x14f4 +#define PCI_DEVICE_ID_AMD_19H_M78H_DF_F4 0x12fc +#define PCI_DEVICE_ID_AMD_MI200_DF_F4 0x14d4 /* Protect the PCI config register pairs used for SMN. */ static DEFINE_MUTEX(smn_mutex); @@ -53,6 +56,7 @@ static const struct pci_device_id amd_root_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_ROOT) }, {} }; @@ -81,6 +85,7 @@ static const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F3) }, {} }; @@ -101,6 +106,7 @@ static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F4) }, {} }; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 770557110051..af49e24b46a4 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -101,6 +101,9 @@ static int apic_extnmi __ro_after_init = APIC_EXTNMI_BSP; */ static bool virt_ext_dest_id __ro_after_init; +/* For parallel bootup. */ +unsigned long apic_mmio_base __ro_after_init; + /* * Map cpu index to physical APIC ID */ @@ -2163,6 +2166,7 @@ void __init register_lapic_address(unsigned long address) if (!x2apic_mode) { set_fixmap_nocache(FIX_APIC_BASE, address); + apic_mmio_base = APIC_BASE; apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", APIC_BASE, address); } @@ -2376,7 +2380,7 @@ static int nr_logical_cpuids = 1; /* * Used to store mapping between logical CPU IDs and APIC IDs. */ -static int cpuid_to_apicid[] = { +int cpuid_to_apicid[] = { [0 ... NR_CPUS - 1] = -1, }; @@ -2386,20 +2390,31 @@ bool arch_match_cpu_phys_id(int cpu, u64 phys_id) } #ifdef CONFIG_SMP -/** - * apic_id_is_primary_thread - Check whether APIC ID belongs to a primary thread - * @apicid: APIC ID to check +static void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) +{ + /* Isolate the SMT bit(s) in the APICID and check for 0 */ + u32 mask = (1U << (fls(smp_num_siblings) - 1)) - 1; + + if (smp_num_siblings == 1 || !(apicid & mask)) + cpumask_set_cpu(cpu, &__cpu_primary_thread_mask); +} + +/* + * Due to the utter mess of CPUID evaluation smp_num_siblings is not valid + * during early boot. Initialize the primary thread mask before SMP + * bringup. */ -bool apic_id_is_primary_thread(unsigned int apicid) +static int __init smp_init_primary_thread_mask(void) { - u32 mask; + unsigned int cpu; - if (smp_num_siblings == 1) - return true; - /* Isolate the SMT bit(s) in the APICID and check for 0 */ - mask = (1U << (fls(smp_num_siblings) - 1)) - 1; - return !(apicid & mask); + for (cpu = 0; cpu < nr_logical_cpuids; cpu++) + cpu_mark_primary_thread(cpu, cpuid_to_apicid[cpu]); + return 0; } +early_initcall(smp_init_primary_thread_mask); +#else +static inline void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) { } #endif /* @@ -2544,6 +2559,9 @@ int generic_processor_info(int apicid, int version) set_cpu_present(cpu, true); num_processors++; + if (system_state != SYSTEM_BOOTING) + cpu_mark_primary_thread(cpu, apicid); + return cpu; } diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 6bde05a86b4e..896bc41cb2ba 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -97,7 +97,10 @@ static void init_x2apic_ldr(void) static int x2apic_phys_probe(void) { - if (x2apic_mode && (x2apic_phys || x2apic_fadt_phys())) + if (!x2apic_mode) + return 0; + + if (x2apic_phys || x2apic_fadt_phys()) return 1; return apic == &apic_x2apic_phys; diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 482855227964..d9384d5b4b8e 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -546,7 +546,6 @@ unsigned long sn_rtc_cycles_per_second; EXPORT_SYMBOL(sn_rtc_cycles_per_second); /* The following values are used for the per node hub info struct */ -static __initdata unsigned short *_node_to_pnode; static __initdata unsigned short _min_socket, _max_socket; static __initdata unsigned short _min_pnode, _max_pnode, _gr_table_len; static __initdata struct uv_gam_range_entry *uv_gre_table; @@ -554,6 +553,7 @@ static __initdata struct uv_gam_parameters *uv_gp_table; static __initdata unsigned short *_socket_to_node; static __initdata unsigned short *_socket_to_pnode; static __initdata unsigned short *_pnode_to_socket; +static __initdata unsigned short *_node_to_socket; static __initdata struct uv_gam_range_s *_gr_table; @@ -617,7 +617,8 @@ static __init void build_uv_gr_table(void) bytes = _gr_table_len * sizeof(struct uv_gam_range_s); grt = kzalloc(bytes, GFP_KERNEL); - BUG_ON(!grt); + if (WARN_ON_ONCE(!grt)) + return; _gr_table = grt; for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { @@ -1022,7 +1023,7 @@ static void __init calc_mmioh_map(enum mmioh_arch index, switch (index) { case UVY_MMIOH0: mmr = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG0; - nasid_mask = UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0_BASE_MASK; + nasid_mask = UVYH_RH10_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK; n = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG0_DEPTH; min_nasid = min_pnode; max_nasid = max_pnode; @@ -1030,7 +1031,7 @@ static void __init calc_mmioh_map(enum mmioh_arch index, break; case UVY_MMIOH1: mmr = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG1; - nasid_mask = UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG1_BASE_MASK; + nasid_mask = UVYH_RH10_GAM_MMIOH_REDIRECT_CONFIG1_NASID_MASK; n = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG1_DEPTH; min_nasid = min_pnode; max_nasid = max_pnode; @@ -1038,7 +1039,7 @@ static void __init calc_mmioh_map(enum mmioh_arch index, break; case UVX_MMIOH0: mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0; - nasid_mask = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_BASE_MASK; + nasid_mask = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK; n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_DEPTH; min_nasid = min_pnode * 2; max_nasid = max_pnode * 2; @@ -1046,7 +1047,7 @@ static void __init calc_mmioh_map(enum mmioh_arch index, break; case UVX_MMIOH1: mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1; - nasid_mask = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_BASE_MASK; + nasid_mask = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_MASK; n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_DEPTH; min_nasid = min_pnode * 2; max_nasid = max_pnode * 2; @@ -1072,8 +1073,9 @@ static void __init calc_mmioh_map(enum mmioh_arch index, /* Invalid NASID check */ if (nasid < min_nasid || max_nasid < nasid) { - pr_err("UV:%s:Invalid NASID:%x (range:%x..%x)\n", - __func__, index, min_nasid, max_nasid); + /* Not an error: unused table entries get "poison" values */ + pr_debug("UV:%s:Invalid NASID(%x):%x (range:%x..%x)\n", + __func__, index, nasid, min_nasid, max_nasid); nasid = -1; } @@ -1292,6 +1294,7 @@ static void __init uv_init_hub_info(struct uv_hub_info_s *hi) hi->nasid_shift = uv_cpuid.nasid_shift; hi->min_pnode = _min_pnode; hi->min_socket = _min_socket; + hi->node_to_socket = _node_to_socket; hi->pnode_to_socket = _pnode_to_socket; hi->socket_to_node = _socket_to_node; hi->socket_to_pnode = _socket_to_pnode; @@ -1348,7 +1351,7 @@ static void __init decode_gam_rng_tbl(unsigned long ptr) struct uv_gam_range_entry *gre = (struct uv_gam_range_entry *)ptr; unsigned long lgre = 0, gend = 0; int index = 0; - int sock_min = 999999, pnode_min = 99999; + int sock_min = INT_MAX, pnode_min = INT_MAX; int sock_max = -1, pnode_max = -1; uv_gre_table = gre; @@ -1459,11 +1462,37 @@ static int __init decode_uv_systab(void) return 0; } +/* + * Given a bitmask 'bits' representing presnt blades, numbered + * starting at 'base', masking off unused high bits of blade number + * with 'mask', update the minimum and maximum blade numbers that we + * have found. (Masking with 'mask' necessary because of BIOS + * treatment of system partitioning when creating this table we are + * interpreting.) + */ +static inline void blade_update_min_max(unsigned long bits, int base, int mask, int *min, int *max) +{ + int first, last; + + if (!bits) + return; + first = (base + __ffs(bits)) & mask; + last = (base + __fls(bits)) & mask; + + if (*min > first) + *min = first; + if (*max < last) + *max = last; +} + /* Set up physical blade translations from UVH_NODE_PRESENT_TABLE */ static __init void boot_init_possible_blades(struct uv_hub_info_s *hub_info) { unsigned long np; int i, uv_pb = 0; + int sock_min = INT_MAX, sock_max = -1, s_mask; + + s_mask = (1 << uv_cpuid.n_skt) - 1; if (UVH_NODE_PRESENT_TABLE) { pr_info("UV: NODE_PRESENT_DEPTH = %d\n", @@ -1471,35 +1500,82 @@ static __init void boot_init_possible_blades(struct uv_hub_info_s *hub_info) for (i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) { np = uv_read_local_mmr(UVH_NODE_PRESENT_TABLE + i * 8); pr_info("UV: NODE_PRESENT(%d) = 0x%016lx\n", i, np); - uv_pb += hweight64(np); + blade_update_min_max(np, i * 64, s_mask, &sock_min, &sock_max); } } if (UVH_NODE_PRESENT_0) { np = uv_read_local_mmr(UVH_NODE_PRESENT_0); pr_info("UV: NODE_PRESENT_0 = 0x%016lx\n", np); - uv_pb += hweight64(np); + blade_update_min_max(np, 0, s_mask, &sock_min, &sock_max); } if (UVH_NODE_PRESENT_1) { np = uv_read_local_mmr(UVH_NODE_PRESENT_1); pr_info("UV: NODE_PRESENT_1 = 0x%016lx\n", np); - uv_pb += hweight64(np); + blade_update_min_max(np, 64, s_mask, &sock_min, &sock_max); + } + + /* Only update if we actually found some bits indicating blades present */ + if (sock_max >= sock_min) { + _min_socket = sock_min; + _max_socket = sock_max; + uv_pb = sock_max - sock_min + 1; } if (uv_possible_blades != uv_pb) uv_possible_blades = uv_pb; - pr_info("UV: number nodes/possible blades %d\n", uv_pb); + pr_info("UV: number nodes/possible blades %d (%d - %d)\n", + uv_pb, sock_min, sock_max); +} + +static int __init alloc_conv_table(int num_elem, unsigned short **table) +{ + int i; + size_t bytes; + + bytes = num_elem * sizeof(*table[0]); + *table = kmalloc(bytes, GFP_KERNEL); + if (WARN_ON_ONCE(!*table)) + return -ENOMEM; + for (i = 0; i < num_elem; i++) + ((unsigned short *)*table)[i] = SOCK_EMPTY; + return 0; } +/* Remove conversion table if it's 1:1 */ +#define FREE_1_TO_1_TABLE(tbl, min, max, max2) free_1_to_1_table(&tbl, #tbl, min, max, max2) + +static void __init free_1_to_1_table(unsigned short **tp, char *tname, int min, int max, int max2) +{ + int i; + unsigned short *table = *tp; + + if (table == NULL) + return; + if (max != max2) + return; + for (i = 0; i < max; i++) { + if (i != table[i]) + return; + } + kfree(table); + *tp = NULL; + pr_info("UV: %s is 1:1, conversion table removed\n", tname); +} + +/* + * Build Socket Tables + * If the number of nodes is >1 per socket, socket to node table will + * contain lowest node number on that socket. + */ static void __init build_socket_tables(void) { struct uv_gam_range_entry *gre = uv_gre_table; - int num, nump; + int nums, numn, nump; int cpu, i, lnid; int minsock = _min_socket; int maxsock = _max_socket; int minpnode = _min_pnode; int maxpnode = _max_pnode; - size_t bytes; if (!gre) { if (is_uv2_hub() || is_uv3_hub()) { @@ -1507,39 +1583,36 @@ static void __init build_socket_tables(void) return; } pr_err("UV: Error: UVsystab address translations not available!\n"); - BUG(); + WARN_ON_ONCE(!gre); + return; } - /* Build socket id -> node id, pnode */ - num = maxsock - minsock + 1; - bytes = num * sizeof(_socket_to_node[0]); - _socket_to_node = kmalloc(bytes, GFP_KERNEL); - _socket_to_pnode = kmalloc(bytes, GFP_KERNEL); - + numn = num_possible_nodes(); nump = maxpnode - minpnode + 1; - bytes = nump * sizeof(_pnode_to_socket[0]); - _pnode_to_socket = kmalloc(bytes, GFP_KERNEL); - BUG_ON(!_socket_to_node || !_socket_to_pnode || !_pnode_to_socket); - - for (i = 0; i < num; i++) - _socket_to_node[i] = _socket_to_pnode[i] = SOCK_EMPTY; - - for (i = 0; i < nump; i++) - _pnode_to_socket[i] = SOCK_EMPTY; + nums = maxsock - minsock + 1; + + /* Allocate and clear tables */ + if ((alloc_conv_table(nump, &_pnode_to_socket) < 0) + || (alloc_conv_table(nums, &_socket_to_pnode) < 0) + || (alloc_conv_table(numn, &_node_to_socket) < 0) + || (alloc_conv_table(nums, &_socket_to_node) < 0)) { + kfree(_pnode_to_socket); + kfree(_socket_to_pnode); + kfree(_node_to_socket); + return; + } /* Fill in pnode/node/addr conversion list values: */ - pr_info("UV: GAM Building socket/pnode conversion tables\n"); for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { if (gre->type == UV_GAM_RANGE_TYPE_HOLE) continue; i = gre->sockid - minsock; - /* Duplicate: */ - if (_socket_to_pnode[i] != SOCK_EMPTY) - continue; - _socket_to_pnode[i] = gre->pnode; + if (_socket_to_pnode[i] == SOCK_EMPTY) + _socket_to_pnode[i] = gre->pnode; i = gre->pnode - minpnode; - _pnode_to_socket[i] = gre->sockid; + if (_pnode_to_socket[i] == SOCK_EMPTY) + _pnode_to_socket[i] = gre->sockid; pr_info("UV: sid:%02x type:%d nasid:%04x pn:%02x pn2s:%2x\n", gre->sockid, gre->type, gre->nasid, @@ -1549,66 +1622,39 @@ static void __init build_socket_tables(void) /* Set socket -> node values: */ lnid = NUMA_NO_NODE; - for_each_present_cpu(cpu) { + for_each_possible_cpu(cpu) { int nid = cpu_to_node(cpu); int apicid, sockid; if (lnid == nid) continue; lnid = nid; + apicid = per_cpu(x86_cpu_to_apicid, cpu); sockid = apicid >> uv_cpuid.socketid_shift; - _socket_to_node[sockid - minsock] = nid; - pr_info("UV: sid:%02x: apicid:%04x node:%2d\n", - sockid, apicid, nid); - } - /* Set up physical blade to pnode translation from GAM Range Table: */ - bytes = num_possible_nodes() * sizeof(_node_to_pnode[0]); - _node_to_pnode = kmalloc(bytes, GFP_KERNEL); - BUG_ON(!_node_to_pnode); + if (_socket_to_node[sockid - minsock] == SOCK_EMPTY) + _socket_to_node[sockid - minsock] = nid; - for (lnid = 0; lnid < num_possible_nodes(); lnid++) { - unsigned short sockid; + if (_node_to_socket[nid] == SOCK_EMPTY) + _node_to_socket[nid] = sockid; - for (sockid = minsock; sockid <= maxsock; sockid++) { - if (lnid == _socket_to_node[sockid - minsock]) { - _node_to_pnode[lnid] = _socket_to_pnode[sockid - minsock]; - break; - } - } - if (sockid > maxsock) { - pr_err("UV: socket for node %d not found!\n", lnid); - BUG(); - } + pr_info("UV: sid:%02x: apicid:%04x socket:%02d node:%03x s2n:%03x\n", + sockid, + apicid, + _node_to_socket[nid], + nid, + _socket_to_node[sockid - minsock]); } /* - * If socket id == pnode or socket id == node for all nodes, + * If e.g. socket id == pnode for all pnodes, * system runs faster by removing corresponding conversion table. */ - pr_info("UV: Checking socket->node/pnode for identity maps\n"); - if (minsock == 0) { - for (i = 0; i < num; i++) - if (_socket_to_node[i] == SOCK_EMPTY || i != _socket_to_node[i]) - break; - if (i >= num) { - kfree(_socket_to_node); - _socket_to_node = NULL; - pr_info("UV: 1:1 socket_to_node table removed\n"); - } - } - if (minsock == minpnode) { - for (i = 0; i < num; i++) - if (_socket_to_pnode[i] != SOCK_EMPTY && - _socket_to_pnode[i] != i + minpnode) - break; - if (i >= num) { - kfree(_socket_to_pnode); - _socket_to_pnode = NULL; - pr_info("UV: 1:1 socket_to_pnode table removed\n"); - } - } + FREE_1_TO_1_TABLE(_socket_to_node, _min_socket, nums, numn); + FREE_1_TO_1_TABLE(_node_to_socket, _min_socket, nums, numn); + FREE_1_TO_1_TABLE(_socket_to_pnode, _min_pnode, nums, nump); + FREE_1_TO_1_TABLE(_pnode_to_socket, _min_pnode, nums, nump); } /* Check which reboot to use */ @@ -1692,12 +1738,13 @@ static __init int uv_system_init_hubless(void) static void __init uv_system_init_hub(void) { struct uv_hub_info_s hub_info = {0}; - int bytes, cpu, nodeid; - unsigned short min_pnode = 9999, max_pnode = 0; + int bytes, cpu, nodeid, bid; + unsigned short min_pnode = USHRT_MAX, max_pnode = 0; char *hub = is_uv5_hub() ? "UV500" : is_uv4_hub() ? "UV400" : is_uv3_hub() ? "UV300" : is_uv2_hub() ? "UV2000/3000" : NULL; + struct uv_hub_info_s **uv_hub_info_list_blade; if (!hub) { pr_err("UV: Unknown/unsupported UV hub\n"); @@ -1720,9 +1767,12 @@ static void __init uv_system_init_hub(void) build_uv_gr_table(); set_block_size(); uv_init_hub_info(&hub_info); - uv_possible_blades = num_possible_nodes(); - if (!_node_to_pnode) + /* If UV2 or UV3 may need to get # blades from HW */ + if (is_uv(UV2|UV3) && !uv_gre_table) boot_init_possible_blades(&hub_info); + else + /* min/max sockets set in decode_gam_rng_tbl */ + uv_possible_blades = (_max_socket - _min_socket) + 1; /* uv_num_possible_blades() is really the hub count: */ pr_info("UV: Found %d hubs, %d nodes, %d CPUs\n", uv_num_possible_blades(), num_possible_nodes(), num_possible_cpus()); @@ -1731,79 +1781,98 @@ static void __init uv_system_init_hub(void) hub_info.coherency_domain_number = sn_coherency_id; uv_rtc_init(); + /* + * __uv_hub_info_list[] is indexed by node, but there is only + * one hub_info structure per blade. First, allocate one + * structure per blade. Further down we create a per-node + * table (__uv_hub_info_list[]) pointing to hub_info + * structures for the correct blade. + */ + bytes = sizeof(void *) * uv_num_possible_blades(); - __uv_hub_info_list = kzalloc(bytes, GFP_KERNEL); - BUG_ON(!__uv_hub_info_list); + uv_hub_info_list_blade = kzalloc(bytes, GFP_KERNEL); + if (WARN_ON_ONCE(!uv_hub_info_list_blade)) + return; bytes = sizeof(struct uv_hub_info_s); - for_each_node(nodeid) { + for_each_possible_blade(bid) { struct uv_hub_info_s *new_hub; - if (__uv_hub_info_list[nodeid]) { - pr_err("UV: Node %d UV HUB already initialized!?\n", nodeid); - BUG(); + /* Allocate & fill new per hub info list */ + new_hub = (bid == 0) ? &uv_hub_info_node0 + : kzalloc_node(bytes, GFP_KERNEL, uv_blade_to_node(bid)); + if (WARN_ON_ONCE(!new_hub)) { + /* do not kfree() bid 0, which is statically allocated */ + while (--bid > 0) + kfree(uv_hub_info_list_blade[bid]); + kfree(uv_hub_info_list_blade); + return; } - /* Allocate new per hub info list */ - new_hub = (nodeid == 0) ? &uv_hub_info_node0 : kzalloc_node(bytes, GFP_KERNEL, nodeid); - BUG_ON(!new_hub); - __uv_hub_info_list[nodeid] = new_hub; - new_hub = uv_hub_info_list(nodeid); - BUG_ON(!new_hub); + uv_hub_info_list_blade[bid] = new_hub; *new_hub = hub_info; /* Use information from GAM table if available: */ - if (_node_to_pnode) - new_hub->pnode = _node_to_pnode[nodeid]; + if (uv_gre_table) + new_hub->pnode = uv_blade_to_pnode(bid); else /* Or fill in during CPU loop: */ new_hub->pnode = 0xffff; - new_hub->numa_blade_id = uv_node_to_blade_id(nodeid); + new_hub->numa_blade_id = bid; new_hub->memory_nid = NUMA_NO_NODE; new_hub->nr_possible_cpus = 0; new_hub->nr_online_cpus = 0; } + /* + * Now populate __uv_hub_info_list[] for each node with the + * pointer to the struct for the blade it resides on. + */ + + bytes = sizeof(void *) * num_possible_nodes(); + __uv_hub_info_list = kzalloc(bytes, GFP_KERNEL); + if (WARN_ON_ONCE(!__uv_hub_info_list)) { + for_each_possible_blade(bid) + /* bid 0 is statically allocated */ + if (bid != 0) + kfree(uv_hub_info_list_blade[bid]); + kfree(uv_hub_info_list_blade); + return; + } + + for_each_node(nodeid) + __uv_hub_info_list[nodeid] = uv_hub_info_list_blade[uv_node_to_blade_id(nodeid)]; + /* Initialize per CPU info: */ for_each_possible_cpu(cpu) { - int apicid = per_cpu(x86_cpu_to_apicid, cpu); - int numa_node_id; + int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); + unsigned short bid; unsigned short pnode; - nodeid = cpu_to_node(cpu); - numa_node_id = numa_cpu_node(cpu); pnode = uv_apicid_to_pnode(apicid); + bid = uv_pnode_to_socket(pnode) - _min_socket; - uv_cpu_info_per(cpu)->p_uv_hub_info = uv_hub_info_list(nodeid); + uv_cpu_info_per(cpu)->p_uv_hub_info = uv_hub_info_list_blade[bid]; uv_cpu_info_per(cpu)->blade_cpu_id = uv_cpu_hub_info(cpu)->nr_possible_cpus++; if (uv_cpu_hub_info(cpu)->memory_nid == NUMA_NO_NODE) uv_cpu_hub_info(cpu)->memory_nid = cpu_to_node(cpu); - /* Init memoryless node: */ - if (nodeid != numa_node_id && - uv_hub_info_list(numa_node_id)->pnode == 0xffff) - uv_hub_info_list(numa_node_id)->pnode = pnode; - else if (uv_cpu_hub_info(cpu)->pnode == 0xffff) + if (uv_cpu_hub_info(cpu)->pnode == 0xffff) uv_cpu_hub_info(cpu)->pnode = pnode; } - for_each_node(nodeid) { - unsigned short pnode = uv_hub_info_list(nodeid)->pnode; + for_each_possible_blade(bid) { + unsigned short pnode = uv_hub_info_list_blade[bid]->pnode; - /* Add pnode info for pre-GAM list nodes without CPUs: */ - if (pnode == 0xffff) { - unsigned long paddr; + if (pnode == 0xffff) + continue; - paddr = node_start_pfn(nodeid) << PAGE_SHIFT; - pnode = uv_gpa_to_pnode(uv_soc_phys_ram_to_gpa(paddr)); - uv_hub_info_list(nodeid)->pnode = pnode; - } min_pnode = min(pnode, min_pnode); max_pnode = max(pnode, max_pnode); - pr_info("UV: UVHUB node:%2d pn:%02x nrcpus:%d\n", - nodeid, - uv_hub_info_list(nodeid)->pnode, - uv_hub_info_list(nodeid)->nr_possible_cpus); + pr_info("UV: HUB:%2d pn:%02x nrcpus:%d\n", + bid, + uv_hub_info_list_blade[bid]->pnode, + uv_hub_info_list_blade[bid]->nr_possible_cpus); } pr_info("UV: min_pnode:%02x max_pnode:%02x\n", min_pnode, max_pnode); @@ -1811,6 +1880,9 @@ static void __init uv_system_init_hub(void) map_mmr_high(max_pnode); map_mmioh_high(min_pnode, max_pnode); + kfree(uv_hub_info_list_blade); + uv_hub_info_list_blade = NULL; + uv_nmi_setup(); uv_cpu_init(); uv_setup_proc_files(0); diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index 22ab13966427..8bb937331acb 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -133,8 +133,8 @@ static bool skip_addr(void *dest) /* Accounts directly */ if (dest == ret_from_fork) return true; -#ifdef CONFIG_HOTPLUG_CPU - if (dest == start_cpu0) +#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_AMD_MEM_ENCRYPT) + if (dest == soft_restart_cpu) return true; #endif #ifdef CONFIG_FUNCTION_TRACER diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index d7e3ceaf75c1..4350f6bfc064 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -27,7 +27,7 @@ obj-y += cpuid-deps.o obj-y += umwait.o obj-$(CONFIG_PROC_FS) += proc.o -obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o +obj-y += capflags.o powerflags.o obj-$(CONFIG_IA32_FEAT_CTL) += feat_ctl.o ifdef CONFIG_CPU_SUP_INTEL @@ -54,7 +54,6 @@ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o obj-$(CONFIG_ACRN_GUEST) += acrn.o -ifdef CONFIG_X86_FEATURE_NAMES quiet_cmd_mkcapflags = MKCAP $@ cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $@ $^ @@ -63,5 +62,4 @@ vmxfeature = $(src)/../../include/asm/vmxfeatures.h $(obj)/capflags.c: $(cpufeature) $(vmxfeature) $(src)/mkcapflags.sh FORCE $(call if_changed,mkcapflags) -endif targets += capflags.c diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 182af64387d0..9e2a91830f72 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -9,7 +9,6 @@ * - Andrew D. Balsa (code cleanup). */ #include <linux/init.h> -#include <linux/utsname.h> #include <linux/cpu.h> #include <linux/module.h> #include <linux/nospec.h> @@ -27,8 +26,6 @@ #include <asm/msr.h> #include <asm/vmx.h> #include <asm/paravirt.h> -#include <asm/alternative.h> -#include <asm/set_memory.h> #include <asm/intel-family.h> #include <asm/e820/api.h> #include <asm/hypervisor.h> @@ -125,21 +122,8 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); DEFINE_STATIC_KEY_FALSE(mmio_stale_data_clear); EXPORT_SYMBOL_GPL(mmio_stale_data_clear); -void __init check_bugs(void) +void __init cpu_select_mitigations(void) { - identify_boot_cpu(); - - /* - * identify_boot_cpu() initialized SMT support information, let the - * core code know. - */ - cpu_smt_check_topology(); - - if (!IS_ENABLED(CONFIG_SMP)) { - pr_info("CPU: "); - print_cpu_info(&boot_cpu_data); - } - /* * Read the SPEC_CTRL MSR to account for reserved bits which may * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD @@ -176,39 +160,6 @@ void __init check_bugs(void) md_clear_select_mitigation(); srbds_select_mitigation(); l1d_flush_select_mitigation(); - - arch_smt_update(); - -#ifdef CONFIG_X86_32 - /* - * Check whether we are able to run this kernel safely on SMP. - * - * - i386 is no longer supported. - * - In order to run on anything without a TSC, we need to be - * compiled for a i486. - */ - if (boot_cpu_data.x86 < 4) - panic("Kernel requires i486+ for 'invlpg' and other features"); - - init_utsname()->machine[1] = - '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); - alternative_instructions(); - - fpu__init_check_bugs(); -#else /* CONFIG_X86_64 */ - alternative_instructions(); - - /* - * Make sure the first 2MB area is not mapped by huge pages - * There are typically fixed size MTRRs in there and overlapping - * MTRRs into large pages causes slow downs. - * - * Right now we don't do that with gbpages because there seems - * very little benefit for that case. - */ - if (!direct_gbpages) - set_memory_4k((unsigned long)__va(0), 1); -#endif } /* diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index 4063e8991211..8f86eacf69f7 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -39,6 +39,8 @@ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); /* Shared L2 cache maps */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map); +static cpumask_var_t cpu_cacheinfo_mask; + /* Kernel controls MTRR and/or PAT MSRs. */ unsigned int memory_caching_control __ro_after_init; @@ -1172,8 +1174,10 @@ void cache_bp_restore(void) cache_cpu_init(); } -static int cache_ap_init(unsigned int cpu) +static int cache_ap_online(unsigned int cpu) { + cpumask_set_cpu(cpu, cpu_cacheinfo_mask); + if (!memory_caching_control || get_cache_aps_delayed_init()) return 0; @@ -1191,11 +1195,17 @@ static int cache_ap_init(unsigned int cpu) * lock to prevent MTRR entry changes */ stop_machine_from_inactive_cpu(cache_rendezvous_handler, NULL, - cpu_callout_mask); + cpu_cacheinfo_mask); return 0; } +static int cache_ap_offline(unsigned int cpu) +{ + cpumask_clear_cpu(cpu, cpu_cacheinfo_mask); + return 0; +} + /* * Delayed cache initialization for all AP's */ @@ -1210,9 +1220,12 @@ void cache_aps_init(void) static int __init cache_ap_register(void) { + zalloc_cpumask_var(&cpu_cacheinfo_mask, GFP_KERNEL); + cpumask_set_cpu(smp_processor_id(), cpu_cacheinfo_mask); + cpuhp_setup_state_nocalls(CPUHP_AP_CACHECTRL_STARTING, "x86/cachectrl:starting", - cache_ap_init, NULL); + cache_ap_online, cache_ap_offline); return 0; } -core_initcall(cache_ap_register); +early_initcall(cache_ap_register); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 80710a68ef7d..52683fddafaf 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -18,12 +18,16 @@ #include <linux/init.h> #include <linux/kprobes.h> #include <linux/kgdb.h> +#include <linux/mem_encrypt.h> #include <linux/smp.h> +#include <linux/cpu.h> #include <linux/io.h> #include <linux/syscore_ops.h> #include <linux/pgtable.h> #include <linux/stackprotector.h> +#include <linux/utsname.h> +#include <asm/alternative.h> #include <asm/cmdline.h> #include <asm/perf_event.h> #include <asm/mmu_context.h> @@ -59,7 +63,7 @@ #include <asm/intel-family.h> #include <asm/cpu_device_id.h> #include <asm/uv/uv.h> -#include <asm/sigframe.h> +#include <asm/set_memory.h> #include <asm/traps.h> #include <asm/sev.h> @@ -67,14 +71,6 @@ u32 elf_hwcap2 __read_mostly; -/* all of these masks are initialized in setup_cpu_local_masks() */ -cpumask_var_t cpu_initialized_mask; -cpumask_var_t cpu_callout_mask; -cpumask_var_t cpu_callin_mask; - -/* representing cpus for which sibling maps can be computed */ -cpumask_var_t cpu_sibling_setup_mask; - /* Number of siblings per CPU package */ int smp_num_siblings = 1; EXPORT_SYMBOL(smp_num_siblings); @@ -169,15 +165,6 @@ clear_ppin: clear_cpu_cap(c, info->feature); } -/* correctly size the local cpu masks */ -void __init setup_cpu_local_masks(void) -{ - alloc_bootmem_cpumask_var(&cpu_initialized_mask); - alloc_bootmem_cpumask_var(&cpu_callin_mask); - alloc_bootmem_cpumask_var(&cpu_callout_mask); - alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); -} - static void default_init(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_64 @@ -1502,12 +1489,10 @@ static void __init cpu_parse_early_param(void) if (!kstrtouint(opt, 10, &bit)) { if (bit < NCAPINTS * 32) { -#ifdef CONFIG_X86_FEATURE_NAMES /* empty-string, i.e., ""-defined feature flags */ if (!x86_cap_flags[bit]) pr_cont(" " X86_CAP_FMT_NUM, x86_cap_flag_num(bit)); else -#endif pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit)); setup_clear_cpu_cap(bit); @@ -1520,7 +1505,6 @@ static void __init cpu_parse_early_param(void) continue; } -#ifdef CONFIG_X86_FEATURE_NAMES for (bit = 0; bit < 32 * NCAPINTS; bit++) { if (!x86_cap_flag(bit)) continue; @@ -1537,7 +1521,6 @@ static void __init cpu_parse_early_param(void) if (!found) pr_cont(" (unknown: %s)", opt); -#endif } pr_cont("\n"); @@ -1600,10 +1583,6 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) sld_setup(c); - fpu__init_system(c); - - init_sigframe_size(); - #ifdef CONFIG_X86_32 /* * Regardless of whether PCID is enumerated, the SDM says @@ -2123,19 +2102,6 @@ static void dbg_restore_debug_regs(void) #define dbg_restore_debug_regs() #endif /* ! CONFIG_KGDB */ -static void wait_for_master_cpu(int cpu) -{ -#ifdef CONFIG_SMP - /* - * wait for ACK from master CPU before continuing - * with AP initialization - */ - WARN_ON(cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)); - while (!cpumask_test_cpu(cpu, cpu_callout_mask)) - cpu_relax(); -#endif -} - static inline void setup_getcpu(int cpu) { unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu)); @@ -2158,11 +2124,7 @@ static inline void setup_getcpu(int cpu) } #ifdef CONFIG_X86_64 -static inline void ucode_cpu_init(int cpu) -{ - if (cpu) - load_ucode_ap(); -} +static inline void ucode_cpu_init(int cpu) { } static inline void tss_setup_ist(struct tss_struct *tss) { @@ -2239,8 +2201,6 @@ void cpu_init(void) struct task_struct *cur = current; int cpu = raw_smp_processor_id(); - wait_for_master_cpu(cpu); - ucode_cpu_init(cpu); #ifdef CONFIG_NUMA @@ -2285,26 +2245,12 @@ void cpu_init(void) doublefault_init_cpu_tss(); - fpu__init_cpu(); - if (is_uv_system()) uv_cpu_init(); load_fixmap_gdt(cpu); } -#ifdef CONFIG_SMP -void cpu_init_secondary(void) -{ - /* - * Relies on the BP having set-up the IDT tables, which are loaded - * on this CPU in cpu_init_exception_handling(). - */ - cpu_init_exception_handling(); - cpu_init(); -} -#endif - #ifdef CONFIG_MICROCODE_LATE_LOADING /** * store_cpu_caps() - Store a snapshot of CPU capabilities @@ -2362,3 +2308,69 @@ void arch_smt_update(void) /* Check whether IPI broadcasting can be enabled */ apic_smt_update(); } + +void __init arch_cpu_finalize_init(void) +{ + identify_boot_cpu(); + + /* + * identify_boot_cpu() initialized SMT support information, let the + * core code know. + */ + cpu_smt_check_topology(); + + if (!IS_ENABLED(CONFIG_SMP)) { + pr_info("CPU: "); + print_cpu_info(&boot_cpu_data); + } + + cpu_select_mitigations(); + + arch_smt_update(); + + if (IS_ENABLED(CONFIG_X86_32)) { + /* + * Check whether this is a real i386 which is not longer + * supported and fixup the utsname. + */ + if (boot_cpu_data.x86 < 4) + panic("Kernel requires i486+ for 'invlpg' and other features"); + + init_utsname()->machine[1] = + '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); + } + + /* + * Must be before alternatives because it might set or clear + * feature bits. + */ + fpu__init_system(); + fpu__init_cpu(); + + alternative_instructions(); + + if (IS_ENABLED(CONFIG_X86_64)) { + /* + * Make sure the first 2MB area is not mapped by huge pages + * There are typically fixed size MTRRs in there and overlapping + * MTRRs into large pages causes slow downs. + * + * Right now we don't do that with gbpages because there seems + * very little benefit for that case. + */ + if (!direct_gbpages) + set_memory_4k((unsigned long)__va(0), 1); + } else { + fpu__init_check_bugs(); + } + + /* + * This needs to be called before any devices perform DMA + * operations that might use the SWIOTLB bounce buffers. It will + * mark the bounce buffers as decrypted so that their usage will + * not cause "plain-text" data to be decrypted when accessed. It + * must be called after late_time_init() so that Hyper-V x86/x64 + * hypercalls work when the SWIOTLB bounce buffers are decrypted. + */ + mem_encrypt_init(); +} diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index f97b0fe13da8..1c44630d4789 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -79,6 +79,7 @@ extern void detect_ht(struct cpuinfo_x86 *c); extern void check_null_seg_clears_base(struct cpuinfo_x86 *c); unsigned int aperfmperf_get_khz(int cpu); +void cpu_select_mitigations(void); extern void x86_spec_ctrl_setup_ap(void); extern void update_srbds_msr(void); diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 0b971f974096..5e74610b39e7 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -715,11 +715,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) bool amd_mce_is_memory_error(struct mce *m) { + enum smca_bank_types bank_type; /* ErrCodeExt[20:16] */ u8 xec = (m->status >> 16) & 0x1f; + bank_type = smca_get_bank_type(m->extcpu, m->bank); if (mce_flags.smca) - return smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC && xec == 0x0; + return (bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2) && xec == 0x0; return m->bank == 4 && xec == 0x8; } @@ -1050,7 +1052,7 @@ static const char *get_name(unsigned int cpu, unsigned int bank, struct threshol if (bank_type >= N_SMCA_BANK_TYPES) return NULL; - if (b && bank_type == SMCA_UMC) { + if (b && (bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2)) { if (b->block < ARRAY_SIZE(smca_umc_block_names)) return smca_umc_block_names[b->block]; return NULL; diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index ab156e6e7120..89e2aab5d34d 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1533,7 +1533,7 @@ noinstr void do_machine_check(struct pt_regs *regs) /* If this triggers there is no way to recover. Die hard. */ BUG_ON(!on_thread_stack() || !user_mode(regs)); - if (kill_current_task) + if (!mce_usable_address(&m)) queue_task_work(&m, msg, kill_me_now); else queue_task_work(&m, msg, kill_me_maybe); diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index f5fdeb1e3606..87208e46f7ed 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -78,8 +78,6 @@ static u16 find_equiv_id(struct equiv_cpu_table *et, u32 sig) if (sig == e->installed_cpu) return e->equiv_cpu; - - e++; } return 0; } @@ -596,11 +594,6 @@ void reload_ucode_amd(unsigned int cpu) } } } -static u16 __find_equiv_id(unsigned int cpu) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - return find_equiv_id(&equiv_table, uci->cpu_sig.sig); -} /* * a small, trivial cache of per-family ucode patches @@ -651,9 +644,11 @@ static void free_cache(void) static struct ucode_patch *find_patch(unsigned int cpu) { + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; u16 equiv_id; - equiv_id = __find_equiv_id(cpu); + + equiv_id = find_equiv_id(&equiv_table, uci->cpu_sig.sig); if (!equiv_id) return NULL; @@ -705,7 +700,7 @@ static enum ucode_state apply_microcode_amd(int cpu) rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); /* need to apply patch? */ - if (rev >= mc_amd->hdr.patch_id) { + if (rev > mc_amd->hdr.patch_id) { ret = UCODE_OK; goto out; } diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile index cc4f9f1cb94c..aee4bc5ad496 100644 --- a/arch/x86/kernel/cpu/mtrr/Makefile +++ b/arch/x86/kernel/cpu/mtrr/Makefile @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only obj-y := mtrr.o if.o generic.o cleanup.o -obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o +obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o legacy.o diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c index eff6ac62c0ff..ef3e8e42b782 100644 --- a/arch/x86/kernel/cpu/mtrr/amd.c +++ b/arch/x86/kernel/cpu/mtrr/amd.c @@ -110,7 +110,7 @@ amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type) } const struct mtrr_ops amd_mtrr_ops = { - .vendor = X86_VENDOR_AMD, + .var_regs = 2, .set = amd_set_mtrr, .get = amd_get_mtrr, .get_free_region = generic_get_free_region, diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c index b8a74eddde83..6f6c3ae92943 100644 --- a/arch/x86/kernel/cpu/mtrr/centaur.c +++ b/arch/x86/kernel/cpu/mtrr/centaur.c @@ -45,15 +45,6 @@ centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg) return -ENOSPC; } -/* - * Report boot time MCR setups - */ -void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) -{ - centaur_mcr[mcr].low = lo; - centaur_mcr[mcr].high = hi; -} - static void centaur_get_mcr(unsigned int reg, unsigned long *base, unsigned long *size, mtrr_type * type) @@ -112,7 +103,7 @@ centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int t } const struct mtrr_ops centaur_mtrr_ops = { - .vendor = X86_VENDOR_CENTAUR, + .var_regs = 8, .set = centaur_set_mcr, .get = centaur_get_mcr, .get_free_region = centaur_get_free_region, diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index b5f43049fa5f..18cf79d6e2c5 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -55,9 +55,6 @@ static int __initdata nr_range; static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; -static int __initdata debug_print; -#define Dprintk(x...) do { if (debug_print) pr_debug(x); } while (0) - #define BIOS_BUG_MSG \ "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n" @@ -79,12 +76,11 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range, nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, base, base + size); } - if (debug_print) { - pr_debug("After WB checking\n"); - for (i = 0; i < nr_range; i++) - pr_debug("MTRR MAP PFN: %016llx - %016llx\n", - range[i].start, range[i].end); - } + + Dprintk("After WB checking\n"); + for (i = 0; i < nr_range; i++) + Dprintk("MTRR MAP PFN: %016llx - %016llx\n", + range[i].start, range[i].end); /* Take out UC ranges: */ for (i = 0; i < num_var_ranges; i++) { @@ -112,24 +108,22 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range, subtract_range(range, RANGE_NUM, extra_remove_base, extra_remove_base + extra_remove_size); - if (debug_print) { - pr_debug("After UC checking\n"); - for (i = 0; i < RANGE_NUM; i++) { - if (!range[i].end) - continue; - pr_debug("MTRR MAP PFN: %016llx - %016llx\n", - range[i].start, range[i].end); - } + Dprintk("After UC checking\n"); + for (i = 0; i < RANGE_NUM; i++) { + if (!range[i].end) + continue; + + Dprintk("MTRR MAP PFN: %016llx - %016llx\n", + range[i].start, range[i].end); } /* sort the ranges */ nr_range = clean_sort_range(range, RANGE_NUM); - if (debug_print) { - pr_debug("After sorting\n"); - for (i = 0; i < nr_range; i++) - pr_debug("MTRR MAP PFN: %016llx - %016llx\n", - range[i].start, range[i].end); - } + + Dprintk("After sorting\n"); + for (i = 0; i < nr_range; i++) + Dprintk("MTRR MAP PFN: %016llx - %016llx\n", + range[i].start, range[i].end); return nr_range; } @@ -164,16 +158,9 @@ static int __init enable_mtrr_cleanup_setup(char *str) } early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup); -static int __init mtrr_cleanup_debug_setup(char *str) -{ - debug_print = 1; - return 0; -} -early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup); - static void __init set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, - unsigned char type, unsigned int address_bits) + unsigned char type) { u32 base_lo, base_hi, mask_lo, mask_hi; u64 base, mask; @@ -183,7 +170,7 @@ set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, return; } - mask = (1ULL << address_bits) - 1; + mask = (1ULL << boot_cpu_data.x86_phys_bits) - 1; mask &= ~((((u64)sizek) << 10) - 1); base = ((u64)basek) << 10; @@ -209,7 +196,7 @@ save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, range_state[reg].type = type; } -static void __init set_var_mtrr_all(unsigned int address_bits) +static void __init set_var_mtrr_all(void) { unsigned long basek, sizek; unsigned char type; @@ -220,7 +207,7 @@ static void __init set_var_mtrr_all(unsigned int address_bits) sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10); type = range_state[reg].type; - set_var_mtrr(reg, basek, sizek, type, address_bits); + set_var_mtrr(reg, basek, sizek, type); } } @@ -267,7 +254,7 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk, align = max_align; sizek = 1UL << align; - if (debug_print) { + if (mtrr_debug) { char start_factor = 'K', size_factor = 'K'; unsigned long start_base, size_base; @@ -542,7 +529,7 @@ static void __init print_out_mtrr_range_state(void) start_base = to_size_factor(start_base, &start_factor); type = range_state[i].type; - pr_debug("reg %d, base: %ld%cB, range: %ld%cB, type %s\n", + Dprintk("reg %d, base: %ld%cB, range: %ld%cB, type %s\n", i, start_base, start_factor, size_base, size_factor, (type == MTRR_TYPE_UNCACHABLE) ? "UC" : @@ -680,7 +667,7 @@ static int __init mtrr_search_optimal_index(void) return index_good; } -int __init mtrr_cleanup(unsigned address_bits) +int __init mtrr_cleanup(void) { unsigned long x_remove_base, x_remove_size; unsigned long base, size, def, dummy; @@ -689,7 +676,10 @@ int __init mtrr_cleanup(unsigned address_bits) int index_good; int i; - if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) + if (!mtrr_enabled()) + return 0; + + if (!cpu_feature_enabled(X86_FEATURE_MTRR) || enable_mtrr_cleanup < 1) return 0; rdmsr(MSR_MTRRdefType, def, dummy); @@ -711,7 +701,7 @@ int __init mtrr_cleanup(unsigned address_bits) return 0; /* Print original var MTRRs at first, for debugging: */ - pr_debug("original variable MTRRs\n"); + Dprintk("original variable MTRRs\n"); print_out_mtrr_range_state(); memset(range, 0, sizeof(range)); @@ -742,8 +732,8 @@ int __init mtrr_cleanup(unsigned address_bits) mtrr_print_out_one_result(i); if (!result[i].bad) { - set_var_mtrr_all(address_bits); - pr_debug("New variable MTRRs\n"); + set_var_mtrr_all(); + Dprintk("New variable MTRRs\n"); print_out_mtrr_range_state(); return 1; } @@ -763,7 +753,7 @@ int __init mtrr_cleanup(unsigned address_bits) mtrr_calc_range_state(chunk_size, gran_size, x_remove_base, x_remove_size, i); - if (debug_print) { + if (mtrr_debug) { mtrr_print_out_one_result(i); pr_info("\n"); } @@ -786,8 +776,8 @@ int __init mtrr_cleanup(unsigned address_bits) gran_size = result[i].gran_sizek; gran_size <<= 10; x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); - set_var_mtrr_all(address_bits); - pr_debug("New variable MTRRs\n"); + set_var_mtrr_all(); + Dprintk("New variable MTRRs\n"); print_out_mtrr_range_state(); return 1; } else { @@ -802,7 +792,7 @@ int __init mtrr_cleanup(unsigned address_bits) return 0; } #else -int __init mtrr_cleanup(unsigned address_bits) +int __init mtrr_cleanup(void) { return 0; } @@ -882,15 +872,18 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) /* extra one for all 0 */ int num[MTRR_NUM_TYPES + 1]; + if (!mtrr_enabled()) + return 0; + /* * Make sure we only trim uncachable memory on machines that * support the Intel MTRR architecture: */ - if (!is_cpu(INTEL) || disable_mtrr_trim) + if (!cpu_feature_enabled(X86_FEATURE_MTRR) || disable_mtrr_trim) return 0; rdmsr(MSR_MTRRdefType, def, dummy); - def &= 0xff; + def &= MTRR_DEF_TYPE_TYPE; if (def != MTRR_TYPE_UNCACHABLE) return 0; diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c index 173b9e01e623..238dad57d4d6 100644 --- a/arch/x86/kernel/cpu/mtrr/cyrix.c +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -235,7 +235,7 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base, } const struct mtrr_ops cyrix_mtrr_ops = { - .vendor = X86_VENDOR_CYRIX, + .var_regs = 8, .set = cyrix_set_arr, .get = cyrix_get_arr, .get_free_region = cyrix_get_free_region, diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index ee09d359e08f..2d6aa5d2e3d7 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -8,10 +8,12 @@ #include <linux/init.h> #include <linux/io.h> #include <linux/mm.h> - +#include <linux/cc_platform.h> #include <asm/processor-flags.h> #include <asm/cacheinfo.h> #include <asm/cpufeature.h> +#include <asm/hypervisor.h> +#include <asm/mshyperv.h> #include <asm/tlbflush.h> #include <asm/mtrr.h> #include <asm/msr.h> @@ -31,6 +33,55 @@ static struct fixed_range_block fixed_range_blocks[] = { {} }; +struct cache_map { + u64 start; + u64 end; + u64 flags; + u64 type:8; + u64 fixed:1; +}; + +bool mtrr_debug; + +static int __init mtrr_param_setup(char *str) +{ + int rc = 0; + + if (!str) + return -EINVAL; + if (!strcmp(str, "debug")) + mtrr_debug = true; + else + rc = -EINVAL; + + return rc; +} +early_param("mtrr", mtrr_param_setup); + +/* + * CACHE_MAP_MAX is the maximum number of memory ranges in cache_map, where + * no 2 adjacent ranges have the same cache mode (those would be merged). + * The number is based on the worst case: + * - no two adjacent fixed MTRRs share the same cache mode + * - one variable MTRR is spanning a huge area with mode WB + * - 255 variable MTRRs with mode UC all overlap with the WB MTRR, creating 2 + * additional ranges each (result like "ababababa...aba" with a = WB, b = UC), + * accounting for MTRR_MAX_VAR_RANGES * 2 - 1 range entries + * - a TOP_MEM2 area (even with overlapping an UC MTRR can't add 2 range entries + * to the possible maximum, as it always starts at 4GB, thus it can't be in + * the middle of that MTRR, unless that MTRR starts at 0, which would remove + * the initial "a" from the "abababa" pattern above) + * The map won't contain ranges with no matching MTRR (those fall back to the + * default cache mode). + */ +#define CACHE_MAP_MAX (MTRR_NUM_FIXED_RANGES + MTRR_MAX_VAR_RANGES * 2) + +static struct cache_map init_cache_map[CACHE_MAP_MAX] __initdata; +static struct cache_map *cache_map __refdata = init_cache_map; +static unsigned int cache_map_size = CACHE_MAP_MAX; +static unsigned int cache_map_n; +static unsigned int cache_map_fixed; + static unsigned long smp_changes_mask; static int mtrr_state_set; u64 mtrr_tom2; @@ -38,6 +89,9 @@ u64 mtrr_tom2; struct mtrr_state_type mtrr_state; EXPORT_SYMBOL_GPL(mtrr_state); +/* Reserved bits in the high portion of the MTRRphysBaseN MSR. */ +u32 phys_hi_rsvd; + /* * BIOS is expected to clear MtrrFixDramModEn bit, see for example * "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD @@ -69,175 +123,370 @@ static u64 get_mtrr_size(u64 mask) { u64 size; - mask >>= PAGE_SHIFT; - mask |= size_or_mask; + mask |= (u64)phys_hi_rsvd << 32; size = -mask; - size <<= PAGE_SHIFT; + return size; } +static u8 get_var_mtrr_state(unsigned int reg, u64 *start, u64 *size) +{ + struct mtrr_var_range *mtrr = mtrr_state.var_ranges + reg; + + if (!(mtrr->mask_lo & MTRR_PHYSMASK_V)) + return MTRR_TYPE_INVALID; + + *start = (((u64)mtrr->base_hi) << 32) + (mtrr->base_lo & PAGE_MASK); + *size = get_mtrr_size((((u64)mtrr->mask_hi) << 32) + + (mtrr->mask_lo & PAGE_MASK)); + + return mtrr->base_lo & MTRR_PHYSBASE_TYPE; +} + +static u8 get_effective_type(u8 type1, u8 type2) +{ + if (type1 == MTRR_TYPE_UNCACHABLE || type2 == MTRR_TYPE_UNCACHABLE) + return MTRR_TYPE_UNCACHABLE; + + if ((type1 == MTRR_TYPE_WRBACK && type2 == MTRR_TYPE_WRTHROUGH) || + (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK)) + return MTRR_TYPE_WRTHROUGH; + + if (type1 != type2) + return MTRR_TYPE_UNCACHABLE; + + return type1; +} + +static void rm_map_entry_at(int idx) +{ + cache_map_n--; + if (cache_map_n > idx) { + memmove(cache_map + idx, cache_map + idx + 1, + sizeof(*cache_map) * (cache_map_n - idx)); + } +} + /* - * Check and return the effective type for MTRR-MTRR type overlap. - * Returns 1 if the effective type is UNCACHEABLE, else returns 0 + * Add an entry into cache_map at a specific index. Merges adjacent entries if + * appropriate. Return the number of merges for correcting the scan index + * (this is needed as merging will reduce the number of entries, which will + * result in skipping entries in future iterations if the scan index isn't + * corrected). + * Note that the corrected index can never go below -1 (resulting in being 0 in + * the next scan iteration), as "2" is returned only if the current index is + * larger than zero. */ -static int check_type_overlap(u8 *prev, u8 *curr) +static int add_map_entry_at(u64 start, u64 end, u8 type, int idx) { - if (*prev == MTRR_TYPE_UNCACHABLE || *curr == MTRR_TYPE_UNCACHABLE) { - *prev = MTRR_TYPE_UNCACHABLE; - *curr = MTRR_TYPE_UNCACHABLE; - return 1; + bool merge_prev = false, merge_next = false; + + if (start >= end) + return 0; + + if (idx > 0) { + struct cache_map *prev = cache_map + idx - 1; + + if (!prev->fixed && start == prev->end && type == prev->type) + merge_prev = true; } - if ((*prev == MTRR_TYPE_WRBACK && *curr == MTRR_TYPE_WRTHROUGH) || - (*prev == MTRR_TYPE_WRTHROUGH && *curr == MTRR_TYPE_WRBACK)) { - *prev = MTRR_TYPE_WRTHROUGH; - *curr = MTRR_TYPE_WRTHROUGH; + if (idx < cache_map_n) { + struct cache_map *next = cache_map + idx; + + if (!next->fixed && end == next->start && type == next->type) + merge_next = true; } - if (*prev != *curr) { - *prev = MTRR_TYPE_UNCACHABLE; - *curr = MTRR_TYPE_UNCACHABLE; + if (merge_prev && merge_next) { + cache_map[idx - 1].end = cache_map[idx].end; + rm_map_entry_at(idx); + return 2; + } + if (merge_prev) { + cache_map[idx - 1].end = end; return 1; } + if (merge_next) { + cache_map[idx].start = start; + return 1; + } + + /* Sanity check: the array should NEVER be too small! */ + if (cache_map_n == cache_map_size) { + WARN(1, "MTRR cache mode memory map exhausted!\n"); + cache_map_n = cache_map_fixed; + return 0; + } + + if (cache_map_n > idx) { + memmove(cache_map + idx + 1, cache_map + idx, + sizeof(*cache_map) * (cache_map_n - idx)); + } + + cache_map[idx].start = start; + cache_map[idx].end = end; + cache_map[idx].type = type; + cache_map[idx].fixed = 0; + cache_map_n++; return 0; } -/** - * mtrr_type_lookup_fixed - look up memory type in MTRR fixed entries - * - * Return the MTRR fixed memory type of 'start'. - * - * MTRR fixed entries are divided into the following ways: - * 0x00000 - 0x7FFFF : This range is divided into eight 64KB sub-ranges - * 0x80000 - 0xBFFFF : This range is divided into sixteen 16KB sub-ranges - * 0xC0000 - 0xFFFFF : This range is divided into sixty-four 4KB sub-ranges - * - * Return Values: - * MTRR_TYPE_(type) - Matched memory type - * MTRR_TYPE_INVALID - Unmatched +/* Clear a part of an entry. Return 1 if start of entry is still valid. */ +static int clr_map_range_at(u64 start, u64 end, int idx) +{ + int ret = start != cache_map[idx].start; + u64 tmp; + + if (start == cache_map[idx].start && end == cache_map[idx].end) { + rm_map_entry_at(idx); + } else if (start == cache_map[idx].start) { + cache_map[idx].start = end; + } else if (end == cache_map[idx].end) { + cache_map[idx].end = start; + } else { + tmp = cache_map[idx].end; + cache_map[idx].end = start; + add_map_entry_at(end, tmp, cache_map[idx].type, idx + 1); + } + + return ret; +} + +/* + * Add MTRR to the map. The current map is scanned and each part of the MTRR + * either overlapping with an existing entry or with a hole in the map is + * handled separately. */ -static u8 mtrr_type_lookup_fixed(u64 start, u64 end) +static void add_map_entry(u64 start, u64 end, u8 type) { - int idx; + u8 new_type, old_type; + u64 tmp; + int i; - if (start >= 0x100000) - return MTRR_TYPE_INVALID; + for (i = 0; i < cache_map_n && start < end; i++) { + if (start >= cache_map[i].end) + continue; + + if (start < cache_map[i].start) { + /* Region start has no overlap. */ + tmp = min(end, cache_map[i].start); + i -= add_map_entry_at(start, tmp, type, i); + start = tmp; + continue; + } - /* 0x0 - 0x7FFFF */ - if (start < 0x80000) { - idx = 0; - idx += (start >> 16); - return mtrr_state.fixed_ranges[idx]; - /* 0x80000 - 0xBFFFF */ - } else if (start < 0xC0000) { - idx = 1 * 8; - idx += ((start - 0x80000) >> 14); - return mtrr_state.fixed_ranges[idx]; + new_type = get_effective_type(type, cache_map[i].type); + old_type = cache_map[i].type; + + if (cache_map[i].fixed || new_type == old_type) { + /* Cut off start of new entry. */ + start = cache_map[i].end; + continue; + } + + /* Handle only overlapping part of region. */ + tmp = min(end, cache_map[i].end); + i += clr_map_range_at(start, tmp, i); + i -= add_map_entry_at(start, tmp, new_type, i); + start = tmp; } - /* 0xC0000 - 0xFFFFF */ - idx = 3 * 8; - idx += ((start - 0xC0000) >> 12); - return mtrr_state.fixed_ranges[idx]; + /* Add rest of region after last map entry (rest might be empty). */ + add_map_entry_at(start, end, type, i); } -/** - * mtrr_type_lookup_variable - look up memory type in MTRR variable entries - * - * Return Value: - * MTRR_TYPE_(type) - Matched memory type or default memory type (unmatched) - * - * Output Arguments: - * repeat - Set to 1 when [start:end] spanned across MTRR range and type - * returned corresponds only to [start:*partial_end]. Caller has - * to lookup again for [*partial_end:end]. - * - * uniform - Set to 1 when an MTRR covers the region uniformly, i.e. the - * region is fully covered by a single MTRR entry or the default - * type. +/* Add variable MTRRs to cache map. */ +static void map_add_var(void) +{ + u64 start, size; + unsigned int i; + u8 type; + + /* + * Add AMD TOP_MEM2 area. Can't be added in mtrr_build_map(), as it + * needs to be added again when rebuilding the map due to potentially + * having moved as a result of variable MTRRs for memory below 4GB. + */ + if (mtrr_tom2) { + add_map_entry(BIT_ULL(32), mtrr_tom2, MTRR_TYPE_WRBACK); + cache_map[cache_map_n - 1].fixed = 1; + } + + for (i = 0; i < num_var_ranges; i++) { + type = get_var_mtrr_state(i, &start, &size); + if (type != MTRR_TYPE_INVALID) + add_map_entry(start, start + size, type); + } +} + +/* + * Rebuild map by replacing variable entries. Needs to be called when MTRR + * registers are being changed after boot, as such changes could include + * removals of registers, which are complicated to handle without rebuild of + * the map. */ -static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end, - int *repeat, u8 *uniform) +void generic_rebuild_map(void) { - int i; - u64 base, mask; - u8 prev_match, curr_match; + if (mtrr_if != &generic_mtrr_ops) + return; - *repeat = 0; - *uniform = 1; + cache_map_n = cache_map_fixed; - prev_match = MTRR_TYPE_INVALID; - for (i = 0; i < num_var_ranges; ++i) { - unsigned short start_state, end_state, inclusive; + map_add_var(); +} - if (!(mtrr_state.var_ranges[i].mask_lo & (1 << 11))) - continue; +static unsigned int __init get_cache_map_size(void) +{ + return cache_map_fixed + 2 * num_var_ranges + (mtrr_tom2 != 0); +} - base = (((u64)mtrr_state.var_ranges[i].base_hi) << 32) + - (mtrr_state.var_ranges[i].base_lo & PAGE_MASK); - mask = (((u64)mtrr_state.var_ranges[i].mask_hi) << 32) + - (mtrr_state.var_ranges[i].mask_lo & PAGE_MASK); - - start_state = ((start & mask) == (base & mask)); - end_state = ((end & mask) == (base & mask)); - inclusive = ((start < base) && (end > base)); - - if ((start_state != end_state) || inclusive) { - /* - * We have start:end spanning across an MTRR. - * We split the region into either - * - * - start_state:1 - * (start:mtrr_end)(mtrr_end:end) - * - end_state:1 - * (start:mtrr_start)(mtrr_start:end) - * - inclusive:1 - * (start:mtrr_start)(mtrr_start:mtrr_end)(mtrr_end:end) - * - * depending on kind of overlap. - * - * Return the type of the first region and a pointer - * to the start of next region so that caller will be - * advised to lookup again after having adjusted start - * and end. - * - * Note: This way we handle overlaps with multiple - * entries and the default type properly. - */ - if (start_state) - *partial_end = base + get_mtrr_size(mask); - else - *partial_end = base; - - if (unlikely(*partial_end <= start)) { - WARN_ON(1); - *partial_end = start + PAGE_SIZE; - } +/* Build the cache_map containing the cache modes per memory range. */ +void __init mtrr_build_map(void) +{ + u64 start, end, size; + unsigned int i; + u8 type; - end = *partial_end - 1; /* end is inclusive */ - *repeat = 1; - *uniform = 0; + /* Add fixed MTRRs, optimize for adjacent entries with same type. */ + if (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED) { + /* + * Start with 64k size fixed entries, preset 1st one (hence the + * loop below is starting with index 1). + */ + start = 0; + end = size = 0x10000; + type = mtrr_state.fixed_ranges[0]; + + for (i = 1; i < MTRR_NUM_FIXED_RANGES; i++) { + /* 8 64k entries, then 16 16k ones, rest 4k. */ + if (i == 8 || i == 24) + size >>= 2; + + if (mtrr_state.fixed_ranges[i] != type) { + add_map_entry(start, end, type); + start = end; + type = mtrr_state.fixed_ranges[i]; + } + end += size; } + add_map_entry(start, end, type); + } - if ((start & mask) != (base & mask)) - continue; + /* Mark fixed, they take precedence. */ + for (i = 0; i < cache_map_n; i++) + cache_map[i].fixed = 1; + cache_map_fixed = cache_map_n; - curr_match = mtrr_state.var_ranges[i].base_lo & 0xff; - if (prev_match == MTRR_TYPE_INVALID) { - prev_match = curr_match; - continue; + map_add_var(); + + pr_info("MTRR map: %u entries (%u fixed + %u variable; max %u), built from %u variable MTRRs\n", + cache_map_n, cache_map_fixed, cache_map_n - cache_map_fixed, + get_cache_map_size(), num_var_ranges + (mtrr_tom2 != 0)); + + if (mtrr_debug) { + for (i = 0; i < cache_map_n; i++) { + pr_info("%3u: %016llx-%016llx %s\n", i, + cache_map[i].start, cache_map[i].end - 1, + mtrr_attrib_to_str(cache_map[i].type)); } + } +} - *uniform = 0; - if (check_type_overlap(&prev_match, &curr_match)) - return curr_match; +/* Copy the cache_map from __initdata memory to dynamically allocated one. */ +void __init mtrr_copy_map(void) +{ + unsigned int new_size = get_cache_map_size(); + + if (!mtrr_state.enabled || !new_size) { + cache_map = NULL; + return; + } + + mutex_lock(&mtrr_mutex); + + cache_map = kcalloc(new_size, sizeof(*cache_map), GFP_KERNEL); + if (cache_map) { + memmove(cache_map, init_cache_map, + cache_map_n * sizeof(*cache_map)); + cache_map_size = new_size; + } else { + mtrr_state.enabled = 0; + pr_err("MTRRs disabled due to allocation failure for lookup map.\n"); + } + + mutex_unlock(&mtrr_mutex); +} + +/** + * mtrr_overwrite_state - set static MTRR state + * + * Used to set MTRR state via different means (e.g. with data obtained from + * a hypervisor). + * Is allowed only for special cases when running virtualized. Must be called + * from the x86_init.hyper.init_platform() hook. It can be called only once. + * The MTRR state can't be changed afterwards. To ensure that, X86_FEATURE_MTRR + * is cleared. + */ +void mtrr_overwrite_state(struct mtrr_var_range *var, unsigned int num_var, + mtrr_type def_type) +{ + unsigned int i; + + /* Only allowed to be called once before mtrr_bp_init(). */ + if (WARN_ON_ONCE(mtrr_state_set)) + return; + + /* Only allowed when running virtualized. */ + if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) + return; + + /* + * Only allowed for special virtualization cases: + * - when running as Hyper-V, SEV-SNP guest using vTOM + * - when running as Xen PV guest + * - when running as SEV-SNP or TDX guest to avoid unnecessary + * VMM communication/Virtualization exceptions (#VC, #VE) + */ + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP) && + !hv_is_isolation_supported() && + !cpu_feature_enabled(X86_FEATURE_XENPV) && + !cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) + return; + + /* Disable MTRR in order to disable MTRR modifications. */ + setup_clear_cpu_cap(X86_FEATURE_MTRR); + + if (var) { + if (num_var > MTRR_MAX_VAR_RANGES) { + pr_warn("Trying to overwrite MTRR state with %u variable entries\n", + num_var); + num_var = MTRR_MAX_VAR_RANGES; + } + for (i = 0; i < num_var; i++) + mtrr_state.var_ranges[i] = var[i]; + num_var_ranges = num_var; } - if (prev_match != MTRR_TYPE_INVALID) - return prev_match; + mtrr_state.def_type = def_type; + mtrr_state.enabled |= MTRR_STATE_MTRR_ENABLED; - return mtrr_state.def_type; + mtrr_state_set = 1; +} + +static u8 type_merge(u8 type, u8 new_type, u8 *uniform) +{ + u8 effective_type; + + if (type == MTRR_TYPE_INVALID) + return new_type; + + effective_type = get_effective_type(type, new_type); + if (type != effective_type) + *uniform = 0; + + return effective_type; } /** @@ -248,66 +497,49 @@ static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end, * MTRR_TYPE_INVALID - MTRR is disabled * * Output Argument: - * uniform - Set to 1 when an MTRR covers the region uniformly, i.e. the - * region is fully covered by a single MTRR entry or the default - * type. + * uniform - Set to 1 when the returned MTRR type is valid for the whole + * region, set to 0 else. */ u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform) { - u8 type, prev_type, is_uniform = 1, dummy; - int repeat; - u64 partial_end; + u8 type = MTRR_TYPE_INVALID; + unsigned int i; - /* Make end inclusive instead of exclusive */ - end--; + if (!mtrr_state_set) { + /* Uniformity is unknown. */ + *uniform = 0; + return MTRR_TYPE_UNCACHABLE; + } - if (!mtrr_state_set) - return MTRR_TYPE_INVALID; + *uniform = 1; if (!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED)) - return MTRR_TYPE_INVALID; + return MTRR_TYPE_UNCACHABLE; - /* - * Look up the fixed ranges first, which take priority over - * the variable ranges. - */ - if ((start < 0x100000) && - (mtrr_state.have_fixed) && - (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) { - is_uniform = 0; - type = mtrr_type_lookup_fixed(start, end); - goto out; - } + for (i = 0; i < cache_map_n && start < end; i++) { + /* Region after current map entry? -> continue with next one. */ + if (start >= cache_map[i].end) + continue; - /* - * Look up the variable ranges. Look of multiple ranges matching - * this address and pick type as per MTRR precedence. - */ - type = mtrr_type_lookup_variable(start, end, &partial_end, - &repeat, &is_uniform); + /* Start of region not covered by current map entry? */ + if (start < cache_map[i].start) { + /* At least some part of region has default type. */ + type = type_merge(type, mtrr_state.def_type, uniform); + /* End of region not covered, too? -> lookup done. */ + if (end <= cache_map[i].start) + return type; + } - /* - * Common path is with repeat = 0. - * However, we can have cases where [start:end] spans across some - * MTRR ranges and/or the default type. Do repeated lookups for - * that case here. - */ - while (repeat) { - prev_type = type; - start = partial_end; - is_uniform = 0; - type = mtrr_type_lookup_variable(start, end, &partial_end, - &repeat, &dummy); + /* At least part of region covered by map entry. */ + type = type_merge(type, cache_map[i].type, uniform); - if (check_type_overlap(&prev_type, &type)) - goto out; + start = cache_map[i].end; } - if (mtrr_tom2 && (start >= (1ULL<<32)) && (end < mtrr_tom2)) - type = MTRR_TYPE_WRBACK; + /* End of region past last entry in map? -> use default type. */ + if (start < end) + type = type_merge(type, mtrr_state.def_type, uniform); -out: - *uniform = is_uniform; return type; } @@ -363,8 +595,8 @@ static void __init print_fixed_last(void) if (!last_fixed_end) return; - pr_debug(" %05X-%05X %s\n", last_fixed_start, - last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type)); + pr_info(" %05X-%05X %s\n", last_fixed_start, + last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type)); last_fixed_end = 0; } @@ -402,10 +634,10 @@ static void __init print_mtrr_state(void) unsigned int i; int high_width; - pr_debug("MTRR default type: %s\n", - mtrr_attrib_to_str(mtrr_state.def_type)); + pr_info("MTRR default type: %s\n", + mtrr_attrib_to_str(mtrr_state.def_type)); if (mtrr_state.have_fixed) { - pr_debug("MTRR fixed ranges %sabled:\n", + pr_info("MTRR fixed ranges %sabled:\n", ((mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) && (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) ? "en" : "dis"); @@ -420,26 +652,27 @@ static void __init print_mtrr_state(void) /* tail */ print_fixed_last(); } - pr_debug("MTRR variable ranges %sabled:\n", - mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED ? "en" : "dis"); - high_width = (__ffs64(size_or_mask) - (32 - PAGE_SHIFT) + 3) / 4; + pr_info("MTRR variable ranges %sabled:\n", + mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED ? "en" : "dis"); + high_width = (boot_cpu_data.x86_phys_bits - (32 - PAGE_SHIFT) + 3) / 4; for (i = 0; i < num_var_ranges; ++i) { - if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) - pr_debug(" %u base %0*X%05X000 mask %0*X%05X000 %s\n", - i, - high_width, - mtrr_state.var_ranges[i].base_hi, - mtrr_state.var_ranges[i].base_lo >> 12, - high_width, - mtrr_state.var_ranges[i].mask_hi, - mtrr_state.var_ranges[i].mask_lo >> 12, - mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff)); + if (mtrr_state.var_ranges[i].mask_lo & MTRR_PHYSMASK_V) + pr_info(" %u base %0*X%05X000 mask %0*X%05X000 %s\n", + i, + high_width, + mtrr_state.var_ranges[i].base_hi, + mtrr_state.var_ranges[i].base_lo >> 12, + high_width, + mtrr_state.var_ranges[i].mask_hi, + mtrr_state.var_ranges[i].mask_lo >> 12, + mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & + MTRR_PHYSBASE_TYPE)); else - pr_debug(" %u disabled\n", i); + pr_info(" %u disabled\n", i); } if (mtrr_tom2) - pr_debug("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20); + pr_info("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20); } /* Grab all of the MTRR state for this CPU into *state */ @@ -452,7 +685,7 @@ bool __init get_mtrr_state(void) vrs = mtrr_state.var_ranges; rdmsr(MSR_MTRRcap, lo, dummy); - mtrr_state.have_fixed = (lo >> 8) & 1; + mtrr_state.have_fixed = lo & MTRR_CAP_FIX; for (i = 0; i < num_var_ranges; i++) get_mtrr_var_range(i, &vrs[i]); @@ -460,8 +693,8 @@ bool __init get_mtrr_state(void) get_fixed_ranges(mtrr_state.fixed_ranges); rdmsr(MSR_MTRRdefType, lo, dummy); - mtrr_state.def_type = (lo & 0xff); - mtrr_state.enabled = (lo & 0xc00) >> 10; + mtrr_state.def_type = lo & MTRR_DEF_TYPE_TYPE; + mtrr_state.enabled = (lo & MTRR_DEF_TYPE_ENABLE) >> MTRR_STATE_SHIFT; if (amd_special_default_mtrr()) { unsigned low, high; @@ -474,7 +707,8 @@ bool __init get_mtrr_state(void) mtrr_tom2 &= 0xffffff800000ULL; } - print_mtrr_state(); + if (mtrr_debug) + print_mtrr_state(); mtrr_state_set = 1; @@ -574,7 +808,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); - if ((mask_lo & 0x800) == 0) { + if (!(mask_lo & MTRR_PHYSMASK_V)) { /* Invalid (i.e. free) range */ *base = 0; *size = 0; @@ -585,8 +819,8 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi); /* Work out the shifted address mask: */ - tmp = (u64)mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT; - mask = size_or_mask | tmp; + tmp = (u64)mask_hi << 32 | (mask_lo & PAGE_MASK); + mask = (u64)phys_hi_rsvd << 32 | tmp; /* Expand tmp with high bits to all 1s: */ hi = fls64(tmp); @@ -604,9 +838,9 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, * This works correctly if size is a power of two, i.e. a * contiguous range: */ - *size = -mask; + *size = -mask >> PAGE_SHIFT; *base = (u64)base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT; - *type = base_lo & 0xff; + *type = base_lo & MTRR_PHYSBASE_TYPE; out_put_cpu: put_cpu(); @@ -644,9 +878,8 @@ static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) bool changed = false; rdmsr(MTRRphysBase_MSR(index), lo, hi); - if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL) - || (vr->base_hi & (size_and_mask >> (32 - PAGE_SHIFT))) != - (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) { + if ((vr->base_lo & ~MTRR_PHYSBASE_RSVD) != (lo & ~MTRR_PHYSBASE_RSVD) + || (vr->base_hi & ~phys_hi_rsvd) != (hi & ~phys_hi_rsvd)) { mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi); changed = true; @@ -654,9 +887,8 @@ static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) rdmsr(MTRRphysMask_MSR(index), lo, hi); - if ((vr->mask_lo & 0xfffff800UL) != (lo & 0xfffff800UL) - || (vr->mask_hi & (size_and_mask >> (32 - PAGE_SHIFT))) != - (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) { + if ((vr->mask_lo & ~MTRR_PHYSMASK_RSVD) != (lo & ~MTRR_PHYSMASK_RSVD) + || (vr->mask_hi & ~phys_hi_rsvd) != (hi & ~phys_hi_rsvd)) { mtrr_wrmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); changed = true; } @@ -691,11 +923,12 @@ static unsigned long set_mtrr_state(void) * Set_mtrr_restore restores the old value of MTRRdefType, * so to set it we fiddle with the saved value: */ - if ((deftype_lo & 0xff) != mtrr_state.def_type - || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) { + if ((deftype_lo & MTRR_DEF_TYPE_TYPE) != mtrr_state.def_type || + ((deftype_lo & MTRR_DEF_TYPE_ENABLE) >> MTRR_STATE_SHIFT) != mtrr_state.enabled) { - deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type | - (mtrr_state.enabled << 10); + deftype_lo = (deftype_lo & MTRR_DEF_TYPE_DISABLE) | + mtrr_state.def_type | + (mtrr_state.enabled << MTRR_STATE_SHIFT); change_mask |= MTRR_CHANGE_MASK_DEFTYPE; } @@ -708,7 +941,7 @@ void mtrr_disable(void) rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); /* Disable MTRRs, and set the default type to uncached */ - mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi); + mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & MTRR_DEF_TYPE_DISABLE, deftype_hi); } void mtrr_enable(void) @@ -762,9 +995,9 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base, memset(vr, 0, sizeof(struct mtrr_var_range)); } else { vr->base_lo = base << PAGE_SHIFT | type; - vr->base_hi = (base & size_and_mask) >> (32 - PAGE_SHIFT); - vr->mask_lo = -size << PAGE_SHIFT | 0x800; - vr->mask_hi = (-size & size_and_mask) >> (32 - PAGE_SHIFT); + vr->base_hi = (base >> (32 - PAGE_SHIFT)) & ~phys_hi_rsvd; + vr->mask_lo = -size << PAGE_SHIFT | MTRR_PHYSMASK_V; + vr->mask_hi = (-size >> (32 - PAGE_SHIFT)) & ~phys_hi_rsvd; mtrr_wrmsr(MTRRphysBase_MSR(reg), vr->base_lo, vr->base_hi); mtrr_wrmsr(MTRRphysMask_MSR(reg), vr->mask_lo, vr->mask_hi); @@ -783,7 +1016,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, * For Intel PPro stepping <= 7 * must be 4 MiB aligned and not touch 0x70000000 -> 0x7003FFFF */ - if (is_cpu(INTEL) && boot_cpu_data.x86 == 6 && + if (mtrr_if == &generic_mtrr_ops && boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 1 && boot_cpu_data.x86_stepping <= 7) { if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) { @@ -817,7 +1050,7 @@ static int generic_have_wrcomb(void) { unsigned long config, dummy; rdmsr(MSR_MTRRcap, config, dummy); - return config & (1 << 10); + return config & MTRR_CAP_WC; } int positive_have_wrcomb(void) diff --git a/arch/x86/kernel/cpu/mtrr/legacy.c b/arch/x86/kernel/cpu/mtrr/legacy.c new file mode 100644 index 000000000000..d25882fcf181 --- /dev/null +++ b/arch/x86/kernel/cpu/mtrr/legacy.c @@ -0,0 +1,90 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/syscore_ops.h> +#include <asm/cpufeature.h> +#include <asm/mtrr.h> +#include <asm/processor.h> +#include "mtrr.h" + +void mtrr_set_if(void) +{ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + /* Pre-Athlon (K6) AMD CPU MTRRs */ + if (cpu_feature_enabled(X86_FEATURE_K6_MTRR)) + mtrr_if = &amd_mtrr_ops; + break; + case X86_VENDOR_CENTAUR: + if (cpu_feature_enabled(X86_FEATURE_CENTAUR_MCR)) + mtrr_if = ¢aur_mtrr_ops; + break; + case X86_VENDOR_CYRIX: + if (cpu_feature_enabled(X86_FEATURE_CYRIX_ARR)) + mtrr_if = &cyrix_mtrr_ops; + break; + default: + break; + } +} + +/* + * The suspend/resume methods are only for CPUs without MTRR. CPUs using generic + * MTRR driver don't require this. + */ +struct mtrr_value { + mtrr_type ltype; + unsigned long lbase; + unsigned long lsize; +}; + +static struct mtrr_value *mtrr_value; + +static int mtrr_save(void) +{ + int i; + + if (!mtrr_value) + return -ENOMEM; + + for (i = 0; i < num_var_ranges; i++) { + mtrr_if->get(i, &mtrr_value[i].lbase, + &mtrr_value[i].lsize, + &mtrr_value[i].ltype); + } + return 0; +} + +static void mtrr_restore(void) +{ + int i; + + for (i = 0; i < num_var_ranges; i++) { + if (mtrr_value[i].lsize) { + mtrr_if->set(i, mtrr_value[i].lbase, + mtrr_value[i].lsize, + mtrr_value[i].ltype); + } + } +} + +static struct syscore_ops mtrr_syscore_ops = { + .suspend = mtrr_save, + .resume = mtrr_restore, +}; + +void mtrr_register_syscore(void) +{ + mtrr_value = kcalloc(num_var_ranges, sizeof(*mtrr_value), GFP_KERNEL); + + /* + * The CPU has no MTRR and seems to not support SMP. They have + * specific drivers, we use a tricky method to support + * suspend/resume for them. + * + * TBD: is there any system with such CPU which supports + * suspend/resume? If no, we should remove the code. + */ + register_syscore_ops(&mtrr_syscore_ops); +} diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c index 783f3210d582..767bf1c71aad 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.c +++ b/arch/x86/kernel/cpu/mtrr/mtrr.c @@ -59,15 +59,9 @@ #define MTRR_TO_PHYS_WC_OFFSET 1000 u32 num_var_ranges; -static bool mtrr_enabled(void) -{ - return !!mtrr_if; -} unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; -static DEFINE_MUTEX(mtrr_mutex); - -u64 size_or_mask, size_and_mask; +DEFINE_MUTEX(mtrr_mutex); const struct mtrr_ops *mtrr_if; @@ -105,21 +99,6 @@ static int have_wrcomb(void) return mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0; } -/* This function returns the number of variable MTRRs */ -static void __init set_num_var_ranges(bool use_generic) -{ - unsigned long config = 0, dummy; - - if (use_generic) - rdmsr(MSR_MTRRcap, config, dummy); - else if (is_cpu(AMD) || is_cpu(HYGON)) - config = 2; - else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) - config = 8; - - num_var_ranges = config & 0xff; -} - static void __init init_table(void) { int i, max; @@ -194,20 +173,8 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) * Note that the mechanism is the same for UP systems, too; all the SMP stuff * becomes nops. */ -static void -set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) -{ - struct set_mtrr_data data = { .smp_reg = reg, - .smp_base = base, - .smp_size = size, - .smp_type = type - }; - - stop_machine(mtrr_rendezvous_handler, &data, cpu_online_mask); -} - -static void set_mtrr_cpuslocked(unsigned int reg, unsigned long base, - unsigned long size, mtrr_type type) +static void set_mtrr(unsigned int reg, unsigned long base, unsigned long size, + mtrr_type type) { struct set_mtrr_data data = { .smp_reg = reg, .smp_base = base, @@ -216,6 +183,8 @@ static void set_mtrr_cpuslocked(unsigned int reg, unsigned long base, }; stop_machine_cpuslocked(mtrr_rendezvous_handler, &data, cpu_online_mask); + + generic_rebuild_map(); } /** @@ -337,7 +306,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, /* Search for an empty MTRR */ i = mtrr_if->get_free_region(base, size, replace); if (i >= 0) { - set_mtrr_cpuslocked(i, base, size, type); + set_mtrr(i, base, size, type); if (likely(replace < 0)) { mtrr_usage_table[i] = 1; } else { @@ -345,7 +314,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, if (increment) mtrr_usage_table[i]++; if (unlikely(replace != i)) { - set_mtrr_cpuslocked(replace, 0, 0, 0); + set_mtrr(replace, 0, 0, 0); mtrr_usage_table[replace] = 0; } } @@ -363,7 +332,7 @@ static int mtrr_check(unsigned long base, unsigned long size) { if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { pr_warn("size and base must be multiples of 4 kiB\n"); - pr_debug("size: 0x%lx base: 0x%lx\n", size, base); + Dprintk("size: 0x%lx base: 0x%lx\n", size, base); dump_stack(); return -1; } @@ -454,8 +423,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) } } if (reg < 0) { - pr_debug("no MTRR for %lx000,%lx000 found\n", - base, size); + Dprintk("no MTRR for %lx000,%lx000 found\n", base, size); goto out; } } @@ -473,7 +441,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) goto out; } if (--mtrr_usage_table[reg] < 1) - set_mtrr_cpuslocked(reg, 0, 0, 0); + set_mtrr(reg, 0, 0, 0); error = reg; out: mutex_unlock(&mtrr_mutex); @@ -574,136 +542,54 @@ int arch_phys_wc_index(int handle) } EXPORT_SYMBOL_GPL(arch_phys_wc_index); -/* The suspend/resume methods are only for CPU without MTRR. CPU using generic - * MTRR driver doesn't require this - */ -struct mtrr_value { - mtrr_type ltype; - unsigned long lbase; - unsigned long lsize; -}; - -static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES]; - -static int mtrr_save(void) -{ - int i; - - for (i = 0; i < num_var_ranges; i++) { - mtrr_if->get(i, &mtrr_value[i].lbase, - &mtrr_value[i].lsize, - &mtrr_value[i].ltype); - } - return 0; -} - -static void mtrr_restore(void) -{ - int i; - - for (i = 0; i < num_var_ranges; i++) { - if (mtrr_value[i].lsize) { - set_mtrr(i, mtrr_value[i].lbase, - mtrr_value[i].lsize, - mtrr_value[i].ltype); - } - } -} - - - -static struct syscore_ops mtrr_syscore_ops = { - .suspend = mtrr_save, - .resume = mtrr_restore, -}; - int __initdata changed_by_mtrr_cleanup; -#define SIZE_OR_MASK_BITS(n) (~((1ULL << ((n) - PAGE_SHIFT)) - 1)) /** - * mtrr_bp_init - initialize mtrrs on the boot CPU + * mtrr_bp_init - initialize MTRRs on the boot CPU * * This needs to be called early; before any of the other CPUs are * initialized (i.e. before smp_init()). - * */ void __init mtrr_bp_init(void) { + bool generic_mtrrs = cpu_feature_enabled(X86_FEATURE_MTRR); const char *why = "(not available)"; - u32 phys_addr; - - phys_addr = 32; + unsigned long config, dummy; - if (boot_cpu_has(X86_FEATURE_MTRR)) { - mtrr_if = &generic_mtrr_ops; - size_or_mask = SIZE_OR_MASK_BITS(36); - size_and_mask = 0x00f00000; - phys_addr = 36; + phys_hi_rsvd = GENMASK(31, boot_cpu_data.x86_phys_bits - 32); + if (!generic_mtrrs && mtrr_state.enabled) { /* - * This is an AMD specific MSR, but we assume(hope?) that - * Intel will implement it too when they extend the address - * bus of the Xeon. + * Software overwrite of MTRR state, only for generic case. + * Note that X86_FEATURE_MTRR has been reset in this case. */ - if (cpuid_eax(0x80000000) >= 0x80000008) { - phys_addr = cpuid_eax(0x80000008) & 0xff; - /* CPUID workaround for Intel 0F33/0F34 CPU */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && - boot_cpu_data.x86 == 0xF && - boot_cpu_data.x86_model == 0x3 && - (boot_cpu_data.x86_stepping == 0x3 || - boot_cpu_data.x86_stepping == 0x4)) - phys_addr = 36; - - size_or_mask = SIZE_OR_MASK_BITS(phys_addr); - size_and_mask = ~size_or_mask & 0xfffff00000ULL; - } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR && - boot_cpu_data.x86 == 6) { - /* - * VIA C* family have Intel style MTRRs, - * but don't support PAE - */ - size_or_mask = SIZE_OR_MASK_BITS(32); - size_and_mask = 0; - phys_addr = 32; - } - } else { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - if (cpu_feature_enabled(X86_FEATURE_K6_MTRR)) { - /* Pre-Athlon (K6) AMD CPU MTRRs */ - mtrr_if = &amd_mtrr_ops; - size_or_mask = SIZE_OR_MASK_BITS(32); - size_and_mask = 0; - } - break; - case X86_VENDOR_CENTAUR: - if (cpu_feature_enabled(X86_FEATURE_CENTAUR_MCR)) { - mtrr_if = ¢aur_mtrr_ops; - size_or_mask = SIZE_OR_MASK_BITS(32); - size_and_mask = 0; - } - break; - case X86_VENDOR_CYRIX: - if (cpu_feature_enabled(X86_FEATURE_CYRIX_ARR)) { - mtrr_if = &cyrix_mtrr_ops; - size_or_mask = SIZE_OR_MASK_BITS(32); - size_and_mask = 0; - } - break; - default: - break; - } + init_table(); + mtrr_build_map(); + pr_info("MTRRs set to read-only\n"); + + return; } + if (generic_mtrrs) + mtrr_if = &generic_mtrr_ops; + else + mtrr_set_if(); + if (mtrr_enabled()) { - set_num_var_ranges(mtrr_if == &generic_mtrr_ops); + /* Get the number of variable MTRR ranges. */ + if (mtrr_if == &generic_mtrr_ops) + rdmsr(MSR_MTRRcap, config, dummy); + else + config = mtrr_if->var_regs; + num_var_ranges = config & MTRR_CAP_VCNT; + init_table(); if (mtrr_if == &generic_mtrr_ops) { /* BIOS may override */ if (get_mtrr_state()) { memory_caching_control |= CACHE_MTRR; - changed_by_mtrr_cleanup = mtrr_cleanup(phys_addr); + changed_by_mtrr_cleanup = mtrr_cleanup(); + mtrr_build_map(); } else { mtrr_if = NULL; why = "by BIOS"; @@ -730,8 +616,14 @@ void mtrr_save_state(void) smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1); } -static int __init mtrr_init_finialize(void) +static int __init mtrr_init_finalize(void) { + /* + * Map might exist if mtrr_overwrite_state() has been called or if + * mtrr_enabled() returns true. + */ + mtrr_copy_map(); + if (!mtrr_enabled()) return 0; @@ -741,16 +633,8 @@ static int __init mtrr_init_finialize(void) return 0; } - /* - * The CPU has no MTRR and seems to not support SMP. They have - * specific drivers, we use a tricky method to support - * suspend/resume for them. - * - * TBD: is there any system with such CPU which supports - * suspend/resume? If no, we should remove the code. - */ - register_syscore_ops(&mtrr_syscore_ops); + mtrr_register_syscore(); return 0; } -subsys_initcall(mtrr_init_finialize); +subsys_initcall(mtrr_init_finalize); diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 02eb5871492d..5655f253d929 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -10,10 +10,13 @@ #define MTRR_CHANGE_MASK_VARIABLE 0x02 #define MTRR_CHANGE_MASK_DEFTYPE 0x04 +extern bool mtrr_debug; +#define Dprintk(x...) do { if (mtrr_debug) pr_info(x); } while (0) + extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; struct mtrr_ops { - u32 vendor; + u32 var_regs; void (*set)(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type); void (*get)(unsigned int reg, unsigned long *base, @@ -51,18 +54,26 @@ void fill_mtrr_var_range(unsigned int index, u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); bool get_mtrr_state(void); -extern u64 size_or_mask, size_and_mask; extern const struct mtrr_ops *mtrr_if; - -#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) +extern struct mutex mtrr_mutex; extern unsigned int num_var_ranges; extern u64 mtrr_tom2; extern struct mtrr_state_type mtrr_state; +extern u32 phys_hi_rsvd; void mtrr_state_warn(void); const char *mtrr_attrib_to_str(int x); void mtrr_wrmsr(unsigned, unsigned, unsigned); +#ifdef CONFIG_X86_32 +void mtrr_set_if(void); +void mtrr_register_syscore(void); +#else +static inline void mtrr_set_if(void) { } +static inline void mtrr_register_syscore(void) { } +#endif +void mtrr_build_map(void); +void mtrr_copy_map(void); /* CPU specific mtrr_ops vectors. */ extern const struct mtrr_ops amd_mtrr_ops; @@ -70,4 +81,14 @@ extern const struct mtrr_ops cyrix_mtrr_ops; extern const struct mtrr_ops centaur_mtrr_ops; extern int changed_by_mtrr_cleanup; -extern int mtrr_cleanup(unsigned address_bits); +extern int mtrr_cleanup(void); + +/* + * Must be used by code which uses mtrr_if to call platform-specific + * MTRR manipulation functions. + */ +static inline bool mtrr_enabled(void) +{ + return !!mtrr_if; +} +void generic_rebuild_map(void); diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 6ad33f355861..725344048f85 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -726,11 +726,15 @@ unlock: static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) { struct task_struct *p, *t; + pid_t pid; rcu_read_lock(); for_each_process_thread(p, t) { - if (is_closid_match(t, r) || is_rmid_match(t, r)) - seq_printf(s, "%d\n", t->pid); + if (is_closid_match(t, r) || is_rmid_match(t, r)) { + pid = task_pid_vnr(t); + if (pid) + seq_printf(s, "%d\n", pid); + } } rcu_read_unlock(); } @@ -2301,6 +2305,26 @@ static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn) } } +static void rdtgroup_kn_get(struct rdtgroup *rdtgrp, struct kernfs_node *kn) +{ + atomic_inc(&rdtgrp->waitcount); + kernfs_break_active_protection(kn); +} + +static void rdtgroup_kn_put(struct rdtgroup *rdtgrp, struct kernfs_node *kn) +{ + if (atomic_dec_and_test(&rdtgrp->waitcount) && + (rdtgrp->flags & RDT_DELETED)) { + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) + rdtgroup_pseudo_lock_remove(rdtgrp); + kernfs_unbreak_active_protection(kn); + rdtgroup_remove(rdtgrp); + } else { + kernfs_unbreak_active_protection(kn); + } +} + struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn) { struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); @@ -2308,8 +2332,7 @@ struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn) if (!rdtgrp) return NULL; - atomic_inc(&rdtgrp->waitcount); - kernfs_break_active_protection(kn); + rdtgroup_kn_get(rdtgrp, kn); mutex_lock(&rdtgroup_mutex); @@ -2328,17 +2351,7 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn) return; mutex_unlock(&rdtgroup_mutex); - - if (atomic_dec_and_test(&rdtgrp->waitcount) && - (rdtgrp->flags & RDT_DELETED)) { - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) - rdtgroup_pseudo_lock_remove(rdtgrp); - kernfs_unbreak_active_protection(kn); - rdtgroup_remove(rdtgrp); - } else { - kernfs_unbreak_active_protection(kn); - } + rdtgroup_kn_put(rdtgrp, kn); } static int mkdir_mondata_all(struct kernfs_node *parent_kn, @@ -3505,6 +3518,133 @@ out: return ret; } +/** + * mongrp_reparent() - replace parent CTRL_MON group of a MON group + * @rdtgrp: the MON group whose parent should be replaced + * @new_prdtgrp: replacement parent CTRL_MON group for @rdtgrp + * @cpus: cpumask provided by the caller for use during this call + * + * Replaces the parent CTRL_MON group for a MON group, resulting in all member + * tasks' CLOSID immediately changing to that of the new parent group. + * Monitoring data for the group is unaffected by this operation. + */ +static void mongrp_reparent(struct rdtgroup *rdtgrp, + struct rdtgroup *new_prdtgrp, + cpumask_var_t cpus) +{ + struct rdtgroup *prdtgrp = rdtgrp->mon.parent; + + WARN_ON(rdtgrp->type != RDTMON_GROUP); + WARN_ON(new_prdtgrp->type != RDTCTRL_GROUP); + + /* Nothing to do when simply renaming a MON group. */ + if (prdtgrp == new_prdtgrp) + return; + + WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); + list_move_tail(&rdtgrp->mon.crdtgrp_list, + &new_prdtgrp->mon.crdtgrp_list); + + rdtgrp->mon.parent = new_prdtgrp; + rdtgrp->closid = new_prdtgrp->closid; + + /* Propagate updated closid to all tasks in this group. */ + rdt_move_group_tasks(rdtgrp, rdtgrp, cpus); + + update_closid_rmid(cpus, NULL); +} + +static int rdtgroup_rename(struct kernfs_node *kn, + struct kernfs_node *new_parent, const char *new_name) +{ + struct rdtgroup *new_prdtgrp; + struct rdtgroup *rdtgrp; + cpumask_var_t tmpmask; + int ret; + + rdtgrp = kernfs_to_rdtgroup(kn); + new_prdtgrp = kernfs_to_rdtgroup(new_parent); + if (!rdtgrp || !new_prdtgrp) + return -ENOENT; + + /* Release both kernfs active_refs before obtaining rdtgroup mutex. */ + rdtgroup_kn_get(rdtgrp, kn); + rdtgroup_kn_get(new_prdtgrp, new_parent); + + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + /* + * Don't allow kernfs_to_rdtgroup() to return a parent rdtgroup if + * either kernfs_node is a file. + */ + if (kernfs_type(kn) != KERNFS_DIR || + kernfs_type(new_parent) != KERNFS_DIR) { + rdt_last_cmd_puts("Source and destination must be directories"); + ret = -EPERM; + goto out; + } + + if ((rdtgrp->flags & RDT_DELETED) || (new_prdtgrp->flags & RDT_DELETED)) { + ret = -ENOENT; + goto out; + } + + if (rdtgrp->type != RDTMON_GROUP || !kn->parent || + !is_mon_groups(kn->parent, kn->name)) { + rdt_last_cmd_puts("Source must be a MON group\n"); + ret = -EPERM; + goto out; + } + + if (!is_mon_groups(new_parent, new_name)) { + rdt_last_cmd_puts("Destination must be a mon_groups subdirectory\n"); + ret = -EPERM; + goto out; + } + + /* + * If the MON group is monitoring CPUs, the CPUs must be assigned to the + * current parent CTRL_MON group and therefore cannot be assigned to + * the new parent, making the move illegal. + */ + if (!cpumask_empty(&rdtgrp->cpu_mask) && + rdtgrp->mon.parent != new_prdtgrp) { + rdt_last_cmd_puts("Cannot move a MON group that monitors CPUs\n"); + ret = -EPERM; + goto out; + } + + /* + * Allocate the cpumask for use in mongrp_reparent() to avoid the + * possibility of failing to allocate it after kernfs_rename() has + * succeeded. + */ + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) { + ret = -ENOMEM; + goto out; + } + + /* + * Perform all input validation and allocations needed to ensure + * mongrp_reparent() will succeed before calling kernfs_rename(), + * otherwise it would be necessary to revert this call if + * mongrp_reparent() failed. + */ + ret = kernfs_rename(kn, new_parent, new_name); + if (!ret) + mongrp_reparent(rdtgrp, new_prdtgrp, tmpmask); + + free_cpumask_var(tmpmask); + +out: + mutex_unlock(&rdtgroup_mutex); + rdtgroup_kn_put(rdtgrp, kn); + rdtgroup_kn_put(new_prdtgrp, new_parent); + return ret; +} + static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) { if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3)) @@ -3522,6 +3662,7 @@ static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = { .mkdir = rdtgroup_mkdir, .rmdir = rdtgroup_rmdir, + .rename = rdtgroup_rename, .show_options = rdtgroup_show_options, }; diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 2a0e90fe2abc..91fa70e51004 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -755,6 +755,7 @@ static void sgx_mmu_notifier_release(struct mmu_notifier *mn, { struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier); struct sgx_encl_mm *tmp = NULL; + bool found = false; /* * The enclave itself can remove encl_mm. Note, objects can't be moved @@ -764,12 +765,13 @@ static void sgx_mmu_notifier_release(struct mmu_notifier *mn, list_for_each_entry(tmp, &encl_mm->encl->mm_list, list) { if (tmp == encl_mm) { list_del_rcu(&encl_mm->list); + found = true; break; } } spin_unlock(&encl_mm->encl->mm_lock); - if (tmp == encl_mm) { + if (found) { synchronize_srcu(&encl_mm->encl->srcu); mmu_notifier_put(mn); } diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 5e868b62a7c4..0270925fe013 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -79,7 +79,7 @@ int detect_extended_topology_early(struct cpuinfo_x86 *c) * initial apic id, which also represents 32-bit extended x2apic id. */ c->initial_apicid = edx; - smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx); + smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx)); #endif return 0; } @@ -109,7 +109,8 @@ int detect_extended_topology(struct cpuinfo_x86 *c) */ cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx); c->initial_apicid = edx; - core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx); + core_level_siblings = LEVEL_MAX_SIBLINGS(ebx); + smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx)); core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); die_level_siblings = LEVEL_MAX_SIBLINGS(ebx); pkg_mask_width = die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c index 3b58d8703094..6eaf9a6bc02f 100644 --- a/arch/x86/kernel/doublefault_32.c +++ b/arch/x86/kernel/doublefault_32.c @@ -9,6 +9,7 @@ #include <asm/processor.h> #include <asm/desc.h> #include <asm/traps.h> +#include <asm/doublefault.h> #define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM) diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 0bf6779187dd..f18ca44c904b 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -195,7 +195,6 @@ static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, printk("%sCall Trace:\n", log_lvl); unwind_start(&state, task, regs, stack); - stack = stack ? : get_stack_pointer(task, regs); regs = unwind_get_entry_regs(&state, &partial); /* @@ -214,9 +213,13 @@ static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, * - hardirq stack * - entry stack */ - for ( ; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { + for (stack = stack ?: get_stack_pointer(task, regs); + stack; + stack = stack_info.next_sp) { const char *stack_name; + stack = PTR_ALIGN(stack, sizeof(long)); + if (get_stack_info(stack, task, &stack_info, &visit_mask)) { /* * We weren't on a valid stack. It's possible that diff --git a/arch/x86/kernel/fpu/context.h b/arch/x86/kernel/fpu/context.h index 9fcfa5c4dad7..af5cbdd9bd29 100644 --- a/arch/x86/kernel/fpu/context.h +++ b/arch/x86/kernel/fpu/context.h @@ -57,7 +57,7 @@ static inline void fpregs_restore_userregs(void) struct fpu *fpu = ¤t->thread.fpu; int cpu = smp_processor_id(); - if (WARN_ON_ONCE(current->flags & (PF_KTHREAD | PF_IO_WORKER))) + if (WARN_ON_ONCE(current->flags & (PF_KTHREAD | PF_USER_WORKER))) return; if (!fpregs_state_valid(fpu, cpu)) { diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index caf33486dc5e..1015af1ae562 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -426,7 +426,7 @@ void kernel_fpu_begin_mask(unsigned int kfpu_mask) this_cpu_write(in_kernel_fpu, true); - if (!(current->flags & (PF_KTHREAD | PF_IO_WORKER)) && + if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER)) && !test_thread_flag(TIF_NEED_FPU_LOAD)) { set_thread_flag(TIF_NEED_FPU_LOAD); save_fpregs_to_fpstate(¤t->thread.fpu); diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 851eb13edc01..998a08f17e33 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -53,7 +53,7 @@ void fpu__init_cpu(void) fpu__init_cpu_xstate(); } -static bool fpu__probe_without_cpuid(void) +static bool __init fpu__probe_without_cpuid(void) { unsigned long cr0; u16 fsw, fcw; @@ -71,7 +71,7 @@ static bool fpu__probe_without_cpuid(void) return fsw == 0 && (fcw & 0x103f) == 0x003f; } -static void fpu__init_system_early_generic(struct cpuinfo_x86 *c) +static void __init fpu__init_system_early_generic(void) { if (!boot_cpu_has(X86_FEATURE_CPUID) && !test_bit(X86_FEATURE_FPU, (unsigned long *)cpu_caps_cleared)) { @@ -211,10 +211,10 @@ static void __init fpu__init_system_xstate_size_legacy(void) * Called on the boot CPU once per system bootup, to set up the initial * FPU state that is later cloned into all processes: */ -void __init fpu__init_system(struct cpuinfo_x86 *c) +void __init fpu__init_system(void) { fpstate_reset(¤t->thread.fpu); - fpu__init_system_early_generic(c); + fpu__init_system_early_generic(); /* * The FPU has to be operational for some of the diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 5e7ead52cfdb..01e8f34daf22 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -525,9 +525,6 @@ static void *addr_from_call(void *ptr) return ptr + CALL_INSN_SIZE + call.disp; } -void prepare_ftrace_return(unsigned long ip, unsigned long *parent, - unsigned long frame_pointer); - /* * If the ops->trampoline was not allocated, then it probably * has a static trampoline func, or is the ftrace caller itself. diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 10c27b4261eb..246a609f889b 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -69,6 +69,7 @@ asmlinkage __visible void __init __noreturn i386_start_kernel(void) * to the first kernel PMD. Note the upper half of each PMD or PTE are * always zero at this stage. */ +void __init mk_early_pgtbl_32(void); void __init mk_early_pgtbl_32(void) { #ifdef __pa diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 67c8ed99144b..c9318993f959 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -138,20 +138,6 @@ SYM_CODE_START(startup_32) jmp .Ldefault_entry SYM_CODE_END(startup_32) -#ifdef CONFIG_HOTPLUG_CPU -/* - * Boot CPU0 entry point. It's called from play_dead(). Everything has been set - * up already except stack. We just set up stack here. Then call - * start_secondary(). - */ -SYM_FUNC_START(start_cpu0) - movl initial_stack, %ecx - movl %ecx, %esp - call *(initial_code) -1: jmp 1b -SYM_FUNC_END(start_cpu0) -#endif - /* * Non-boot CPU entry point; entered from trampoline.S * We can't lgdt here, because lgdt itself uses a data segment, but diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index a5df3e994f04..c5b9289837dc 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -24,7 +24,9 @@ #include "../entry/calling.h" #include <asm/export.h> #include <asm/nospec-branch.h> +#include <asm/apicdef.h> #include <asm/fixmap.h> +#include <asm/smp.h> /* * We are not able to switch in one step to the final KERNEL ADDRESS SPACE @@ -77,6 +79,15 @@ SYM_CODE_START_NOALIGN(startup_64) call startup_64_setup_env popq %rsi + /* Now switch to __KERNEL_CS so IRET works reliably */ + pushq $__KERNEL_CS + leaq .Lon_kernel_cs(%rip), %rax + pushq %rax + lretq + +.Lon_kernel_cs: + UNWIND_HINT_END_OF_STACK + #ifdef CONFIG_AMD_MEM_ENCRYPT /* * Activate SEV/SME memory encryption if supported/enabled. This needs to @@ -90,15 +101,6 @@ SYM_CODE_START_NOALIGN(startup_64) popq %rsi #endif - /* Now switch to __KERNEL_CS so IRET works reliably */ - pushq $__KERNEL_CS - leaq .Lon_kernel_cs(%rip), %rax - pushq %rax - lretq - -.Lon_kernel_cs: - UNWIND_HINT_END_OF_STACK - /* Sanitize CPU configuration */ call verify_cpu @@ -234,8 +236,67 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) ANNOTATE_NOENDBR // above #ifdef CONFIG_SMP + /* + * For parallel boot, the APIC ID is read from the APIC, and then + * used to look up the CPU number. For booting a single CPU, the + * CPU number is encoded in smpboot_control. + * + * Bit 31 STARTUP_READ_APICID (Read APICID from APIC) + * Bit 0-23 CPU# if STARTUP_xx flags are not set + */ movl smpboot_control(%rip), %ecx + testl $STARTUP_READ_APICID, %ecx + jnz .Lread_apicid + /* + * No control bit set, single CPU bringup. CPU number is provided + * in bit 0-23. This is also the boot CPU case (CPU number 0). + */ + andl $(~STARTUP_PARALLEL_MASK), %ecx + jmp .Lsetup_cpu +.Lread_apicid: + /* Check whether X2APIC mode is already enabled */ + mov $MSR_IA32_APICBASE, %ecx + rdmsr + testl $X2APIC_ENABLE, %eax + jnz .Lread_apicid_msr + + /* Read the APIC ID from the fix-mapped MMIO space. */ + movq apic_mmio_base(%rip), %rcx + addq $APIC_ID, %rcx + movl (%rcx), %eax + shr $24, %eax + jmp .Llookup_AP + +.Lread_apicid_msr: + mov $APIC_X2APIC_ID_MSR, %ecx + rdmsr + +.Llookup_AP: + /* EAX contains the APIC ID of the current CPU */ + xorq %rcx, %rcx + leaq cpuid_to_apicid(%rip), %rbx + +.Lfind_cpunr: + cmpl (%rbx,%rcx,4), %eax + jz .Lsetup_cpu + inc %ecx +#ifdef CONFIG_FORCE_NR_CPUS + cmpl $NR_CPUS, %ecx +#else + cmpl nr_cpu_ids(%rip), %ecx +#endif + jb .Lfind_cpunr + + /* APIC ID not found in the table. Drop the trampoline lock and bail. */ + movq trampoline_lock(%rip), %rax + movl $0, (%rax) + +1: cli + hlt + jmp 1b + +.Lsetup_cpu: /* Get the per cpu offset for the given CPU# which is in ECX */ movq __per_cpu_offset(,%rcx,8), %rdx #else @@ -252,6 +313,16 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) movq TASK_threadsp(%rax), %rsp /* + * Now that this CPU is running on its own stack, drop the realmode + * protection. For the boot CPU the pointer is NULL! + */ + movq trampoline_lock(%rip), %rax + testq %rax, %rax + jz .Lsetup_gdt + movl $0, (%rax) + +.Lsetup_gdt: + /* * We must switch to a new descriptor in kernel space for the GDT * because soon the kernel won't have access anymore to the userspace * addresses where we're currently running on. We have to do that here @@ -375,13 +446,13 @@ SYM_CODE_END(secondary_startup_64) #include "verify_cpu.S" #include "sev_verify_cbit.S" -#ifdef CONFIG_HOTPLUG_CPU +#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_AMD_MEM_ENCRYPT) /* - * Boot CPU0 entry point. It's called from play_dead(). Everything has been set - * up already except stack. We just set up stack here. Then call - * start_secondary() via .Ljump_to_C_code. + * Entry point for soft restart of a CPU. Invoked from xxx_play_dead() for + * restarting the boot CPU or for restarting SEV guest CPUs after CPU hot + * unplug. Everything is set up already except the stack. */ -SYM_CODE_START(start_cpu0) +SYM_CODE_START(soft_restart_cpu) ANNOTATE_NOENDBR UNWIND_HINT_END_OF_STACK @@ -390,7 +461,7 @@ SYM_CODE_START(start_cpu0) movq TASK_threadsp(%rcx), %rsp jmp .Ljump_to_C_code -SYM_CODE_END(start_cpu0) +SYM_CODE_END(soft_restart_cpu) #endif #ifdef CONFIG_AMD_MEM_ENCRYPT @@ -433,6 +504,8 @@ SYM_DATA(initial_code, .quad x86_64_start_kernel) #ifdef CONFIG_AMD_MEM_ENCRYPT SYM_DATA(initial_vc_handler, .quad handle_vc_boot_ghcb) #endif + +SYM_DATA(trampoline_lock, .quad 0); __FINITDATA __INIT diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 766ffe3ba313..9f668d2f3d11 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -211,6 +211,13 @@ u64 arch_irq_stat_cpu(unsigned int cpu) #ifdef CONFIG_X86_MCE_THRESHOLD sum += irq_stats(cpu)->irq_threshold_count; #endif +#ifdef CONFIG_X86_HV_CALLBACK_VECTOR + sum += irq_stats(cpu)->irq_hv_callback_count; +#endif +#if IS_ENABLED(CONFIG_HYPERV) + sum += irq_stats(cpu)->irq_hv_reenlightenment_count; + sum += irq_stats(cpu)->hyperv_stimer0_count; +#endif #ifdef CONFIG_X86_MCE sum += per_cpu(mce_exception_count, cpu); sum += per_cpu(mce_poll_count, cpu); diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index 670eb08b972a..ee4fe8cdb857 100644 --- a/arch/x86/kernel/itmt.c +++ b/arch/x86/kernel/itmt.c @@ -165,32 +165,19 @@ int arch_asym_cpu_priority(int cpu) /** * sched_set_itmt_core_prio() - Set CPU priority based on ITMT - * @prio: Priority of cpu core - * @core_cpu: The cpu number associated with the core + * @prio: Priority of @cpu + * @cpu: The CPU number * * The pstate driver will find out the max boost frequency * and call this function to set a priority proportional - * to the max boost frequency. CPU with higher boost + * to the max boost frequency. CPUs with higher boost * frequency will receive higher priority. * * No need to rebuild sched domain after updating * the CPU priorities. The sched domains have no * dependency on CPU priorities. */ -void sched_set_itmt_core_prio(int prio, int core_cpu) +void sched_set_itmt_core_prio(int prio, int cpu) { - int cpu, i = 1; - - for_each_cpu(cpu, topology_sibling_cpumask(core_cpu)) { - int smt_prio; - - /* - * Ensure that the siblings are moved to the end - * of the priority chain and only used when - * all other high priority cpus are out of capacity. - */ - smt_prio = prio * smp_num_siblings / (i * i); - per_cpu(sched_core_priority, cpu) = smt_prio; - i++; - } + per_cpu(sched_core_priority, cpu) = prio; } diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 0f35d44c56fe..fb8f52149be9 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -71,7 +71,7 @@ static int kvm_set_wallclock(const struct timespec64 *now) return -ENODEV; } -static noinstr u64 kvm_clock_read(void) +static u64 kvm_clock_read(void) { u64 ret; @@ -88,7 +88,7 @@ static u64 kvm_clock_get_cycles(struct clocksource *cs) static noinstr u64 kvm_sched_clock_read(void) { - return kvm_clock_read() - kvm_sched_clock_offset; + return pvclock_clocksource_read_nowd(this_cpu_pvti()) - kvm_sched_clock_offset; } static inline void kvm_sched_clock_init(bool stable) diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c index b348a672f71d..b525fe6d6657 100644 --- a/arch/x86/kernel/platform-quirks.c +++ b/arch/x86/kernel/platform-quirks.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/init.h> +#include <linux/pnp.h> #include <asm/setup.h> #include <asm/bios_ebda.h> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index dac41a0072ea..ff9b80a0e3e3 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -759,15 +759,26 @@ bool xen_set_default_idle(void) } #endif +struct cpumask cpus_stop_mask; + void __noreturn stop_this_cpu(void *dummy) { + struct cpuinfo_x86 *c = this_cpu_ptr(&cpu_info); + unsigned int cpu = smp_processor_id(); + local_irq_disable(); + /* - * Remove this CPU: + * Remove this CPU from the online mask and disable it + * unconditionally. This might be redundant in case that the reboot + * vector was handled late and stop_other_cpus() sent an NMI. + * + * According to SDM and APM NMIs can be accepted even after soft + * disabling the local APIC. */ - set_cpu_online(smp_processor_id(), false); + set_cpu_online(cpu, false); disable_local_APIC(); - mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); + mcheck_cpu_clear(c); /* * Use wbinvd on processors that support SME. This provides support @@ -781,8 +792,17 @@ void __noreturn stop_this_cpu(void *dummy) * Test the CPUID bit directly because the machine might've cleared * X86_FEATURE_SME due to cmdline options. */ - if (cpuid_eax(0x8000001f) & BIT(0)) + if (c->extended_cpuid_level >= 0x8000001f && (cpuid_eax(0x8000001f) & BIT(0))) native_wbinvd(); + + /* + * This brings a cache line back and dirties it, but + * native_stop_other_cpus() will overwrite cpus_stop_mask after it + * observed that all CPUs reported stop. This write will invalidate + * the related cache line on this CPU. + */ + cpumask_clear_cpu(cpu, &cpus_stop_mask); + for (;;) { /* * Use native_halt() so that memory contents don't change diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 16babff771bd..0cccfeb67c3a 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1037,6 +1037,8 @@ void __init setup_arch(char **cmdline_p) /* * VMware detection requires dmi to be available, so this * needs to be done after dmi_setup(), for the boot CPU. + * For some guest types (Xen PV, SEV-SNP, TDX) it is required to be + * called before cache_bp_init() for setting up MTRR state. */ init_hypervisor_platform(); diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index 3a5b0c9c4fcc..2eabccde94fb 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -12,6 +12,9 @@ #ifndef __BOOT_COMPRESSED #define error(v) pr_err(v) #define has_cpuflag(f) boot_cpu_has(f) +#else +#undef WARN +#define WARN(condition, format...) (!!(condition)) #endif /* I/O parameters for CPUID-related helpers */ @@ -991,3 +994,103 @@ static void __init setup_cpuid_table(const struct cc_blob_sev_info *cc_info) cpuid_ext_range_max = fn->eax; } } + +static void pvalidate_pages(struct snp_psc_desc *desc) +{ + struct psc_entry *e; + unsigned long vaddr; + unsigned int size; + unsigned int i; + bool validate; + int rc; + + for (i = 0; i <= desc->hdr.end_entry; i++) { + e = &desc->entries[i]; + + vaddr = (unsigned long)pfn_to_kaddr(e->gfn); + size = e->pagesize ? RMP_PG_SIZE_2M : RMP_PG_SIZE_4K; + validate = e->operation == SNP_PAGE_STATE_PRIVATE; + + rc = pvalidate(vaddr, size, validate); + if (rc == PVALIDATE_FAIL_SIZEMISMATCH && size == RMP_PG_SIZE_2M) { + unsigned long vaddr_end = vaddr + PMD_SIZE; + + for (; vaddr < vaddr_end; vaddr += PAGE_SIZE) { + rc = pvalidate(vaddr, RMP_PG_SIZE_4K, validate); + if (rc) + break; + } + } + + if (rc) { + WARN(1, "Failed to validate address 0x%lx ret %d", vaddr, rc); + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); + } + } +} + +static int vmgexit_psc(struct ghcb *ghcb, struct snp_psc_desc *desc) +{ + int cur_entry, end_entry, ret = 0; + struct snp_psc_desc *data; + struct es_em_ctxt ctxt; + + vc_ghcb_invalidate(ghcb); + + /* Copy the input desc into GHCB shared buffer */ + data = (struct snp_psc_desc *)ghcb->shared_buffer; + memcpy(ghcb->shared_buffer, desc, min_t(int, GHCB_SHARED_BUF_SIZE, sizeof(*desc))); + + /* + * As per the GHCB specification, the hypervisor can resume the guest + * before processing all the entries. Check whether all the entries + * are processed. If not, then keep retrying. Note, the hypervisor + * will update the data memory directly to indicate the status, so + * reference the data->hdr everywhere. + * + * The strategy here is to wait for the hypervisor to change the page + * state in the RMP table before guest accesses the memory pages. If the + * page state change was not successful, then later memory access will + * result in a crash. + */ + cur_entry = data->hdr.cur_entry; + end_entry = data->hdr.end_entry; + + while (data->hdr.cur_entry <= data->hdr.end_entry) { + ghcb_set_sw_scratch(ghcb, (u64)__pa(data)); + + /* This will advance the shared buffer data points to. */ + ret = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_PSC, 0, 0); + + /* + * Page State Change VMGEXIT can pass error code through + * exit_info_2. + */ + if (WARN(ret || ghcb->save.sw_exit_info_2, + "SNP: PSC failed ret=%d exit_info_2=%llx\n", + ret, ghcb->save.sw_exit_info_2)) { + ret = 1; + goto out; + } + + /* Verify that reserved bit is not set */ + if (WARN(data->hdr.reserved, "Reserved bit is set in the PSC header\n")) { + ret = 1; + goto out; + } + + /* + * Sanity check that entry processing is not going backwards. + * This will happen only if hypervisor is tricking us. + */ + if (WARN(data->hdr.end_entry > end_entry || cur_entry > data->hdr.cur_entry, +"SNP: PSC processing going backward, end_entry %d (got %d) cur_entry %d (got %d)\n", + end_entry, data->hdr.end_entry, cur_entry, data->hdr.cur_entry)) { + ret = 1; + goto out; + } + } + +out: + return ret; +} diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index b031244d6d2d..1ee7bed453de 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -113,13 +113,23 @@ struct ghcb_state { }; static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data); -DEFINE_STATIC_KEY_FALSE(sev_es_enable_key); - static DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa); struct sev_config { __u64 debug : 1, - __reserved : 63; + + /* + * A flag used by __set_pages_state() that indicates when the + * per-CPU GHCB has been created and registered and thus can be + * used by the BSP instead of the early boot GHCB. + * + * For APs, the per-CPU GHCB is created before they are started + * and registered upon startup, so this flag can be used globally + * for the BSP and APs. + */ + ghcbs_initialized : 1, + + __reserved : 62; }; static struct sev_config sev_cfg __read_mostly; @@ -645,32 +655,26 @@ static u64 __init get_jump_table_addr(void) return ret; } -static void pvalidate_pages(unsigned long vaddr, unsigned int npages, bool validate) -{ - unsigned long vaddr_end; - int rc; - - vaddr = vaddr & PAGE_MASK; - vaddr_end = vaddr + (npages << PAGE_SHIFT); - - while (vaddr < vaddr_end) { - rc = pvalidate(vaddr, RMP_PG_SIZE_4K, validate); - if (WARN(rc, "Failed to validate address 0x%lx ret %d", vaddr, rc)) - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); - - vaddr = vaddr + PAGE_SIZE; - } -} - -static void __init early_set_pages_state(unsigned long paddr, unsigned int npages, enum psc_op op) +static void early_set_pages_state(unsigned long vaddr, unsigned long paddr, + unsigned long npages, enum psc_op op) { unsigned long paddr_end; u64 val; + int ret; + + vaddr = vaddr & PAGE_MASK; paddr = paddr & PAGE_MASK; paddr_end = paddr + (npages << PAGE_SHIFT); while (paddr < paddr_end) { + if (op == SNP_PAGE_STATE_SHARED) { + /* Page validation must be rescinded before changing to shared */ + ret = pvalidate(vaddr, RMP_PG_SIZE_4K, false); + if (WARN(ret, "Failed to validate address 0x%lx ret %d", paddr, ret)) + goto e_term; + } + /* * Use the MSR protocol because this function can be called before * the GHCB is established. @@ -691,7 +695,15 @@ static void __init early_set_pages_state(unsigned long paddr, unsigned int npage paddr, GHCB_MSR_PSC_RESP_VAL(val))) goto e_term; - paddr = paddr + PAGE_SIZE; + if (op == SNP_PAGE_STATE_PRIVATE) { + /* Page validation must be performed after changing to private */ + ret = pvalidate(vaddr, RMP_PG_SIZE_4K, true); + if (WARN(ret, "Failed to validate address 0x%lx ret %d", paddr, ret)) + goto e_term; + } + + vaddr += PAGE_SIZE; + paddr += PAGE_SIZE; } return; @@ -701,7 +713,7 @@ e_term: } void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, - unsigned int npages) + unsigned long npages) { /* * This can be invoked in early boot while running identity mapped, so @@ -716,14 +728,11 @@ void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long padd * Ask the hypervisor to mark the memory pages as private in the RMP * table. */ - early_set_pages_state(paddr, npages, SNP_PAGE_STATE_PRIVATE); - - /* Validate the memory pages after they've been added in the RMP table. */ - pvalidate_pages(vaddr, npages, true); + early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_PRIVATE); } void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, - unsigned int npages) + unsigned long npages) { /* * This can be invoked in early boot while running identity mapped, so @@ -734,11 +743,8 @@ void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) return; - /* Invalidate the memory pages before they are marked shared in the RMP table. */ - pvalidate_pages(vaddr, npages, false); - /* Ask hypervisor to mark the memory pages shared in the RMP table. */ - early_set_pages_state(paddr, npages, SNP_PAGE_STATE_SHARED); + early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_SHARED); } void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op) @@ -756,96 +762,16 @@ void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op WARN(1, "invalid memory op %d\n", op); } -static int vmgexit_psc(struct snp_psc_desc *desc) +static unsigned long __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr, + unsigned long vaddr_end, int op) { - int cur_entry, end_entry, ret = 0; - struct snp_psc_desc *data; struct ghcb_state state; - struct es_em_ctxt ctxt; - unsigned long flags; - struct ghcb *ghcb; - - /* - * __sev_get_ghcb() needs to run with IRQs disabled because it is using - * a per-CPU GHCB. - */ - local_irq_save(flags); - - ghcb = __sev_get_ghcb(&state); - if (!ghcb) { - ret = 1; - goto out_unlock; - } - - /* Copy the input desc into GHCB shared buffer */ - data = (struct snp_psc_desc *)ghcb->shared_buffer; - memcpy(ghcb->shared_buffer, desc, min_t(int, GHCB_SHARED_BUF_SIZE, sizeof(*desc))); - - /* - * As per the GHCB specification, the hypervisor can resume the guest - * before processing all the entries. Check whether all the entries - * are processed. If not, then keep retrying. Note, the hypervisor - * will update the data memory directly to indicate the status, so - * reference the data->hdr everywhere. - * - * The strategy here is to wait for the hypervisor to change the page - * state in the RMP table before guest accesses the memory pages. If the - * page state change was not successful, then later memory access will - * result in a crash. - */ - cur_entry = data->hdr.cur_entry; - end_entry = data->hdr.end_entry; - - while (data->hdr.cur_entry <= data->hdr.end_entry) { - ghcb_set_sw_scratch(ghcb, (u64)__pa(data)); - - /* This will advance the shared buffer data points to. */ - ret = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_PSC, 0, 0); - - /* - * Page State Change VMGEXIT can pass error code through - * exit_info_2. - */ - if (WARN(ret || ghcb->save.sw_exit_info_2, - "SNP: PSC failed ret=%d exit_info_2=%llx\n", - ret, ghcb->save.sw_exit_info_2)) { - ret = 1; - goto out; - } - - /* Verify that reserved bit is not set */ - if (WARN(data->hdr.reserved, "Reserved bit is set in the PSC header\n")) { - ret = 1; - goto out; - } - - /* - * Sanity check that entry processing is not going backwards. - * This will happen only if hypervisor is tricking us. - */ - if (WARN(data->hdr.end_entry > end_entry || cur_entry > data->hdr.cur_entry, -"SNP: PSC processing going backward, end_entry %d (got %d) cur_entry %d (got %d)\n", - end_entry, data->hdr.end_entry, cur_entry, data->hdr.cur_entry)) { - ret = 1; - goto out; - } - } - -out: - __sev_put_ghcb(&state); - -out_unlock: - local_irq_restore(flags); - - return ret; -} - -static void __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr, - unsigned long vaddr_end, int op) -{ + bool use_large_entry; struct psc_hdr *hdr; struct psc_entry *e; + unsigned long flags; unsigned long pfn; + struct ghcb *ghcb; int i; hdr = &data->hdr; @@ -854,74 +780,104 @@ static void __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr, memset(data, 0, sizeof(*data)); i = 0; - while (vaddr < vaddr_end) { - if (is_vmalloc_addr((void *)vaddr)) + while (vaddr < vaddr_end && i < ARRAY_SIZE(data->entries)) { + hdr->end_entry = i; + + if (is_vmalloc_addr((void *)vaddr)) { pfn = vmalloc_to_pfn((void *)vaddr); - else + use_large_entry = false; + } else { pfn = __pa(vaddr) >> PAGE_SHIFT; + use_large_entry = true; + } e->gfn = pfn; e->operation = op; - hdr->end_entry = i; - /* - * Current SNP implementation doesn't keep track of the RMP page - * size so use 4K for simplicity. - */ - e->pagesize = RMP_PG_SIZE_4K; + if (use_large_entry && IS_ALIGNED(vaddr, PMD_SIZE) && + (vaddr_end - vaddr) >= PMD_SIZE) { + e->pagesize = RMP_PG_SIZE_2M; + vaddr += PMD_SIZE; + } else { + e->pagesize = RMP_PG_SIZE_4K; + vaddr += PAGE_SIZE; + } - vaddr = vaddr + PAGE_SIZE; e++; i++; } - if (vmgexit_psc(data)) + /* Page validation must be rescinded before changing to shared */ + if (op == SNP_PAGE_STATE_SHARED) + pvalidate_pages(data); + + local_irq_save(flags); + + if (sev_cfg.ghcbs_initialized) + ghcb = __sev_get_ghcb(&state); + else + ghcb = boot_ghcb; + + /* Invoke the hypervisor to perform the page state changes */ + if (!ghcb || vmgexit_psc(ghcb, data)) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); + + if (sev_cfg.ghcbs_initialized) + __sev_put_ghcb(&state); + + local_irq_restore(flags); + + /* Page validation must be performed after changing to private */ + if (op == SNP_PAGE_STATE_PRIVATE) + pvalidate_pages(data); + + return vaddr; } -static void set_pages_state(unsigned long vaddr, unsigned int npages, int op) +static void set_pages_state(unsigned long vaddr, unsigned long npages, int op) { - unsigned long vaddr_end, next_vaddr; - struct snp_psc_desc *desc; + struct snp_psc_desc desc; + unsigned long vaddr_end; - desc = kmalloc(sizeof(*desc), GFP_KERNEL_ACCOUNT); - if (!desc) - panic("SNP: failed to allocate memory for PSC descriptor\n"); + /* Use the MSR protocol when a GHCB is not available. */ + if (!boot_ghcb) + return early_set_pages_state(vaddr, __pa(vaddr), npages, op); vaddr = vaddr & PAGE_MASK; vaddr_end = vaddr + (npages << PAGE_SHIFT); - while (vaddr < vaddr_end) { - /* Calculate the last vaddr that fits in one struct snp_psc_desc. */ - next_vaddr = min_t(unsigned long, vaddr_end, - (VMGEXIT_PSC_MAX_ENTRY * PAGE_SIZE) + vaddr); - - __set_pages_state(desc, vaddr, next_vaddr, op); - - vaddr = next_vaddr; - } - - kfree(desc); + while (vaddr < vaddr_end) + vaddr = __set_pages_state(&desc, vaddr, vaddr_end, op); } -void snp_set_memory_shared(unsigned long vaddr, unsigned int npages) +void snp_set_memory_shared(unsigned long vaddr, unsigned long npages) { if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) return; - pvalidate_pages(vaddr, npages, false); - set_pages_state(vaddr, npages, SNP_PAGE_STATE_SHARED); } -void snp_set_memory_private(unsigned long vaddr, unsigned int npages) +void snp_set_memory_private(unsigned long vaddr, unsigned long npages) { if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) return; set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE); +} + +void snp_accept_memory(phys_addr_t start, phys_addr_t end) +{ + unsigned long vaddr; + unsigned int npages; + + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return; + + vaddr = (unsigned long)__va(start); + npages = (end - start) >> PAGE_SHIFT; - pvalidate_pages(vaddr, npages, true); + set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE); } static int snp_set_vmsa(void *va, bool vmsa) @@ -1267,6 +1223,8 @@ void setup_ghcb(void) if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) snp_register_per_cpu_ghcb(); + sev_cfg.ghcbs_initialized = true; + return; } @@ -1328,7 +1286,7 @@ static void sev_es_play_dead(void) * If we get here, the VCPU was woken up again. Jump to CPU * startup code to get it back online. */ - start_cpu0(); + soft_restart_cpu(); } #else /* CONFIG_HOTPLUG_CPU */ #define sev_es_play_dead native_play_dead @@ -1395,9 +1353,6 @@ void __init sev_es_init_vc_handling(void) sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); } - /* Enable SEV-ES special handling */ - static_branch_enable(&sev_es_enable_key); - /* Initialize per-cpu GHCB pages */ for_each_possible_cpu(cpu) { alloc_runtime_data(cpu); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 004cb30b7419..cfeec3ee877e 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -182,7 +182,7 @@ get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size, static unsigned long __ro_after_init max_frame_size; static unsigned int __ro_after_init fpu_default_state_size; -void __init init_sigframe_size(void) +static int __init init_sigframe_size(void) { fpu_default_state_size = fpu__get_fpstate_size(); @@ -194,7 +194,9 @@ void __init init_sigframe_size(void) max_frame_size = round_up(max_frame_size, FRAME_ALIGNMENT); pr_info("max sigframe size: %lu\n", max_frame_size); + return 0; } +early_initcall(init_sigframe_size); unsigned long get_sigframe_size(void) { diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 375b33ecafa2..7eb18ca7bd45 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -21,12 +21,14 @@ #include <linux/interrupt.h> #include <linux/cpu.h> #include <linux/gfp.h> +#include <linux/kexec.h> #include <asm/mtrr.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> #include <asm/proto.h> #include <asm/apic.h> +#include <asm/cpu.h> #include <asm/idtentry.h> #include <asm/nmi.h> #include <asm/mce.h> @@ -129,7 +131,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) } /* - * this function calls the 'stop' function on all other CPUs in the system. + * Disable virtualization, APIC etc. and park the CPU in a HLT loop */ DEFINE_IDTENTRY_SYSVEC(sysvec_reboot) { @@ -146,61 +148,96 @@ static int register_stop_handler(void) static void native_stop_other_cpus(int wait) { - unsigned long flags; - unsigned long timeout; + unsigned int cpu = smp_processor_id(); + unsigned long flags, timeout; if (reboot_force) return; - /* - * Use an own vector here because smp_call_function - * does lots of things not suitable in a panic situation. - */ + /* Only proceed if this is the first CPU to reach this code */ + if (atomic_cmpxchg(&stopping_cpu, -1, cpu) != -1) + return; + + /* For kexec, ensure that offline CPUs are out of MWAIT and in HLT */ + if (kexec_in_progress) + smp_kick_mwait_play_dead(); /* - * We start by using the REBOOT_VECTOR irq. - * The irq is treated as a sync point to allow critical - * regions of code on other cpus to release their spin locks - * and re-enable irqs. Jumping straight to an NMI might - * accidentally cause deadlocks with further shutdown/panic - * code. By syncing, we give the cpus up to one second to - * finish their work before we force them off with the NMI. + * 1) Send an IPI on the reboot vector to all other CPUs. + * + * The other CPUs should react on it after leaving critical + * sections and re-enabling interrupts. They might still hold + * locks, but there is nothing which can be done about that. + * + * 2) Wait for all other CPUs to report that they reached the + * HLT loop in stop_this_cpu() + * + * 3) If the system uses INIT/STARTUP for CPU bringup, then + * send all present CPUs an INIT vector, which brings them + * completely out of the way. + * + * 4) If #3 is not possible and #2 timed out send an NMI to the + * CPUs which did not yet report + * + * 5) Wait for all other CPUs to report that they reached the + * HLT loop in stop_this_cpu() + * + * #4 can obviously race against a CPU reaching the HLT loop late. + * That CPU will have reported already and the "have all CPUs + * reached HLT" condition will be true despite the fact that the + * other CPU is still handling the NMI. Again, there is no + * protection against that as "disabled" APICs still respond to + * NMIs. */ - if (num_online_cpus() > 1) { - /* did someone beat us here? */ - if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1) - return; - - /* sync above data before sending IRQ */ - wmb(); + cpumask_copy(&cpus_stop_mask, cpu_online_mask); + cpumask_clear_cpu(cpu, &cpus_stop_mask); + if (!cpumask_empty(&cpus_stop_mask)) { apic_send_IPI_allbutself(REBOOT_VECTOR); /* * Don't wait longer than a second for IPI completion. The * wait request is not checked here because that would - * prevent an NMI shutdown attempt in case that not all + * prevent an NMI/INIT shutdown in case that not all * CPUs reach shutdown state. */ timeout = USEC_PER_SEC; - while (num_online_cpus() > 1 && timeout--) + while (!cpumask_empty(&cpus_stop_mask) && timeout--) udelay(1); } - /* if the REBOOT_VECTOR didn't work, try with the NMI */ - if (num_online_cpus() > 1) { + /* + * Park all other CPUs in INIT including "offline" CPUs, if + * possible. That's a safe place where they can't resume execution + * of HLT and then execute the HLT loop from overwritten text or + * page tables. + * + * The only downside is a broadcast MCE, but up to the point where + * the kexec() kernel brought all APs online again an MCE will just + * make HLT resume and handle the MCE. The machine crashes and burns + * due to overwritten text, page tables and data. So there is a + * choice between fire and frying pan. The result is pretty much + * the same. Chose frying pan until x86 provides a sane mechanism + * to park a CPU. + */ + if (smp_park_other_cpus_in_init()) + goto done; + + /* + * If park with INIT was not possible and the REBOOT_VECTOR didn't + * take all secondary CPUs offline, try with the NMI. + */ + if (!cpumask_empty(&cpus_stop_mask)) { /* * If NMI IPI is enabled, try to register the stop handler * and send the IPI. In any case try to wait for the other * CPUs to stop. */ if (!smp_no_nmi_ipi && !register_stop_handler()) { - /* Sync above data before sending IRQ */ - wmb(); - pr_emerg("Shutting down cpus with NMI\n"); - apic_send_IPI_allbutself(NMI_VECTOR); + for_each_cpu(cpu, &cpus_stop_mask) + apic->send_IPI(cpu, NMI_VECTOR); } /* * Don't wait longer than 10 ms if the caller didn't @@ -208,14 +245,21 @@ static void native_stop_other_cpus(int wait) * one or more CPUs do not reach shutdown state. */ timeout = USEC_PER_MSEC * 10; - while (num_online_cpus() > 1 && (wait || timeout--)) + while (!cpumask_empty(&cpus_stop_mask) && (wait || timeout--)) udelay(1); } +done: local_irq_save(flags); disable_local_APIC(); mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); local_irq_restore(flags); + + /* + * Ensure that the cpus_stop_mask cache lines are invalidated on + * the other CPUs. See comment vs. SME in stop_this_cpu(). + */ + cpumask_clear(&cpus_stop_mask); } /* @@ -268,8 +312,7 @@ struct smp_ops smp_ops = { #endif .smp_send_reschedule = native_smp_send_reschedule, - .cpu_up = native_cpu_up, - .cpu_die = native_cpu_die, + .kick_ap_alive = native_kick_ap, .cpu_disable = native_cpu_disable, .play_dead = native_play_dead, diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 352f0ce1ece4..ed2d51960a7d 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -53,10 +53,13 @@ #include <linux/tboot.h> #include <linux/gfp.h> #include <linux/cpuidle.h> +#include <linux/kexec.h> #include <linux/numa.h> #include <linux/pgtable.h> #include <linux/overflow.h> #include <linux/stackprotector.h> +#include <linux/cpuhotplug.h> +#include <linux/mc146818rtc.h> #include <asm/acpi.h> #include <asm/cacheinfo.h> @@ -74,7 +77,7 @@ #include <asm/fpu/api.h> #include <asm/setup.h> #include <asm/uv/uv.h> -#include <linux/mc146818rtc.h> +#include <asm/microcode.h> #include <asm/i8259.h> #include <asm/misc.h> #include <asm/qspinlock.h> @@ -101,6 +104,26 @@ EXPORT_PER_CPU_SYMBOL(cpu_die_map); DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); EXPORT_PER_CPU_SYMBOL(cpu_info); +/* CPUs which are the primary SMT threads */ +struct cpumask __cpu_primary_thread_mask __read_mostly; + +/* Representing CPUs for which sibling maps can be computed */ +static cpumask_var_t cpu_sibling_setup_mask; + +struct mwait_cpu_dead { + unsigned int control; + unsigned int status; +}; + +#define CPUDEAD_MWAIT_WAIT 0xDEADBEEF +#define CPUDEAD_MWAIT_KEXEC_HLT 0x4A17DEAD + +/* + * Cache line aligned data for mwait_play_dead(). Separate on purpose so + * that it's unlikely to be touched by other CPUs. + */ +static DEFINE_PER_CPU_ALIGNED(struct mwait_cpu_dead, mwait_cpu_dead); + /* Logical package management. We might want to allocate that dynamically */ unsigned int __max_logical_packages __read_mostly; EXPORT_SYMBOL(__max_logical_packages); @@ -121,7 +144,6 @@ int arch_update_cpu_topology(void) return retval; } - static unsigned int smpboot_warm_reset_vector_count; static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) @@ -154,66 +176,63 @@ static inline void smpboot_restore_warm_reset_vector(void) } -/* - * Report back to the Boot Processor during boot time or to the caller processor - * during CPU online. - */ -static void smp_callin(void) +/* Run the next set of setup steps for the upcoming CPU */ +static void ap_starting(void) { - int cpuid; + int cpuid = smp_processor_id(); - /* - * If waken up by an INIT in an 82489DX configuration - * cpu_callout_mask guarantees we don't get here before - * an INIT_deassert IPI reaches our local APIC, so it is - * now safe to touch our local APIC. - */ - cpuid = smp_processor_id(); + /* Mop up eventual mwait_play_dead() wreckage */ + this_cpu_write(mwait_cpu_dead.status, 0); + this_cpu_write(mwait_cpu_dead.control, 0); /* - * the boot CPU has finished the init stage and is spinning - * on callin_map until we finish. We are free to set up this - * CPU, first the APIC. (this is probably redundant on most - * boards) + * If woken up by an INIT in an 82489DX configuration the alive + * synchronization guarantees that the CPU does not reach this + * point before an INIT_deassert IPI reaches the local APIC, so it + * is now safe to touch the local APIC. + * + * Set up this CPU, first the APIC, which is probably redundant on + * most boards. */ apic_ap_setup(); - /* - * Save our processor parameters. Note: this information - * is needed for clock calibration. - */ + /* Save the processor parameters. */ smp_store_cpu_info(cpuid); /* * The topology information must be up to date before - * calibrate_delay() and notify_cpu_starting(). + * notify_cpu_starting(). */ - set_cpu_sibling_map(raw_smp_processor_id()); + set_cpu_sibling_map(cpuid); ap_init_aperfmperf(); - /* - * Get our bogomips. - * Update loops_per_jiffy in cpu_data. Previous call to - * smp_store_cpu_info() stored a value that is close but not as - * accurate as the value just calculated. - */ - calibrate_delay(); - cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy; pr_debug("Stack at about %p\n", &cpuid); wmb(); + /* + * This runs the AP through all the cpuhp states to its target + * state CPUHP_ONLINE. + */ notify_cpu_starting(cpuid); +} +static void ap_calibrate_delay(void) +{ /* - * Allow the master to continue. + * Calibrate the delay loop and update loops_per_jiffy in cpu_data. + * smp_store_cpu_info() stored a value that is close but not as + * accurate as the value just calculated. + * + * As this is invoked after the TSC synchronization check, + * calibrate_delay_is_known() will skip the calibration routine + * when TSC is synchronized across sockets. */ - cpumask_set_cpu(cpuid, cpu_callin_mask); + calibrate_delay(); + cpu_data(smp_processor_id()).loops_per_jiffy = loops_per_jiffy; } -static int cpu0_logical_apicid; -static int enable_start_cpu0; /* * Activate a secondary processor. */ @@ -226,24 +245,63 @@ static void notrace start_secondary(void *unused) */ cr4_init(); -#ifdef CONFIG_X86_32 - /* switch away from the initial page table */ - load_cr3(swapper_pg_dir); - __flush_tlb_all(); -#endif - cpu_init_secondary(); + /* + * 32-bit specific. 64-bit reaches this code with the correct page + * table established. Yet another historical divergence. + */ + if (IS_ENABLED(CONFIG_X86_32)) { + /* switch away from the initial page table */ + load_cr3(swapper_pg_dir); + __flush_tlb_all(); + } + + cpu_init_exception_handling(); + + /* + * 32-bit systems load the microcode from the ASM startup code for + * historical reasons. + * + * On 64-bit systems load it before reaching the AP alive + * synchronization point below so it is not part of the full per + * CPU serialized bringup part when "parallel" bringup is enabled. + * + * That's even safe when hyperthreading is enabled in the CPU as + * the core code starts the primary threads first and leaves the + * secondary threads waiting for SIPI. Loading microcode on + * physical cores concurrently is a safe operation. + * + * This covers both the Intel specific issue that concurrent + * microcode loading on SMT siblings must be prohibited and the + * vendor independent issue`that microcode loading which changes + * CPUID, MSRs etc. must be strictly serialized to maintain + * software state correctness. + */ + if (IS_ENABLED(CONFIG_X86_64)) + load_ucode_ap(); + + /* + * Synchronization point with the hotplug core. Sets this CPUs + * synchronization state to ALIVE and spin-waits for the control CPU to + * release this CPU for further bringup. + */ + cpuhp_ap_sync_alive(); + + cpu_init(); + fpu__init_cpu(); rcu_cpu_starting(raw_smp_processor_id()); x86_cpuinit.early_percpu_clock_init(); - smp_callin(); - enable_start_cpu0 = 0; + ap_starting(); + + /* Check TSC synchronization with the control CPU. */ + check_tsc_sync_target(); - /* otherwise gcc will move up smp_processor_id before the cpu_init */ - barrier(); /* - * Check TSC synchronization with the boot CPU: + * Calibrate the delay loop after the TSC synchronization check. + * This allows to skip the calibration when TSC is synchronized + * across sockets. */ - check_tsc_sync_target(); + ap_calibrate_delay(); speculative_store_bypass_ht_init(); @@ -257,7 +315,6 @@ static void notrace start_secondary(void *unused) set_cpu_online(smp_processor_id(), true); lapic_online(); unlock_vector_lock(); - cpu_set_state_online(smp_processor_id()); x86_platform.nmi_init(); /* enable local interrupts */ @@ -270,15 +327,6 @@ static void notrace start_secondary(void *unused) } /** - * topology_is_primary_thread - Check whether CPU is the primary SMT thread - * @cpu: CPU to check - */ -bool topology_is_primary_thread(unsigned int cpu) -{ - return apic_id_is_primary_thread(per_cpu(x86_cpu_to_apicid, cpu)); -} - -/** * topology_smt_supported - Check whether SMT is supported by the CPUs */ bool topology_smt_supported(void) @@ -288,6 +336,7 @@ bool topology_smt_supported(void) /** * topology_phys_to_logical_pkg - Map a physical package id to a logical + * @phys_pkg: The physical package id to map * * Returns logical package id or -1 if not found */ @@ -304,15 +353,17 @@ int topology_phys_to_logical_pkg(unsigned int phys_pkg) return -1; } EXPORT_SYMBOL(topology_phys_to_logical_pkg); + /** * topology_phys_to_logical_die - Map a physical die id to logical + * @die_id: The physical die id to map + * @cur_cpu: The CPU for which the mapping is done * * Returns logical die id or -1 if not found */ -int topology_phys_to_logical_die(unsigned int die_id, unsigned int cur_cpu) +static int topology_phys_to_logical_die(unsigned int die_id, unsigned int cur_cpu) { - int cpu; - int proc_id = cpu_data(cur_cpu).phys_proc_id; + int cpu, proc_id = cpu_data(cur_cpu).phys_proc_id; for_each_possible_cpu(cpu) { struct cpuinfo_x86 *c = &cpu_data(cpu); @@ -323,7 +374,6 @@ int topology_phys_to_logical_die(unsigned int die_id, unsigned int cur_cpu) } return -1; } -EXPORT_SYMBOL(topology_phys_to_logical_die); /** * topology_update_package_map - Update the physical to logical package map @@ -398,7 +448,7 @@ void smp_store_cpu_info(int id) c->cpu_index = id; /* * During boot time, CPU0 has this setup already. Save the info when - * bringing up AP or offlined CPU0. + * bringing up an AP. */ identify_secondary_cpu(c); c->initialized = true; @@ -552,7 +602,7 @@ static int x86_core_flags(void) #ifdef CONFIG_SCHED_SMT static int x86_smt_flags(void) { - return cpu_smt_flags() | x86_sched_itmt_flags(); + return cpu_smt_flags(); } #endif #ifdef CONFIG_SCHED_CLUSTER @@ -563,50 +613,57 @@ static int x86_cluster_flags(void) #endif #endif -static struct sched_domain_topology_level x86_numa_in_package_topology[] = { -#ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, -#endif -#ifdef CONFIG_SCHED_CLUSTER - { cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) }, -#endif -#ifdef CONFIG_SCHED_MC - { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, -#endif - { NULL, }, -}; +/* + * Set if a package/die has multiple NUMA nodes inside. + * AMD Magny-Cours, Intel Cluster-on-Die, and Intel + * Sub-NUMA Clustering have this. + */ +static bool x86_has_numa_in_package; -static struct sched_domain_topology_level x86_hybrid_topology[] = { -#ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, -#endif -#ifdef CONFIG_SCHED_MC - { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, -#endif - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, - { NULL, }, -}; +static struct sched_domain_topology_level x86_topology[6]; + +static void __init build_sched_topology(void) +{ + int i = 0; -static struct sched_domain_topology_level x86_topology[] = { #ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, + x86_topology[i++] = (struct sched_domain_topology_level){ + cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) + }; #endif #ifdef CONFIG_SCHED_CLUSTER - { cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) }, + /* + * For now, skip the cluster domain on Hybrid. + */ + if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) { + x86_topology[i++] = (struct sched_domain_topology_level){ + cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) + }; + } #endif #ifdef CONFIG_SCHED_MC - { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, + x86_topology[i++] = (struct sched_domain_topology_level){ + cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) + }; #endif - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, - { NULL, }, -}; + /* + * When there is NUMA topology inside the package skip the DIE domain + * since the NUMA domains will auto-magically create the right spanning + * domains based on the SLIT. + */ + if (!x86_has_numa_in_package) { + x86_topology[i++] = (struct sched_domain_topology_level){ + cpu_cpu_mask, SD_INIT_NAME(DIE) + }; + } -/* - * Set if a package/die has multiple NUMA nodes inside. - * AMD Magny-Cours, Intel Cluster-on-Die, and Intel - * Sub-NUMA Clustering have this. - */ -static bool x86_has_numa_in_package; + /* + * There must be one trailing NULL entry left. + */ + BUG_ON(i >= ARRAY_SIZE(x86_topology)-1); + + set_sched_topology(x86_topology); +} void set_cpu_sibling_map(int cpu) { @@ -706,9 +763,9 @@ static void impress_friends(void) * Allow the user to impress friends. */ pr_debug("Before bogomips\n"); - for_each_possible_cpu(cpu) - if (cpumask_test_cpu(cpu, cpu_callout_mask)) - bogosum += cpu_data(cpu).loops_per_jiffy; + for_each_online_cpu(cpu) + bogosum += cpu_data(cpu).loops_per_jiffy; + pr_info("Total of %d processors activated (%lu.%02lu BogoMIPS)\n", num_online_cpus(), bogosum/(500000/HZ), @@ -795,86 +852,42 @@ static void __init smp_quirk_init_udelay(void) } /* - * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal - * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this - * won't ... remember to clear down the APIC, etc later. + * Wake up AP by INIT, INIT, STARTUP sequence. */ -int -wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip) +static void send_init_sequence(int phys_apicid) { - u32 dm = apic->dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL; - unsigned long send_status, accept_status = 0; - int maxlvt; + int maxlvt = lapic_get_maxlvt(); - /* Target chip */ - /* Boot on the stack */ - /* Kick the second */ - apic_icr_write(APIC_DM_NMI | dm, apicid); - - pr_debug("Waiting for send to finish...\n"); - send_status = safe_apic_wait_icr_idle(); - - /* - * Give the other CPU some time to accept the IPI. - */ - udelay(200); + /* Be paranoid about clearing APIC errors. */ if (APIC_INTEGRATED(boot_cpu_apic_version)) { - maxlvt = lapic_get_maxlvt(); - if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + /* Due to the Pentium erratum 3AP. */ + if (maxlvt > 3) apic_write(APIC_ESR, 0); - accept_status = (apic_read(APIC_ESR) & 0xEF); + apic_read(APIC_ESR); } - pr_debug("NMI sent\n"); - if (send_status) - pr_err("APIC never delivered???\n"); - if (accept_status) - pr_err("APIC delivery error (%lx)\n", accept_status); + /* Assert INIT on the target CPU */ + apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT, phys_apicid); + safe_apic_wait_icr_idle(); - return (send_status | accept_status); + udelay(init_udelay); + + /* Deassert INIT on the target CPU */ + apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid); + safe_apic_wait_icr_idle(); } -static int -wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) +/* + * Wake up AP by INIT, INIT, STARTUP sequence. + */ +static int wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) { unsigned long send_status = 0, accept_status = 0; - int maxlvt, num_starts, j; + int num_starts, j, maxlvt; + preempt_disable(); maxlvt = lapic_get_maxlvt(); - - /* - * Be paranoid about clearing APIC errors. - */ - if (APIC_INTEGRATED(boot_cpu_apic_version)) { - if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - } - - pr_debug("Asserting INIT\n"); - - /* - * Turn INIT on target chip - */ - /* - * Send IPI - */ - apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT, - phys_apicid); - - pr_debug("Waiting for send to finish...\n"); - send_status = safe_apic_wait_icr_idle(); - - udelay(init_udelay); - - pr_debug("Deasserting INIT\n"); - - /* Target chip */ - /* Send IPI */ - apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid); - - pr_debug("Waiting for send to finish...\n"); - send_status = safe_apic_wait_icr_idle(); + send_init_sequence(phys_apicid); mb(); @@ -945,15 +958,16 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) if (accept_status) pr_err("APIC delivery error (%lx)\n", accept_status); + preempt_enable(); return (send_status | accept_status); } /* reduce the number of lines printed when booting a large cpu count system */ static void announce_cpu(int cpu, int apicid) { + static int width, node_width, first = 1; static int current_node = NUMA_NO_NODE; int node = early_cpu_to_node(cpu); - static int width, node_width; if (!width) width = num_digits(num_possible_cpus()) + 1; /* + '#' sign */ @@ -961,10 +975,10 @@ static void announce_cpu(int cpu, int apicid) if (!node_width) node_width = num_digits(num_possible_nodes()) + 1; /* + '#' */ - if (cpu == 1) - printk(KERN_INFO "x86: Booting SMP configuration:\n"); - if (system_state < SYSTEM_RUNNING) { + if (first) + pr_info("x86: Booting SMP configuration:\n"); + if (node != current_node) { if (current_node > (-1)) pr_cont("\n"); @@ -975,77 +989,16 @@ static void announce_cpu(int cpu, int apicid) } /* Add padding for the BSP */ - if (cpu == 1) + if (first) pr_cont("%*s", width + 1, " "); + first = 0; pr_cont("%*s#%d", width - num_digits(cpu), " ", cpu); - } else pr_info("Booting Node %d Processor %d APIC 0x%x\n", node, cpu, apicid); } -static int wakeup_cpu0_nmi(unsigned int cmd, struct pt_regs *regs) -{ - int cpu; - - cpu = smp_processor_id(); - if (cpu == 0 && !cpu_online(cpu) && enable_start_cpu0) - return NMI_HANDLED; - - return NMI_DONE; -} - -/* - * Wake up AP by INIT, INIT, STARTUP sequence. - * - * Instead of waiting for STARTUP after INITs, BSP will execute the BIOS - * boot-strap code which is not a desired behavior for waking up BSP. To - * void the boot-strap code, wake up CPU0 by NMI instead. - * - * This works to wake up soft offlined CPU0 only. If CPU0 is hard offlined - * (i.e. physically hot removed and then hot added), NMI won't wake it up. - * We'll change this code in the future to wake up hard offlined CPU0 if - * real platform and request are available. - */ -static int -wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid, - int *cpu0_nmi_registered) -{ - int id; - int boot_error; - - preempt_disable(); - - /* - * Wake up AP by INIT, INIT, STARTUP sequence. - */ - if (cpu) { - boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip); - goto out; - } - - /* - * Wake up BSP by nmi. - * - * Register a NMI handler to help wake up CPU0. - */ - boot_error = register_nmi_handler(NMI_LOCAL, - wakeup_cpu0_nmi, 0, "wake_cpu0"); - - if (!boot_error) { - enable_start_cpu0 = 1; - *cpu0_nmi_registered = 1; - id = apic->dest_mode_logical ? cpu0_logical_apicid : apicid; - boot_error = wakeup_secondary_cpu_via_nmi(id, start_ip); - } - -out: - preempt_enable(); - - return boot_error; -} - int common_cpu_up(unsigned int cpu, struct task_struct *idle) { int ret; @@ -1071,17 +1024,13 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle) /* * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad * (ie clustered apic addressing mode), this is a LOGICAL apic ID. - * Returns zero if CPU booted OK, else error code from + * Returns zero if startup was successfully sent, else error code from * ->wakeup_secondary_cpu. */ -static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, - int *cpu0_nmi_registered) +static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) { - /* start_ip had better be page-aligned! */ unsigned long start_ip = real_mode_header->trampoline_start; - - unsigned long boot_error = 0; - unsigned long timeout; + int ret; #ifdef CONFIG_X86_64 /* If 64-bit wakeup method exists, use the 64-bit mode trampoline IP */ @@ -1094,7 +1043,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, if (IS_ENABLED(CONFIG_X86_32)) { early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu); initial_stack = idle->thread.sp; - } else { + } else if (!(smpboot_control & STARTUP_PARALLEL_MASK)) { smpboot_control = cpu; } @@ -1108,7 +1057,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, * This grunge runs the startup process for * the targeted processor. */ - if (x86_platform.legacy.warm_reset) { pr_debug("Setting warm reset code and vector.\n"); @@ -1123,13 +1071,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, } } - /* - * AP might wait on cpu_callout_mask in cpu_init() with - * cpu_initialized_mask set if previous attempt to online - * it timed-out. Clear cpu_initialized_mask so that after - * INIT/SIPI it could start with a clean state. - */ - cpumask_clear_cpu(cpu, cpu_initialized_mask); smp_mb(); /* @@ -1137,66 +1078,25 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, * - Use a method from the APIC driver if one defined, with wakeup * straight to 64-bit mode preferred over wakeup to RM. * Otherwise, - * - Use an INIT boot APIC message for APs or NMI for BSP. + * - Use an INIT boot APIC message */ if (apic->wakeup_secondary_cpu_64) - boot_error = apic->wakeup_secondary_cpu_64(apicid, start_ip); + ret = apic->wakeup_secondary_cpu_64(apicid, start_ip); else if (apic->wakeup_secondary_cpu) - boot_error = apic->wakeup_secondary_cpu(apicid, start_ip); + ret = apic->wakeup_secondary_cpu(apicid, start_ip); else - boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid, - cpu0_nmi_registered); - - if (!boot_error) { - /* - * Wait 10s total for first sign of life from AP - */ - boot_error = -1; - timeout = jiffies + 10*HZ; - while (time_before(jiffies, timeout)) { - if (cpumask_test_cpu(cpu, cpu_initialized_mask)) { - /* - * Tell AP to proceed with initialization - */ - cpumask_set_cpu(cpu, cpu_callout_mask); - boot_error = 0; - break; - } - schedule(); - } - } - - if (!boot_error) { - /* - * Wait till AP completes initial initialization - */ - while (!cpumask_test_cpu(cpu, cpu_callin_mask)) { - /* - * Allow other tasks to run while we wait for the - * AP to come online. This also gives a chance - * for the MTRR work(triggered by the AP coming online) - * to be completed in the stop machine context. - */ - schedule(); - } - } + ret = wakeup_secondary_cpu_via_init(apicid, start_ip); - if (x86_platform.legacy.warm_reset) { - /* - * Cleanup possible dangling ends... - */ - smpboot_restore_warm_reset_vector(); - } - - return boot_error; + /* If the wakeup mechanism failed, cleanup the warm reset vector */ + if (ret) + arch_cpuhp_cleanup_kick_cpu(cpu); + return ret; } -int native_cpu_up(unsigned int cpu, struct task_struct *tidle) +int native_kick_ap(unsigned int cpu, struct task_struct *tidle) { int apicid = apic->cpu_present_to_apicid(cpu); - int cpu0_nmi_registered = 0; - unsigned long flags; - int err, ret = 0; + int err; lockdep_assert_irqs_enabled(); @@ -1210,24 +1110,11 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) } /* - * Already booted CPU? - */ - if (cpumask_test_cpu(cpu, cpu_callin_mask)) { - pr_debug("do_boot_cpu %d Already started\n", cpu); - return -ENOSYS; - } - - /* * Save current MTRR state in case it was changed since early boot * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync: */ mtrr_save_state(); - /* x86 CPUs take themselves offline, so delayed offline is OK. */ - err = cpu_check_up_prepare(cpu); - if (err && err != -EBUSY) - return err; - /* the FPU context is blank, nobody can own it */ per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL; @@ -1235,41 +1122,44 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) if (err) return err; - err = do_boot_cpu(apicid, cpu, tidle, &cpu0_nmi_registered); - if (err) { + err = do_boot_cpu(apicid, cpu, tidle); + if (err) pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu); - ret = -EIO; - goto unreg_nmi; - } - /* - * Check TSC synchronization with the AP (keep irqs disabled - * while doing so): - */ - local_irq_save(flags); - check_tsc_sync_source(cpu); - local_irq_restore(flags); + return err; +} - while (!cpu_online(cpu)) { - cpu_relax(); - touch_nmi_watchdog(); - } +int arch_cpuhp_kick_ap_alive(unsigned int cpu, struct task_struct *tidle) +{ + return smp_ops.kick_ap_alive(cpu, tidle); +} -unreg_nmi: - /* - * Clean up the nmi handler. Do this after the callin and callout sync - * to avoid impact of possible long unregister time. - */ - if (cpu0_nmi_registered) - unregister_nmi_handler(NMI_LOCAL, "wake_cpu0"); +void arch_cpuhp_cleanup_kick_cpu(unsigned int cpu) +{ + /* Cleanup possible dangling ends... */ + if (smp_ops.kick_ap_alive == native_kick_ap && x86_platform.legacy.warm_reset) + smpboot_restore_warm_reset_vector(); +} - return ret; +void arch_cpuhp_cleanup_dead_cpu(unsigned int cpu) +{ + if (smp_ops.cleanup_dead_cpu) + smp_ops.cleanup_dead_cpu(cpu); + + if (system_state == SYSTEM_RUNNING) + pr_info("CPU %u is now offline\n", cpu); +} + +void arch_cpuhp_sync_state_poll(void) +{ + if (smp_ops.poll_sync_state) + smp_ops.poll_sync_state(); } /** - * arch_disable_smp_support() - disables SMP support for x86 at runtime + * arch_disable_smp_support() - Disables SMP support for x86 at boottime */ -void arch_disable_smp_support(void) +void __init arch_disable_smp_support(void) { disable_ioapic_support(); } @@ -1361,14 +1251,6 @@ static void __init smp_cpu_index_default(void) } } -static void __init smp_get_logical_apicid(void) -{ - if (x2apic_mode) - cpu0_logical_apicid = apic_read(APIC_LDR); - else - cpu0_logical_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); -} - void __init smp_prepare_cpus_common(void) { unsigned int i; @@ -1379,7 +1261,6 @@ void __init smp_prepare_cpus_common(void) * Setup boot CPU information */ smp_store_boot_cpu_info(); /* Final full version of the data */ - cpumask_copy(cpu_callin_mask, cpumask_of(0)); mb(); for_each_possible_cpu(i) { @@ -1390,18 +1271,24 @@ void __init smp_prepare_cpus_common(void) zalloc_cpumask_var(&per_cpu(cpu_l2c_shared_map, i), GFP_KERNEL); } - /* - * Set 'default' x86 topology, this matches default_topology() in that - * it has NUMA nodes as a topology level. See also - * native_smp_cpus_done(). - * - * Must be done before set_cpus_sibling_map() is ran. - */ - set_sched_topology(x86_topology); - set_cpu_sibling_map(0); } +#ifdef CONFIG_X86_64 +/* Establish whether parallel bringup can be supported. */ +bool __init arch_cpuhp_init_parallel_bringup(void) +{ + if (!x86_cpuinit.parallel_bringup) { + pr_info("Parallel CPU startup disabled by the platform\n"); + return false; + } + + smpboot_control = STARTUP_READ_APICID; + pr_debug("Parallel CPU startup enabled: 0x%08x\n", smpboot_control); + return true; +} +#endif + /* * Prepare for SMP bootup. * @max_cpus: configured maximum number of CPUs, It is a legacy parameter @@ -1431,8 +1318,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) /* Setup local timer */ x86_init.timers.setup_percpu_clockev(); - smp_get_logical_apicid(); - pr_info("CPU0: "); print_cpu_info(&cpu_data(0)); @@ -1455,6 +1340,25 @@ void arch_thaw_secondary_cpus_end(void) cache_aps_init(); } +bool smp_park_other_cpus_in_init(void) +{ + unsigned int cpu, this_cpu = smp_processor_id(); + unsigned int apicid; + + if (apic->wakeup_secondary_cpu_64 || apic->wakeup_secondary_cpu) + return false; + + for_each_present_cpu(cpu) { + if (cpu == this_cpu) + continue; + apicid = apic->cpu_present_to_apicid(cpu); + if (apicid == BAD_APICID) + continue; + send_init_sequence(apicid); + } + return true; +} + /* * Early setup to make printk work. */ @@ -1466,9 +1370,6 @@ void __init native_smp_prepare_boot_cpu(void) if (!IS_ENABLED(CONFIG_SMP)) switch_gdt_and_percpu_base(me); - /* already set me in cpu_online_mask in boot_cpu_init() */ - cpumask_set_cpu(me, cpu_callout_mask); - cpu_set_state_online(me); native_pv_lock_init(); } @@ -1490,13 +1391,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus) pr_debug("Boot done\n"); calculate_max_logical_packages(); - - /* XXX for now assume numa-in-package and hybrid don't overlap */ - if (x86_has_numa_in_package) - set_sched_topology(x86_numa_in_package_topology); - if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) - set_sched_topology(x86_hybrid_topology); - + build_sched_topology(); nmi_selftest(); impress_friends(); cache_aps_init(); @@ -1592,6 +1487,12 @@ __init void prefill_possible_map(void) set_cpu_possible(i, true); } +/* correctly size the local cpu masks */ +void __init setup_cpu_local_masks(void) +{ + alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); +} + #ifdef CONFIG_HOTPLUG_CPU /* Recompute SMT state for all CPUs on offline */ @@ -1650,10 +1551,6 @@ static void remove_siblinginfo(int cpu) static void remove_cpu_from_maps(int cpu) { set_cpu_online(cpu, false); - cpumask_clear_cpu(cpu, cpu_callout_mask); - cpumask_clear_cpu(cpu, cpu_callin_mask); - /* was set by cpu_init() */ - cpumask_clear_cpu(cpu, cpu_initialized_mask); numa_remove_cpu(cpu); } @@ -1704,64 +1601,27 @@ int native_cpu_disable(void) return 0; } -int common_cpu_die(unsigned int cpu) -{ - int ret = 0; - - /* We don't do anything here: idle task is faking death itself. */ - - /* They ack this in play_dead() by setting CPU_DEAD */ - if (cpu_wait_death(cpu, 5)) { - if (system_state == SYSTEM_RUNNING) - pr_info("CPU %u is now offline\n", cpu); - } else { - pr_err("CPU %u didn't die...\n", cpu); - ret = -1; - } - - return ret; -} - -void native_cpu_die(unsigned int cpu) -{ - common_cpu_die(cpu); -} - void play_dead_common(void) { idle_task_exit(); - /* Ack it */ - (void)cpu_report_death(); - + cpuhp_ap_report_dead(); /* * With physical CPU hotplug, we should halt the cpu */ local_irq_disable(); } -/** - * cond_wakeup_cpu0 - Wake up CPU0 if needed. - * - * If NMI wants to wake up CPU0, start CPU0. - */ -void cond_wakeup_cpu0(void) -{ - if (smp_processor_id() == 0 && enable_start_cpu0) - start_cpu0(); -} -EXPORT_SYMBOL_GPL(cond_wakeup_cpu0); - /* * We need to flush the caches before going to sleep, lest we have * dirty data in our caches when we come back up. */ static inline void mwait_play_dead(void) { + struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead); unsigned int eax, ebx, ecx, edx; unsigned int highest_cstate = 0; unsigned int highest_subcstate = 0; - void *mwait_ptr; int i; if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || @@ -1796,12 +1656,9 @@ static inline void mwait_play_dead(void) (highest_subcstate - 1); } - /* - * This should be a memory location in a cache line which is - * unlikely to be touched by other processors. The actual - * content is immaterial as it is not actually modified in any way. - */ - mwait_ptr = ¤t_thread_info()->flags; + /* Set up state for the kexec() hack below */ + md->status = CPUDEAD_MWAIT_WAIT; + md->control = CPUDEAD_MWAIT_WAIT; wbinvd(); @@ -1814,13 +1671,58 @@ static inline void mwait_play_dead(void) * case where we return around the loop. */ mb(); - clflush(mwait_ptr); + clflush(md); mb(); - __monitor(mwait_ptr, 0, 0); + __monitor(md, 0, 0); mb(); __mwait(eax, 0); - cond_wakeup_cpu0(); + if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) { + /* + * Kexec is about to happen. Don't go back into mwait() as + * the kexec kernel might overwrite text and data including + * page tables and stack. So mwait() would resume when the + * monitor cache line is written to and then the CPU goes + * south due to overwritten text, page tables and stack. + * + * Note: This does _NOT_ protect against a stray MCE, NMI, + * SMI. They will resume execution at the instruction + * following the HLT instruction and run into the problem + * which this is trying to prevent. + */ + WRITE_ONCE(md->status, CPUDEAD_MWAIT_KEXEC_HLT); + while(1) + native_halt(); + } + } +} + +/* + * Kick all "offline" CPUs out of mwait on kexec(). See comment in + * mwait_play_dead(). + */ +void smp_kick_mwait_play_dead(void) +{ + u32 newstate = CPUDEAD_MWAIT_KEXEC_HLT; + struct mwait_cpu_dead *md; + unsigned int cpu, i; + + for_each_cpu_andnot(cpu, cpu_present_mask, cpu_online_mask) { + md = per_cpu_ptr(&mwait_cpu_dead, cpu); + + /* Does it sit in mwait_play_dead() ? */ + if (READ_ONCE(md->status) != CPUDEAD_MWAIT_WAIT) + continue; + + /* Wait up to 5ms */ + for (i = 0; READ_ONCE(md->status) != newstate && i < 1000; i++) { + /* Bring it out of mwait */ + WRITE_ONCE(md->control, newstate); + udelay(5); + } + + if (READ_ONCE(md->status) != newstate) + pr_err_once("CPU%u is stuck in mwait_play_dead()\n", cpu); } } @@ -1829,11 +1731,8 @@ void __noreturn hlt_play_dead(void) if (__this_cpu_read(cpu_info.x86) >= 4) wbinvd(); - while (1) { + while (1) native_halt(); - - cond_wakeup_cpu0(); - } } void native_play_dead(void) @@ -1852,12 +1751,6 @@ int native_cpu_disable(void) return -ENOSYS; } -void native_cpu_die(unsigned int cpu) -{ - /* We said "no" in __cpu_disable */ - BUG(); -} - void native_play_dead(void) { BUG(); diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index 1b83377274b8..ca004e2e4469 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -38,102 +38,12 @@ static DEFINE_PER_CPU(struct x86_cpu, cpu_devices); #ifdef CONFIG_HOTPLUG_CPU - -#ifdef CONFIG_BOOTPARAM_HOTPLUG_CPU0 -static int cpu0_hotpluggable = 1; -#else -static int cpu0_hotpluggable; -static int __init enable_cpu0_hotplug(char *str) -{ - cpu0_hotpluggable = 1; - return 1; -} - -__setup("cpu0_hotplug", enable_cpu0_hotplug); -#endif - -#ifdef CONFIG_DEBUG_HOTPLUG_CPU0 -/* - * This function offlines a CPU as early as possible and allows userspace to - * boot up without the CPU. The CPU can be onlined back by user after boot. - * - * This is only called for debugging CPU offline/online feature. - */ -int _debug_hotplug_cpu(int cpu, int action) +int arch_register_cpu(int cpu) { - int ret; - - if (!cpu_is_hotpluggable(cpu)) - return -EINVAL; + struct x86_cpu *xc = per_cpu_ptr(&cpu_devices, cpu); - switch (action) { - case 0: - ret = remove_cpu(cpu); - if (!ret) - pr_info("DEBUG_HOTPLUG_CPU0: CPU %u is now offline\n", cpu); - else - pr_debug("Can't offline CPU%d.\n", cpu); - break; - case 1: - ret = add_cpu(cpu); - if (ret) - pr_debug("Can't online CPU%d.\n", cpu); - - break; - default: - ret = -EINVAL; - } - - return ret; -} - -static int __init debug_hotplug_cpu(void) -{ - _debug_hotplug_cpu(0, 0); - return 0; -} - -late_initcall_sync(debug_hotplug_cpu); -#endif /* CONFIG_DEBUG_HOTPLUG_CPU0 */ - -int arch_register_cpu(int num) -{ - struct cpuinfo_x86 *c = &cpu_data(num); - - /* - * Currently CPU0 is only hotpluggable on Intel platforms. Other - * vendors can add hotplug support later. - * Xen PV guests don't support CPU0 hotplug at all. - */ - if (c->x86_vendor != X86_VENDOR_INTEL || - cpu_feature_enabled(X86_FEATURE_XENPV)) - cpu0_hotpluggable = 0; - - /* - * Two known BSP/CPU0 dependencies: Resume from suspend/hibernate - * depends on BSP. PIC interrupts depend on BSP. - * - * If the BSP dependencies are under control, one can tell kernel to - * enable BSP hotplug. This basically adds a control file and - * one can attempt to offline BSP. - */ - if (num == 0 && cpu0_hotpluggable) { - unsigned int irq; - /* - * We won't take down the boot processor on i386 if some - * interrupts only are able to be serviced by the BSP in PIC. - */ - for_each_active_irq(irq) { - if (!IO_APIC_IRQ(irq) && irq_has_action(irq)) { - cpu0_hotpluggable = 0; - break; - } - } - } - if (num || cpu0_hotpluggable) - per_cpu(cpu_devices, num).cpu.hotpluggable = 1; - - return register_cpu(&per_cpu(cpu_devices, num).cpu, num); + xc->cpu.hotpluggable = cpu > 0; + return register_cpu(&xc->cpu, cpu); } EXPORT_SYMBOL(arch_register_cpu); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 344698852146..3425c6a943e4 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -69,12 +69,10 @@ static int __init tsc_early_khz_setup(char *buf) } early_param("tsc_early_khz", tsc_early_khz_setup); -__always_inline void cyc2ns_read_begin(struct cyc2ns_data *data) +__always_inline void __cyc2ns_read(struct cyc2ns_data *data) { int seq, idx; - preempt_disable_notrace(); - do { seq = this_cpu_read(cyc2ns.seq.seqcount.sequence); idx = seq & 1; @@ -86,6 +84,12 @@ __always_inline void cyc2ns_read_begin(struct cyc2ns_data *data) } while (unlikely(seq != this_cpu_read(cyc2ns.seq.seqcount.sequence))); } +__always_inline void cyc2ns_read_begin(struct cyc2ns_data *data) +{ + preempt_disable_notrace(); + __cyc2ns_read(data); +} + __always_inline void cyc2ns_read_end(void) { preempt_enable_notrace(); @@ -115,18 +119,25 @@ __always_inline void cyc2ns_read_end(void) * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ -static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc) +static __always_inline unsigned long long __cycles_2_ns(unsigned long long cyc) { struct cyc2ns_data data; unsigned long long ns; - cyc2ns_read_begin(&data); + __cyc2ns_read(&data); ns = data.cyc2ns_offset; ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift); - cyc2ns_read_end(); + return ns; +} +static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc) +{ + unsigned long long ns; + preempt_disable_notrace(); + ns = __cycles_2_ns(cyc); + preempt_enable_notrace(); return ns; } @@ -223,7 +234,7 @@ noinstr u64 native_sched_clock(void) u64 tsc_now = rdtsc(); /* return the value in ns */ - return cycles_2_ns(tsc_now); + return __cycles_2_ns(tsc_now); } /* @@ -250,7 +261,7 @@ u64 native_sched_clock_from_tsc(u64 tsc) /* We need to define a real function for sched_clock, to override the weak default version */ #ifdef CONFIG_PARAVIRT -noinstr u64 sched_clock(void) +noinstr u64 sched_clock_noinstr(void) { return paravirt_sched_clock(); } @@ -260,11 +271,20 @@ bool using_native_sched_clock(void) return static_call_query(pv_sched_clock) == native_sched_clock; } #else -u64 sched_clock(void) __attribute__((alias("native_sched_clock"))); +u64 sched_clock_noinstr(void) __attribute__((alias("native_sched_clock"))); bool using_native_sched_clock(void) { return true; } #endif +notrace u64 sched_clock(void) +{ + u64 now; + preempt_disable_notrace(); + now = sched_clock_noinstr(); + preempt_enable_notrace(); + return now; +} + int check_tsc_unstable(void) { return tsc_unstable; @@ -1598,10 +1618,7 @@ void __init tsc_init(void) #ifdef CONFIG_SMP /* - * If we have a constant TSC and are using the TSC for the delay loop, - * we can skip clock calibration if another cpu in the same socket has already - * been calibrated. This assumes that CONSTANT_TSC applies to all - * cpus in the socket - this should be a safe assumption. + * Check whether existing calibration data can be reused. */ unsigned long calibrate_delay_is_known(void) { @@ -1609,6 +1626,21 @@ unsigned long calibrate_delay_is_known(void) int constant_tsc = cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC); const struct cpumask *mask = topology_core_cpumask(cpu); + /* + * If TSC has constant frequency and TSC is synchronized across + * sockets then reuse CPU0 calibration. + */ + if (constant_tsc && !tsc_unstable) + return cpu_data(0).loops_per_jiffy; + + /* + * If TSC has constant frequency and TSC is not synchronized across + * sockets and this is not the first CPU in the socket, then reuse + * the calibration value of an already online CPU on that socket. + * + * This assumes that CONSTANT_TSC is consistent for all CPUs in a + * socket. + */ if (!constant_tsc || !mask) return 0; diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 9452dc9664b5..bbc440c93e08 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -245,7 +245,6 @@ bool tsc_store_and_check_tsc_adjust(bool bootcpu) */ static atomic_t start_count; static atomic_t stop_count; -static atomic_t skip_test; static atomic_t test_runs; /* @@ -344,21 +343,14 @@ static inline unsigned int loop_timeout(int cpu) } /* - * Source CPU calls into this - it waits for the freshly booted - * target CPU to arrive and then starts the measurement: + * The freshly booted CPU initiates this via an async SMP function call. */ -void check_tsc_sync_source(int cpu) +static void check_tsc_sync_source(void *__cpu) { + unsigned int cpu = (unsigned long)__cpu; int cpus = 2; /* - * No need to check if we already know that the TSC is not - * synchronized or if we have no TSC. - */ - if (unsynchronized_tsc()) - return; - - /* * Set the maximum number of test runs to * 1 if the CPU does not provide the TSC_ADJUST MSR * 3 if the MSR is available, so the target can try to adjust @@ -368,16 +360,9 @@ void check_tsc_sync_source(int cpu) else atomic_set(&test_runs, 3); retry: - /* - * Wait for the target to start or to skip the test: - */ - while (atomic_read(&start_count) != cpus - 1) { - if (atomic_read(&skip_test) > 0) { - atomic_set(&skip_test, 0); - return; - } + /* Wait for the target to start. */ + while (atomic_read(&start_count) != cpus - 1) cpu_relax(); - } /* * Trigger the target to continue into the measurement too: @@ -397,14 +382,14 @@ retry: if (!nr_warps) { atomic_set(&test_runs, 0); - pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n", + pr_debug("TSC synchronization [CPU#%d -> CPU#%u]: passed\n", smp_processor_id(), cpu); } else if (atomic_dec_and_test(&test_runs) || random_warps) { /* Force it to 0 if random warps brought us here */ atomic_set(&test_runs, 0); - pr_warn("TSC synchronization [CPU#%d -> CPU#%d]:\n", + pr_warn("TSC synchronization [CPU#%d -> CPU#%u]:\n", smp_processor_id(), cpu); pr_warn("Measured %Ld cycles TSC warp between CPUs, " "turning off TSC clock.\n", max_warp); @@ -457,11 +442,12 @@ void check_tsc_sync_target(void) * SoCs the TSC is frequency synchronized, but still the TSC ADJUST * register might have been wreckaged by the BIOS.. */ - if (tsc_store_and_check_tsc_adjust(false) || tsc_clocksource_reliable) { - atomic_inc(&skip_test); + if (tsc_store_and_check_tsc_adjust(false) || tsc_clocksource_reliable) return; - } + /* Kick the control CPU into the TSC synchronization function */ + smp_call_function_single(cpumask_first(cpu_online_mask), check_tsc_sync_source, + (unsigned long *)(unsigned long)cpu, 0); retry: /* * Register this CPU's participation and wait for the diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 3ac50b7298d1..4d8e518365f4 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -7,6 +7,9 @@ #include <asm/unwind.h> #include <asm/orc_types.h> #include <asm/orc_lookup.h> +#include <asm/orc_header.h> + +ORC_HEADER; #define orc_warn(fmt, ...) \ printk_deferred_once(KERN_WARNING "WARNING: " fmt, ##__VA_ARGS__) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 25f155205770..03c885d3640f 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -508,4 +508,8 @@ INIT_PER_CPU(irq_stack_backing_store); "fixed_percpu_data is not at start of per-cpu area"); #endif +#ifdef CONFIG_RETHUNK +. = ASSERT((__x86_return_thunk & 0x3f) == 0, "__x86_return_thunk not cacheline-aligned"); +#endif + #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index d82f4fa2f1bf..a37ebd3b4773 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -126,12 +126,13 @@ struct x86_init_ops x86_init __initdata = { struct x86_cpuinit_ops x86_cpuinit = { .early_percpu_clock_init = x86_init_noop, .setup_percpu_clockev = setup_secondary_APIC_clock, + .parallel_bringup = true, }; static void default_nmi_init(void) { }; -static void enc_status_change_prepare_noop(unsigned long vaddr, int npages, bool enc) { } -static bool enc_status_change_finish_noop(unsigned long vaddr, int npages, bool enc) { return false; } +static bool enc_status_change_prepare_noop(unsigned long vaddr, int npages, bool enc) { return true; } +static bool enc_status_change_finish_noop(unsigned long vaddr, int npages, bool enc) { return true; } static bool enc_tlb_flush_required_noop(bool enc) { return false; } static bool enc_cache_flush_required_noop(void) { return false; } static bool is_private_mmio_noop(u64 addr) {return false; } |