From 8b766b0f8eece55155146f7628610ce54a065e0f Mon Sep 17 00:00:00 2001 From: Michal Suchanek Date: Fri, 25 Feb 2022 21:51:34 +0100 Subject: sysfb: Enable boot time VESA graphic mode selection Since switch to simplefb/simpledrm VESA graphic mode selection with vga= kernel parameter is no longer available with legacy BIOS. The x86 realmode boot code enables the VESA graphic modes when option FB_BOOT_VESA_SUPPORT is enabled. This option is selected by vesafb but not simplefb/simpledrm. To enable use of VESA modes with simplefb in legacy BIOS boot mode drop dependency of BOOT_VESA_SUPPORT on FB, also drop the FB_ prefix. Select the option from sysfb rather than the drivers that depend on it. The BOOT_VESA_SUPPORT is not specific to framebuffer but rather to x86 platform, move it from fbdev to x86 Kconfig. Fixes: e3263ab389a7 ("x86: provide platform-devices for boot-framebuffers") Signed-off-by: Michal Suchanek Reviewed-by: Javier Martinez Canillas Acked-by: Borislav Petkov Signed-off-by: Thomas Zimmermann Link: https://patchwork.freedesktop.org/patch/msgid/948c39940a4e99f5b43bdbcbe537faae71a43e1d.1645822213.git.msuchanek@suse.de --- arch/x86/Kconfig | 6 ++++++ arch/x86/boot/video-vesa.c | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9f5bd41bf660..cceb1aab0486 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -942,6 +942,12 @@ config GART_IOMMU If unsure, say Y. +config BOOT_VESA_SUPPORT + bool + help + If true, at least one selected framebuffer driver can take advantage + of VESA video modes set at an early boot stage via the vga= parameter. + config MAXSMP bool "Enable Maximum number of SMP Processors and NUMA Nodes" depends on X86_64 && SMP && DEBUG_KERNEL diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c index 7e185977a984..c2c6d35e3a43 100644 --- a/arch/x86/boot/video-vesa.c +++ b/arch/x86/boot/video-vesa.c @@ -83,7 +83,7 @@ static int vesa_probe(void) (vminfo.memory_layout == 4 || vminfo.memory_layout == 6) && vminfo.memory_planes == 1) { -#ifdef CONFIG_FB_BOOT_VESA_SUPPORT +#ifdef CONFIG_BOOT_VESA_SUPPORT /* Graphics mode, color, linear frame buffer supported. Only register the mode if if framebuffer is configured, however, @@ -121,7 +121,7 @@ static int vesa_set_mode(struct mode_info *mode) if ((vminfo.mode_attr & 0x15) == 0x05) { /* It's a supported text mode */ is_graphic = 0; -#ifdef CONFIG_FB_BOOT_VESA_SUPPORT +#ifdef CONFIG_BOOT_VESA_SUPPORT } else if ((vminfo.mode_attr & 0x99) == 0x99) { /* It's a graphics mode with linear frame buffer */ is_graphic = 1; -- cgit From 5f1b97cb9af6c5471825b16306ad7da419cda563 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Fri, 11 Mar 2022 12:06:38 +0200 Subject: x86/gpu: include drm/i915_pciids.h directly in early quirks early-quirks.c is the only user of drm/i915_drm.h that also needs drm/i915_pciids.h. Include the masses of PCI ID macros only where needed. Cc: Bjorn Helgaas Cc: linux-pci@vger.kernel.org Cc: x86@kernel.org Signed-off-by: Jani Nikula Acked-by: Bjorn Helgaas Link: https://patchwork.freedesktop.org/patch/msgid/20220311100639.114685-1-jani.nikula@intel.com --- arch/x86/kernel/early-quirks.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index bd6dad83c65b..805596736e20 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include -- cgit From e87f4152e542610d0b4c6c8548964a68a59d2040 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 23 Mar 2022 20:02:41 +0100 Subject: task_stack, x86/cea: Force-inline stack helpers Force-inline two stack helpers to fix the following objtool warnings: vmlinux.o: warning: objtool: in_task_stack()+0xc: call to task_stack_page() leaves .noinstr.text section vmlinux.o: warning: objtool: in_entry_stack()+0x10: call to cpu_entry_stack() leaves .noinstr.text section Signed-off-by: Borislav Petkov Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220324183607.31717-2-bp@alien8.de --- arch/x86/include/asm/cpu_entry_area.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index dd5ea1bdf04c..75efc4c6f076 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -143,7 +143,7 @@ extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags); extern struct cpu_entry_area *get_cpu_entry_area(int cpu); -static inline struct entry_stack *cpu_entry_stack(int cpu) +static __always_inline struct entry_stack *cpu_entry_stack(int cpu) { return &get_cpu_entry_area(cpu)->entry_stack_page.stack; } -- cgit From 6b91ec4ad290a808a894e7d3448eb82d0b8ef8d5 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 23 Mar 2022 21:52:44 +0100 Subject: x86/kvm/svm: Force-inline GHCB accessors In order to fix: vmlinux.o: warning: objtool: __sev_es_nmi_complete()+0x4c: call to ghcb_set_sw_exit_code() leaves .noinstr.text section Signed-off-by: Borislav Petkov Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220324183607.31717-3-bp@alien8.de --- arch/x86/include/asm/svm.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index f70a5108d464..e255039b36b6 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -444,23 +444,23 @@ struct vmcb { (offsetof(struct vmcb_save_area, field) / sizeof(u64)) #define DEFINE_GHCB_ACCESSORS(field) \ - static inline bool ghcb_##field##_is_valid(const struct ghcb *ghcb) \ + static __always_inline bool ghcb_##field##_is_valid(const struct ghcb *ghcb) \ { \ return test_bit(GHCB_BITMAP_IDX(field), \ (unsigned long *)&ghcb->save.valid_bitmap); \ } \ \ - static inline u64 ghcb_get_##field(struct ghcb *ghcb) \ + static __always_inline u64 ghcb_get_##field(struct ghcb *ghcb) \ { \ return ghcb->save.field; \ } \ \ - static inline u64 ghcb_get_##field##_if_valid(struct ghcb *ghcb) \ + static __always_inline u64 ghcb_get_##field##_if_valid(struct ghcb *ghcb) \ { \ return ghcb_##field##_is_valid(ghcb) ? ghcb->save.field : 0; \ } \ \ - static inline void ghcb_set_##field(struct ghcb *ghcb, u64 value) \ + static __always_inline void ghcb_set_##field(struct ghcb *ghcb, u64 value) \ { \ __set_bit(GHCB_BITMAP_IDX(field), \ (unsigned long *)&ghcb->save.valid_bitmap); \ -- cgit From ace1a98519270c586c0d4179419292df67441cd1 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 23 Mar 2022 23:24:12 +0100 Subject: x86/mm: Force-inline __phys_addr_nodebug() Fix: vmlinux.o: warning: objtool: __sev_es_nmi_complete()+0x8b: call to __phys_addr_nodebug() leaves .noinstr.text section Signed-off-by: Borislav Petkov Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220324183607.31717-4-bp@alien8.de --- arch/x86/include/asm/page_64.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index e9c86299b835..baa70451b8df 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -16,7 +16,7 @@ extern unsigned long page_offset_base; extern unsigned long vmalloc_base; extern unsigned long vmemmap_base; -static inline unsigned long __phys_addr_nodebug(unsigned long x) +static __always_inline unsigned long __phys_addr_nodebug(unsigned long x) { unsigned long y = x - __START_KERNEL_map; -- cgit From 1625c833db93516faaac5feedadf8d19c14238b6 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 27 Jan 2022 12:56:21 +0100 Subject: x86/cpu: Allow feature bit names from /proc/cpuinfo in clearcpuid= Having to give the X86_FEATURE array indices in order to disable a feature bit for testing is not really user-friendly. So accept the feature bit names too. Some feature bits don't have names so there the array indices are still accepted, of course. Clearing CPUID flags is not something which should be done in production so taint the kernel too. An exemplary cmdline would then be something like: clearcpuid=de,440,smca,succory,bmi1,3dnow ("succory" is wrong on purpose). And it says: [ ... ] Clearing CPUID bits: de 13:24 smca (unknown: succory) bmi1 3dnow [ Fix CONFIG_X86_FEATURE_NAMES=n build error as reported by the 0day robot: https://lore.kernel.org/r/202203292206.ICsY2RKX-lkp@intel.com ] Signed-off-by: Borislav Petkov Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20220127115626.14179-2-bp@alien8.de --- arch/x86/include/asm/cpufeature.h | 7 +++-- arch/x86/kernel/cpu/common.c | 64 +++++++++++++++++++++++++++++++-------- 2 files changed, 57 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 1261842d006c..66d3e3b1d24d 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -34,14 +34,17 @@ enum cpuid_leafs CPUID_8000_001F_EAX, }; +#define X86_CAP_FMT_NUM "%d:%d" +#define x86_cap_flag_num(flag) ((flag) >> 5), ((flag) & 31) + #ifdef CONFIG_X86_FEATURE_NAMES extern const char * const x86_cap_flags[NCAPINTS*32]; extern const char * const x86_power_flags[32]; #define X86_CAP_FMT "%s" #define x86_cap_flag(flag) x86_cap_flags[flag] #else -#define X86_CAP_FMT "%d:%d" -#define x86_cap_flag(flag) ((flag) >> 5), ((flag) & 31) +#define X86_CAP_FMT X86_CAP_FMT_NUM +#define x86_cap_flag x86_cap_flag_num #endif /* diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index ed4417500700..69c7ea84b005 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1368,8 +1368,8 @@ static void detect_nopl(void) static void __init cpu_parse_early_param(void) { char arg[128]; - char *argptr = arg; - int arglen, res, bit; + char *argptr = arg, *opt; + int arglen, taint = 0; #ifdef CONFIG_X86_32 if (cmdline_find_option_bool(boot_command_line, "no387")) @@ -1397,21 +1397,61 @@ static void __init cpu_parse_early_param(void) return; pr_info("Clearing CPUID bits:"); - do { - res = get_option(&argptr, &bit); - if (res == 0 || res == 3) - break; - /* If the argument was too long, the last bit may be cut off */ - if (res == 1 && arglen >= sizeof(arg)) - break; + while (argptr) { + bool found __maybe_unused = false; + unsigned int bit; + + opt = strsep(&argptr, ","); + + /* + * Handle naked numbers first for feature flags which don't + * have names. + */ + if (!kstrtouint(opt, 10, &bit)) { + if (bit < NCAPINTS * 32) { + +#ifdef CONFIG_X86_FEATURE_NAMES + /* empty-string, i.e., ""-defined feature flags */ + if (!x86_cap_flags[bit]) + pr_cont(" " X86_CAP_FMT_NUM, x86_cap_flag_num(bit)); + else +#endif + pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit)); + + setup_clear_cpu_cap(bit); + taint++; + } + /* + * The assumption is that there are no feature names with only + * numbers in the name thus go to the next argument. + */ + continue; + } + +#ifdef CONFIG_X86_FEATURE_NAMES + for (bit = 0; bit < 32 * NCAPINTS; bit++) { + if (!x86_cap_flag(bit)) + continue; - if (bit >= 0 && bit < NCAPINTS * 32) { - pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit)); + if (strcmp(x86_cap_flag(bit), opt)) + continue; + + pr_cont(" %s", opt); setup_clear_cpu_cap(bit); + taint++; + found = true; + break; } - } while (res == 2); + + if (!found) + pr_cont(" (unknown: %s)", opt); +#endif + } pr_cont("\n"); + + if (taint) + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); } /* -- cgit From c949110ef4e31cb5d3387bd8273fd5de66b5227b Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 27 Jan 2022 12:56:22 +0100 Subject: x86/cpu: Remove "nosep" That chicken bit was added by 4f88651125e2 ("[PATCH] i386: allow disabling X86_FEATURE_SEP at boot") but measuring int80 vsyscall performance on 32-bit doesn't matter anymore. If still needed, one can boot with clearcpuid=sep to disable that feature for testing. Signed-off-by: Borislav Petkov Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20220127115626.14179-3-bp@alien8.de --- arch/x86/kernel/cpu/common.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 69c7ea84b005..c71d1075db93 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -298,13 +298,6 @@ static int __init cachesize_setup(char *str) } __setup("cachesize=", cachesize_setup); -static int __init x86_sep_setup(char *s) -{ - setup_clear_cpu_cap(X86_FEATURE_SEP); - return 1; -} -__setup("nosep", x86_sep_setup); - /* Standard macro to see if a specific flag is changeable */ static inline int flag_is_changeable_p(u32 flag) { -- cgit From dbae0a934f09208075ec3e73491bd0844e1397b3 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 27 Jan 2022 12:56:23 +0100 Subject: x86/cpu: Remove CONFIG_X86_SMAP and "nosmap" Those were added as part of the SMAP enablement but SMAP is currently an integral part of kernel proper and there's no need to disable it anymore. Rip out that functionality. Leave --uaccess default on for objtool as this is what objtool should do by default anyway. If still needed - clearcpuid=smap. Signed-off-by: Borislav Petkov Reviewed-by: Lai Jiangshan Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20220127115626.14179-4-bp@alien8.de --- arch/x86/Kconfig | 11 ----------- arch/x86/include/asm/disabled-features.h | 8 +------- arch/x86/include/asm/smap.h | 24 ------------------------ arch/x86/kernel/cpu/common.c | 15 +-------------- 4 files changed, 2 insertions(+), 56 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b0142e01002e..5bc8bee64bb0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1816,17 +1816,6 @@ config ARCH_RANDOM If supported, this is a high bandwidth, cryptographically secure hardware random number generator. -config X86_SMAP - def_bool y - prompt "Supervisor Mode Access Prevention" if EXPERT - help - Supervisor Mode Access Prevention (SMAP) is a security - feature in newer Intel processors. There is a small - performance cost if this enabled and turned on; there is - also a small increase in the kernel size if this is enabled. - - If unsure, say Y. - config X86_UMIP def_bool y prompt "User Mode Instruction Prevention" if EXPERT diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 1231d63f836d..1ae0fab7d902 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -10,12 +10,6 @@ * cpu_feature_enabled(). */ -#ifdef CONFIG_X86_SMAP -# define DISABLE_SMAP 0 -#else -# define DISABLE_SMAP (1<<(X86_FEATURE_SMAP & 31)) -#endif - #ifdef CONFIG_X86_UMIP # define DISABLE_UMIP 0 #else @@ -80,7 +74,7 @@ #define DISABLED_MASK6 0 #define DISABLED_MASK7 (DISABLE_PTI) #define DISABLED_MASK8 0 -#define DISABLED_MASK9 (DISABLE_SMAP|DISABLE_SGX) +#define DISABLED_MASK9 (DISABLE_SGX) #define DISABLED_MASK10 0 #define DISABLED_MASK11 0 #define DISABLED_MASK12 0 diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index d17b39893b79..bab490379c65 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h @@ -19,25 +19,14 @@ #ifdef __ASSEMBLY__ -#ifdef CONFIG_X86_SMAP - #define ASM_CLAC \ ALTERNATIVE "", __ASM_CLAC, X86_FEATURE_SMAP #define ASM_STAC \ ALTERNATIVE "", __ASM_STAC, X86_FEATURE_SMAP -#else /* CONFIG_X86_SMAP */ - -#define ASM_CLAC -#define ASM_STAC - -#endif /* CONFIG_X86_SMAP */ - #else /* __ASSEMBLY__ */ -#ifdef CONFIG_X86_SMAP - static __always_inline void clac(void) { /* Note: a barrier is implicit in alternative() */ @@ -76,19 +65,6 @@ static __always_inline void smap_restore(unsigned long flags) #define ASM_STAC \ ALTERNATIVE("", __ASM_STAC, X86_FEATURE_SMAP) -#else /* CONFIG_X86_SMAP */ - -static inline void clac(void) { } -static inline void stac(void) { } - -static inline unsigned long smap_save(void) { return 0; } -static inline void smap_restore(unsigned long flags) { } - -#define ASM_CLAC -#define ASM_STAC - -#endif /* CONFIG_X86_SMAP */ - #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_SMAP_H */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c71d1075db93..747df070fb5e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -382,13 +382,6 @@ static __always_inline void setup_smep(struct cpuinfo_x86 *c) cr4_set_bits(X86_CR4_SMEP); } -static __init int setup_disable_smap(char *arg) -{ - setup_clear_cpu_cap(X86_FEATURE_SMAP); - return 1; -} -__setup("nosmap", setup_disable_smap); - static __always_inline void setup_smap(struct cpuinfo_x86 *c) { unsigned long eflags = native_save_fl(); @@ -396,14 +389,8 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) /* This should have been cleared long ago */ BUG_ON(eflags & X86_EFLAGS_AC); - if (cpu_has(c, X86_FEATURE_SMAP)) { -#ifdef CONFIG_X86_SMAP + if (cpu_has(c, X86_FEATURE_SMAP)) cr4_set_bits(X86_CR4_SMAP); -#else - clear_cpu_cap(c, X86_FEATURE_SMAP); - cr4_clear_bits(X86_CR4_SMAP); -#endif - } } static __always_inline void setup_umip(struct cpuinfo_x86 *c) -- cgit From 385d2ae0a1b5efacb30e13a0f0e521490441d9bb Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 27 Jan 2022 12:56:24 +0100 Subject: x86/cpu: Remove "nosmep" There should be no need to disable SMEP anymore. Signed-off-by: Borislav Petkov Reviewed-by: Lai Jiangshan Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20220127115626.14179-5-bp@alien8.de --- arch/x86/kernel/cpu/common.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 747df070fb5e..5791f692d67e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -369,13 +369,6 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) } #endif -static __init int setup_disable_smep(char *arg) -{ - setup_clear_cpu_cap(X86_FEATURE_SMEP); - return 1; -} -__setup("nosmep", setup_disable_smep); - static __always_inline void setup_smep(struct cpuinfo_x86 *c) { if (cpu_has(c, X86_FEATURE_SMEP)) -- cgit From 76ea0025a214cdf0d2c204f4c21cbffa9fb57c32 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 27 Jan 2022 12:56:25 +0100 Subject: x86/cpu: Remove "noexec" It doesn't make any sense to disable non-executable mappings - security-wise or else. So rip out that switch and move the remaining code into setup.c and delete setup_nx.c Signed-off-by: Borislav Petkov Reviewed-by: Lai Jiangshan Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20220127115626.14179-6-bp@alien8.de --- arch/x86/include/asm/proto.h | 1 - arch/x86/kernel/setup.c | 28 +++++++++++++++++--- arch/x86/mm/Makefile | 3 +-- arch/x86/mm/init_64.c | 1 - arch/x86/mm/setup_nx.c | 62 -------------------------------------------- 5 files changed, 26 insertions(+), 69 deletions(-) delete mode 100644 arch/x86/mm/setup_nx.c (limited to 'arch/x86') diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index feed36d44d04..0f899c8d7a4e 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -35,7 +35,6 @@ void xen_entry_INT80_compat(void); #endif void x86_configure_nx(void); -void x86_report_nx(void); extern int reboot_force; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index c95b9ac5a457..249981bf3d8a 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -756,6 +756,30 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) return 0; } +void x86_configure_nx(void) +{ + if (boot_cpu_has(X86_FEATURE_NX)) + __supported_pte_mask |= _PAGE_NX; + else + __supported_pte_mask &= ~_PAGE_NX; +} + +static void __init x86_report_nx(void) +{ + if (!boot_cpu_has(X86_FEATURE_NX)) { + printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " + "missing in CPU!\n"); + } else { +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) + printk(KERN_INFO "NX (Execute Disable) protection: active\n"); +#else + /* 32bit non-PAE kernel, NX cannot be used */ + printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " + "cannot be enabled: non-PAE kernel!\n"); +#endif + } +} + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -896,9 +920,7 @@ void __init setup_arch(char **cmdline_p) /* * x86_configure_nx() is called before parse_early_param() to detect * whether hardware doesn't support NX (so that the early EHCI debug - * console setup can safely call set_fixmap()). It may then be called - * again from within noexec_setup() during parsing early parameters - * to honor the respective command line option. + * console setup can safely call set_fixmap()). */ x86_configure_nx(); diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index fe3d3061fc11..d957dc15b371 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -20,13 +20,12 @@ CFLAGS_REMOVE_mem_encrypt_identity.o = -pg endif obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o mmap.o \ - pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o maccess.o + pgtable.o physaddr.o tlb.o cpu_entry_area.o maccess.o obj-y += pat/ # Make sure __phys_addr has no stackprotector CFLAGS_physaddr.o := -fno-stack-protector -CFLAGS_setup_nx.o := -fno-stack-protector CFLAGS_mem_encrypt_identity.o := -fno-stack-protector CFLAGS_fault.o := -I $(srctree)/$(src)/../include/asm/trace diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 96d34ebb20a9..d2e484efdfa1 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -110,7 +110,6 @@ int force_personality32; /* * noexec32=on|off * Control non executable heap for 32bit processes. - * To control the stack too use noexec=off * * on PROT_READ does not imply PROT_EXEC for 32-bit processes (default) * off PROT_READ implies PROT_EXEC diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c deleted file mode 100644 index ed5667f5169f..000000000000 --- a/arch/x86/mm/setup_nx.c +++ /dev/null @@ -1,62 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include - -#include -#include - -static int disable_nx; - -/* - * noexec = on|off - * - * Control non-executable mappings for processes. - * - * on Enable - * off Disable - */ -static int __init noexec_setup(char *str) -{ - if (!str) - return -EINVAL; - if (!strncmp(str, "on", 2)) { - disable_nx = 0; - } else if (!strncmp(str, "off", 3)) { - disable_nx = 1; - } - x86_configure_nx(); - return 0; -} -early_param("noexec", noexec_setup); - -void x86_configure_nx(void) -{ - if (boot_cpu_has(X86_FEATURE_NX) && !disable_nx) - __supported_pte_mask |= _PAGE_NX; - else - __supported_pte_mask &= ~_PAGE_NX; -} - -void __init x86_report_nx(void) -{ - if (!boot_cpu_has(X86_FEATURE_NX)) { - printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " - "missing in CPU!\n"); - } else { -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) - if (disable_nx) { - printk(KERN_INFO "NX (Execute Disable) protection: " - "disabled by kernel command line option\n"); - } else { - printk(KERN_INFO "NX (Execute Disable) protection: " - "active\n"); - } -#else - /* 32bit non-PAE kernel, NX cannot be used */ - printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " - "cannot be enabled: non-PAE kernel!\n"); -#endif - } -} -- cgit From f8858b5eff30d1b2be15ef1ea6285964013b95e6 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 27 Jan 2022 12:56:26 +0100 Subject: x86/cpu: Remove "noclflush" Not really needed anymore and there's clearcpuid=. Signed-off-by: Borislav Petkov Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20220127115626.14179-7-bp@alien8.de --- arch/x86/kernel/cpu/common.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 5791f692d67e..7dd7604cf46a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1870,14 +1870,6 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) update_srbds_msr(); } -static __init int setup_noclflush(char *arg) -{ - setup_clear_cpu_cap(X86_FEATURE_CLFLUSH); - setup_clear_cpu_cap(X86_FEATURE_CLFLUSHOPT); - return 1; -} -__setup("noclflush", setup_noclflush); - void print_cpu_info(struct cpuinfo_x86 *c) { const char *vendor = NULL; -- cgit From 93d256cd3c1e93c4093e8015b371e832de4c4146 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 3 Mar 2022 18:04:43 -0600 Subject: x86/PCI: Eliminate remove_e820_regions() common subexpressions Add local variables to reduce repetition later. No functional change intended. Link: https://lore.kernel.org/r/20220304035110.988712-2-helgaas@kernel.org Signed-off-by: Bjorn Helgaas Reviewed-by: Hans de Goede Reviewed-by: Mika Westerberg Acked-by: Rafael J. Wysocki --- arch/x86/kernel/resource.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c index 9b9fb7882c20..8ffe68437744 100644 --- a/arch/x86/kernel/resource.c +++ b/arch/x86/kernel/resource.c @@ -27,12 +27,14 @@ static void remove_e820_regions(struct resource *avail) { int i; struct e820_entry *entry; + u64 e820_start, e820_end; for (i = 0; i < e820_table->nr_entries; i++) { entry = &e820_table->entries[i]; + e820_start = entry->addr; + e820_end = entry->addr + entry->size - 1; - resource_clip(avail, entry->addr, - entry->addr + entry->size - 1); + resource_clip(avail, e820_start, e820_end); } } -- cgit From 944fad4583bc8a6d7dd80fbe39db50141da95793 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Tue, 1 Feb 2022 15:40:55 +0100 Subject: x86/fault: Cast an argument to the proper address space in prefetch() Commit in Fixes uses accessors based on the access mode, i.e., it distinguishes its access if instr carries a user address or a kernel address. Since that commit, sparse complains about passing an argument without __user annotation to get_user(), which expects a pointer of the __user address space: arch/x86/mm/fault.c:152:29: warning: incorrect type in argument 1 (different address spaces) arch/x86/mm/fault.c:152:29: expected void const volatile [noderef] __user *ptr arch/x86/mm/fault.c:152:29: got unsigned char *[assigned] instr Cast instr to __user when accessing user memory. No functional change. No change in the generated object code. [ bp: Simplify commit message. ] Fixes: 35f1c89b0cce ("x86/fault: Fix AMD erratum #91 errata fixup for user code") Signed-off-by: Lukas Bulwahn Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220201144055.5670-1-lukas.bulwahn@gmail.com --- arch/x86/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index d0074c6ed31a..fad8faa29d04 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -149,7 +149,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) unsigned char opcode; if (user_mode(regs)) { - if (get_user(opcode, instr)) + if (get_user(opcode, (unsigned char __user *) instr)) break; } else { if (get_kernel_nofault(opcode, instr)) -- cgit From 70431c63d7ed31baf18b0083ba5473274363a174 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 31 Mar 2022 11:05:54 -0700 Subject: x86/pkeys: Clean up arch_set_user_pkey_access() declaration arch_set_user_pkey_access() was declared two times in the header. Remove the 2nd declaration. Suggested-by: "Edgecombe, Rick P" Signed-off-by: Ira Weiny Signed-off-by: Dave Hansen Link: https://lkml.kernel.org/r/20220331180554.2945884-1-ira.weiny@intel.com --- arch/x86/include/asm/pkeys.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h index 1d5f14aff5f6..9c530530b9a7 100644 --- a/arch/x86/include/asm/pkeys.h +++ b/arch/x86/include/asm/pkeys.h @@ -118,8 +118,6 @@ int mm_pkey_free(struct mm_struct *mm, int pkey) return 0; } -extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, - unsigned long init_val); extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val); -- cgit From 5a0893088a20252cc268cbeabb25e883c2b6f94f Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 31 Mar 2022 11:06:55 -0700 Subject: x86/pkeys: Remove __arch_set_user_pkey_access() declaration In the x86 code __arch_set_user_pkey_access() is not used and is not defined. Remove the dead declaration. Signed-off-by: Ira Weiny Signed-off-by: Dave Hansen Link: https://lkml.kernel.org/r/20220331180655.2946086-1-ira.weiny@intel.com --- arch/x86/include/asm/pkeys.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h index 9c530530b9a7..2e6c04d8a45b 100644 --- a/arch/x86/include/asm/pkeys.h +++ b/arch/x86/include/asm/pkeys.h @@ -41,9 +41,6 @@ static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma, return __arch_override_mprotect_pkey(vma, prot, pkey); } -extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey, - unsigned long init_val); - #define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | VM_PKEY_BIT3) #define mm_pkey_allocation_map(mm) (mm->context.pkey_allocation_map) @@ -118,9 +115,6 @@ int mm_pkey_free(struct mm_struct *mm, int pkey) return 0; } -extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey, - unsigned long init_val); - static inline int vma_pkey(struct vm_area_struct *vma) { unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 | -- cgit From bfe4daf850f45d92dcd3da477f0b0456620294c3 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 22 Mar 2022 15:15:05 -0700 Subject: perf/core: Add perf_clear_branch_entry_bitfields() helper Make it simpler to reset all the info fields on the perf_branch_entry by adding a helper inline function. The goal is to centralize the initialization to avoid missing a field in case more are added. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220322221517.2510440-2-eranian@google.com --- arch/x86/events/intel/lbr.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index fe1742c4ca49..13179f31fe10 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -769,6 +769,7 @@ void intel_pmu_lbr_disable_all(void) void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) { unsigned long mask = x86_pmu.lbr_nr - 1; + struct perf_branch_entry *br = cpuc->lbr_entries; u64 tos = intel_pmu_lbr_tos(); int i; @@ -784,15 +785,11 @@ void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr); - cpuc->lbr_entries[i].from = msr_lastbranch.from; - cpuc->lbr_entries[i].to = msr_lastbranch.to; - cpuc->lbr_entries[i].mispred = 0; - cpuc->lbr_entries[i].predicted = 0; - cpuc->lbr_entries[i].in_tx = 0; - cpuc->lbr_entries[i].abort = 0; - cpuc->lbr_entries[i].cycles = 0; - cpuc->lbr_entries[i].type = 0; - cpuc->lbr_entries[i].reserved = 0; + perf_clear_branch_entry_bitfields(br); + + br->from = msr_lastbranch.from; + br->to = msr_lastbranch.to; + br++; } cpuc->lbr_stack.nr = i; cpuc->lbr_stack.hw_idx = tos; @@ -807,6 +804,7 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) { bool need_info = false, call_stack = false; unsigned long mask = x86_pmu.lbr_nr - 1; + struct perf_branch_entry *br = cpuc->lbr_entries; u64 tos = intel_pmu_lbr_tos(); int i; int out = 0; @@ -878,15 +876,14 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) if (abort && x86_pmu.lbr_double_abort && out > 0) out--; - cpuc->lbr_entries[out].from = from; - cpuc->lbr_entries[out].to = to; - cpuc->lbr_entries[out].mispred = mis; - cpuc->lbr_entries[out].predicted = pred; - cpuc->lbr_entries[out].in_tx = in_tx; - cpuc->lbr_entries[out].abort = abort; - cpuc->lbr_entries[out].cycles = cycles; - cpuc->lbr_entries[out].type = 0; - cpuc->lbr_entries[out].reserved = 0; + perf_clear_branch_entry_bitfields(br+out); + br[out].from = from; + br[out].to = to; + br[out].mispred = mis; + br[out].predicted = pred; + br[out].in_tx = in_tx; + br[out].abort = abort; + br[out].cycles = cycles; out++; } cpuc->lbr_stack.nr = out; @@ -951,6 +948,8 @@ static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc, to = rdlbr_to(i, lbr); info = rdlbr_info(i, lbr); + perf_clear_branch_entry_bitfields(e); + e->from = from; e->to = to; e->mispred = get_lbr_mispred(info); @@ -959,7 +958,6 @@ static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc, e->abort = !!(info & LBR_INFO_ABORT); e->cycles = get_lbr_cycles(info); e->type = get_lbr_br_type(info); - e->reserved = 0; } cpuc->lbr_stack.nr = i; -- cgit From a77d41ac3a0f41c80120ec5b8b08ab284fec950a Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 22 Mar 2022 15:15:06 -0700 Subject: x86/cpufeatures: Add AMD Fam19h Branch Sampling feature Add a cpu feature for AMD Fam19h Branch Sampling feature as bit 31 of EBX on CPUID leaf function 0x80000008. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220322221517.2510440-3-eranian@google.com --- arch/x86/include/asm/cpufeatures.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 73e643ae94b6..0d62afd525e3 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -315,6 +315,7 @@ #define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */ #define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */ #define X86_FEATURE_CPPC (13*32+27) /* Collaborative Processor Performance Control */ +#define X86_FEATURE_BRS (13*32+31) /* Branch Sampling available */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ -- cgit From ada543459cab7f653dcacdaba4011a8bb19c627c Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 22 Mar 2022 15:15:07 -0700 Subject: perf/x86/amd: Add AMD Fam19h Branch Sampling support Add support for the AMD Fam19h 16-deep branch sampling feature as described in the AMD PPR Fam19h Model 01h Revision B1. This is a model specific extension. It is not an architected AMD feature. The Branch Sampling (BRS) operates with a 16-deep saturating buffer in MSR registers. There is no branch type filtering. All control flow changes are captured. BRS relies on specific programming of the core PMU of Fam19h. In particular, the following requirements must be met: - the sampling period be greater than 16 (BRS depth) - the sampling period must use a fixed and not frequency mode BRS interacts with the NMI interrupt as well. Because enabling BRS is expensive, it is only activated after P event occurrences, where P is the desired sampling period. At P occurrences of the event, the counter overflows, the CPU catches the interrupt, activates BRS for 16 branches until it saturates, and then delivers the NMI to the kernel. Between the overflow and the time BRS activates more branches may be executed skewing the period. All along, the sampling event keeps counting. The skid may be attenuated by reducing the sampling period by 16 (subsequent patch). BRS is integrated into perf_events seamlessly via the same PERF_RECORD_BRANCH_STACK sample format. BRS generates perf_branch_entry records in the sampling buffer. No prediction information is supported. The branches are stored in reverse order of execution. The most recent branch is the first entry in each record. No modification to the perf tool is necessary. BRS can be used with any sampling event. However, it is recommended to use the RETIRED_BRANCH_INSTRUCTIONS event because it matches what the BRS captures. $ perf record -b -c 1000037 -e cpu/event=0xc2,name=ret_br_instructions/ test $ perf report -D 56531696056126 0x193c000 [0x1a8]: PERF_RECORD_SAMPLE(IP, 0x2): 18122/18230: 0x401d24 period: 1000037 addr: 0 ... branch stack: nr:16 ..... 0: 0000000000401d24 -> 0000000000401d5a 0 cycles 0 ..... 1: 0000000000401d5c -> 0000000000401d24 0 cycles 0 ..... 2: 0000000000401d22 -> 0000000000401d5c 0 cycles 0 ..... 3: 0000000000401d5e -> 0000000000401d22 0 cycles 0 ..... 4: 0000000000401d20 -> 0000000000401d5e 0 cycles 0 ..... 5: 0000000000401d3e -> 0000000000401d20 0 cycles 0 ..... 6: 0000000000401d42 -> 0000000000401d3e 0 cycles 0 ..... 7: 0000000000401d3c -> 0000000000401d42 0 cycles 0 ..... 8: 0000000000401d44 -> 0000000000401d3c 0 cycles 0 ..... 9: 0000000000401d3a -> 0000000000401d44 0 cycles 0 ..... 10: 0000000000401d46 -> 0000000000401d3a 0 cycles 0 ..... 11: 0000000000401d38 -> 0000000000401d46 0 cycles 0 ..... 12: 0000000000401d48 -> 0000000000401d38 0 cycles 0 ..... 13: 0000000000401d36 -> 0000000000401d48 0 cycles 0 ..... 14: 0000000000401d4a -> 0000000000401d36 0 cycles 0 ..... 15: 0000000000401d34 -> 0000000000401d4a 0 cycles 0 ... thread: test:18230 ...... dso: test Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220322221517.2510440-4-eranian@google.com --- arch/x86/events/amd/Makefile | 2 +- arch/x86/events/amd/brs.c | 317 +++++++++++++++++++++++++++++++++++++++ arch/x86/events/amd/core.c | 233 +++++++++++++++++++++++++++- arch/x86/events/core.c | 10 +- arch/x86/events/perf_event.h | 101 +++++++++++-- arch/x86/include/asm/msr-index.h | 4 + 6 files changed, 645 insertions(+), 22 deletions(-) create mode 100644 arch/x86/events/amd/brs.c (limited to 'arch/x86') diff --git a/arch/x86/events/amd/Makefile b/arch/x86/events/amd/Makefile index 6cbe38d5fd9d..cf323ffab5cd 100644 --- a/arch/x86/events/amd/Makefile +++ b/arch/x86/events/amd/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_CPU_SUP_AMD) += core.o +obj-$(CONFIG_CPU_SUP_AMD) += core.o brs.o obj-$(CONFIG_PERF_EVENTS_AMD_POWER) += power.o obj-$(CONFIG_X86_LOCAL_APIC) += ibs.o obj-$(CONFIG_PERF_EVENTS_AMD_UNCORE) += amd-uncore.o diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c new file mode 100644 index 000000000000..3c13c484c637 --- /dev/null +++ b/arch/x86/events/amd/brs.c @@ -0,0 +1,317 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Implement support for AMD Fam19h Branch Sampling feature + * Based on specifications published in AMD PPR Fam19 Model 01 + * + * Copyright 2021 Google LLC + * Contributed by Stephane Eranian + */ +#include +#include +#include + +#include "../perf_event.h" + +#define BRS_POISON 0xFFFFFFFFFFFFFFFEULL /* mark limit of valid entries */ + +/* Debug Extension Configuration register layout */ +union amd_debug_extn_cfg { + __u64 val; + struct { + __u64 rsvd0:2, /* reserved */ + brsmen:1, /* branch sample enable */ + rsvd4_3:2,/* reserved - must be 0x3 */ + vb:1, /* valid branches recorded */ + rsvd2:10, /* reserved */ + msroff:4, /* index of next entry to write */ + rsvd3:4, /* reserved */ + pmc:3, /* #PMC holding the sampling event */ + rsvd4:37; /* reserved */ + }; +}; + +static inline unsigned int brs_from(int idx) +{ + return MSR_AMD_SAMP_BR_FROM + 2 * idx; +} + +static inline unsigned int brs_to(int idx) +{ + return MSR_AMD_SAMP_BR_FROM + 2 * idx + 1; +} + +static inline void set_debug_extn_cfg(u64 val) +{ + /* bits[4:3] must always be set to 11b */ + wrmsrl(MSR_AMD_DBG_EXTN_CFG, val | 3ULL << 3); +} + +static inline u64 get_debug_extn_cfg(void) +{ + u64 val; + + rdmsrl(MSR_AMD_DBG_EXTN_CFG, val); + return val; +} + +static bool __init amd_brs_detect(void) +{ + if (!boot_cpu_has(X86_FEATURE_BRS)) + return false; + + switch (boot_cpu_data.x86) { + case 0x19: /* AMD Fam19h (Zen3) */ + x86_pmu.lbr_nr = 16; + + /* No hardware filtering supported */ + x86_pmu.lbr_sel_map = NULL; + x86_pmu.lbr_sel_mask = 0; + break; + default: + return false; + } + + return true; +} + +/* + * Current BRS implementation does not support branch type or privilege level + * filtering. Therefore, this function simply enforces these limitations. No need for + * a br_sel_map. Software filtering is not supported because it would not correlate well + * with a sampling period. + */ +int amd_brs_setup_filter(struct perf_event *event) +{ + u64 type = event->attr.branch_sample_type; + + /* No BRS support */ + if (!x86_pmu.lbr_nr) + return -EOPNOTSUPP; + + /* Can only capture all branches, i.e., no filtering */ + if ((type & ~PERF_SAMPLE_BRANCH_PLM_ALL) != PERF_SAMPLE_BRANCH_ANY) + return -EINVAL; + + /* can only capture at all priv levels due to the way BRS works */ + if ((type & PERF_SAMPLE_BRANCH_PLM_ALL) != PERF_SAMPLE_BRANCH_PLM_ALL) + return -EINVAL; + + return 0; +} + +/* tos = top of stack, i.e., last valid entry written */ +static inline int amd_brs_get_tos(union amd_debug_extn_cfg *cfg) +{ + /* + * msroff: index of next entry to write so top-of-stack is one off + * if BRS is full then msroff is set back to 0. + */ + return (cfg->msroff ? cfg->msroff : x86_pmu.lbr_nr) - 1; +} + +/* + * make sure we have a sane BRS offset to begin with + * especially with kexec + */ +void amd_brs_reset(void) +{ + /* + * Reset config + */ + set_debug_extn_cfg(0); + + /* + * Mark first entry as poisoned + */ + wrmsrl(brs_to(0), BRS_POISON); +} + +int __init amd_brs_init(void) +{ + if (!amd_brs_detect()) + return -EOPNOTSUPP; + + pr_cont("%d-deep BRS, ", x86_pmu.lbr_nr); + + return 0; +} + +void amd_brs_enable(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + union amd_debug_extn_cfg cfg; + + /* Activate only on first user */ + if (++cpuc->brs_active > 1) + return; + + cfg.val = 0; /* reset all fields */ + cfg.brsmen = 1; /* enable branch sampling */ + + /* Set enable bit */ + set_debug_extn_cfg(cfg.val); +} + +void amd_brs_enable_all(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + if (cpuc->lbr_users) + amd_brs_enable(); +} + +void amd_brs_disable(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + union amd_debug_extn_cfg cfg; + + /* Check if active (could be disabled via x86_pmu_disable_all()) */ + if (!cpuc->brs_active) + return; + + /* Only disable for last user */ + if (--cpuc->brs_active) + return; + + /* + * Clear the brsmen bit but preserve the others as they contain + * useful state such as vb and msroff + */ + cfg.val = get_debug_extn_cfg(); + + /* + * When coming in on interrupt and BRS is full, then hw will have + * already stopped BRS, no need to issue wrmsr again + */ + if (cfg.brsmen) { + cfg.brsmen = 0; + set_debug_extn_cfg(cfg.val); + } +} + +void amd_brs_disable_all(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + if (cpuc->lbr_users) + amd_brs_disable(); +} + +/* + * Caller must ensure amd_brs_inuse() is true before calling + * return: + */ +void amd_brs_drain(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_event *event = cpuc->events[0]; + struct perf_branch_entry *br = cpuc->lbr_entries; + union amd_debug_extn_cfg cfg; + u32 i, nr = 0, num, tos, start; + u32 shift = 64 - boot_cpu_data.x86_virt_bits; + + /* + * BRS event forced on PMC0, + * so check if there is an event. + * It is possible to have lbr_users > 0 but the event + * not yet scheduled due to long latency PMU irq + */ + if (!event) + goto empty; + + cfg.val = get_debug_extn_cfg(); + + /* Sanity check [0-x86_pmu.lbr_nr] */ + if (WARN_ON_ONCE(cfg.msroff >= x86_pmu.lbr_nr)) + goto empty; + + /* No valid branch */ + if (cfg.vb == 0) + goto empty; + + /* + * msr.off points to next entry to be written + * tos = most recent entry index = msr.off - 1 + * BRS register buffer saturates, so we know we have + * start < tos and that we have to read from start to tos + */ + start = 0; + tos = amd_brs_get_tos(&cfg); + + num = tos - start + 1; + + /* + * BRS is only one pass (saturation) from MSROFF to depth-1 + * MSROFF wraps to zero when buffer is full + */ + for (i = 0; i < num; i++) { + u32 brs_idx = tos - i; + u64 from, to; + + rdmsrl(brs_to(brs_idx), to); + + /* Entry does not belong to us (as marked by kernel) */ + if (to == BRS_POISON) + break; + + rdmsrl(brs_from(brs_idx), from); + + /* + * Sign-extend SAMP_BR_TO to 64 bits, bits 61-63 are reserved. + * Necessary to generate proper virtual addresses suitable for + * symbolization + */ + to = (u64)(((s64)to << shift) >> shift); + + perf_clear_branch_entry_bitfields(br+nr); + + br[nr].from = from; + br[nr].to = to; + + nr++; + } +empty: + /* Record number of sampled branches */ + cpuc->lbr_stack.nr = nr; +} + +/* + * Poison most recent entry to prevent reuse by next task + * required because BRS entry are not tagged by PID + */ +static void amd_brs_poison_buffer(void) +{ + union amd_debug_extn_cfg cfg; + unsigned int idx; + + /* Get current state */ + cfg.val = get_debug_extn_cfg(); + + /* idx is most recently written entry */ + idx = amd_brs_get_tos(&cfg); + + /* Poison target of entry */ + wrmsrl(brs_to(idx), BRS_POISON); +} + +/* + * On context switch in, we need to make sure no samples from previous user + * are left in the BRS. + * + * On ctxswin, sched_in = true, called after the PMU has started + * On ctxswout, sched_in = false, called before the PMU is stopped + */ +void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + /* no active users */ + if (!cpuc->lbr_users) + return; + + /* + * On context switch in, we need to ensure we do not use entries + * from previous BRS user on that CPU, so we poison the buffer as + * a faster way compared to resetting all entries. + */ + if (sched_in) + amd_brs_poison_buffer(); +} diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 9687a8aef01c..c7ac70d8ed9a 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -325,8 +325,16 @@ static inline bool amd_is_pair_event_code(struct hw_perf_event *hwc) } } +#define AMD_FAM19H_BRS_EVENT 0xc4 /* RETIRED_TAKEN_BRANCH_INSTRUCTIONS */ +static inline int amd_is_brs_event(struct perf_event *e) +{ + return (e->hw.config & AMD64_RAW_EVENT_MASK) == AMD_FAM19H_BRS_EVENT; +} + static int amd_core_hw_config(struct perf_event *event) { + int ret = 0; + if (event->attr.exclude_host && event->attr.exclude_guest) /* * When HO == GO == 1 the hardware treats that as GO == HO == 0 @@ -343,7 +351,66 @@ static int amd_core_hw_config(struct perf_event *event) if ((x86_pmu.flags & PMU_FL_PAIR) && amd_is_pair_event_code(&event->hw)) event->hw.flags |= PERF_X86_EVENT_PAIR; - return 0; + /* + * if branch stack is requested + */ + if (has_branch_stack(event)) { + /* + * Due to interrupt holding, BRS is not recommended in + * counting mode. + */ + if (!is_sampling_event(event)) + return -EINVAL; + + /* + * Due to the way BRS operates by holding the interrupt until + * lbr_nr entries have been captured, it does not make sense + * to allow sampling on BRS with an event that does not match + * what BRS is capturing, i.e., retired taken branches. + * Otherwise the correlation with the event's period is even + * more loose: + * + * With retired taken branch: + * Effective P = P + 16 + X + * With any other event: + * Effective P = P + Y + X + * + * Where X is the number of taken branches due to interrupt + * skid. Skid is large. + * + * Where Y is the occurences of the event while BRS is + * capturing the lbr_nr entries. + * + * By using retired taken branches, we limit the impact on the + * Y variable. We know it cannot be more than the depth of + * BRS. + */ + if (!amd_is_brs_event(event)) + return -EINVAL; + + /* + * BRS implementation does not work with frequency mode + * reprogramming of the period. + */ + if (event->attr.freq) + return -EINVAL; + /* + * The kernel subtracts BRS depth from period, so it must + * be big enough. + */ + if (event->attr.sample_period <= x86_pmu.lbr_nr) + return -EINVAL; + + /* + * Check if we can allow PERF_SAMPLE_BRANCH_STACK + */ + ret = amd_brs_setup_filter(event); + + /* only set in case of success */ + if (!ret) + event->hw.flags |= PERF_X86_EVENT_AMD_BRS; + } + return ret; } static inline int amd_is_nb_event(struct hw_perf_event *hwc) @@ -366,7 +433,7 @@ static int amd_pmu_hw_config(struct perf_event *event) if (event->attr.precise_ip && get_ibs_caps()) return -ENOENT; - if (has_branch_stack(event)) + if (has_branch_stack(event) && !x86_pmu.lbr_nr) return -EOPNOTSUPP; ret = x86_pmu_hw_config(event); @@ -555,6 +622,8 @@ static void amd_pmu_cpu_starting(int cpu) cpuc->amd_nb->nb_id = nb_id; cpuc->amd_nb->refcnt++; + + amd_brs_reset(); } static void amd_pmu_cpu_dead(int cpu) @@ -610,6 +679,8 @@ static void amd_pmu_disable_all(void) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; + amd_brs_disable_all(); + x86_pmu_disable_all(); /* @@ -634,6 +705,30 @@ static void amd_pmu_disable_all(void) } } +static void amd_pmu_enable_event(struct perf_event *event) +{ + x86_pmu_enable_event(event); +} + +static void amd_pmu_enable_all(int added) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event *hwc; + int idx; + + amd_brs_enable_all(); + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + hwc = &cpuc->events[idx]->hw; + + /* only activate events which are marked as active */ + if (!test_bit(idx, cpuc->active_mask)) + continue; + + amd_pmu_enable_event(cpuc->events[idx]); + } +} + static void amd_pmu_disable_event(struct perf_event *event) { x86_pmu_disable_event(event); @@ -651,6 +746,18 @@ static void amd_pmu_disable_event(struct perf_event *event) amd_pmu_wait_on_overflow(event->hw.idx); } +static void amd_pmu_add_event(struct perf_event *event) +{ + if (needs_branch_stack(event)) + amd_pmu_brs_add(event); +} + +static void amd_pmu_del_event(struct perf_event *event) +{ + if (needs_branch_stack(event)) + amd_pmu_brs_del(event); +} + /* * Because of NMI latency, if multiple PMC counters are active or other sources * of NMIs are received, the perf NMI handler can handle one or more overflowed @@ -671,11 +778,31 @@ static void amd_pmu_disable_event(struct perf_event *event) */ static int amd_pmu_handle_irq(struct pt_regs *regs) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int handled; + int pmu_enabled; + + /* + * Save the PMU state. + * It needs to be restored when leaving the handler. + */ + pmu_enabled = cpuc->enabled; + cpuc->enabled = 0; + + /* stop everything (includes BRS) */ + amd_pmu_disable_all(); + + /* Drain BRS is in use (could be inactive) */ + if (cpuc->lbr_users) + amd_brs_drain(); /* Process any counter overflows */ handled = x86_pmu_handle_irq(regs); + cpuc->enabled = pmu_enabled; + if (pmu_enabled) + amd_pmu_enable_all(0); + /* * If a counter was handled, record a timestamp such that un-handled * NMIs will be claimed if arriving within that window. @@ -897,6 +1024,51 @@ static void amd_put_event_constraints_f17h(struct cpu_hw_events *cpuc, --cpuc->n_pair; } +/* + * Because of the way BRS operates with an inactive and active phases, and + * the link to one counter, it is not possible to have two events using BRS + * scheduled at the same time. There would be an issue with enforcing the + * period of each one and given that the BRS saturates, it would not be possible + * to guarantee correlated content for all events. Therefore, in situations + * where multiple events want to use BRS, the kernel enforces mutual exclusion. + * Exclusion is enforced by chosing only one counter for events using BRS. + * The event scheduling logic will then automatically multiplex the + * events and ensure that at most one event is actively using BRS. + * + * The BRS counter could be any counter, but there is no constraint on Fam19h, + * therefore all counters are equal and thus we pick the first one: PMC0 + */ +static struct event_constraint amd_fam19h_brs_cntr0_constraint = + EVENT_CONSTRAINT(0, 0x1, AMD64_RAW_EVENT_MASK); + +static struct event_constraint amd_fam19h_brs_pair_cntr0_constraint = + __EVENT_CONSTRAINT(0, 0x1, AMD64_RAW_EVENT_MASK, 1, 0, PERF_X86_EVENT_PAIR); + +static struct event_constraint * +amd_get_event_constraints_f19h(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + bool has_brs = has_amd_brs(hwc); + + /* + * In case BRS is used with an event requiring a counter pair, + * the kernel allows it but only on counter 0 & 1 to enforce + * multiplexing requiring to protect BRS in case of multiple + * BRS users + */ + if (amd_is_pair_event_code(hwc)) { + return has_brs ? &amd_fam19h_brs_pair_cntr0_constraint + : &pair_constraint; + } + + if (has_brs) + return &amd_fam19h_brs_cntr0_constraint; + + return &unconstrained; +} + + static ssize_t amd_event_sysfs_show(char *page, u64 config) { u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) | @@ -905,12 +1077,19 @@ static ssize_t amd_event_sysfs_show(char *page, u64 config) return x86_event_sysfs_show(page, config, event); } +static void amd_pmu_sched_task(struct perf_event_context *ctx, + bool sched_in) +{ + if (sched_in && x86_pmu.lbr_nr) + amd_pmu_brs_sched_task(ctx, sched_in); +} + static __initconst const struct x86_pmu amd_pmu = { .name = "AMD", .handle_irq = amd_pmu_handle_irq, .disable_all = amd_pmu_disable_all, - .enable_all = x86_pmu_enable_all, - .enable = x86_pmu_enable_event, + .enable_all = amd_pmu_enable_all, + .enable = amd_pmu_enable_event, .disable = amd_pmu_disable_event, .hw_config = amd_pmu_hw_config, .schedule_events = x86_schedule_events, @@ -920,6 +1099,8 @@ static __initconst const struct x86_pmu amd_pmu = { .event_map = amd_pmu_event_map, .max_events = ARRAY_SIZE(amd_perfmon_event_map), .num_counters = AMD64_NUM_COUNTERS, + .add = amd_pmu_add_event, + .del = amd_pmu_del_event, .cntval_bits = 48, .cntval_mask = (1ULL << 48) - 1, .apic = 1, @@ -938,6 +1119,37 @@ static __initconst const struct x86_pmu amd_pmu = { .amd_nb_constraints = 1, }; +static ssize_t branches_show(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu.lbr_nr); +} + +static DEVICE_ATTR_RO(branches); + +static struct attribute *amd_pmu_brs_attrs[] = { + &dev_attr_branches.attr, + NULL, +}; + +static umode_t +amd_brs_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return x86_pmu.lbr_nr ? attr->mode : 0; +} + +static struct attribute_group group_caps_amd_brs = { + .name = "caps", + .attrs = amd_pmu_brs_attrs, + .is_visible = amd_brs_is_visible, +}; + +static const struct attribute_group *amd_attr_update[] = { + &group_caps_amd_brs, + NULL, +}; + static int __init amd_core_pmu_init(void) { u64 even_ctr_mask = 0ULL; @@ -989,6 +1201,19 @@ static int __init amd_core_pmu_init(void) x86_pmu.flags |= PMU_FL_PAIR; } + /* + * BRS requires special event constraints and flushing on ctxsw. + */ + if (boot_cpu_data.x86 >= 0x19 && !amd_brs_init()) { + x86_pmu.get_event_constraints = amd_get_event_constraints_f19h; + x86_pmu.sched_task = amd_pmu_sched_task; + /* + * put_event_constraints callback same as Fam17h, set above + */ + } + + x86_pmu.attr_update = amd_attr_update; + pr_cont("core perfctr, "); return 0; } diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index eef816fc216d..7ada9172b074 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1338,6 +1338,10 @@ static void x86_pmu_enable(struct pmu *pmu) if (hwc->state & PERF_HES_ARCH) continue; + /* + * if cpuc->enabled = 0, then no wrmsr as + * per x86_pmu_enable_event() + */ x86_pmu_start(event, PERF_EF_RELOAD); } cpuc->n_added = 0; @@ -1704,11 +1708,15 @@ int x86_pmu_handle_irq(struct pt_regs *regs) * event overflow */ handled++; - perf_sample_data_init(&data, 0, event->hw.last_period); if (!x86_perf_event_set_period(event)) continue; + perf_sample_data_init(&data, 0, event->hw.last_period); + + if (has_branch_stack(event)) + data.br_stack = &cpuc->lbr_stack; + if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); } diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 150261d929b9..6f1265163c9f 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -67,22 +67,23 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode) /* * struct hw_perf_event.flags flags */ -#define PERF_X86_EVENT_PEBS_LDLAT 0x0001 /* ld+ldlat data address sampling */ -#define PERF_X86_EVENT_PEBS_ST 0x0002 /* st data address sampling */ -#define PERF_X86_EVENT_PEBS_ST_HSW 0x0004 /* haswell style datala, store */ -#define PERF_X86_EVENT_PEBS_LD_HSW 0x0008 /* haswell style datala, load */ -#define PERF_X86_EVENT_PEBS_NA_HSW 0x0010 /* haswell style datala, unknown */ -#define PERF_X86_EVENT_EXCL 0x0020 /* HT exclusivity on counter */ -#define PERF_X86_EVENT_DYNAMIC 0x0040 /* dynamic alloc'd constraint */ - -#define PERF_X86_EVENT_EXCL_ACCT 0x0100 /* accounted EXCL event */ -#define PERF_X86_EVENT_AUTO_RELOAD 0x0200 /* use PEBS auto-reload */ -#define PERF_X86_EVENT_LARGE_PEBS 0x0400 /* use large PEBS */ -#define PERF_X86_EVENT_PEBS_VIA_PT 0x0800 /* use PT buffer for PEBS */ -#define PERF_X86_EVENT_PAIR 0x1000 /* Large Increment per Cycle */ -#define PERF_X86_EVENT_LBR_SELECT 0x2000 /* Save/Restore MSR_LBR_SELECT */ -#define PERF_X86_EVENT_TOPDOWN 0x4000 /* Count Topdown slots/metrics events */ -#define PERF_X86_EVENT_PEBS_STLAT 0x8000 /* st+stlat data address sampling */ +#define PERF_X86_EVENT_PEBS_LDLAT 0x00001 /* ld+ldlat data address sampling */ +#define PERF_X86_EVENT_PEBS_ST 0x00002 /* st data address sampling */ +#define PERF_X86_EVENT_PEBS_ST_HSW 0x00004 /* haswell style datala, store */ +#define PERF_X86_EVENT_PEBS_LD_HSW 0x00008 /* haswell style datala, load */ +#define PERF_X86_EVENT_PEBS_NA_HSW 0x00010 /* haswell style datala, unknown */ +#define PERF_X86_EVENT_EXCL 0x00020 /* HT exclusivity on counter */ +#define PERF_X86_EVENT_DYNAMIC 0x00040 /* dynamic alloc'd constraint */ + +#define PERF_X86_EVENT_EXCL_ACCT 0x00100 /* accounted EXCL event */ +#define PERF_X86_EVENT_AUTO_RELOAD 0x00200 /* use PEBS auto-reload */ +#define PERF_X86_EVENT_LARGE_PEBS 0x00400 /* use large PEBS */ +#define PERF_X86_EVENT_PEBS_VIA_PT 0x00800 /* use PT buffer for PEBS */ +#define PERF_X86_EVENT_PAIR 0x01000 /* Large Increment per Cycle */ +#define PERF_X86_EVENT_LBR_SELECT 0x02000 /* Save/Restore MSR_LBR_SELECT */ +#define PERF_X86_EVENT_TOPDOWN 0x04000 /* Count Topdown slots/metrics events */ +#define PERF_X86_EVENT_PEBS_STLAT 0x08000 /* st+stlat data address sampling */ +#define PERF_X86_EVENT_AMD_BRS 0x10000 /* AMD Branch Sampling */ static inline bool is_topdown_count(struct perf_event *event) { @@ -325,6 +326,8 @@ struct cpu_hw_events { * AMD specific bits */ struct amd_nb *amd_nb; + int brs_active; /* BRS is enabled */ + /* Inverted mask of bits to clear in the perf_ctr ctrl registers */ u64 perf_ctr_virt_mask; int n_pair; /* Large increment events */ @@ -1105,6 +1108,11 @@ int x86_pmu_hw_config(struct perf_event *event); void x86_pmu_disable_all(void); +static inline bool has_amd_brs(struct hw_perf_event *hwc) +{ + return hwc->flags & PERF_X86_EVENT_AMD_BRS; +} + static inline bool is_counter_pair(struct hw_perf_event *hwc) { return hwc->flags & PERF_X86_EVENT_PAIR; @@ -1210,6 +1218,50 @@ static inline bool fixed_counter_disabled(int i, struct pmu *pmu) #ifdef CONFIG_CPU_SUP_AMD int amd_pmu_init(void); +int amd_brs_init(void); +void amd_brs_disable(void); +void amd_brs_enable(void); +void amd_brs_enable_all(void); +void amd_brs_disable_all(void); +void amd_brs_drain(void); +void amd_brs_disable_all(void); +int amd_brs_setup_filter(struct perf_event *event); +void amd_brs_reset(void); + +static inline void amd_pmu_brs_add(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + perf_sched_cb_inc(event->ctx->pmu); + cpuc->lbr_users++; + /* + * No need to reset BRS because it is reset + * on brs_enable() and it is saturating + */ +} + +static inline void amd_pmu_brs_del(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + cpuc->lbr_users--; + WARN_ON_ONCE(cpuc->lbr_users < 0); + + perf_sched_cb_dec(event->ctx->pmu); +} + +void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in); + +/* + * check if BRS is activated on the CPU + * active defined as it has non-zero users and DBG_EXT_CFG.BRSEN=1 + */ +static inline bool amd_brs_active(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + return cpuc->brs_active; +} #else /* CONFIG_CPU_SUP_AMD */ @@ -1218,6 +1270,23 @@ static inline int amd_pmu_init(void) return 0; } +static inline int amd_brs_init(void) +{ + return -EOPNOTSUPP; +} + +static inline void amd_brs_drain(void) +{ +} + +static inline void amd_brs_enable_all(void) +{ +} + +static inline void amd_brs_disable_all(void) +{ +} + #endif /* CONFIG_CPU_SUP_AMD */ static inline int is_pebs_pt(struct perf_event *event) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 0eb90d21049e..8179ea351bd8 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -688,6 +688,10 @@ #define MSR_IA32_PERF_CTL 0x00000199 #define INTEL_PERF_CTL_MASK 0xffff +/* AMD Branch Sampling configuration */ +#define MSR_AMD_DBG_EXTN_CFG 0xc000010f +#define MSR_AMD_SAMP_BR_FROM 0xc0010300 + #define MSR_IA32_MPERF 0x000000e7 #define MSR_IA32_APERF 0x000000e8 -- cgit From 44175993efbae04e8b2d7f7795ff512c3a726db0 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 22 Mar 2022 15:15:08 -0700 Subject: perf/x86/amd: Add branch-brs helper event for Fam19h BRS Add a pseudo event called branch-brs to help use the FAM Fam19h Branch Sampling feature (BRS). BRS samples taken branches, so it is best used when sampling on a retired taken branch event (0xc4) which is what BRS captures. Instead of trying to remember the event code or actual event name, users can simply do: $ perf record -b -e cpu/branch-brs/ -c 1000037 ..... Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220322221517.2510440-5-eranian@google.com --- arch/x86/events/amd/core.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index c7ac70d8ed9a..f7bce8364fe4 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -1145,8 +1145,23 @@ static struct attribute_group group_caps_amd_brs = { .is_visible = amd_brs_is_visible, }; +EVENT_ATTR_STR(branch-brs, amd_branch_brs, + "event=" __stringify(AMD_FAM19H_BRS_EVENT)"\n"); + +static struct attribute *amd_brs_events_attrs[] = { + EVENT_PTR(amd_branch_brs), + NULL, +}; + +static struct attribute_group group_events_amd_brs = { + .name = "events", + .attrs = amd_brs_events_attrs, + .is_visible = amd_brs_is_visible, +}; + static const struct attribute_group *amd_attr_update[] = { &group_caps_amd_brs, + &group_events_amd_brs, NULL, }; -- cgit From 8910075d61a37e5b0d82e6c83ed9a0a31fe9ea08 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 22 Mar 2022 15:15:09 -0700 Subject: perf/x86/amd: Enable branch sampling priv level filtering The AMD Branch Sampling features does not provide hardware filtering by privilege level. The associated PMU counter does but not the branch sampling by itself. Given how BRS operates there is a possibility that BRS captures kernel level branches even though the event is programmed to count only at the user level. Implement a workaround in software by removing the branches which belong to the wrong privilege level. The privilege level is evaluated on the target of the branch and not the source so as to be compatible with other architectures. As a consequence of this patch, the number of entries in the PERF_RECORD_BRANCH_STACK buffer may be less than the maximum (16). It could even be zero. Another consequence is that consecutive entries in the branch stack may not reflect actual code path and may have discontinuities, in case kernel branches were suppressed. But this is no different than what happens on other architectures. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220322221517.2510440-6-eranian@google.com --- arch/x86/events/amd/brs.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c index 3c13c484c637..40461c3ce714 100644 --- a/arch/x86/events/amd/brs.c +++ b/arch/x86/events/amd/brs.c @@ -92,10 +92,6 @@ int amd_brs_setup_filter(struct perf_event *event) if ((type & ~PERF_SAMPLE_BRANCH_PLM_ALL) != PERF_SAMPLE_BRANCH_ANY) return -EINVAL; - /* can only capture at all priv levels due to the way BRS works */ - if ((type & PERF_SAMPLE_BRANCH_PLM_ALL) != PERF_SAMPLE_BRANCH_PLM_ALL) - return -EINVAL; - return 0; } @@ -195,6 +191,21 @@ void amd_brs_disable_all(void) amd_brs_disable(); } +static bool amd_brs_match_plm(struct perf_event *event, u64 to) +{ + int type = event->attr.branch_sample_type; + int plm_k = PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_HV; + int plm_u = PERF_SAMPLE_BRANCH_USER; + + if (!(type & plm_k) && kernel_ip(to)) + return 0; + + if (!(type & plm_u) && !kernel_ip(to)) + return 0; + + return 1; +} + /* * Caller must ensure amd_brs_inuse() is true before calling * return: @@ -252,8 +263,6 @@ void amd_brs_drain(void) if (to == BRS_POISON) break; - rdmsrl(brs_from(brs_idx), from); - /* * Sign-extend SAMP_BR_TO to 64 bits, bits 61-63 are reserved. * Necessary to generate proper virtual addresses suitable for @@ -261,6 +270,11 @@ void amd_brs_drain(void) */ to = (u64)(((s64)to << shift) >> shift); + if (!amd_brs_match_plm(event, to)) + continue; + + rdmsrl(brs_from(brs_idx), from); + perf_clear_branch_entry_bitfields(br+nr); br[nr].from = from; -- cgit From ba2fe7500845a30fc845a72081999cf632051862 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 22 Mar 2022 15:15:10 -0700 Subject: perf/x86/amd: Add AMD branch sampling period adjustment Add code to adjust the sampling event period when used with the Branch Sampling feature (BRS). Given the depth of the BRS (16), the period is reduced by that depth such that in the best case scenario, BRS saturates at the desired sampling period. In practice, though, the processor may execute more branches. Given a desired period P and a depth D, the kernel programs the actual period at P - D. After P occurrences of the sampling event, the counter overflows. It then may take X branches (skid) before the NMI is caught and held by the hardware and BRS activates. Then, after D branches, BRS saturates and the NMI is delivered. With no skid, the effective period would be (P - D) + D = P. In practice, however, it will likely be (P - D) + X + D. There is no way to eliminate X or predict X. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220322221517.2510440-7-eranian@google.com --- arch/x86/events/core.c | 7 +++++++ arch/x86/events/perf_event.h | 12 ++++++++++++ 2 files changed, 19 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 7ada9172b074..54f992e65252 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1374,6 +1374,13 @@ int x86_perf_event_set_period(struct perf_event *event) x86_pmu.set_topdown_event_period) return x86_pmu.set_topdown_event_period(event); + /* + * decrease period by the depth of the BRS feature to get + * the last N taken branches and approximate the desired period + */ + if (has_branch_stack(event)) + period = amd_brs_adjust_period(period); + /* * If we are way outside a reasonable range then just skip forward: */ diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 6f1265163c9f..d91ff2c6cefe 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1263,6 +1263,14 @@ static inline bool amd_brs_active(void) return cpuc->brs_active; } +static inline s64 amd_brs_adjust_period(s64 period) +{ + if (period > x86_pmu.lbr_nr) + return period - x86_pmu.lbr_nr; + + return period; +} + #else /* CONFIG_CPU_SUP_AMD */ static inline int amd_pmu_init(void) @@ -1287,6 +1295,10 @@ static inline void amd_brs_disable_all(void) { } +static inline s64 amd_brs_adjust_period(s64 period) +{ + return period; +} #endif /* CONFIG_CPU_SUP_AMD */ static inline int is_pebs_pt(struct perf_event *event) -- cgit From cc37e520a236069c0de0e7ea455082fa11c73b12 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 22 Mar 2022 15:15:11 -0700 Subject: perf/x86/amd: Make Zen3 branch sampling opt-in Add a kernel config option CONFIG_PERF_EVENTS_AMD_BRS to make the support for AMD Zen3 Branch Sampling (BRS) an opt-in compile time option. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220322221517.2510440-8-eranian@google.com --- arch/x86/events/Kconfig | 8 ++++++++ arch/x86/events/amd/Makefile | 3 ++- arch/x86/events/perf_event.h | 49 +++++++++++++++++++++++++++++++++++--------- 3 files changed, 49 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/Kconfig b/arch/x86/events/Kconfig index d6cdfe631674..09c56965750a 100644 --- a/arch/x86/events/Kconfig +++ b/arch/x86/events/Kconfig @@ -44,4 +44,12 @@ config PERF_EVENTS_AMD_UNCORE To compile this driver as a module, choose M here: the module will be called 'amd-uncore'. + +config PERF_EVENTS_AMD_BRS + depends on PERF_EVENTS && CPU_SUP_AMD + bool "AMD Zen3 Branch Sampling support" + help + Enable AMD Zen3 branch sampling support (BRS) which samples up to + 16 consecutive taken branches in registers. + endmenu diff --git a/arch/x86/events/amd/Makefile b/arch/x86/events/amd/Makefile index cf323ffab5cd..b9f5d4610256 100644 --- a/arch/x86/events/amd/Makefile +++ b/arch/x86/events/amd/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_CPU_SUP_AMD) += core.o brs.o +obj-$(CONFIG_CPU_SUP_AMD) += core.o +obj-$(CONFIG_PERF_EVENTS_AMD_BRS) += brs.o obj-$(CONFIG_PERF_EVENTS_AMD_POWER) += power.o obj-$(CONFIG_X86_LOCAL_APIC) += ibs.o obj-$(CONFIG_PERF_EVENTS_AMD_UNCORE) += amd-uncore.o diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index d91ff2c6cefe..ef27aee04b13 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1218,6 +1218,8 @@ static inline bool fixed_counter_disabled(int i, struct pmu *pmu) #ifdef CONFIG_CPU_SUP_AMD int amd_pmu_init(void); + +#ifdef CONFIG_PERF_EVENTS_AMD_BRS int amd_brs_init(void); void amd_brs_disable(void); void amd_brs_enable(void); @@ -1252,25 +1254,52 @@ static inline void amd_pmu_brs_del(struct perf_event *event) void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in); -/* - * check if BRS is activated on the CPU - * active defined as it has non-zero users and DBG_EXT_CFG.BRSEN=1 - */ -static inline bool amd_brs_active(void) +static inline s64 amd_brs_adjust_period(s64 period) { - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + if (period > x86_pmu.lbr_nr) + return period - x86_pmu.lbr_nr; - return cpuc->brs_active; + return period; +} +#else +static inline int amd_brs_init(void) +{ + return 0; } +static inline void amd_brs_disable(void) {} +static inline void amd_brs_enable(void) {} +static inline void amd_brs_drain(void) {} +static inline void amd_brs_lopwr_init(void) {} +static inline void amd_brs_disable_all(void) {} +static inline int amd_brs_setup_filter(struct perf_event *event) +{ + return 0; +} +static inline void amd_brs_reset(void) {} -static inline s64 amd_brs_adjust_period(s64 period) +static inline void amd_pmu_brs_add(struct perf_event *event) { - if (period > x86_pmu.lbr_nr) - return period - x86_pmu.lbr_nr; +} + +static inline void amd_pmu_brs_del(struct perf_event *event) +{ +} + +static inline void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in) +{ +} +static inline s64 amd_brs_adjust_period(s64 period) +{ return period; } +static inline void amd_brs_enable_all(void) +{ +} + +#endif + #else /* CONFIG_CPU_SUP_AMD */ static inline int amd_pmu_init(void) -- cgit From d5616bac7adadbf42a3b63b8717e75eb82a2cc2c Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 22 Mar 2022 15:15:13 -0700 Subject: perf/x86/amd: Add idle hooks for branch sampling On AMD Fam19h Zen3, the branch sampling (BRS) feature must be disabled before entering low power and re-enabled (if was active) when returning from low power. Otherwise, the NMI interrupt may be held up for too long and cause problems. Stopping BRS will cause the NMI to be delivered if it was held up. Define a perf_amd_brs_lopwr_cb() callback to stop/restart BRS. The callback is protected by a jump label which is enabled only when AMD BRS is detected. In all other cases, the callback is never called. Signed-off-by: Stephane Eranian [peterz: static_call() and build fixes] Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220322221517.2510440-10-eranian@google.com --- arch/x86/events/amd/brs.c | 33 +++++++++++++++++++++++++++++++++ arch/x86/events/amd/core.c | 4 ++++ arch/x86/events/perf_event.h | 1 + arch/x86/include/asm/perf_event.h | 23 +++++++++++++++++++++++ 4 files changed, 61 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c index 40461c3ce714..895c82165d85 100644 --- a/arch/x86/events/amd/brs.c +++ b/arch/x86/events/amd/brs.c @@ -7,6 +7,7 @@ * Contributed by Stephane Eranian */ #include +#include #include #include @@ -329,3 +330,35 @@ void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in) if (sched_in) amd_brs_poison_buffer(); } + +/* + * called from ACPI processor_idle.c or acpi_pad.c + * with interrupts disabled + */ +void perf_amd_brs_lopwr_cb(bool lopwr_in) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + union amd_debug_extn_cfg cfg; + + /* + * on mwait in, we may end up in non C0 state. + * we must disable branch sampling to avoid holding the NMI + * for too long. We disable it in hardware but we + * keep the state in cpuc, so we can re-enable. + * + * The hardware will deliver the NMI if needed when brsmen cleared + */ + if (cpuc->brs_active) { + cfg.val = get_debug_extn_cfg(); + cfg.brsmen = !lopwr_in; + set_debug_extn_cfg(cfg.val); + } +} + +DEFINE_STATIC_CALL_NULL(perf_lopwr_cb, perf_amd_brs_lopwr_cb); +EXPORT_STATIC_CALL_TRAMP_GPL(perf_lopwr_cb); + +void __init amd_brs_lopwr_init(void) +{ + static_call_update(perf_lopwr_cb, perf_amd_brs_lopwr_cb); +} diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index f7bce8364fe4..8e1e818f8195 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only #include +#include #include #include #include @@ -1225,6 +1226,9 @@ static int __init amd_core_pmu_init(void) /* * put_event_constraints callback same as Fam17h, set above */ + + /* branch sampling must be stopped when entering low power */ + amd_brs_lopwr_init(); } x86_pmu.attr_update = amd_attr_update; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index ef27aee04b13..3b0324584da3 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1226,6 +1226,7 @@ void amd_brs_enable(void); void amd_brs_enable_all(void); void amd_brs_disable_all(void); void amd_brs_drain(void); +void amd_brs_lopwr_init(void); void amd_brs_disable_all(void); int amd_brs_setup_filter(struct perf_event *event); void amd_brs_reset(void); diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 58d9e4b1fa0a..8199fc5a37ea 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_PERF_EVENT_H #define _ASM_X86_PERF_EVENT_H +#include + /* * Performance event hw details: */ @@ -513,6 +515,27 @@ static inline void intel_pt_handle_vmx(int on) #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) extern void amd_pmu_enable_virt(void); extern void amd_pmu_disable_virt(void); + +#if defined(CONFIG_PERF_EVENTS_AMD_BRS) + +#define PERF_NEEDS_LOPWR_CB 1 + +/* + * architectural low power callback impacts + * drivers/acpi/processor_idle.c + * drivers/acpi/acpi_pad.c + */ +extern void perf_amd_brs_lopwr_cb(bool lopwr_in); + +DECLARE_STATIC_CALL(perf_lopwr_cb, perf_amd_brs_lopwr_cb); + +static inline void perf_lopwr_cb(bool lopwr_in) +{ + static_call_mod(perf_lopwr_cb)(lopwr_in); +} + +#endif /* PERF_NEEDS_LOPWR_CB */ + #else static inline void amd_pmu_enable_virt(void) { } static inline void amd_pmu_disable_virt(void) { } -- cgit From 7bebfe9dd802b80abff5a43e00ab68d98893a22c Mon Sep 17 00:00:00 2001 From: Yang Jihong Date: Thu, 24 Mar 2022 11:19:57 +0800 Subject: perf/x86: Unify format of events sysfs show Sysfs show formats of files in /sys/devices/cpu/events/ are not unified, some end with "\n", and some do not. Modify sysfs show format of events defined by EVENT_ATTR_STR to end with "\n". Before: $ ls /sys/devices/cpu/events/* | xargs -i sh -c 'echo -n "{}: "; cat -A {}; echo' branch-instructions: event=0xc4$ branch-misses: event=0xc5$ bus-cycles: event=0x3c,umask=0x01$ cache-misses: event=0x2e,umask=0x41$ cache-references: event=0x2e,umask=0x4f$ cpu-cycles: event=0x3c$ instructions: event=0xc0$ ref-cycles: event=0x00,umask=0x03$ slots: event=0x00,umask=0x4 topdown-bad-spec: event=0x00,umask=0x81 topdown-be-bound: event=0x00,umask=0x83 topdown-fe-bound: event=0x00,umask=0x82 topdown-retiring: event=0x00,umask=0x80 After: $ ls /sys/devices/cpu/events/* | xargs -i sh -c 'echo -n "{}: "; cat -A {}; echo' /sys/devices/cpu/events/branch-instructions: event=0xc4$ /sys/devices/cpu/events/branch-misses: event=0xc5$ /sys/devices/cpu/events/bus-cycles: event=0x3c,umask=0x01$ /sys/devices/cpu/events/cache-misses: event=0x2e,umask=0x41$ /sys/devices/cpu/events/cache-references: event=0x2e,umask=0x4f$ /sys/devices/cpu/events/cpu-cycles: event=0x3c$ /sys/devices/cpu/events/instructions: event=0xc0$ /sys/devices/cpu/events/ref-cycles: event=0x00,umask=0x03$ /sys/devices/cpu/events/slots: event=0x00,umask=0x4$ Signed-off-by: Yang Jihong Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220324031957.135595-1-yangjihong1@huawei.com --- arch/x86/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 54f992e65252..b08052b05db6 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1852,7 +1852,7 @@ ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, cha /* string trumps id */ if (pmu_attr->event_str) - return sprintf(page, "%s", pmu_attr->event_str); + return sprintf(page, "%s\n", pmu_attr->event_str); return x86_pmu.events_sysfs_show(page, config); } -- cgit From 046f773be106ec8eb92b13414c90f8e279deffe0 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 7 Mar 2022 15:33:11 -0600 Subject: KVM: SVM: Define sev_features and VMPL field in the VMSA The hypervisor uses the sev_features field (offset 3B0h) in the Save State Area to control the SEV-SNP guest features such as SNPActive, vTOM, ReflectVC etc. An SEV-SNP guest can read the sev_features field through the SEV_STATUS MSR. While at it, update dump_vmcb() to log the VMPL level. See APM2 Table 15-34 and B-4 for more details. Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Reviewed-by: Venu Busireddy Link: https://lore.kernel.org/r/20220307213356.2797205-2-brijesh.singh@amd.com --- arch/x86/include/asm/svm.h | 6 ++++-- arch/x86/kvm/svm/svm.c | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index f70a5108d464..f2d01f351e95 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -282,7 +282,8 @@ struct vmcb_save_area { struct vmcb_seg ldtr; struct vmcb_seg idtr; struct vmcb_seg tr; - u8 reserved_1[43]; + u8 reserved_1[42]; + u8 vmpl; u8 cpl; u8 reserved_2[4]; u64 efer; @@ -347,7 +348,8 @@ struct vmcb_save_area { u64 sw_exit_info_1; u64 sw_exit_info_2; u64 sw_scratch; - u8 reserved_11[56]; + u64 sev_features; + u8 reserved_11[48]; u64 xcr0; u8 valid_bitmap[16]; u64 x87_state_gpa; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index bd4c64b362d2..81cb518170a8 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3117,8 +3117,8 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) "tr:", save01->tr.selector, save01->tr.attrib, save01->tr.limit, save01->tr.base); - pr_err("cpl: %d efer: %016llx\n", - save->cpl, save->efer); + pr_err("vmpl: %d cpl: %d efer: %016llx\n", + save->vmpl, save->cpl, save->efer); pr_err("%-15s %016llx %-13s %016llx\n", "cr0:", save->cr0, "cr2:", save->cr2); pr_err("%-15s %016llx %-13s %016llx\n", -- cgit From e1907d37514b8564ba18b4a768a35beee71cb011 Mon Sep 17 00:00:00 2001 From: Muralidhara M K Date: Thu, 24 Mar 2022 17:57:29 +0530 Subject: x86/amd_nb: Unexport amd_cache_northbridges() amd_cache_northbridges() is exported by amd_nb.c and is called by amd64-agp.c and amd64_edac.c modules at module_init() time so that NB descriptors are properly cached before those drivers can use them. However, the init_amd_nbs() initcall already does call amd_cache_northbridges() unconditionally and thus makes sure the NB descriptors are enumerated. That initcall is a fs_initcall type which is on the 5th group (starting from 0) of initcalls that gets run in increasing numerical order by the init code. The module_init() call is turned into an __initcall() in the MODULE=n case and those are device-level initcalls, i.e., group 6. Therefore, the northbridges caching is already finished by the time module initialization starts and thus the correct initialization order is retained. Unexport amd_cache_northbridges(), update dependent modules to call amd_nb_num() instead. While at it, simplify the checks in amd_cache_northbridges(). [ bp: Heavily massage and *actually* explain why the change is ok. ] Signed-off-by: Muralidhara M K Signed-off-by: Naveen Krishna Chatradhi Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220324122729.221765-1-nchatrad@amd.com --- arch/x86/include/asm/amd_nb.h | 1 - arch/x86/kernel/amd_nb.c | 7 +++---- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 00d1a400b7a1..ed0eaf65c437 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -16,7 +16,6 @@ extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[]; extern bool early_is_amd_nb(u32 value); extern struct resource *amd_get_mmconfig_range(struct resource *res); -extern int amd_cache_northbridges(void); extern void amd_flush_garts(void); extern int amd_numa_init(void); extern int amd_get_subcaches(int); diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 020c906f7934..190e0f763375 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -188,7 +188,7 @@ int amd_smn_write(u16 node, u32 address, u32 value) EXPORT_SYMBOL_GPL(amd_smn_write); -int amd_cache_northbridges(void) +static int amd_cache_northbridges(void) { const struct pci_device_id *misc_ids = amd_nb_misc_ids; const struct pci_device_id *link_ids = amd_nb_link_ids; @@ -210,14 +210,14 @@ int amd_cache_northbridges(void) } misc = NULL; - while ((misc = next_northbridge(misc, misc_ids)) != NULL) + while ((misc = next_northbridge(misc, misc_ids))) misc_count++; if (!misc_count) return -ENODEV; root = NULL; - while ((root = next_northbridge(root, root_ids)) != NULL) + while ((root = next_northbridge(root, root_ids))) root_count++; if (root_count) { @@ -290,7 +290,6 @@ int amd_cache_northbridges(void) return 0; } -EXPORT_SYMBOL_GPL(amd_cache_northbridges); /* * Ignores subdevice/subvendor but as far as I can figure out -- cgit From b86eb74098a92afd789da02699b4b0dd3f73b889 Mon Sep 17 00:00:00 2001 From: Ammar Faizi Date: Tue, 29 Mar 2022 17:47:04 +0700 Subject: x86/delay: Fix the wrong asm constraint in delay_loop() The asm constraint does not reflect the fact that the asm statement can modify the value of the local variable loops. Which it does. Specifying the wrong constraint may lead to undefined behavior, it may clobber random stuff (e.g. local variable, important temporary value in regs, etc.). This is especially dangerous when the compiler decides to inline the function and since it doesn't know that the value gets modified, it might decide to use it from a register directly without reloading it. Change the constraint to "+a" to denote that the first argument is an input and an output argument. [ bp: Fix typo, massage commit message. ] Fixes: e01b70ef3eb3 ("x86: fix bug in arch/i386/lib/delay.c file, delay_loop function") Signed-off-by: Ammar Faizi Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220329104705.65256-2-ammarfaizi2@gnuweeb.org --- arch/x86/lib/delay.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 65d15df6212d..0e65d00e2339 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -54,8 +54,8 @@ static void delay_loop(u64 __loops) " jnz 2b \n" "3: dec %0 \n" - : /* we don't need output */ - :"a" (loops) + : "+a" (loops) + : ); } -- cgit From 9f1b19b977ee3cbd3fe9135ff63dbf221eac1d6a Mon Sep 17 00:00:00 2001 From: Smita Koralahalli Date: Fri, 25 Feb 2022 13:33:40 -0600 Subject: x86/mce: Avoid unnecessary padding in struct mce_bank Convert struct mce_bank member "init" from bool to a bitfield to get rid of unnecessary padding. $ pahole -C mce_bank arch/x86/kernel/cpu/mce/core.o before: /* size: 16, cachelines: 1, members: 2 */ /* padding: 7 */ /* last cacheline: 16 bytes */ after: /* size: 16, cachelines: 1, members: 3 */ /* last cacheline: 16 bytes */ No functional changes. Signed-off-by: Smita Koralahalli Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220225193342.215780-2-Smita.KoralahalliChannabasappa@amd.com --- arch/x86/kernel/cpu/mce/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 981496e6bc0e..d775fcd74e98 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -69,7 +69,9 @@ DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks); struct mce_bank { u64 ctl; /* subevents to enable */ - bool init; /* initialise bank? */ + + __u64 init : 1, /* initialise bank? */ + __reserved_1 : 63; }; static DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array); -- cgit From e5f28623ceb103e13fc3d7bd45edf9818b227fd0 Mon Sep 17 00:00:00 2001 From: Ammar Faizi Date: Tue, 29 Mar 2022 17:47:05 +0700 Subject: x86/MCE/AMD: Fix memory leak when threshold_create_bank() fails In mce_threshold_create_device(), if threshold_create_bank() fails, the previously allocated threshold banks array @bp will be leaked because the call to mce_threshold_remove_device() will not free it. This happens because mce_threshold_remove_device() fetches the pointer through the threshold_banks per-CPU variable but bp is written there only after the bank creation is successful, and not before, when threshold_create_bank() fails. Add a helper which unwinds all the bank creation work previously done and pass into it the previously allocated threshold banks array for freeing. [ bp: Massage. ] Fixes: 6458de97fc15 ("x86/mce/amd: Straighten CPU hotplug path") Co-developed-by: Alviro Iskandar Setiawan Signed-off-by: Alviro Iskandar Setiawan Co-developed-by: Yazen Ghannam Signed-off-by: Yazen Ghannam Signed-off-by: Ammar Faizi Signed-off-by: Borislav Petkov Cc: Link: https://lore.kernel.org/r/20220329104705.65256-3-ammarfaizi2@gnuweeb.org --- arch/x86/kernel/cpu/mce/amd.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 1940d305db1c..1c87501e0fa3 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -1294,10 +1294,23 @@ out_free: kfree(bank); } +static void __threshold_remove_device(struct threshold_bank **bp) +{ + unsigned int bank, numbanks = this_cpu_read(mce_num_banks); + + for (bank = 0; bank < numbanks; bank++) { + if (!bp[bank]) + continue; + + threshold_remove_bank(bp[bank]); + bp[bank] = NULL; + } + kfree(bp); +} + int mce_threshold_remove_device(unsigned int cpu) { struct threshold_bank **bp = this_cpu_read(threshold_banks); - unsigned int bank, numbanks = this_cpu_read(mce_num_banks); if (!bp) return 0; @@ -1308,13 +1321,7 @@ int mce_threshold_remove_device(unsigned int cpu) */ this_cpu_write(threshold_banks, NULL); - for (bank = 0; bank < numbanks; bank++) { - if (bp[bank]) { - threshold_remove_bank(bp[bank]); - bp[bank] = NULL; - } - } - kfree(bp); + __threshold_remove_device(bp); return 0; } @@ -1351,15 +1358,14 @@ int mce_threshold_create_device(unsigned int cpu) if (!(this_cpu_read(bank_map) & (1 << bank))) continue; err = threshold_create_bank(bp, cpu, bank); - if (err) - goto out_err; + if (err) { + __threshold_remove_device(bp); + return err; + } } this_cpu_write(threshold_banks, bp); if (thresholding_irq_en) mce_threshold_vector = amd_threshold_interrupt; return 0; -out_err: - mce_threshold_remove_device(cpu); - return err; } -- cgit From 0205f8a738ab9e62d849e88e543cfa6ce4c13163 Mon Sep 17 00:00:00 2001 From: Ricardo Cañuelo Date: Fri, 1 Apr 2022 09:45:17 +0200 Subject: x86/speculation/srbds: Do not try to turn mitigation off when not supported MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When SRBDS is mitigated by TSX OFF, update_srbds_msr() will still read and write to MSR_IA32_MCU_OPT_CTRL even when that MSR is not supported due to not having loaded the appropriate microcode. Check for X86_FEATURE_SRBDS_CTRL which is set only when the respective microcode which adds MSR_IA32_MCU_OPT_CTRL is loaded. Based on a patch by Thadeu Lima de Souza Cascardo . [ bp: Massage commit message. ] Suggested-by: Pawan Gupta Signed-off-by: Ricardo Cañuelo Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220401074517.1848264-1-ricardo.canuelo@collabora.com --- arch/x86/kernel/cpu/bugs.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 6296e1ebed1d..d879a6c93609 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -446,6 +446,13 @@ void update_srbds_msr(void) if (srbds_mitigation == SRBDS_MITIGATION_UCODE_NEEDED) return; + /* + * A MDS_NO CPU for which SRBDS mitigation is not needed due to TSX + * being disabled and it hasn't received the SRBDS MSR microcode. + */ + if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL)) + return; + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); switch (srbds_mitigation) { -- cgit From 3dd2775b74c9b1b01d19805877ab45bc47c4a5a5 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Tue, 5 Apr 2022 13:27:43 -0500 Subject: KVM: SVM: Create a separate mapping for the SEV-ES save area The save area for SEV-ES/SEV-SNP guests, as used by the hardware, is different from the save area of a non SEV-ES/SEV-SNP guest. This is the first step in defining the multiple save areas to keep them separate and ensuring proper operation amongst the different types of guests. Create an SEV-ES/SEV-SNP save area and adjust usage to the new save area definition where needed. Signed-off-by: Tom Lendacky Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Reviewed-by: Venu Busireddy Link: https://lore.kernel.org/r/20220405182743.308853-1-brijesh.singh@amd.com --- arch/x86/include/asm/svm.h | 87 +++++++++++++++++++++++++++++++++++----------- arch/x86/kvm/svm/sev.c | 22 ++++++------ arch/x86/kvm/svm/svm.c | 4 +-- arch/x86/kvm/svm/svm.h | 4 +-- 4 files changed, 82 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index f2d01f351e95..788a43f99080 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -271,6 +271,7 @@ struct vmcb_seg { u64 base; } __packed; +/* Save area definition for legacy and SEV-MEM guests */ struct vmcb_save_area { struct vmcb_seg es; struct vmcb_seg cs; @@ -287,8 +288,58 @@ struct vmcb_save_area { u8 cpl; u8 reserved_2[4]; u64 efer; + u8 reserved_3[112]; + u64 cr4; + u64 cr3; + u64 cr0; + u64 dr7; + u64 dr6; + u64 rflags; + u64 rip; + u8 reserved_4[88]; + u64 rsp; + u64 s_cet; + u64 ssp; + u64 isst_addr; + u64 rax; + u64 star; + u64 lstar; + u64 cstar; + u64 sfmask; + u64 kernel_gs_base; + u64 sysenter_cs; + u64 sysenter_esp; + u64 sysenter_eip; + u64 cr2; + u8 reserved_5[32]; + u64 g_pat; + u64 dbgctl; + u64 br_from; + u64 br_to; + u64 last_excp_from; + u64 last_excp_to; + u8 reserved_6[72]; + u32 spec_ctrl; /* Guest version of SPEC_CTRL at 0x2E0 */ +} __packed; + +/* Save area definition for SEV-ES and SEV-SNP guests */ +struct sev_es_save_area { + struct vmcb_seg es; + struct vmcb_seg cs; + struct vmcb_seg ss; + struct vmcb_seg ds; + struct vmcb_seg fs; + struct vmcb_seg gs; + struct vmcb_seg gdtr; + struct vmcb_seg ldtr; + struct vmcb_seg idtr; + struct vmcb_seg tr; + u8 reserved_1[43]; + u8 cpl; + u8 reserved_2[4]; + u64 efer; u8 reserved_3[104]; - u64 xss; /* Valid for SEV-ES only */ + u64 xss; u64 cr4; u64 cr3; u64 cr0; @@ -316,22 +367,14 @@ struct vmcb_save_area { u64 br_to; u64 last_excp_from; u64 last_excp_to; - - /* - * The following part of the save area is valid only for - * SEV-ES guests when referenced through the GHCB or for - * saving to the host save area. - */ - u8 reserved_7[72]; - u32 spec_ctrl; /* Guest version of SPEC_CTRL at 0x2E0 */ - u8 reserved_7b[4]; + u8 reserved_7[80]; u32 pkru; - u8 reserved_7a[20]; - u64 reserved_8; /* rax already available at 0x01f8 */ + u8 reserved_9[20]; + u64 reserved_10; /* rax already available at 0x01f8 */ u64 rcx; u64 rdx; u64 rbx; - u64 reserved_9; /* rsp already available at 0x01d8 */ + u64 reserved_11; /* rsp already available at 0x01d8 */ u64 rbp; u64 rsi; u64 rdi; @@ -343,23 +386,25 @@ struct vmcb_save_area { u64 r13; u64 r14; u64 r15; - u8 reserved_10[16]; + u8 reserved_12[16]; u64 sw_exit_code; u64 sw_exit_info_1; u64 sw_exit_info_2; u64 sw_scratch; u64 sev_features; - u8 reserved_11[48]; + u8 reserved_13[48]; u64 xcr0; u8 valid_bitmap[16]; u64 x87_state_gpa; } __packed; +#define GHCB_SHARED_BUF_SIZE 2032 + struct ghcb { - struct vmcb_save_area save; - u8 reserved_save[2048 - sizeof(struct vmcb_save_area)]; + struct sev_es_save_area save; + u8 reserved_save[2048 - sizeof(struct sev_es_save_area)]; - u8 shared_buffer[2032]; + u8 shared_buffer[GHCB_SHARED_BUF_SIZE]; u8 reserved_1[10]; u16 protocol_version; /* negotiated SEV-ES/GHCB protocol version */ @@ -367,13 +412,15 @@ struct ghcb { } __packed; -#define EXPECTED_VMCB_SAVE_AREA_SIZE 1032 +#define EXPECTED_VMCB_SAVE_AREA_SIZE 740 +#define EXPECTED_SEV_ES_SAVE_AREA_SIZE 1032 #define EXPECTED_VMCB_CONTROL_AREA_SIZE 1024 #define EXPECTED_GHCB_SIZE PAGE_SIZE static inline void __unused_size_checks(void) { BUILD_BUG_ON(sizeof(struct vmcb_save_area) != EXPECTED_VMCB_SAVE_AREA_SIZE); + BUILD_BUG_ON(sizeof(struct sev_es_save_area) != EXPECTED_SEV_ES_SAVE_AREA_SIZE); BUILD_BUG_ON(sizeof(struct vmcb_control_area) != EXPECTED_VMCB_CONTROL_AREA_SIZE); BUILD_BUG_ON(sizeof(struct ghcb) != EXPECTED_GHCB_SIZE); } @@ -443,7 +490,7 @@ struct vmcb { /* GHCB Accessor functions */ #define GHCB_BITMAP_IDX(field) \ - (offsetof(struct vmcb_save_area, field) / sizeof(u64)) + (offsetof(struct sev_es_save_area, field) / sizeof(u64)) #define DEFINE_GHCB_ACCESSORS(field) \ static inline bool ghcb_##field##_is_valid(const struct ghcb *ghcb) \ diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 75fa6dd268f0..6e18ec1839f0 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -559,12 +559,20 @@ e_unpin: static int sev_es_sync_vmsa(struct vcpu_svm *svm) { - struct vmcb_save_area *save = &svm->vmcb->save; + struct sev_es_save_area *save = svm->sev_es.vmsa; /* Check some debug related fields before encrypting the VMSA */ - if (svm->vcpu.guest_debug || (save->dr7 & ~DR7_FIXED_1)) + if (svm->vcpu.guest_debug || (svm->vmcb->save.dr7 & ~DR7_FIXED_1)) return -EINVAL; + /* + * SEV-ES will use a VMSA that is pointed to by the VMCB, not + * the traditional VMSA that is part of the VMCB. Copy the + * traditional VMSA as it has been built so far (in prep + * for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state. + */ + memcpy(save, &svm->vmcb->save, sizeof(svm->vmcb->save)); + /* Sync registgers */ save->rax = svm->vcpu.arch.regs[VCPU_REGS_RAX]; save->rbx = svm->vcpu.arch.regs[VCPU_REGS_RBX]; @@ -592,14 +600,6 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm) save->xss = svm->vcpu.arch.ia32_xss; save->dr6 = svm->vcpu.arch.dr6; - /* - * SEV-ES will use a VMSA that is pointed to by the VMCB, not - * the traditional VMSA that is part of the VMCB. Copy the - * traditional VMSA as it has been built so far (in prep - * for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state. - */ - memcpy(svm->sev_es.vmsa, save, sizeof(*save)); - return 0; } @@ -2932,7 +2932,7 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm) sev_enc_bit)); } -void sev_es_prepare_switch_to_guest(struct vmcb_save_area *hostsa) +void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa) { /* * As an SEV-ES guest, hardware will restore the host state on VMEXIT, diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 81cb518170a8..6ff595f74e3a 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1270,8 +1270,8 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) */ vmsave(__sme_page_pa(sd->save_area)); if (sev_es_guest(vcpu->kvm)) { - struct vmcb_save_area *hostsa; - hostsa = (struct vmcb_save_area *)(page_address(sd->save_area) + 0x400); + struct sev_es_save_area *hostsa; + hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400); sev_es_prepare_switch_to_guest(hostsa); } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index f77a7d2d39dd..cc857deaee5e 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -181,7 +181,7 @@ struct svm_nested_state { struct vcpu_sev_es_state { /* SEV-ES support */ - struct vmcb_save_area *vmsa; + struct sev_es_save_area *vmsa; struct ghcb *ghcb; struct kvm_host_map ghcb_map; bool received_first_sipi; @@ -620,7 +620,7 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in); void sev_es_init_vmcb(struct vcpu_svm *svm); void sev_es_vcpu_reset(struct vcpu_svm *svm); void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); -void sev_es_prepare_switch_to_guest(struct vmcb_save_area *hostsa); +void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa); void sev_es_unmap_ghcb(struct vcpu_svm *svm); /* vmenter.S */ -- cgit From a4690359eaec985a1351786da887df1ba92440a0 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 7 Mar 2022 15:33:13 -0600 Subject: KVM: SVM: Create a separate mapping for the GHCB save area The initial implementation of the GHCB spec was based on trying to keep the register state offsets the same relative to the VM save area. However, the save area for SEV-ES has changed within the hardware causing the relation between the SEV-ES save area to change relative to the GHCB save area. This is the second step in defining the multiple save areas to keep them separate and ensuring proper operation amongst the different types of guests. Create a GHCB save area that matches the GHCB specification. Signed-off-by: Tom Lendacky Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Reviewed-by: Venu Busireddy Link: https://lore.kernel.org/r/20220307213356.2797205-4-brijesh.singh@amd.com --- arch/x86/include/asm/svm.h | 48 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 45 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 788a43f99080..0789ad83b2bd 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -398,11 +398,51 @@ struct sev_es_save_area { u64 x87_state_gpa; } __packed; +struct ghcb_save_area { + u8 reserved_1[203]; + u8 cpl; + u8 reserved_2[116]; + u64 xss; + u8 reserved_3[24]; + u64 dr7; + u8 reserved_4[16]; + u64 rip; + u8 reserved_5[88]; + u64 rsp; + u8 reserved_6[24]; + u64 rax; + u8 reserved_7[264]; + u64 rcx; + u64 rdx; + u64 rbx; + u8 reserved_8[8]; + u64 rbp; + u64 rsi; + u64 rdi; + u64 r8; + u64 r9; + u64 r10; + u64 r11; + u64 r12; + u64 r13; + u64 r14; + u64 r15; + u8 reserved_9[16]; + u64 sw_exit_code; + u64 sw_exit_info_1; + u64 sw_exit_info_2; + u64 sw_scratch; + u8 reserved_10[56]; + u64 xcr0; + u8 valid_bitmap[16]; + u64 x87_state_gpa; +} __packed; + #define GHCB_SHARED_BUF_SIZE 2032 struct ghcb { - struct sev_es_save_area save; - u8 reserved_save[2048 - sizeof(struct sev_es_save_area)]; + struct ghcb_save_area save; + u8 reserved_save[2048 - sizeof(struct ghcb_save_area)]; u8 shared_buffer[GHCB_SHARED_BUF_SIZE]; @@ -413,6 +453,7 @@ struct ghcb { #define EXPECTED_VMCB_SAVE_AREA_SIZE 740 +#define EXPECTED_GHCB_SAVE_AREA_SIZE 1032 #define EXPECTED_SEV_ES_SAVE_AREA_SIZE 1032 #define EXPECTED_VMCB_CONTROL_AREA_SIZE 1024 #define EXPECTED_GHCB_SIZE PAGE_SIZE @@ -420,6 +461,7 @@ struct ghcb { static inline void __unused_size_checks(void) { BUILD_BUG_ON(sizeof(struct vmcb_save_area) != EXPECTED_VMCB_SAVE_AREA_SIZE); + BUILD_BUG_ON(sizeof(struct ghcb_save_area) != EXPECTED_GHCB_SAVE_AREA_SIZE); BUILD_BUG_ON(sizeof(struct sev_es_save_area) != EXPECTED_SEV_ES_SAVE_AREA_SIZE); BUILD_BUG_ON(sizeof(struct vmcb_control_area) != EXPECTED_VMCB_CONTROL_AREA_SIZE); BUILD_BUG_ON(sizeof(struct ghcb) != EXPECTED_GHCB_SIZE); @@ -490,7 +532,7 @@ struct vmcb { /* GHCB Accessor functions */ #define GHCB_BITMAP_IDX(field) \ - (offsetof(struct sev_es_save_area, field) / sizeof(u64)) + (offsetof(struct ghcb_save_area, field) / sizeof(u64)) #define DEFINE_GHCB_ACCESSORS(field) \ static inline bool ghcb_##field##_is_valid(const struct ghcb *ghcb) \ -- cgit From 6d3b3d34e39eb4ee9a7dbe7e28e2588e160f9c0f Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 7 Mar 2022 15:33:14 -0600 Subject: KVM: SVM: Update the SEV-ES save area mapping This is the final step in defining the multiple save areas to keep them separate and ensuring proper operation amongst the different types of guests. Update the SEV-ES/SEV-SNP save area to match the APM. This save area will be used for the upcoming SEV-SNP AP Creation NAE event support. Signed-off-by: Tom Lendacky Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Reviewed-by: Venu Busireddy Link: https://lore.kernel.org/r/20220307213356.2797205-5-brijesh.singh@amd.com --- arch/x86/include/asm/svm.h | 66 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 50 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 0789ad83b2bd..ec623e7da33d 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -334,7 +334,13 @@ struct sev_es_save_area { struct vmcb_seg ldtr; struct vmcb_seg idtr; struct vmcb_seg tr; - u8 reserved_1[43]; + u64 vmpl0_ssp; + u64 vmpl1_ssp; + u64 vmpl2_ssp; + u64 vmpl3_ssp; + u64 u_cet; + u8 reserved_1[2]; + u8 vmpl; u8 cpl; u8 reserved_2[4]; u64 efer; @@ -347,9 +353,19 @@ struct sev_es_save_area { u64 dr6; u64 rflags; u64 rip; - u8 reserved_4[88]; + u64 dr0; + u64 dr1; + u64 dr2; + u64 dr3; + u64 dr0_addr_mask; + u64 dr1_addr_mask; + u64 dr2_addr_mask; + u64 dr3_addr_mask; + u8 reserved_4[24]; u64 rsp; - u8 reserved_5[24]; + u64 s_cet; + u64 ssp; + u64 isst_addr; u64 rax; u64 star; u64 lstar; @@ -360,7 +376,7 @@ struct sev_es_save_area { u64 sysenter_esp; u64 sysenter_eip; u64 cr2; - u8 reserved_6[32]; + u8 reserved_5[32]; u64 g_pat; u64 dbgctl; u64 br_from; @@ -369,12 +385,12 @@ struct sev_es_save_area { u64 last_excp_to; u8 reserved_7[80]; u32 pkru; - u8 reserved_9[20]; - u64 reserved_10; /* rax already available at 0x01f8 */ + u8 reserved_8[20]; + u64 reserved_9; /* rax already available at 0x01f8 */ u64 rcx; u64 rdx; u64 rbx; - u64 reserved_11; /* rsp already available at 0x01d8 */ + u64 reserved_10; /* rsp already available at 0x01d8 */ u64 rbp; u64 rsi; u64 rdi; @@ -386,16 +402,34 @@ struct sev_es_save_area { u64 r13; u64 r14; u64 r15; - u8 reserved_12[16]; - u64 sw_exit_code; - u64 sw_exit_info_1; - u64 sw_exit_info_2; - u64 sw_scratch; + u8 reserved_11[16]; + u64 guest_exit_info_1; + u64 guest_exit_info_2; + u64 guest_exit_int_info; + u64 guest_nrip; u64 sev_features; - u8 reserved_13[48]; + u64 vintr_ctrl; + u64 guest_exit_code; + u64 virtual_tom; + u64 tlb_id; + u64 pcpu_id; + u64 event_inj; u64 xcr0; - u8 valid_bitmap[16]; - u64 x87_state_gpa; + u8 reserved_12[16]; + + /* Floating point area */ + u64 x87_dp; + u32 mxcsr; + u16 x87_ftw; + u16 x87_fsw; + u16 x87_fcw; + u16 x87_fop; + u16 x87_ds; + u16 x87_cs; + u64 x87_rip; + u8 fpreg_x87[80]; + u8 fpreg_xmm[256]; + u8 fpreg_ymm[256]; } __packed; struct ghcb_save_area { @@ -454,7 +488,7 @@ struct ghcb { #define EXPECTED_VMCB_SAVE_AREA_SIZE 740 #define EXPECTED_GHCB_SAVE_AREA_SIZE 1032 -#define EXPECTED_SEV_ES_SAVE_AREA_SIZE 1032 +#define EXPECTED_SEV_ES_SAVE_AREA_SIZE 1648 #define EXPECTED_VMCB_CONTROL_AREA_SIZE 1024 #define EXPECTED_GHCB_SIZE PAGE_SIZE -- cgit From 176db622573f028f85221873ea4577e096785315 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Wed, 9 Feb 2022 12:09:59 -0600 Subject: x86/boot: Introduce helpers for MSR reads/writes The current set of helpers used throughout the run-time kernel have dependencies on code/facilities outside of the boot kernel, so there are a number of call-sites throughout the boot kernel where inline assembly is used instead. More will be added with subsequent patches that add support for SEV-SNP, so take the opportunity to provide a basic set of helpers that can be used by the boot kernel to reduce reliance on inline assembly. Use boot_* prefix so that it's clear these are helpers specific to the boot kernel to avoid any confusion with the various other MSR read/write helpers. [ bp: Disambiguate parameter names and trim comment. ] Suggested-by: Borislav Petkov Signed-off-by: Michael Roth Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-6-brijesh.singh@amd.com --- arch/x86/boot/msr.h | 26 ++++++++++++++++++++++++++ arch/x86/include/asm/msr.h | 11 +---------- arch/x86/include/asm/shared/msr.h | 15 +++++++++++++++ 3 files changed, 42 insertions(+), 10 deletions(-) create mode 100644 arch/x86/boot/msr.h create mode 100644 arch/x86/include/asm/shared/msr.h (limited to 'arch/x86') diff --git a/arch/x86/boot/msr.h b/arch/x86/boot/msr.h new file mode 100644 index 000000000000..aed66f7ae199 --- /dev/null +++ b/arch/x86/boot/msr.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Helpers/definitions related to MSR access. + */ + +#ifndef BOOT_MSR_H +#define BOOT_MSR_H + +#include + +/* + * The kernel proper already defines rdmsr()/wrmsr(), but they are not for the + * boot kernel since they rely on tracepoint/exception handling infrastructure + * that's not available here. + */ +static inline void boot_rdmsr(unsigned int reg, struct msr *m) +{ + asm volatile("rdmsr" : "=a" (m->l), "=d" (m->h) : "c" (reg)); +} + +static inline void boot_wrmsr(unsigned int reg, const struct msr *m) +{ + asm volatile("wrmsr" : : "c" (reg), "a"(m->l), "d" (m->h) : "memory"); +} + +#endif /* BOOT_MSR_H */ diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index d42e6c6b47b1..65ec1965cd28 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -10,16 +10,7 @@ #include #include #include - -struct msr { - union { - struct { - u32 l; - u32 h; - }; - u64 q; - }; -}; +#include struct msr_info { u32 msr_no; diff --git a/arch/x86/include/asm/shared/msr.h b/arch/x86/include/asm/shared/msr.h new file mode 100644 index 000000000000..1e6ec10b3a15 --- /dev/null +++ b/arch/x86/include/asm/shared/msr.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_SHARED_MSR_H +#define _ASM_X86_SHARED_MSR_H + +struct msr { + union { + struct { + u32 l; + u32 h; + }; + u64 q; + }; +}; + +#endif /* _ASM_X86_SHARED_MSR_H */ -- cgit From 950d00558a920227b5703d1fcc4751cfe03853cd Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Wed, 9 Feb 2022 12:10:00 -0600 Subject: x86/boot: Use MSR read/write helpers instead of inline assembly Update all C code to use the new boot_rdmsr()/boot_wrmsr() helpers instead of relying on inline assembly. Suggested-by: Borislav Petkov Signed-off-by: Michael Roth Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-7-brijesh.singh@amd.com --- arch/x86/boot/compressed/sev.c | 17 +++++++---------- arch/x86/boot/cpucheck.c | 30 +++++++++++++++--------------- 2 files changed, 22 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 28bcf04c022e..4e82101b7d13 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -22,6 +22,7 @@ #include #include "error.h" +#include "../msr.h" struct ghcb boot_ghcb_page __aligned(PAGE_SIZE); struct ghcb *boot_ghcb; @@ -56,23 +57,19 @@ static unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx) static inline u64 sev_es_rd_ghcb_msr(void) { - unsigned long low, high; + struct msr m; - asm volatile("rdmsr" : "=a" (low), "=d" (high) : - "c" (MSR_AMD64_SEV_ES_GHCB)); + boot_rdmsr(MSR_AMD64_SEV_ES_GHCB, &m); - return ((high << 32) | low); + return m.q; } static inline void sev_es_wr_ghcb_msr(u64 val) { - u32 low, high; + struct msr m; - low = val & 0xffffffffUL; - high = val >> 32; - - asm volatile("wrmsr" : : "c" (MSR_AMD64_SEV_ES_GHCB), - "a"(low), "d" (high) : "memory"); + m.q = val; + boot_wrmsr(MSR_AMD64_SEV_ES_GHCB, &m); } static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c index e1478d32de1a..fed8d13ce252 100644 --- a/arch/x86/boot/cpucheck.c +++ b/arch/x86/boot/cpucheck.c @@ -27,6 +27,7 @@ #include #include #include "string.h" +#include "msr.h" static u32 err_flags[NCAPINTS]; @@ -130,12 +131,11 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr) /* If this is an AMD and we're only missing SSE+SSE2, try to turn them on */ - u32 ecx = MSR_K7_HWCR; - u32 eax, edx; + struct msr m; - asm("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx)); - eax &= ~(1 << 15); - asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx)); + boot_rdmsr(MSR_K7_HWCR, &m); + m.l &= ~(1 << 15); + boot_wrmsr(MSR_K7_HWCR, &m); get_cpuflags(); /* Make sure it really did something */ err = check_cpuflags(); @@ -145,28 +145,28 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr) /* If this is a VIA C3, we might have to enable CX8 explicitly */ - u32 ecx = MSR_VIA_FCR; - u32 eax, edx; + struct msr m; - asm("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx)); - eax |= (1<<1)|(1<<7); - asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx)); + boot_rdmsr(MSR_VIA_FCR, &m); + m.l |= (1 << 1) | (1 << 7); + boot_wrmsr(MSR_VIA_FCR, &m); set_bit(X86_FEATURE_CX8, cpu.flags); err = check_cpuflags(); } else if (err == 0x01 && is_transmeta()) { /* Transmeta might have masked feature bits in word 0 */ - u32 ecx = 0x80860004; - u32 eax, edx; + struct msr m, m_tmp; u32 level = 1; - asm("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx)); - asm("wrmsr" : : "a" (~0), "d" (edx), "c" (ecx)); + boot_rdmsr(0x80860004, &m); + m_tmp = m; + m_tmp.l = ~0; + boot_wrmsr(0x80860004, &m_tmp); asm("cpuid" : "+a" (level), "=d" (cpu.flags[0]) : : "ecx", "ebx"); - asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx)); + boot_wrmsr(0x80860004, &m); err = check_cpuflags(); } else if (err == 0x01 && -- cgit From ec1c66af3a30d45c2420da0974c01d3515dba26e Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Wed, 9 Feb 2022 12:10:01 -0600 Subject: x86/compressed/64: Detect/setup SEV/SME features earlier during boot With upcoming SEV-SNP support, SEV-related features need to be initialized earlier during boot, at the same point the initial #VC handler is set up, so that the SEV-SNP CPUID table can be utilized during the initial feature checks. Also, SEV-SNP feature detection will rely on EFI helper functions to scan the EFI config table for the Confidential Computing blob, and so would need to be implemented at least partially in C. Currently set_sev_encryption_mask() is used to initialize the sev_status and sme_me_mask globals that advertise what SEV/SME features are available in a guest. Rename it to sev_enable() to better reflect that (SME is only enabled in the case of SEV guests in the boot/compressed kernel), and move it to just after the stage1 #VC handler is set up so that it can be used to initialize SEV-SNP as well in future patches. While at it, re-implement it as C code so that all SEV feature detection can be better consolidated with upcoming SEV-SNP feature detection, which will also be in C. The 32-bit entry path remains unchanged, as it never relied on the set_sev_encryption_mask() initialization to begin with. [ bp: Massage commit message. ] Signed-off-by: Michael Roth Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-8-brijesh.singh@amd.com --- arch/x86/boot/compressed/head_64.S | 37 ++++++++++++++++++++-------------- arch/x86/boot/compressed/mem_encrypt.S | 36 --------------------------------- arch/x86/boot/compressed/misc.h | 4 ++-- arch/x86/boot/compressed/sev.c | 36 +++++++++++++++++++++++++++++++++ 4 files changed, 60 insertions(+), 53 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index dea95301196b..4cd661165d4a 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -189,11 +189,11 @@ SYM_FUNC_START(startup_32) subl $32, %eax /* Encryption bit is always above bit 31 */ bts %eax, %edx /* Set encryption mask for page tables */ /* - * Mark SEV as active in sev_status so that startup32_check_sev_cbit() - * will do a check. The sev_status memory will be fully initialized - * with the contents of MSR_AMD_SEV_STATUS later in - * set_sev_encryption_mask(). For now it is sufficient to know that SEV - * is active. + * Set MSR_AMD64_SEV_ENABLED_BIT in sev_status so that + * startup32_check_sev_cbit() will do a check. sev_enable() will + * initialize sev_status with all the bits reported by + * MSR_AMD_SEV_STATUS later, but only MSR_AMD64_SEV_ENABLED_BIT + * needs to be set for now. */ movl $1, rva(sev_status)(%ebp) 1: @@ -447,6 +447,23 @@ SYM_CODE_START(startup_64) call load_stage1_idt popq %rsi +#ifdef CONFIG_AMD_MEM_ENCRYPT + /* + * Now that the stage1 interrupt handlers are set up, #VC exceptions from + * CPUID instructions can be properly handled for SEV-ES guests. + * + * For SEV-SNP, the CPUID table also needs to be set up in advance of any + * CPUID instructions being issued, so go ahead and do that now via + * sev_enable(), which will also handle the rest of the SEV-related + * detection/setup to ensure that has been done in advance of any dependent + * code. + */ + pushq %rsi + movq %rsi, %rdi /* real mode address */ + call sev_enable + popq %rsi +#endif + /* * paging_prepare() sets up the trampoline and checks if we need to * enable 5-level paging. @@ -558,17 +575,7 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) shrq $3, %rcx rep stosq -/* - * If running as an SEV guest, the encryption mask is required in the - * page-table setup code below. When the guest also has SEV-ES enabled - * set_sev_encryption_mask() will cause #VC exceptions, but the stage2 - * handler can't map its GHCB because the page-table is not set up yet. - * So set up the encryption mask here while still on the stage1 #VC - * handler. Then load stage2 IDT and switch to the kernel's own - * page-table. - */ pushq %rsi - call set_sev_encryption_mask call load_stage2_idt /* Pass boot_params to initialize_identity_maps() */ diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S index a63424d13627..a73e4d783cae 100644 --- a/arch/x86/boot/compressed/mem_encrypt.S +++ b/arch/x86/boot/compressed/mem_encrypt.S @@ -187,42 +187,6 @@ SYM_CODE_END(startup32_vc_handler) .code64 #include "../../kernel/sev_verify_cbit.S" -SYM_FUNC_START(set_sev_encryption_mask) -#ifdef CONFIG_AMD_MEM_ENCRYPT - push %rbp - push %rdx - - movq %rsp, %rbp /* Save current stack pointer */ - - call get_sev_encryption_bit /* Get the encryption bit position */ - testl %eax, %eax - jz .Lno_sev_mask - - bts %rax, sme_me_mask(%rip) /* Create the encryption mask */ - - /* - * Read MSR_AMD64_SEV again and store it to sev_status. Can't do this in - * get_sev_encryption_bit() because this function is 32-bit code and - * shared between 64-bit and 32-bit boot path. - */ - movl $MSR_AMD64_SEV, %ecx /* Read the SEV MSR */ - rdmsr - - /* Store MSR value in sev_status */ - shlq $32, %rdx - orq %rdx, %rax - movq %rax, sev_status(%rip) - -.Lno_sev_mask: - movq %rbp, %rsp /* Restore original stack pointer */ - - pop %rdx - pop %rbp -#endif - - xor %rax, %rax - RET -SYM_FUNC_END(set_sev_encryption_mask) .data diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 16ed360b6692..23e0e395084a 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -120,12 +120,12 @@ static inline void console_init(void) { } #endif -void set_sev_encryption_mask(void); - #ifdef CONFIG_AMD_MEM_ENCRYPT +void sev_enable(struct boot_params *bp); void sev_es_shutdown_ghcb(void); extern bool sev_es_check_ghcb_fault(unsigned long address); #else +static inline void sev_enable(struct boot_params *bp) { } static inline void sev_es_shutdown_ghcb(void) { } static inline bool sev_es_check_ghcb_fault(unsigned long address) { diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 4e82101b7d13..27ccd5a5ff60 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -201,3 +201,39 @@ finish: else if (result != ES_RETRY) sev_es_terminate(GHCB_SEV_ES_GEN_REQ); } + +void sev_enable(struct boot_params *bp) +{ + unsigned int eax, ebx, ecx, edx; + struct msr m; + + /* Check for the SME/SEV support leaf */ + eax = 0x80000000; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + if (eax < 0x8000001f) + return; + + /* + * Check for the SME/SEV feature: + * CPUID Fn8000_001F[EAX] + * - Bit 0 - Secure Memory Encryption support + * - Bit 1 - Secure Encrypted Virtualization support + * CPUID Fn8000_001F[EBX] + * - Bits 5:0 - Pagetable bit position used to indicate encryption + */ + eax = 0x8000001f; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + /* Check whether SEV is supported */ + if (!(eax & BIT(1))) + return; + + /* Set the SME mask if this is an SEV guest. */ + boot_rdmsr(MSR_AMD64_SEV, &m); + sev_status = m.q; + if (!(sev_status & MSR_AMD64_SEV_ENABLED)) + return; + + sme_me_mask = BIT_ULL(ebx & 0x3f); +} -- cgit From bcce829083339bf862d66df602cbb111943da8fb Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Wed, 9 Feb 2022 12:10:02 -0600 Subject: x86/sev: Detect/setup SEV/SME features earlier in boot sme_enable() handles feature detection for both SEV and SME. Future patches will also use it for SEV-SNP feature detection/setup, which will need to be done immediately after the first #VC handler is set up. Move it now in preparation. Signed-off-by: Michael Roth Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Reviewed-by: Venu Busireddy Link: https://lore.kernel.org/r/20220307213356.2797205-9-brijesh.singh@amd.com --- arch/x86/kernel/head64.c | 3 --- arch/x86/kernel/head_64.S | 13 +++++++++++++ 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 4f5ecbbaae77..cbc285ddc4ac 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -192,9 +192,6 @@ unsigned long __head __startup_64(unsigned long physaddr, if (load_delta & ~PMD_PAGE_MASK) for (;;); - /* Activate Secure Memory Encryption (SME) if supported and enabled */ - sme_enable(bp); - /* Include the SME encryption mask in the fixup value */ load_delta += sme_get_me_mask(); diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index b8e3019547a5..6bf340c91165 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -69,6 +69,19 @@ SYM_CODE_START_NOALIGN(startup_64) call startup_64_setup_env popq %rsi +#ifdef CONFIG_AMD_MEM_ENCRYPT + /* + * Activate SEV/SME memory encryption if supported/enabled. This needs to + * be done now, since this also includes setup of the SEV-SNP CPUID table, + * which needs to be done before any CPUID instructions are executed in + * subsequent code. + */ + movq %rsi, %rdi + pushq %rsi + call sme_enable + popq %rsi +#endif + /* Now switch to __KERNEL_CS so IRET works reliably */ pushq $__KERNEL_CS leaq .Lon_kernel_cs(%rip), %rax -- cgit From f742b90e61bb53b27771f64bdae05db03a6ab1f2 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Thu, 24 Feb 2022 10:55:49 -0600 Subject: x86/mm: Extend cc_attr to include AMD SEV-SNP The CC_ATTR_GUEST_SEV_SNP can be used by the guest to query whether the SNP (Secure Nested Paging) feature is active. Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-10-brijesh.singh@amd.com --- arch/x86/coco/core.c | 3 +++ arch/x86/include/asm/msr-index.h | 2 ++ arch/x86/mm/mem_encrypt.c | 4 ++++ 3 files changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c index fc1365dd927e..dafd4881ce29 100644 --- a/arch/x86/coco/core.c +++ b/arch/x86/coco/core.c @@ -57,6 +57,9 @@ static bool amd_cc_platform_has(enum cc_attr attr) return (sev_status & MSR_AMD64_SEV_ENABLED) && !(sev_status & MSR_AMD64_SEV_ES_ENABLED); + case CC_ATTR_GUEST_SEV_SNP: + return sev_status & MSR_AMD64_SEV_SNP_ENABLED; + default: return false; } diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 0eb90d21049e..ef96f166b1b6 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -502,8 +502,10 @@ #define MSR_AMD64_SEV 0xc0010131 #define MSR_AMD64_SEV_ENABLED_BIT 0 #define MSR_AMD64_SEV_ES_ENABLED_BIT 1 +#define MSR_AMD64_SEV_SNP_ENABLED_BIT 2 #define MSR_AMD64_SEV_ENABLED BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT) #define MSR_AMD64_SEV_ES_ENABLED BIT_ULL(MSR_AMD64_SEV_ES_ENABLED_BIT) +#define MSR_AMD64_SEV_SNP_ENABLED BIT_ULL(MSR_AMD64_SEV_SNP_ENABLED_BIT) #define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 50d209939c66..f85868c031c6 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -62,6 +62,10 @@ static void print_mem_encrypt_feature_info(void) if (cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) pr_cont(" SEV-ES"); + /* Secure Nested Paging */ + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + pr_cont(" SEV-SNP"); + pr_cont("\n"); } -- cgit From 6c0f74d678c94060932683738b3e227995b363d3 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Wed, 9 Feb 2022 12:10:04 -0600 Subject: x86/sev: Define the Linux-specific guest termination reasons The GHCB specification defines the reason code for reason set 0. The reason codes defined in the set 0 do not cover all possible causes for a guest to request termination. The reason sets 1 to 255 are reserved for the vendor-specific codes. Reserve the reason set 1 for the Linux guest. Define the error codes for reason set 1 so that one can have meaningful termination reasons and thus better guest failure diagnosis. While at it, change sev_es_terminate() to accept a reason set parameter. [ bp: Massage commit message. ] Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Reviewed-by: Venu Busireddy Link: https://lore.kernel.org/r/20220307213356.2797205-11-brijesh.singh@amd.com --- arch/x86/boot/compressed/sev.c | 6 +++--- arch/x86/include/asm/sev-common.h | 8 ++++++++ arch/x86/kernel/sev-shared.c | 11 ++++------- arch/x86/kernel/sev.c | 4 ++-- 4 files changed, 17 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 27ccd5a5ff60..56e941d5e092 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -119,7 +119,7 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, static bool early_setup_sev_es(void) { if (!sev_es_negotiate_protocol()) - sev_es_terminate(GHCB_SEV_ES_PROT_UNSUPPORTED); + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_PROT_UNSUPPORTED); if (set_page_decrypted((unsigned long)&boot_ghcb_page)) return false; @@ -172,7 +172,7 @@ void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code) enum es_result result; if (!boot_ghcb && !early_setup_sev_es()) - sev_es_terminate(GHCB_SEV_ES_GEN_REQ); + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); vc_ghcb_invalidate(boot_ghcb); result = vc_init_em_ctxt(&ctxt, regs, exit_code); @@ -199,7 +199,7 @@ finish: if (result == ES_OK) vc_finish_insn(&ctxt); else if (result != ES_RETRY) - sev_es_terminate(GHCB_SEV_ES_GEN_REQ); + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); } void sev_enable(struct boot_params *bp) diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index 1b2fd32b42fe..94f0ea574049 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -73,9 +73,17 @@ /* GHCBData[23:16] */ \ ((((u64)reason_val) & 0xff) << 16)) +/* Error codes from reason set 0 */ +#define SEV_TERM_SET_GEN 0 #define GHCB_SEV_ES_GEN_REQ 0 #define GHCB_SEV_ES_PROT_UNSUPPORTED 1 +/* Linux-specific reason codes (used with reason set 1) */ +#define SEV_TERM_SET_LINUX 1 +#define GHCB_TERM_REGISTER 0 /* GHCB GPA registration failure */ +#define GHCB_TERM_PSC 1 /* Page State Change failure */ +#define GHCB_TERM_PVALIDATE 2 /* Pvalidate failure */ + #define GHCB_RESP_CODE(v) ((v) & GHCB_MSR_INFO_MASK) /* diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index ce987688bbc0..2abf8a7d75e5 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -24,15 +24,12 @@ static bool __init sev_es_check_cpu_features(void) return true; } -static void __noreturn sev_es_terminate(unsigned int reason) +static void __noreturn sev_es_terminate(unsigned int set, unsigned int reason) { u64 val = GHCB_MSR_TERM_REQ; - /* - * Tell the hypervisor what went wrong - only reason-set 0 is - * currently supported. - */ - val |= GHCB_SEV_TERM_REASON(0, reason); + /* Tell the hypervisor what went wrong. */ + val |= GHCB_SEV_TERM_REASON(set, reason); /* Request Guest Termination from Hypvervisor */ sev_es_wr_ghcb_msr(val); @@ -221,7 +218,7 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) fail: /* Terminate the guest */ - sev_es_terminate(GHCB_SEV_ES_GEN_REQ); + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); } static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt, diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index e6d316a01fdd..19ad09712902 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -1337,7 +1337,7 @@ DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication) show_regs(regs); /* Ask hypervisor to sev_es_terminate */ - sev_es_terminate(GHCB_SEV_ES_GEN_REQ); + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); /* If that fails and we get here - just panic */ panic("Returned from Terminate-Request to Hypervisor\n"); @@ -1385,7 +1385,7 @@ bool __init handle_vc_boot_ghcb(struct pt_regs *regs) /* Do initial setup or terminate the guest */ if (unlikely(boot_ghcb == NULL && !sev_es_setup_ghcb())) - sev_es_terminate(GHCB_SEV_ES_GEN_REQ); + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); vc_ghcb_invalidate(boot_ghcb); -- cgit From 2ea29c5abbc27147c2d9e2ab5e05436aca706b65 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Wed, 9 Feb 2022 12:10:05 -0600 Subject: x86/sev: Save the negotiated GHCB version The SEV-ES guest calls sev_es_negotiate_protocol() to negotiate the GHCB protocol version before establishing the GHCB. Cache the negotiated GHCB version so that it can be used later. Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Reviewed-by: Venu Busireddy Link: https://lore.kernel.org/r/20220307213356.2797205-12-brijesh.singh@amd.com --- arch/x86/include/asm/sev.h | 2 +- arch/x86/kernel/sev-shared.c | 17 ++++++++++++++--- 2 files changed, 15 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index ec060c433589..9b9c190e8c3b 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -12,7 +12,7 @@ #include #include -#define GHCB_PROTO_OUR 0x0001UL +#define GHCB_PROTOCOL_MIN 1ULL #define GHCB_PROTOCOL_MAX 1ULL #define GHCB_DEFAULT_USAGE 0ULL diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index 2abf8a7d75e5..91105f5a02a8 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -14,6 +14,15 @@ #define has_cpuflag(f) boot_cpu_has(f) #endif +/* + * Since feature negotiation related variables are set early in the boot + * process they must reside in the .data section so as not to be zeroed + * out when the .bss section is later cleared. + * + * GHCB protocol version negotiated with the hypervisor. + */ +static u16 ghcb_version __ro_after_init; + static bool __init sev_es_check_cpu_features(void) { if (!has_cpuflag(X86_FEATURE_RDRAND)) { @@ -51,10 +60,12 @@ static bool sev_es_negotiate_protocol(void) if (GHCB_MSR_INFO(val) != GHCB_MSR_SEV_INFO_RESP) return false; - if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTO_OUR || - GHCB_MSR_PROTO_MIN(val) > GHCB_PROTO_OUR) + if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTOCOL_MIN || + GHCB_MSR_PROTO_MIN(val) > GHCB_PROTOCOL_MAX) return false; + ghcb_version = min_t(size_t, GHCB_MSR_PROTO_MAX(val), GHCB_PROTOCOL_MAX); + return true; } @@ -127,7 +138,7 @@ enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, bool set_ghcb_msr, u64 exit_info_1, u64 exit_info_2) { /* Fill in protocol and format specifiers */ - ghcb->protocol_version = GHCB_PROTOCOL_MAX; + ghcb->protocol_version = ghcb_version; ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; ghcb_set_sw_exit_code(ghcb, exit_code); -- cgit From cbd3d4f7c4e5a93edae68e5142a269368fde77d6 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Wed, 9 Feb 2022 12:10:06 -0600 Subject: x86/sev: Check SEV-SNP features support Version 2 of the GHCB specification added the advertisement of features that are supported by the hypervisor. If the hypervisor supports SEV-SNP then it must set the SEV-SNP features bit to indicate that the base functionality is supported. Check that feature bit while establishing the GHCB; if failed, terminate the guest. Version 2 of the GHCB specification adds several new Non-Automatic Exits (NAEs), most of them are optional except the hypervisor feature. Now that the hypervisor feature NAE is implemented, bump the GHCB maximum supported protocol version. While at it, move the GHCB protocol negotiation check from the #VC exception handler to sev_enable() so that all feature detection happens before the first #VC exception. While at it, document why the GHCB page cannot be setup from load_stage2_idt(). [ bp: Massage commit message. ] Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-13-brijesh.singh@amd.com --- arch/x86/boot/compressed/idt_64.c | 18 +++++++++++++++++- arch/x86/boot/compressed/sev.c | 20 +++++++++++++++----- arch/x86/include/asm/sev-common.h | 6 ++++++ arch/x86/include/asm/sev.h | 2 +- arch/x86/include/uapi/asm/svm.h | 2 ++ arch/x86/kernel/sev-shared.c | 20 ++++++++++++++++++++ arch/x86/kernel/sev.c | 14 ++++++++++++++ 7 files changed, 75 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/idt_64.c b/arch/x86/boot/compressed/idt_64.c index 9b93567d663a..6debb816e83d 100644 --- a/arch/x86/boot/compressed/idt_64.c +++ b/arch/x86/boot/compressed/idt_64.c @@ -39,7 +39,23 @@ void load_stage1_idt(void) load_boot_idt(&boot_idt_desc); } -/* Setup IDT after kernel jumping to .Lrelocated */ +/* + * Setup IDT after kernel jumping to .Lrelocated. + * + * initialize_identity_maps() needs a #PF handler to be setup + * in order to be able to fault-in identity mapping ranges; see + * do_boot_page_fault(). + * + * This #PF handler setup needs to happen in load_stage2_idt() where the + * IDT is loaded and there the #VC IDT entry gets setup too. + * + * In order to be able to handle #VCs, one needs a GHCB which + * gets setup with an already set up pagetable, which is done in + * initialize_identity_maps(). And there's the catch 22: the boot #VC + * handler do_boot_stage2_vc() needs to call early_setup_ghcb() itself + * (and, especially set_page_decrypted()) because the SEV-ES setup code + * cannot initialize a GHCB as there's no #PF handler yet... + */ void load_stage2_idt(void) { boot_idt_desc.address = (unsigned long)boot_idt; diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 56e941d5e092..5b389310be87 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -116,11 +116,8 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, /* Include code for early handlers */ #include "../../kernel/sev-shared.c" -static bool early_setup_sev_es(void) +static bool early_setup_ghcb(void) { - if (!sev_es_negotiate_protocol()) - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_PROT_UNSUPPORTED); - if (set_page_decrypted((unsigned long)&boot_ghcb_page)) return false; @@ -171,7 +168,7 @@ void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code) struct es_em_ctxt ctxt; enum es_result result; - if (!boot_ghcb && !early_setup_sev_es()) + if (!boot_ghcb && !early_setup_ghcb()) sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); vc_ghcb_invalidate(boot_ghcb); @@ -235,5 +232,18 @@ void sev_enable(struct boot_params *bp) if (!(sev_status & MSR_AMD64_SEV_ENABLED)) return; + /* Negotiate the GHCB protocol version. */ + if (sev_status & MSR_AMD64_SEV_ES_ENABLED) { + if (!sev_es_negotiate_protocol()) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_PROT_UNSUPPORTED); + } + + /* + * SNP is supported in v2 of the GHCB spec which mandates support for HV + * features. + */ + if (sev_status & MSR_AMD64_SEV_SNP_ENABLED && !(get_hv_features() & GHCB_HV_FT_SNP)) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); + sme_me_mask = BIT_ULL(ebx & 0x3f); } diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index 94f0ea574049..6f037c29a46e 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -60,6 +60,11 @@ /* GHCB Hypervisor Feature Request/Response */ #define GHCB_MSR_HV_FT_REQ 0x080 #define GHCB_MSR_HV_FT_RESP 0x081 +#define GHCB_MSR_HV_FT_RESP_VAL(v) \ + /* GHCBData[63:12] */ \ + (((u64)(v) & GENMASK_ULL(63, 12)) >> 12) + +#define GHCB_HV_FT_SNP BIT_ULL(0) #define GHCB_MSR_TERM_REQ 0x100 #define GHCB_MSR_TERM_REASON_SET_POS 12 @@ -77,6 +82,7 @@ #define SEV_TERM_SET_GEN 0 #define GHCB_SEV_ES_GEN_REQ 0 #define GHCB_SEV_ES_PROT_UNSUPPORTED 1 +#define GHCB_SNP_UNSUPPORTED 2 /* Linux-specific reason codes (used with reason set 1) */ #define SEV_TERM_SET_LINUX 1 diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 9b9c190e8c3b..17b75f6ee11a 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -13,7 +13,7 @@ #include #define GHCB_PROTOCOL_MIN 1ULL -#define GHCB_PROTOCOL_MAX 1ULL +#define GHCB_PROTOCOL_MAX 2ULL #define GHCB_DEFAULT_USAGE 0ULL #define VMGEXIT() { asm volatile("rep; vmmcall\n\r"); } diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index efa969325ede..b0ad00f4c1e1 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -108,6 +108,7 @@ #define SVM_VMGEXIT_AP_JUMP_TABLE 0x80000005 #define SVM_VMGEXIT_SET_AP_JUMP_TABLE 0 #define SVM_VMGEXIT_GET_AP_JUMP_TABLE 1 +#define SVM_VMGEXIT_HV_FEATURES 0x8000fffd #define SVM_VMGEXIT_UNSUPPORTED_EVENT 0x8000ffff /* Exit code reserved for hypervisor/software use */ @@ -218,6 +219,7 @@ { SVM_VMGEXIT_NMI_COMPLETE, "vmgexit_nmi_complete" }, \ { SVM_VMGEXIT_AP_HLT_LOOP, "vmgexit_ap_hlt_loop" }, \ { SVM_VMGEXIT_AP_JUMP_TABLE, "vmgexit_ap_jump_table" }, \ + { SVM_VMGEXIT_HV_FEATURES, "vmgexit_hypervisor_feature" }, \ { SVM_EXIT_ERR, "invalid_guest_state" } diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index 91105f5a02a8..4a876e684f67 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -48,6 +48,26 @@ static void __noreturn sev_es_terminate(unsigned int set, unsigned int reason) asm volatile("hlt\n" : : : "memory"); } +/* + * The hypervisor features are available from GHCB version 2 onward. + */ +static u64 get_hv_features(void) +{ + u64 val; + + if (ghcb_version < 2) + return 0; + + sev_es_wr_ghcb_msr(GHCB_MSR_HV_FT_REQ); + VMGEXIT(); + + val = sev_es_rd_ghcb_msr(); + if (GHCB_RESP_CODE(val) != GHCB_MSR_HV_FT_RESP) + return 0; + + return GHCB_MSR_HV_FT_RESP_VAL(val); +} + static bool sev_es_negotiate_protocol(void) { u64 val; diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 19ad09712902..cb20fb0c608e 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -43,6 +43,9 @@ static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); */ static struct ghcb __initdata *boot_ghcb; +/* Bitmap of SEV features supported by the hypervisor */ +static u64 sev_hv_features __ro_after_init; + /* #VC handler runtime per-CPU data */ struct sev_es_runtime_data { struct ghcb ghcb_page; @@ -766,6 +769,17 @@ void __init sev_es_init_vc_handling(void) if (!sev_es_check_cpu_features()) panic("SEV-ES CPU Features missing"); + /* + * SNP is supported in v2 of the GHCB spec which mandates support for HV + * features. + */ + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) { + sev_hv_features = get_hv_features(); + + if (!(sev_hv_features & GHCB_HV_FT_SNP)) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); + } + /* Enable SEV-ES special handling */ static_branch_enable(&sev_es_enable_key); -- cgit From 0bd6f1e526070271dbe0f626b123b4f6b01dc79c Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Wed, 9 Feb 2022 12:10:07 -0600 Subject: x86/sev: Add a helper for the PVALIDATE instruction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An SNP-active guest uses the PVALIDATE instruction to validate or rescind the validation of a guest page’s RMP entry. Upon completion, a return code is stored in EAX and rFLAGS bits are set based on the return code. If the instruction completed successfully, the carry flag (CF) indicates if the content of the RMP were changed or not. See AMD APM Volume 3 for additional details. Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Reviewed-by: Venu Busireddy Link: https://lore.kernel.org/r/20220307213356.2797205-14-brijesh.singh@amd.com --- arch/x86/include/asm/sev.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 17b75f6ee11a..4ee98976aed8 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -60,6 +60,9 @@ extern void vc_no_ghcb(void); extern void vc_boot_ghcb(void); extern bool handle_vc_boot_ghcb(struct pt_regs *regs); +/* Software defined (when rFlags.CF = 1) */ +#define PVALIDATE_FAIL_NOUPDATE 255 + #ifdef CONFIG_AMD_MEM_ENCRYPT extern struct static_key_false sev_es_enable_key; extern void __sev_es_ist_enter(struct pt_regs *regs); @@ -87,12 +90,30 @@ extern enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, struct es_em_ctxt *ctxt, u64 exit_code, u64 exit_info_1, u64 exit_info_2); +static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) +{ + bool no_rmpupdate; + int rc; + + /* "pvalidate" mnemonic support in binutils 2.36 and newer */ + asm volatile(".byte 0xF2, 0x0F, 0x01, 0xFF\n\t" + CC_SET(c) + : CC_OUT(c) (no_rmpupdate), "=a"(rc) + : "a"(vaddr), "c"(rmp_psize), "d"(validate) + : "memory", "cc"); + + if (no_rmpupdate) + return PVALIDATE_FAIL_NOUPDATE; + + return rc; +} #else static inline void sev_es_ist_enter(struct pt_regs *regs) { } static inline void sev_es_ist_exit(void) { } static inline int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { return 0; } static inline void sev_es_nmi_complete(void) { } static inline int sev_es_efi_map_ghcbs(pgd_t *pgd) { return 0; } +static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) { return 0; } #endif #endif -- cgit From 81cc3df9a90e7817494421ecc48ede6bd5e8132b Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Wed, 9 Feb 2022 12:10:08 -0600 Subject: x86/sev: Check the VMPL level The Virtual Machine Privilege Level (VMPL) feature in the SEV-SNP architecture allows a guest VM to divide its address space into four levels. The level can be used to provide hardware isolated abstraction layers within a VM. VMPL0 is the highest privilege level, and VMPL3 is the least privilege level. Certain operations must be done by the VMPL0 software, such as: * Validate or invalidate memory range (PVALIDATE instruction) * Allocate VMSA page (RMPADJUST instruction when VMSA=1) The initial SNP support requires that the guest kernel is running at VMPL0. Add such a check to verify the guest is running at level 0 before continuing the boot. There is no easy method to query the current VMPL level, so use the RMPADJUST instruction to determine whether the guest is running at the VMPL0. [ bp: Massage commit message. ] Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-15-brijesh.singh@amd.com --- arch/x86/boot/compressed/sev.c | 28 ++++++++++++++++++++++++++-- arch/x86/include/asm/sev-common.h | 1 + arch/x86/include/asm/sev.h | 16 ++++++++++++++++ 3 files changed, 43 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 5b389310be87..eb421787d50a 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -199,6 +199,26 @@ finish: sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); } +static void enforce_vmpl0(void) +{ + u64 attrs; + int err; + + /* + * RMPADJUST modifies RMP permissions of a lesser-privileged (numerically + * higher) privilege level. Here, clear the VMPL1 permission mask of the + * GHCB page. If the guest is not running at VMPL0, this will fail. + * + * If the guest is running at VMPL0, it will succeed. Even if that operation + * modifies permission bits, it is still ok to do so currently because Linux + * SNP guests are supported only on VMPL0 so VMPL1 or higher permission masks + * changing is a don't-care. + */ + attrs = 1; + if (rmpadjust((unsigned long)&boot_ghcb_page, RMP_PG_SIZE_4K, attrs)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_NOT_VMPL0); +} + void sev_enable(struct boot_params *bp) { unsigned int eax, ebx, ecx, edx; @@ -242,8 +262,12 @@ void sev_enable(struct boot_params *bp) * SNP is supported in v2 of the GHCB spec which mandates support for HV * features. */ - if (sev_status & MSR_AMD64_SEV_SNP_ENABLED && !(get_hv_features() & GHCB_HV_FT_SNP)) - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); + if (sev_status & MSR_AMD64_SEV_SNP_ENABLED) { + if (!(get_hv_features() & GHCB_HV_FT_SNP)) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); + + enforce_vmpl0(); + } sme_me_mask = BIT_ULL(ebx & 0x3f); } diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index 6f037c29a46e..7ac5842e32b6 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -89,6 +89,7 @@ #define GHCB_TERM_REGISTER 0 /* GHCB GPA registration failure */ #define GHCB_TERM_PSC 1 /* Page State Change failure */ #define GHCB_TERM_PVALIDATE 2 /* Pvalidate failure */ +#define GHCB_TERM_NOT_VMPL0 3 /* SNP guest is not running at VMPL-0 */ #define GHCB_RESP_CODE(v) ((v) & GHCB_MSR_INFO_MASK) diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 4ee98976aed8..e37451849165 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -63,6 +63,9 @@ extern bool handle_vc_boot_ghcb(struct pt_regs *regs); /* Software defined (when rFlags.CF = 1) */ #define PVALIDATE_FAIL_NOUPDATE 255 +/* RMP page size */ +#define RMP_PG_SIZE_4K 0 + #ifdef CONFIG_AMD_MEM_ENCRYPT extern struct static_key_false sev_es_enable_key; extern void __sev_es_ist_enter(struct pt_regs *regs); @@ -90,6 +93,18 @@ extern enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, struct es_em_ctxt *ctxt, u64 exit_code, u64 exit_info_1, u64 exit_info_2); +static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) +{ + int rc; + + /* "rmpadjust" mnemonic support in binutils 2.36 and newer */ + asm volatile(".byte 0xF3,0x0F,0x01,0xFE\n\t" + : "=a"(rc) + : "a"(vaddr), "c"(rmp_psize), "d"(attrs) + : "memory", "cc"); + + return rc; +} static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) { bool no_rmpupdate; @@ -114,6 +129,7 @@ static inline int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { ret static inline void sev_es_nmi_complete(void) { } static inline int sev_es_efi_map_ghcbs(pgd_t *pgd) { return 0; } static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) { return 0; } +static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) { return 0; } #endif #endif -- cgit From 4f9c403e44e5e88feb27d5e617d1adc9cc7ef684 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Wed, 9 Feb 2022 12:10:09 -0600 Subject: x86/compressed: Add helper for validating pages in the decompression stage Many of the integrity guarantees of SEV-SNP are enforced through the Reverse Map Table (RMP). Each RMP entry contains the GPA at which a particular page of DRAM should be mapped. The VMs can request the hypervisor to add pages in the RMP table via the Page State Change VMGEXIT defined in the GHCB specification. Inside each RMP entry is a Validated flag; this flag is automatically cleared to 0 by the CPU hardware when a new RMP entry is created for a guest. Each VM page can be either validated or invalidated, as indicated by the Validated flag in the RMP entry. Memory access to a private page that is not validated generates a #VC. A VM must use the PVALIDATE instruction to validate a private page before using it. To maintain the security guarantee of SEV-SNP guests, when transitioning pages from private to shared, the guest must invalidate the pages before asking the hypervisor to change the page state to shared in the RMP table. After the pages are mapped private in the page table, the guest must issue a page state change VMGEXIT to mark the pages private in the RMP table and validate them. Upon boot, BIOS should have validated the entire system memory. During the kernel decompression stage, early_setup_ghcb() uses set_page_decrypted() to make the GHCB page shared (i.e. clear encryption attribute). And while exiting from the decompression, it calls set_page_encrypted() to make the page private. Add snp_set_page_{private,shared}() helpers that are used by set_page_{decrypted,encrypted}() to change the page state in the RMP table. [ bp: Massage commit message and comments. ] Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-16-brijesh.singh@amd.com --- arch/x86/boot/compressed/ident_map_64.c | 18 ++++++++++++- arch/x86/boot/compressed/misc.h | 4 +++ arch/x86/boot/compressed/sev.c | 46 +++++++++++++++++++++++++++++++++ arch/x86/include/asm/sev-common.h | 26 +++++++++++++++++++ 4 files changed, 93 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c index f7213d0943b8..613367e7e09e 100644 --- a/arch/x86/boot/compressed/ident_map_64.c +++ b/arch/x86/boot/compressed/ident_map_64.c @@ -275,15 +275,31 @@ static int set_clr_page_flags(struct x86_mapping_info *info, * Changing encryption attributes of a page requires to flush it from * the caches. */ - if ((set | clr) & _PAGE_ENC) + if ((set | clr) & _PAGE_ENC) { clflush_page(address); + /* + * If the encryption attribute is being cleared, change the page state + * to shared in the RMP table. + */ + if (clr) + snp_set_page_shared(__pa(address & PAGE_MASK)); + } + /* Update PTE */ pte = *ptep; pte = pte_set_flags(pte, set); pte = pte_clear_flags(pte, clr); set_pte(ptep, pte); + /* + * If the encryption attribute is being set, then change the page state to + * private in the RMP entry. The page state change must be done after the PTE + * is updated. + */ + if (set & _PAGE_ENC) + snp_set_page_private(__pa(address & PAGE_MASK)); + /* Flush TLB after changing encryption attribute */ write_cr3(top_level_pgt); diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 23e0e395084a..01cc13c12059 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -124,6 +124,8 @@ static inline void console_init(void) void sev_enable(struct boot_params *bp); void sev_es_shutdown_ghcb(void); extern bool sev_es_check_ghcb_fault(unsigned long address); +void snp_set_page_private(unsigned long paddr); +void snp_set_page_shared(unsigned long paddr); #else static inline void sev_enable(struct boot_params *bp) { } static inline void sev_es_shutdown_ghcb(void) { } @@ -131,6 +133,8 @@ static inline bool sev_es_check_ghcb_fault(unsigned long address) { return false; } +static inline void snp_set_page_private(unsigned long paddr) { } +static inline void snp_set_page_shared(unsigned long paddr) { } #endif /* acpi.c */ diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index eb421787d50a..5f2c26860df7 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -116,6 +116,52 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, /* Include code for early handlers */ #include "../../kernel/sev-shared.c" +static inline bool sev_snp_enabled(void) +{ + return sev_status & MSR_AMD64_SEV_SNP_ENABLED; +} + +static void __page_state_change(unsigned long paddr, enum psc_op op) +{ + u64 val; + + if (!sev_snp_enabled()) + return; + + /* + * If private -> shared then invalidate the page before requesting the + * state change in the RMP table. + */ + if (op == SNP_PAGE_STATE_SHARED && pvalidate(paddr, RMP_PG_SIZE_4K, 0)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); + + /* Issue VMGEXIT to change the page state in RMP table. */ + sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op)); + VMGEXIT(); + + /* Read the response of the VMGEXIT. */ + val = sev_es_rd_ghcb_msr(); + if ((GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP) || GHCB_MSR_PSC_RESP_VAL(val)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); + + /* + * Now that page state is changed in the RMP table, validate it so that it is + * consistent with the RMP entry. + */ + if (op == SNP_PAGE_STATE_PRIVATE && pvalidate(paddr, RMP_PG_SIZE_4K, 1)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); +} + +void snp_set_page_private(unsigned long paddr) +{ + __page_state_change(paddr, SNP_PAGE_STATE_PRIVATE); +} + +void snp_set_page_shared(unsigned long paddr) +{ + __page_state_change(paddr, SNP_PAGE_STATE_SHARED); +} + static bool early_setup_ghcb(void) { if (set_page_decrypted((unsigned long)&boot_ghcb_page)) diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index 7ac5842e32b6..fe7fe16e5fd5 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -57,6 +57,32 @@ #define GHCB_MSR_AP_RESET_HOLD_REQ 0x006 #define GHCB_MSR_AP_RESET_HOLD_RESP 0x007 +/* + * SNP Page State Change Operation + * + * GHCBData[55:52] - Page operation: + * 0x0001 Page assignment, Private + * 0x0002 Page assignment, Shared + */ +enum psc_op { + SNP_PAGE_STATE_PRIVATE = 1, + SNP_PAGE_STATE_SHARED, +}; + +#define GHCB_MSR_PSC_REQ 0x014 +#define GHCB_MSR_PSC_REQ_GFN(gfn, op) \ + /* GHCBData[55:52] */ \ + (((u64)((op) & 0xf) << 52) | \ + /* GHCBData[51:12] */ \ + ((u64)((gfn) & GENMASK_ULL(39, 0)) << 12) | \ + /* GHCBData[11:0] */ \ + GHCB_MSR_PSC_REQ) + +#define GHCB_MSR_PSC_RESP 0x015 +#define GHCB_MSR_PSC_RESP_VAL(val) \ + /* GHCBData[63:32] */ \ + (((u64)(val) & GENMASK_ULL(63, 32)) >> 32) + /* GHCB Hypervisor Feature Request/Response */ #define GHCB_MSR_HV_FT_REQ 0x080 #define GHCB_MSR_HV_FT_RESP 0x081 -- cgit From 87294bdb7b4b73555b0fba45da1cdecdc6a0d5a8 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Wed, 9 Feb 2022 12:10:10 -0600 Subject: x86/compressed: Register GHCB memory when SEV-SNP is active The SEV-SNP guest is required by the GHCB spec to register the GHCB's Guest Physical Address (GPA). This is because the hypervisor may prefer that a guest use a consistent and/or specific GPA for the GHCB associated with a vCPU. For more information, see the GHCB specification section "GHCB GPA Registration". If hypervisor can not work with the guest provided GPA then terminate the guest boot. Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Reviewed-by: Venu Busireddy Link: https://lore.kernel.org/r/20220307213356.2797205-17-brijesh.singh@amd.com --- arch/x86/boot/compressed/sev.c | 4 ++++ arch/x86/include/asm/sev-common.h | 13 +++++++++++++ arch/x86/kernel/sev-shared.c | 16 ++++++++++++++++ 3 files changed, 33 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 5f2c26860df7..f31b434e2ce4 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -175,6 +175,10 @@ static bool early_setup_ghcb(void) /* Initialize lookup tables for the instruction decoder */ inat_init_tables(); + /* SNP guest requires the GHCB GPA must be registered */ + if (sev_snp_enabled()) + snp_register_ghcb_early(__pa(&boot_ghcb_page)); + return true; } diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index fe7fe16e5fd5..f077a6c95e67 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -57,6 +57,19 @@ #define GHCB_MSR_AP_RESET_HOLD_REQ 0x006 #define GHCB_MSR_AP_RESET_HOLD_RESP 0x007 +/* GHCB GPA Register */ +#define GHCB_MSR_REG_GPA_REQ 0x012 +#define GHCB_MSR_REG_GPA_REQ_VAL(v) \ + /* GHCBData[63:12] */ \ + (((u64)((v) & GENMASK_ULL(51, 0)) << 12) | \ + /* GHCBData[11:0] */ \ + GHCB_MSR_REG_GPA_REQ) + +#define GHCB_MSR_REG_GPA_RESP 0x013 +#define GHCB_MSR_REG_GPA_RESP_VAL(v) \ + /* GHCBData[63:12] */ \ + (((u64)(v) & GENMASK_ULL(63, 12)) >> 12) + /* * SNP Page State Change Operation * diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index 4a876e684f67..e9ff13cd90b0 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -68,6 +68,22 @@ static u64 get_hv_features(void) return GHCB_MSR_HV_FT_RESP_VAL(val); } +static void __maybe_unused snp_register_ghcb_early(unsigned long paddr) +{ + unsigned long pfn = paddr >> PAGE_SHIFT; + u64 val; + + sev_es_wr_ghcb_msr(GHCB_MSR_REG_GPA_REQ_VAL(pfn)); + VMGEXIT(); + + val = sev_es_rd_ghcb_msr(); + + /* If the response GPA is not ours then abort the guest */ + if ((GHCB_RESP_CODE(val) != GHCB_MSR_REG_GPA_RESP) || + (GHCB_MSR_REG_GPA_RESP_VAL(val) != pfn)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_REGISTER); +} + static bool sev_es_negotiate_protocol(void) { u64 val; -- cgit From 95d33bfaa3e169cfec1926e0d0f0c6b0ea75d763 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Wed, 9 Feb 2022 12:10:11 -0600 Subject: x86/sev: Register GHCB memory when SEV-SNP is active The SEV-SNP guest is required by the GHCB spec to register the GHCB's Guest Physical Address (GPA). This is because the hypervisor may prefer that a guest uses a consistent and/or specific GPA for the GHCB associated with a vCPU. For more information, see the GHCB specification section "GHCB GPA Registration". [ bp: Cleanup comments. ] Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-18-brijesh.singh@amd.com --- arch/x86/include/asm/sev.h | 2 ++ arch/x86/kernel/cpu/common.c | 4 ++++ arch/x86/kernel/head64.c | 4 +++- arch/x86/kernel/sev-shared.c | 2 +- arch/x86/kernel/sev.c | 46 ++++++++++++++++++++++++++++++++------------ 5 files changed, 44 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index e37451849165..48df02713ee0 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -122,6 +122,7 @@ static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) return rc; } +void setup_ghcb(void); #else static inline void sev_es_ist_enter(struct pt_regs *regs) { } static inline void sev_es_ist_exit(void) { } @@ -130,6 +131,7 @@ static inline void sev_es_nmi_complete(void) { } static inline int sev_es_efi_map_ghcbs(pgd_t *pgd) { return 0; } static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) { return 0; } static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) { return 0; } +static inline void setup_ghcb(void) { } #endif #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index ed4417500700..9e4552133872 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -60,6 +60,7 @@ #include #include #include +#include #include "cpu.h" @@ -2124,6 +2125,9 @@ void cpu_init_exception_handling(void) load_TR_desc(); + /* GHCB needs to be setup to handle #VC. */ + setup_ghcb(); + /* Finally load the IDT */ load_current_idt(); } diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index cbc285ddc4ac..83514b9827e6 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -597,8 +597,10 @@ static void startup_64_load_idt(unsigned long physbase) void early_setup_idt(void) { /* VMM Communication Exception */ - if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) { + setup_ghcb(); set_bringup_idt_handler(bringup_idt_table, X86_TRAP_VC, vc_boot_ghcb); + } bringup_idt_descr.address = (unsigned long)bringup_idt_table; native_load_idt(&bringup_idt_descr); diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index e9ff13cd90b0..3aaef1a18ffe 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -68,7 +68,7 @@ static u64 get_hv_features(void) return GHCB_MSR_HV_FT_RESP_VAL(val); } -static void __maybe_unused snp_register_ghcb_early(unsigned long paddr) +static void snp_register_ghcb_early(unsigned long paddr) { unsigned long pfn = paddr >> PAGE_SHIFT; u64 val; diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index cb20fb0c608e..39c87469ee0b 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -41,7 +41,7 @@ static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); * Needs to be in the .data section because we need it NULL before bss is * cleared */ -static struct ghcb __initdata *boot_ghcb; +static struct ghcb *boot_ghcb __section(".data"); /* Bitmap of SEV features supported by the hypervisor */ static u64 sev_hv_features __ro_after_init; @@ -647,15 +647,39 @@ static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) return ret; } -/* - * This function runs on the first #VC exception after the kernel - * switched to virtual addresses. - */ -static bool __init sev_es_setup_ghcb(void) +static void snp_register_per_cpu_ghcb(void) { + struct sev_es_runtime_data *data; + struct ghcb *ghcb; + + data = this_cpu_read(runtime_data); + ghcb = &data->ghcb_page; + + snp_register_ghcb_early(__pa(ghcb)); +} + +void setup_ghcb(void) +{ + if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) + return; + /* First make sure the hypervisor talks a supported protocol. */ if (!sev_es_negotiate_protocol()) - return false; + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); + + /* + * Check whether the runtime #VC exception handler is active. It uses + * the per-CPU GHCB page which is set up by sev_es_init_vc_handling(). + * + * If SNP is active, register the per-CPU GHCB page so that the runtime + * exception handler can use it. + */ + if (initial_vc_handler == (unsigned long)kernel_exc_vmm_communication) { + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + snp_register_per_cpu_ghcb(); + + return; + } /* * Clear the boot_ghcb. The first exception comes in before the bss @@ -666,7 +690,9 @@ static bool __init sev_es_setup_ghcb(void) /* Alright - Make the boot-ghcb public */ boot_ghcb = &boot_ghcb_page; - return true; + /* SNP guest requires that GHCB GPA must be registered. */ + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + snp_register_ghcb_early(__pa(&boot_ghcb_page)); } #ifdef CONFIG_HOTPLUG_CPU @@ -1397,10 +1423,6 @@ bool __init handle_vc_boot_ghcb(struct pt_regs *regs) struct es_em_ctxt ctxt; enum es_result result; - /* Do initial setup or terminate the guest */ - if (unlikely(boot_ghcb == NULL && !sev_es_setup_ghcb())) - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); - vc_ghcb_invalidate(boot_ghcb); result = vc_init_em_ctxt(&ctxt, regs, exit_code); -- cgit From 5e5ccff60a2977142d39b987a8b90e422d9fc634 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Wed, 9 Feb 2022 12:10:12 -0600 Subject: x86/sev: Add helper for validating pages in early enc attribute changes early_set_memory_{encrypted,decrypted}() are used for changing the page state from decrypted (shared) to encrypted (private) and vice versa. When SEV-SNP is active, the page state transition needs to go through additional steps. If the page is transitioned from shared to private, then perform the following after the encryption attribute is set in the page table: 1. Issue the page state change VMGEXIT to add the page as a private in the RMP table. 2. Validate the page after its successfully added in the RMP table. To maintain the security guarantees, if the page is transitioned from private to shared, then perform the following before clearing the encryption attribute from the page table. 1. Invalidate the page. 2. Issue the page state change VMGEXIT to make the page shared in the RMP table. early_set_memory_{encrypted,decrypted}() can be called before the GHCB is setup so use the SNP page state MSR protocol VMGEXIT defined in the GHCB specification to request the page state change in the RMP table. While at it, add a helper snp_prep_memory() which will be used in probe_roms(), in a later patch. [ bp: Massage commit message. ] Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Reviewed-by: Venu Busireddy Link: https://lore.kernel.org/r/20220307213356.2797205-19-brijesh.singh@amd.com --- arch/x86/include/asm/sev.h | 10 +++++ arch/x86/kernel/sev.c | 99 +++++++++++++++++++++++++++++++++++++++++++ arch/x86/mm/mem_encrypt_amd.c | 58 +++++++++++++++++++++++-- 3 files changed, 163 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 48df02713ee0..f65d257e3d4a 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -123,6 +123,11 @@ static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) return rc; } void setup_ghcb(void); +void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, + unsigned int npages); +void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, + unsigned int npages); +void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op); #else static inline void sev_es_ist_enter(struct pt_regs *regs) { } static inline void sev_es_ist_exit(void) { } @@ -132,6 +137,11 @@ static inline int sev_es_efi_map_ghcbs(pgd_t *pgd) { return 0; } static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) { return 0; } static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) { return 0; } static inline void setup_ghcb(void) { } +static inline void __init +early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned int npages) { } +static inline void __init +early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned int npages) { } +static inline void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op) { } #endif #endif diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 39c87469ee0b..e3fca8615fe8 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -556,6 +556,105 @@ static u64 get_jump_table_addr(void) return ret; } +static void pvalidate_pages(unsigned long vaddr, unsigned int npages, bool validate) +{ + unsigned long vaddr_end; + int rc; + + vaddr = vaddr & PAGE_MASK; + vaddr_end = vaddr + (npages << PAGE_SHIFT); + + while (vaddr < vaddr_end) { + rc = pvalidate(vaddr, RMP_PG_SIZE_4K, validate); + if (WARN(rc, "Failed to validate address 0x%lx ret %d", vaddr, rc)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); + + vaddr = vaddr + PAGE_SIZE; + } +} + +static void __init early_set_pages_state(unsigned long paddr, unsigned int npages, enum psc_op op) +{ + unsigned long paddr_end; + u64 val; + + paddr = paddr & PAGE_MASK; + paddr_end = paddr + (npages << PAGE_SHIFT); + + while (paddr < paddr_end) { + /* + * Use the MSR protocol because this function can be called before + * the GHCB is established. + */ + sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op)); + VMGEXIT(); + + val = sev_es_rd_ghcb_msr(); + + if (WARN(GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP, + "Wrong PSC response code: 0x%x\n", + (unsigned int)GHCB_RESP_CODE(val))) + goto e_term; + + if (WARN(GHCB_MSR_PSC_RESP_VAL(val), + "Failed to change page state to '%s' paddr 0x%lx error 0x%llx\n", + op == SNP_PAGE_STATE_PRIVATE ? "private" : "shared", + paddr, GHCB_MSR_PSC_RESP_VAL(val))) + goto e_term; + + paddr = paddr + PAGE_SIZE; + } + + return; + +e_term: + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); +} + +void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, + unsigned int npages) +{ + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return; + + /* + * Ask the hypervisor to mark the memory pages as private in the RMP + * table. + */ + early_set_pages_state(paddr, npages, SNP_PAGE_STATE_PRIVATE); + + /* Validate the memory pages after they've been added in the RMP table. */ + pvalidate_pages(vaddr, npages, true); +} + +void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, + unsigned int npages) +{ + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return; + + /* Invalidate the memory pages before they are marked shared in the RMP table. */ + pvalidate_pages(vaddr, npages, false); + + /* Ask hypervisor to mark the memory pages shared in the RMP table. */ + early_set_pages_state(paddr, npages, SNP_PAGE_STATE_SHARED); +} + +void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op) +{ + unsigned long vaddr, npages; + + vaddr = (unsigned long)__va(paddr); + npages = PAGE_ALIGN(sz) >> PAGE_SHIFT; + + if (op == SNP_PAGE_STATE_PRIVATE) + early_snp_set_memory_private(vaddr, paddr, npages); + else if (op == SNP_PAGE_STATE_SHARED) + early_snp_set_memory_shared(vaddr, paddr, npages); + else + WARN(1, "invalid memory op %d\n", op); +} + int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { u16 startup_cs, startup_ip; diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c index 6169053c2854..8539dd6f24ff 100644 --- a/arch/x86/mm/mem_encrypt_amd.c +++ b/arch/x86/mm/mem_encrypt_amd.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "mm_internal.h" @@ -47,6 +48,36 @@ EXPORT_SYMBOL(sme_me_mask); /* Buffer used for early in-place encryption by BSP, no locking needed */ static char sme_early_buffer[PAGE_SIZE] __initdata __aligned(PAGE_SIZE); +/* + * SNP-specific routine which needs to additionally change the page state from + * private to shared before copying the data from the source to destination and + * restore after the copy. + */ +static inline void __init snp_memcpy(void *dst, void *src, size_t sz, + unsigned long paddr, bool decrypt) +{ + unsigned long npages = PAGE_ALIGN(sz) >> PAGE_SHIFT; + + if (decrypt) { + /* + * @paddr needs to be accessed decrypted, mark the page shared in + * the RMP table before copying it. + */ + early_snp_set_memory_shared((unsigned long)__va(paddr), paddr, npages); + + memcpy(dst, src, sz); + + /* Restore the page state after the memcpy. */ + early_snp_set_memory_private((unsigned long)__va(paddr), paddr, npages); + } else { + /* + * @paddr need to be accessed encrypted, no need for the page state + * change. + */ + memcpy(dst, src, sz); + } +} + /* * This routine does not change the underlying encryption setting of the * page(s) that map this memory. It assumes that eventually the memory is @@ -95,8 +126,13 @@ static void __init __sme_early_enc_dec(resource_size_t paddr, * Use a temporary buffer, of cache-line multiple size, to * avoid data corruption as documented in the APM. */ - memcpy(sme_early_buffer, src, len); - memcpy(dst, sme_early_buffer, len); + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) { + snp_memcpy(sme_early_buffer, src, len, paddr, enc); + snp_memcpy(dst, sme_early_buffer, len, paddr, !enc); + } else { + memcpy(sme_early_buffer, src, len); + memcpy(dst, sme_early_buffer, len); + } early_memunmap(dst, len); early_memunmap(src, len); @@ -322,14 +358,28 @@ static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc) clflush_cache_range(__va(pa), size); /* Encrypt/decrypt the contents in-place */ - if (enc) + if (enc) { sme_early_encrypt(pa, size); - else + } else { sme_early_decrypt(pa, size); + /* + * ON SNP, the page state in the RMP table must happen + * before the page table updates. + */ + early_snp_set_memory_shared((unsigned long)__va(pa), pa, 1); + } + /* Change the page encryption mask. */ new_pte = pfn_pte(pfn, new_prot); set_pte_atomic(kpte, new_pte); + + /* + * If page is set encrypted in the page table, then update the RMP table to + * add this page as private. + */ + if (enc) + early_snp_set_memory_private((unsigned long)__va(pa), pa, 1); } static int __init early_set_memory_enc_dec(unsigned long vaddr, -- cgit From efac0eedfab515e523cde5cb7a62289eb2ee58f8 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Wed, 9 Feb 2022 12:10:13 -0600 Subject: x86/kernel: Mark the .bss..decrypted section as shared in the RMP table The encryption attribute for the .bss..decrypted section is cleared in the initial page table build. This is because the section contains the data that need to be shared between the guest and the hypervisor. When SEV-SNP is active, just clearing the encryption attribute in the page table is not enough. The page state needs to be updated in the RMP table. Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-20-brijesh.singh@amd.com --- arch/x86/kernel/head64.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 83514b9827e6..656d2f3e2cf0 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -143,7 +143,20 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdv if (sme_get_me_mask()) { vaddr = (unsigned long)__start_bss_decrypted; vaddr_end = (unsigned long)__end_bss_decrypted; + for (; vaddr < vaddr_end; vaddr += PMD_SIZE) { + /* + * On SNP, transition the page to shared in the RMP table so that + * it is consistent with the page table attribute change. + * + * __start_bss_decrypted has a virtual address in the high range + * mapping (kernel .text). PVALIDATE, by way of + * early_snp_set_memory_shared(), requires a valid virtual + * address but the kernel is currently running off of the identity + * mapping so use __pa() to get a *currently* valid virtual address. + */ + early_snp_set_memory_shared(__pa(vaddr), __pa(vaddr), PTRS_PER_PMD); + i = pmd_index(vaddr); pmd[i] -= sme_get_me_mask(); } -- cgit From 9704c07bf9f7682a83aec4e66f2d9154dbd8577f Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Wed, 9 Feb 2022 12:10:14 -0600 Subject: x86/kernel: Validate ROM memory before accessing when SEV-SNP is active probe_roms() accesses the memory range (0xc0000 - 0x10000) to probe various ROMs. The memory range is not part of the E820 system RAM range. The memory range is mapped as private (i.e encrypted) in the page table. When SEV-SNP is active, all the private memory must be validated before accessing. The ROM range was not part of E820 map, so the guest BIOS did not validate it. An access to invalidated memory will cause a exception yet, so validate the ROM memory regions before it is accessed. [ bp: Massage commit message. ] Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-21-brijesh.singh@amd.com --- arch/x86/kernel/probe_roms.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c index 36e84d904260..319fef37d9dc 100644 --- a/arch/x86/kernel/probe_roms.c +++ b/arch/x86/kernel/probe_roms.c @@ -21,6 +21,7 @@ #include #include #include +#include static struct resource system_rom_resource = { .name = "System ROM", @@ -197,11 +198,21 @@ static int __init romchecksum(const unsigned char *rom, unsigned long length) void __init probe_roms(void) { - const unsigned char *rom; unsigned long start, length, upper; + const unsigned char *rom; unsigned char c; int i; + /* + * The ROM memory range is not part of the e820 table and is therefore not + * pre-validated by BIOS. The kernel page table maps the ROM region as encrypted + * memory, and SNP requires encrypted memory to be validated before access. + * Do that here. + */ + snp_prep_memory(video_rom_resource.start, + ((system_rom_resource.end + 1) - video_rom_resource.start), + SNP_PAGE_STATE_PRIVATE); + /* video rom */ upper = adapter_rom_resources[0].start; for (start = video_rom_resource.start; start < upper; start += 2048) { -- cgit From dc3f3d2474b80eaee8be89f4c5eb344f10648f42 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Thu, 24 Feb 2022 10:56:01 -0600 Subject: x86/mm: Validate memory when changing the C-bit Add the needed functionality to change pages state from shared to private and vice-versa using the Page State Change VMGEXIT as documented in the GHCB spec. Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-22-brijesh.singh@amd.com --- arch/x86/include/asm/sev-common.h | 22 +++++ arch/x86/include/asm/sev.h | 4 + arch/x86/include/uapi/asm/svm.h | 2 + arch/x86/kernel/sev.c | 168 ++++++++++++++++++++++++++++++++++++++ arch/x86/mm/mem_encrypt_amd.c | 13 +++ 5 files changed, 209 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index f077a6c95e67..1aa72b5c2490 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -105,6 +105,28 @@ enum psc_op { #define GHCB_HV_FT_SNP BIT_ULL(0) +/* SNP Page State Change NAE event */ +#define VMGEXIT_PSC_MAX_ENTRY 253 + +struct psc_hdr { + u16 cur_entry; + u16 end_entry; + u32 reserved; +} __packed; + +struct psc_entry { + u64 cur_page : 12, + gfn : 40, + operation : 4, + pagesize : 1, + reserved : 7; +} __packed; + +struct snp_psc_desc { + struct psc_hdr hdr; + struct psc_entry entries[VMGEXIT_PSC_MAX_ENTRY]; +} __packed; + #define GHCB_MSR_TERM_REQ 0x100 #define GHCB_MSR_TERM_REASON_SET_POS 12 #define GHCB_MSR_TERM_REASON_SET_MASK 0xf diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index f65d257e3d4a..feeb93e6ec97 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -128,6 +128,8 @@ void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long padd void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned int npages); void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op); +void snp_set_memory_shared(unsigned long vaddr, unsigned int npages); +void snp_set_memory_private(unsigned long vaddr, unsigned int npages); #else static inline void sev_es_ist_enter(struct pt_regs *regs) { } static inline void sev_es_ist_exit(void) { } @@ -142,6 +144,8 @@ early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned static inline void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned int npages) { } static inline void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op) { } +static inline void snp_set_memory_shared(unsigned long vaddr, unsigned int npages) { } +static inline void snp_set_memory_private(unsigned long vaddr, unsigned int npages) { } #endif #endif diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index b0ad00f4c1e1..64404b47b773 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -108,6 +108,7 @@ #define SVM_VMGEXIT_AP_JUMP_TABLE 0x80000005 #define SVM_VMGEXIT_SET_AP_JUMP_TABLE 0 #define SVM_VMGEXIT_GET_AP_JUMP_TABLE 1 +#define SVM_VMGEXIT_PSC 0x80000010 #define SVM_VMGEXIT_HV_FEATURES 0x8000fffd #define SVM_VMGEXIT_UNSUPPORTED_EVENT 0x8000ffff @@ -219,6 +220,7 @@ { SVM_VMGEXIT_NMI_COMPLETE, "vmgexit_nmi_complete" }, \ { SVM_VMGEXIT_AP_HLT_LOOP, "vmgexit_ap_hlt_loop" }, \ { SVM_VMGEXIT_AP_JUMP_TABLE, "vmgexit_ap_jump_table" }, \ + { SVM_VMGEXIT_PSC, "vmgexit_page_state_change" }, \ { SVM_VMGEXIT_HV_FEATURES, "vmgexit_hypervisor_feature" }, \ { SVM_EXIT_ERR, "invalid_guest_state" } diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index e3fca8615fe8..bf4b57835694 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -655,6 +655,174 @@ void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op WARN(1, "invalid memory op %d\n", op); } +static int vmgexit_psc(struct snp_psc_desc *desc) +{ + int cur_entry, end_entry, ret = 0; + struct snp_psc_desc *data; + struct ghcb_state state; + struct es_em_ctxt ctxt; + unsigned long flags; + struct ghcb *ghcb; + + /* + * __sev_get_ghcb() needs to run with IRQs disabled because it is using + * a per-CPU GHCB. + */ + local_irq_save(flags); + + ghcb = __sev_get_ghcb(&state); + if (!ghcb) { + ret = 1; + goto out_unlock; + } + + /* Copy the input desc into GHCB shared buffer */ + data = (struct snp_psc_desc *)ghcb->shared_buffer; + memcpy(ghcb->shared_buffer, desc, min_t(int, GHCB_SHARED_BUF_SIZE, sizeof(*desc))); + + /* + * As per the GHCB specification, the hypervisor can resume the guest + * before processing all the entries. Check whether all the entries + * are processed. If not, then keep retrying. Note, the hypervisor + * will update the data memory directly to indicate the status, so + * reference the data->hdr everywhere. + * + * The strategy here is to wait for the hypervisor to change the page + * state in the RMP table before guest accesses the memory pages. If the + * page state change was not successful, then later memory access will + * result in a crash. + */ + cur_entry = data->hdr.cur_entry; + end_entry = data->hdr.end_entry; + + while (data->hdr.cur_entry <= data->hdr.end_entry) { + ghcb_set_sw_scratch(ghcb, (u64)__pa(data)); + + /* This will advance the shared buffer data points to. */ + ret = sev_es_ghcb_hv_call(ghcb, true, &ctxt, SVM_VMGEXIT_PSC, 0, 0); + + /* + * Page State Change VMGEXIT can pass error code through + * exit_info_2. + */ + if (WARN(ret || ghcb->save.sw_exit_info_2, + "SNP: PSC failed ret=%d exit_info_2=%llx\n", + ret, ghcb->save.sw_exit_info_2)) { + ret = 1; + goto out; + } + + /* Verify that reserved bit is not set */ + if (WARN(data->hdr.reserved, "Reserved bit is set in the PSC header\n")) { + ret = 1; + goto out; + } + + /* + * Sanity check that entry processing is not going backwards. + * This will happen only if hypervisor is tricking us. + */ + if (WARN(data->hdr.end_entry > end_entry || cur_entry > data->hdr.cur_entry, +"SNP: PSC processing going backward, end_entry %d (got %d) cur_entry %d (got %d)\n", + end_entry, data->hdr.end_entry, cur_entry, data->hdr.cur_entry)) { + ret = 1; + goto out; + } + } + +out: + __sev_put_ghcb(&state); + +out_unlock: + local_irq_restore(flags); + + return ret; +} + +static void __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr, + unsigned long vaddr_end, int op) +{ + struct psc_hdr *hdr; + struct psc_entry *e; + unsigned long pfn; + int i; + + hdr = &data->hdr; + e = data->entries; + + memset(data, 0, sizeof(*data)); + i = 0; + + while (vaddr < vaddr_end) { + if (is_vmalloc_addr((void *)vaddr)) + pfn = vmalloc_to_pfn((void *)vaddr); + else + pfn = __pa(vaddr) >> PAGE_SHIFT; + + e->gfn = pfn; + e->operation = op; + hdr->end_entry = i; + + /* + * Current SNP implementation doesn't keep track of the RMP page + * size so use 4K for simplicity. + */ + e->pagesize = RMP_PG_SIZE_4K; + + vaddr = vaddr + PAGE_SIZE; + e++; + i++; + } + + if (vmgexit_psc(data)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); +} + +static void set_pages_state(unsigned long vaddr, unsigned int npages, int op) +{ + unsigned long vaddr_end, next_vaddr; + struct snp_psc_desc *desc; + + desc = kmalloc(sizeof(*desc), GFP_KERNEL_ACCOUNT); + if (!desc) + panic("SNP: failed to allocate memory for PSC descriptor\n"); + + vaddr = vaddr & PAGE_MASK; + vaddr_end = vaddr + (npages << PAGE_SHIFT); + + while (vaddr < vaddr_end) { + /* Calculate the last vaddr that fits in one struct snp_psc_desc. */ + next_vaddr = min_t(unsigned long, vaddr_end, + (VMGEXIT_PSC_MAX_ENTRY * PAGE_SIZE) + vaddr); + + __set_pages_state(desc, vaddr, next_vaddr, op); + + vaddr = next_vaddr; + } + + kfree(desc); +} + +void snp_set_memory_shared(unsigned long vaddr, unsigned int npages) +{ + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return; + + pvalidate_pages(vaddr, npages, false); + + set_pages_state(vaddr, npages, SNP_PAGE_STATE_SHARED); +} + +void snp_set_memory_private(unsigned long vaddr, unsigned int npages) +{ + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return; + + set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE); + + pvalidate_pages(vaddr, npages, true); +} + int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { u16 startup_cs, startup_ip; diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c index 8539dd6f24ff..d3c88d9ef8d6 100644 --- a/arch/x86/mm/mem_encrypt_amd.c +++ b/arch/x86/mm/mem_encrypt_amd.c @@ -316,11 +316,24 @@ static void enc_dec_hypercall(unsigned long vaddr, int npages, bool enc) static void amd_enc_status_change_prepare(unsigned long vaddr, int npages, bool enc) { + /* + * To maintain the security guarantees of SEV-SNP guests, make sure + * to invalidate the memory before encryption attribute is cleared. + */ + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP) && !enc) + snp_set_memory_shared(vaddr, npages); } /* Return true unconditionally: return value doesn't matter for the SEV side */ static bool amd_enc_status_change_finish(unsigned long vaddr, int npages, bool enc) { + /* + * After memory is mapped encrypted in the page table, validate it + * so that it is consistent with the page table updates. + */ + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP) && enc) + snp_set_memory_private(vaddr, npages); + if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) enc_dec_hypercall(vaddr, npages, enc); -- cgit From 0afb6b660a6b58cb336d1175ed687bf9525849a4 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 7 Mar 2022 15:33:32 -0600 Subject: x86/sev: Use SEV-SNP AP creation to start secondary CPUs To provide a more secure way to start APs under SEV-SNP, use the SEV-SNP AP Creation NAE event. This allows for guest control over the AP register state rather than trusting the hypervisor with the SEV-ES Jump Table address. During native_smp_prepare_cpus(), invoke an SEV-SNP function that, if SEV-SNP is active, will set/override apic->wakeup_secondary_cpu. This will allow the SEV-SNP AP Creation NAE event method to be used to boot the APs. As a result of installing the override when SEV-SNP is active, this method of starting the APs becomes the required method. The override function will fail to start the AP if the hypervisor does not have support for AP creation. [ bp: Work in forgotten review comments. ] Signed-off-by: Tom Lendacky Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-23-brijesh.singh@amd.com --- arch/x86/include/asm/sev-common.h | 1 + arch/x86/include/asm/sev.h | 4 + arch/x86/include/uapi/asm/svm.h | 5 + arch/x86/kernel/sev.c | 242 ++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/smpboot.c | 3 + 5 files changed, 255 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index 1aa72b5c2490..e9b6815b3b3d 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -104,6 +104,7 @@ enum psc_op { (((u64)(v) & GENMASK_ULL(63, 12)) >> 12) #define GHCB_HV_FT_SNP BIT_ULL(0) +#define GHCB_HV_FT_SNP_AP_CREATION BIT_ULL(1) /* SNP Page State Change NAE event */ #define VMGEXIT_PSC_MAX_ENTRY 253 diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index feeb93e6ec97..a3203b2caaca 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -66,6 +66,8 @@ extern bool handle_vc_boot_ghcb(struct pt_regs *regs); /* RMP page size */ #define RMP_PG_SIZE_4K 0 +#define RMPADJUST_VMSA_PAGE_BIT BIT(16) + #ifdef CONFIG_AMD_MEM_ENCRYPT extern struct static_key_false sev_es_enable_key; extern void __sev_es_ist_enter(struct pt_regs *regs); @@ -130,6 +132,7 @@ void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op); void snp_set_memory_shared(unsigned long vaddr, unsigned int npages); void snp_set_memory_private(unsigned long vaddr, unsigned int npages); +void snp_set_wakeup_secondary_cpu(void); #else static inline void sev_es_ist_enter(struct pt_regs *regs) { } static inline void sev_es_ist_exit(void) { } @@ -146,6 +149,7 @@ early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned i static inline void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op) { } static inline void snp_set_memory_shared(unsigned long vaddr, unsigned int npages) { } static inline void snp_set_memory_private(unsigned long vaddr, unsigned int npages) { } +static inline void snp_set_wakeup_secondary_cpu(void) { } #endif #endif diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index 64404b47b773..cfea5e875088 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -109,6 +109,10 @@ #define SVM_VMGEXIT_SET_AP_JUMP_TABLE 0 #define SVM_VMGEXIT_GET_AP_JUMP_TABLE 1 #define SVM_VMGEXIT_PSC 0x80000010 +#define SVM_VMGEXIT_AP_CREATION 0x80000013 +#define SVM_VMGEXIT_AP_CREATE_ON_INIT 0 +#define SVM_VMGEXIT_AP_CREATE 1 +#define SVM_VMGEXIT_AP_DESTROY 2 #define SVM_VMGEXIT_HV_FEATURES 0x8000fffd #define SVM_VMGEXIT_UNSUPPORTED_EVENT 0x8000ffff @@ -221,6 +225,7 @@ { SVM_VMGEXIT_AP_HLT_LOOP, "vmgexit_ap_hlt_loop" }, \ { SVM_VMGEXIT_AP_JUMP_TABLE, "vmgexit_ap_jump_table" }, \ { SVM_VMGEXIT_PSC, "vmgexit_page_state_change" }, \ + { SVM_VMGEXIT_AP_CREATION, "vmgexit_ap_creation" }, \ { SVM_VMGEXIT_HV_FEATURES, "vmgexit_hypervisor_feature" }, \ { SVM_EXIT_ERR, "invalid_guest_state" } diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index bf4b57835694..d7915ae7729d 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -31,9 +32,26 @@ #include #include #include +#include #define DR7_RESET_VALUE 0x400 +/* AP INIT values as documented in the APM2 section "Processor Initialization State" */ +#define AP_INIT_CS_LIMIT 0xffff +#define AP_INIT_DS_LIMIT 0xffff +#define AP_INIT_LDTR_LIMIT 0xffff +#define AP_INIT_GDTR_LIMIT 0xffff +#define AP_INIT_IDTR_LIMIT 0xffff +#define AP_INIT_TR_LIMIT 0xffff +#define AP_INIT_RFLAGS_DEFAULT 0x2 +#define AP_INIT_DR6_DEFAULT 0xffff0ff0 +#define AP_INIT_GPAT_DEFAULT 0x0007040600070406ULL +#define AP_INIT_XCR0_DEFAULT 0x1 +#define AP_INIT_X87_FTW_DEFAULT 0x5555 +#define AP_INIT_X87_FCW_DEFAULT 0x0040 +#define AP_INIT_CR0_DEFAULT 0x60000010 +#define AP_INIT_MXCSR_DEFAULT 0x1f80 + /* For early boot hypervisor communication in SEV-ES enabled guests */ static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); @@ -90,6 +108,8 @@ struct ghcb_state { static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data); DEFINE_STATIC_KEY_FALSE(sev_es_enable_key); +static DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa); + static __always_inline bool on_vc_stack(struct pt_regs *regs) { unsigned long sp = regs->sp; @@ -823,6 +843,228 @@ void snp_set_memory_private(unsigned long vaddr, unsigned int npages) pvalidate_pages(vaddr, npages, true); } +static int snp_set_vmsa(void *va, bool vmsa) +{ + u64 attrs; + + /* + * Running at VMPL0 allows the kernel to change the VMSA bit for a page + * using the RMPADJUST instruction. However, for the instruction to + * succeed it must target the permissions of a lesser privileged + * (higher numbered) VMPL level, so use VMPL1 (refer to the RMPADJUST + * instruction in the AMD64 APM Volume 3). + */ + attrs = 1; + if (vmsa) + attrs |= RMPADJUST_VMSA_PAGE_BIT; + + return rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs); +} + +#define __ATTR_BASE (SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK) +#define INIT_CS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_READ_MASK | SVM_SELECTOR_CODE_MASK) +#define INIT_DS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_WRITE_MASK) + +#define INIT_LDTR_ATTRIBS (SVM_SELECTOR_P_MASK | 2) +#define INIT_TR_ATTRIBS (SVM_SELECTOR_P_MASK | 3) + +static void *snp_alloc_vmsa_page(void) +{ + struct page *p; + + /* + * Allocate VMSA page to work around the SNP erratum where the CPU will + * incorrectly signal an RMP violation #PF if a large page (2MB or 1GB) + * collides with the RMP entry of VMSA page. The recommended workaround + * is to not use a large page. + * + * Allocate an 8k page which is also 8k-aligned. + */ + p = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1); + if (!p) + return NULL; + + split_page(p, 1); + + /* Free the first 4k. This page may be 2M/1G aligned and cannot be used. */ + __free_page(p); + + return page_address(p + 1); +} + +static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa) +{ + int err; + + err = snp_set_vmsa(vmsa, false); + if (err) + pr_err("clear VMSA page failed (%u), leaking page\n", err); + else + free_page((unsigned long)vmsa); +} + +static int wakeup_cpu_via_vmgexit(int apic_id, unsigned long start_ip) +{ + struct sev_es_save_area *cur_vmsa, *vmsa; + struct ghcb_state state; + unsigned long flags; + struct ghcb *ghcb; + u8 sipi_vector; + int cpu, ret; + u64 cr4; + + /* + * The hypervisor SNP feature support check has happened earlier, just check + * the AP_CREATION one here. + */ + if (!(sev_hv_features & GHCB_HV_FT_SNP_AP_CREATION)) + return -EOPNOTSUPP; + + /* + * Verify the desired start IP against the known trampoline start IP + * to catch any future new trampolines that may be introduced that + * would require a new protected guest entry point. + */ + if (WARN_ONCE(start_ip != real_mode_header->trampoline_start, + "Unsupported SNP start_ip: %lx\n", start_ip)) + return -EINVAL; + + /* Override start_ip with known protected guest start IP */ + start_ip = real_mode_header->sev_es_trampoline_start; + + /* Find the logical CPU for the APIC ID */ + for_each_present_cpu(cpu) { + if (arch_match_cpu_phys_id(cpu, apic_id)) + break; + } + if (cpu >= nr_cpu_ids) + return -EINVAL; + + cur_vmsa = per_cpu(sev_vmsa, cpu); + + /* + * A new VMSA is created each time because there is no guarantee that + * the current VMSA is the kernels or that the vCPU is not running. If + * an attempt was done to use the current VMSA with a running vCPU, a + * #VMEXIT of that vCPU would wipe out all of the settings being done + * here. + */ + vmsa = (struct sev_es_save_area *)snp_alloc_vmsa_page(); + if (!vmsa) + return -ENOMEM; + + /* CR4 should maintain the MCE value */ + cr4 = native_read_cr4() & X86_CR4_MCE; + + /* Set the CS value based on the start_ip converted to a SIPI vector */ + sipi_vector = (start_ip >> 12); + vmsa->cs.base = sipi_vector << 12; + vmsa->cs.limit = AP_INIT_CS_LIMIT; + vmsa->cs.attrib = INIT_CS_ATTRIBS; + vmsa->cs.selector = sipi_vector << 8; + + /* Set the RIP value based on start_ip */ + vmsa->rip = start_ip & 0xfff; + + /* Set AP INIT defaults as documented in the APM */ + vmsa->ds.limit = AP_INIT_DS_LIMIT; + vmsa->ds.attrib = INIT_DS_ATTRIBS; + vmsa->es = vmsa->ds; + vmsa->fs = vmsa->ds; + vmsa->gs = vmsa->ds; + vmsa->ss = vmsa->ds; + + vmsa->gdtr.limit = AP_INIT_GDTR_LIMIT; + vmsa->ldtr.limit = AP_INIT_LDTR_LIMIT; + vmsa->ldtr.attrib = INIT_LDTR_ATTRIBS; + vmsa->idtr.limit = AP_INIT_IDTR_LIMIT; + vmsa->tr.limit = AP_INIT_TR_LIMIT; + vmsa->tr.attrib = INIT_TR_ATTRIBS; + + vmsa->cr4 = cr4; + vmsa->cr0 = AP_INIT_CR0_DEFAULT; + vmsa->dr7 = DR7_RESET_VALUE; + vmsa->dr6 = AP_INIT_DR6_DEFAULT; + vmsa->rflags = AP_INIT_RFLAGS_DEFAULT; + vmsa->g_pat = AP_INIT_GPAT_DEFAULT; + vmsa->xcr0 = AP_INIT_XCR0_DEFAULT; + vmsa->mxcsr = AP_INIT_MXCSR_DEFAULT; + vmsa->x87_ftw = AP_INIT_X87_FTW_DEFAULT; + vmsa->x87_fcw = AP_INIT_X87_FCW_DEFAULT; + + /* SVME must be set. */ + vmsa->efer = EFER_SVME; + + /* + * Set the SNP-specific fields for this VMSA: + * VMPL level + * SEV_FEATURES (matches the SEV STATUS MSR right shifted 2 bits) + */ + vmsa->vmpl = 0; + vmsa->sev_features = sev_status >> 2; + + /* Switch the page over to a VMSA page now that it is initialized */ + ret = snp_set_vmsa(vmsa, true); + if (ret) { + pr_err("set VMSA page failed (%u)\n", ret); + free_page((unsigned long)vmsa); + + return -EINVAL; + } + + /* Issue VMGEXIT AP Creation NAE event */ + local_irq_save(flags); + + ghcb = __sev_get_ghcb(&state); + + vc_ghcb_invalidate(ghcb); + ghcb_set_rax(ghcb, vmsa->sev_features); + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_CREATION); + ghcb_set_sw_exit_info_1(ghcb, ((u64)apic_id << 32) | SVM_VMGEXIT_AP_CREATE); + ghcb_set_sw_exit_info_2(ghcb, __pa(vmsa)); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + if (!ghcb_sw_exit_info_1_is_valid(ghcb) || + lower_32_bits(ghcb->save.sw_exit_info_1)) { + pr_err("SNP AP Creation error\n"); + ret = -EINVAL; + } + + __sev_put_ghcb(&state); + + local_irq_restore(flags); + + /* Perform cleanup if there was an error */ + if (ret) { + snp_cleanup_vmsa(vmsa); + vmsa = NULL; + } + + /* Free up any previous VMSA page */ + if (cur_vmsa) + snp_cleanup_vmsa(cur_vmsa); + + /* Record the current VMSA page */ + per_cpu(sev_vmsa, cpu) = vmsa; + + return ret; +} + +void snp_set_wakeup_secondary_cpu(void) +{ + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return; + + /* + * Always set this override if SNP is enabled. This makes it the + * required method to start APs under SNP. If the hypervisor does + * not support AP creation, then no APs will be started. + */ + apic->wakeup_secondary_cpu = wakeup_cpu_via_vmgexit; +} + int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { u16 startup_cs, startup_ip; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2ef14772dc04..85d7b1c5eb70 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -82,6 +82,7 @@ #include #include #include +#include /* representing HT siblings of each logical CPU */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); @@ -1430,6 +1431,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) smp_quirk_init_udelay(); speculative_store_bypass_ht_init(); + + snp_set_wakeup_secondary_cpu(); } void arch_thaw_secondary_cpus_begin(void) -- cgit From 469693d8f62299709e8ba56d8fb3da9ea990213c Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Wed, 9 Feb 2022 12:10:17 -0600 Subject: x86/head/64: Re-enable stack protection Due to 103a4908ad4d ("x86/head/64: Disable stack protection for head$(BITS).o") kernel/head{32,64}.c are compiled with -fno-stack-protector to allow a call to set_bringup_idt_handler(), which would otherwise have stack protection enabled with CONFIG_STACKPROTECTOR_STRONG. While sufficient for that case, there may still be issues with calls to any external functions that were compiled with stack protection enabled that in-turn make stack-protected calls, or if the exception handlers set up by set_bringup_idt_handler() make calls to stack-protected functions. Subsequent patches for SEV-SNP CPUID validation support will introduce both such cases. Attempting to disable stack protection for everything in scope to address that is prohibitive since much of the code, like the SEV-ES #VC handler, is shared code that remains in use after boot and could benefit from having stack protection enabled. Attempting to inline calls is brittle and can quickly balloon out to library/helper code where that's not really an option. Instead, re-enable stack protection for head32.c/head64.c, and make the appropriate changes to ensure the segment used for the stack canary is initialized in advance of any stack-protected C calls. For head64.c: - The BSP will enter from startup_64() and call into C code (startup_64_setup_env()) shortly after setting up the stack, which may result in calls to stack-protected code. Set up %gs early to allow for this safely. - APs will enter from secondary_startup_64*(), and %gs will be set up soon after. There is one call to C code prior to %gs being setup (__startup_secondary_64()), but it is only to fetch 'sme_me_mask' global, so just load 'sme_me_mask' directly instead, and remove the now-unused __startup_secondary_64() function. For head32.c: - BSPs/APs will set %fs to __BOOT_DS prior to any C calls. In recent kernels, the compiler is configured to access the stack canary at %fs:__stack_chk_guard [1], which overlaps with the initial per-cpu '__stack_chk_guard' variable in the initial/"master" .data..percpu area. This is sufficient to allow access to the canary for use during initial startup, so no changes are needed there. [1] 3fb0fdb3bbe7 ("x86/stackprotector/32: Make the canary into a regular percpu variable") [ bp: Massage commit message. ] Suggested-by: Joerg Roedel #for 64-bit %gs set up Signed-off-by: Michael Roth Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-24-brijesh.singh@amd.com --- arch/x86/include/asm/setup.h | 1 - arch/x86/kernel/Makefile | 2 -- arch/x86/kernel/head64.c | 9 --------- arch/x86/kernel/head_64.S | 24 +++++++++++++++++++++--- 4 files changed, 21 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 896e48d45828..a1b107f2a12a 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -50,7 +50,6 @@ extern unsigned long saved_video_mode; extern void reserve_standard_io_resources(void); extern void i386_reserve_resources(void); extern unsigned long __startup_64(unsigned long physaddr, struct boot_params *bp); -extern unsigned long __startup_secondary_64(void); extern void startup_64_setup_env(unsigned long physbase); extern void early_setup_idt(void); extern void __init do_early_exception(struct pt_regs *regs, int trapnr); diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index c41ef42adbe8..1a2dc328cb5e 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -46,8 +46,6 @@ endif # non-deterministic coverage. KCOV_INSTRUMENT := n -CFLAGS_head$(BITS).o += -fno-stack-protector - CFLAGS_irq.o := -I $(srctree)/$(src)/../include/asm/trace obj-y := process_$(BITS).o signal.o diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 656d2f3e2cf0..c185f4831498 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -318,15 +318,6 @@ unsigned long __head __startup_64(unsigned long physaddr, return sme_postprocess_startup(bp, pmd); } -unsigned long __startup_secondary_64(void) -{ - /* - * Return the SME encryption mask (if SME is active) to be used as a - * modifier for the initial pgdir entry programmed into CR3. - */ - return sme_get_me_mask(); -} - /* Wipe all early page tables except for the kernel symbol map */ static void __init reset_early_page_tables(void) { diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 6bf340c91165..7bac9a4bdf7c 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -65,6 +65,22 @@ SYM_CODE_START_NOALIGN(startup_64) leaq (__end_init_task - FRAME_SIZE)(%rip), %rsp leaq _text(%rip), %rdi + + /* + * initial_gs points to initial fixed_percpu_data struct with storage for + * the stack protector canary. Global pointer fixups are needed at this + * stage, so apply them as is done in fixup_pointer(), and initialize %gs + * such that the canary can be accessed at %gs:40 for subsequent C calls. + */ + movl $MSR_GS_BASE, %ecx + movq initial_gs(%rip), %rax + movq $_text, %rdx + subq %rdx, %rax + addq %rdi, %rax + movq %rax, %rdx + shrq $32, %rdx + wrmsr + pushq %rsi call startup_64_setup_env popq %rsi @@ -147,9 +163,11 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) * Retrieve the modifier (SME encryption mask if SME is active) to be * added to the initial pgdir entry that will be programmed into CR3. */ - pushq %rsi - call __startup_secondary_64 - popq %rsi +#ifdef CONFIG_AMD_MEM_ENCRYPT + movq sme_me_mask, %rax +#else + xorq %rax, %rax +#endif /* Form the CR3 value being sure to include the CR3 modifier */ addq $(init_top_pgt - __START_KERNEL_map), %rax -- cgit From 7c4146e8885512719a50b641e9277a1712e052ff Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Wed, 9 Feb 2022 12:10:18 -0600 Subject: x86/compressed/acpi: Move EFI detection to helper Future patches for SEV-SNP-validated CPUID will also require early parsing of the EFI configuration. Incrementally move the related code into a set of helpers that can be re-used for that purpose. First, carve out the functionality which determines the EFI environment type the machine is booting on. [ bp: Massage commit message. ] Signed-off-by: Michael Roth Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-25-brijesh.singh@amd.com --- arch/x86/boot/compressed/Makefile | 1 + arch/x86/boot/compressed/acpi.c | 28 ++++++++-------------- arch/x86/boot/compressed/efi.c | 50 +++++++++++++++++++++++++++++++++++++++ arch/x86/boot/compressed/misc.h | 16 +++++++++++++ 4 files changed, 77 insertions(+), 18 deletions(-) create mode 100644 arch/x86/boot/compressed/efi.c (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 6115274fe10f..e69c3d2e0628 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -103,6 +103,7 @@ endif vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o +vmlinux-objs-$(CONFIG_EFI) += $(obj)/efi.o efi-obj-$(CONFIG_EFI_STUB) = $(objtree)/drivers/firmware/efi/libstub/lib.a $(obj)/vmlinux: $(vmlinux-objs-y) $(efi-obj-y) FORCE diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c index 8bcbcee54aa1..db6c561920f0 100644 --- a/arch/x86/boot/compressed/acpi.c +++ b/arch/x86/boot/compressed/acpi.c @@ -87,7 +87,7 @@ static acpi_physical_address kexec_get_rsdp_addr(void) efi_system_table_64_t *systab; struct efi_setup_data *esd; struct efi_info *ei; - char *sig; + enum efi_type et; esd = (struct efi_setup_data *)get_kexec_setup_data_addr(); if (!esd) @@ -98,10 +98,9 @@ static acpi_physical_address kexec_get_rsdp_addr(void) return 0; } - ei = &boot_params->efi_info; - sig = (char *)&ei->efi_loader_signature; - if (strncmp(sig, EFI64_LOADER_SIGNATURE, 4)) { - debug_putstr("Wrong kexec EFI loader signature.\n"); + et = efi_get_type(boot_params); + if (et != EFI_TYPE_64) { + debug_putstr("Unexpected kexec EFI environment (expected 64-bit EFI).\n"); return 0; } @@ -122,29 +121,22 @@ static acpi_physical_address efi_get_rsdp_addr(void) unsigned long systab, config_tables; unsigned int nr_tables; struct efi_info *ei; + enum efi_type et; bool efi_64; - char *sig; - - ei = &boot_params->efi_info; - sig = (char *)&ei->efi_loader_signature; - if (!strncmp(sig, EFI64_LOADER_SIGNATURE, 4)) { + et = efi_get_type(boot_params); + if (et == EFI_TYPE_64) efi_64 = true; - } else if (!strncmp(sig, EFI32_LOADER_SIGNATURE, 4)) { + else if (et == EFI_TYPE_32) efi_64 = false; - } else { - debug_putstr("Wrong EFI loader signature.\n"); + else return 0; - } /* Get systab from boot params. */ + ei = &boot_params->efi_info; #ifdef CONFIG_X86_64 systab = ei->efi_systab | ((__u64)ei->efi_systab_hi << 32); #else - if (ei->efi_systab_hi || ei->efi_memmap_hi) { - debug_putstr("Error getting RSDP address: EFI system table located above 4GB.\n"); - return 0; - } systab = ei->efi_systab; #endif if (!systab) diff --git a/arch/x86/boot/compressed/efi.c b/arch/x86/boot/compressed/efi.c new file mode 100644 index 000000000000..bad0ce3e2ef6 --- /dev/null +++ b/arch/x86/boot/compressed/efi.c @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Helpers for early access to EFI configuration table. + * + * Originally derived from arch/x86/boot/compressed/acpi.c + */ + +#include "misc.h" +#include +#include + +/** + * efi_get_type - Given a pointer to boot_params, determine the type of EFI environment. + * + * @bp: pointer to boot_params + * + * Return: EFI_TYPE_{32,64} for valid EFI environments, EFI_TYPE_NONE otherwise. + */ +enum efi_type efi_get_type(struct boot_params *bp) +{ + struct efi_info *ei; + enum efi_type et; + const char *sig; + + ei = &bp->efi_info; + sig = (char *)&ei->efi_loader_signature; + + if (!strncmp(sig, EFI64_LOADER_SIGNATURE, 4)) { + et = EFI_TYPE_64; + } else if (!strncmp(sig, EFI32_LOADER_SIGNATURE, 4)) { + et = EFI_TYPE_32; + } else { + debug_putstr("No EFI environment detected.\n"); + et = EFI_TYPE_NONE; + } + +#ifndef CONFIG_X86_64 + /* + * Existing callers like acpi.c treat this case as an indicator to + * fall-through to non-EFI, rather than an error, so maintain that + * functionality here as well. + */ + if (ei->efi_systab_hi || ei->efi_memmap_hi) { + debug_putstr("EFI system table is located above 4GB and cannot be accessed.\n"); + et = EFI_TYPE_NONE; + } +#endif + + return et; +} diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 01cc13c12059..fede1afa39e9 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -176,4 +176,20 @@ void boot_stage2_vc(void); unsigned long sev_verify_cbit(unsigned long cr3); +enum efi_type { + EFI_TYPE_64, + EFI_TYPE_32, + EFI_TYPE_NONE, +}; + +#ifdef CONFIG_EFI +/* helpers for early EFI config table access */ +enum efi_type efi_get_type(struct boot_params *bp); +#else +static inline enum efi_type efi_get_type(struct boot_params *bp) +{ + return EFI_TYPE_NONE; +} +#endif /* CONFIG_EFI */ + #endif /* BOOT_COMPRESSED_MISC_H */ -- cgit From 58f3e6b71f42f99ab5d0ab26ddf6e7ee5631f5db Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Thu, 24 Feb 2022 10:56:05 -0600 Subject: x86/compressed/acpi: Move EFI system table lookup to helper Future patches for SEV-SNP-validated CPUID will also require early parsing of the EFI configuration. Incrementally move the related code into a set of helpers that can be re-used for that purpose. Signed-off-by: Michael Roth Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-26-brijesh.singh@amd.com --- arch/x86/boot/compressed/acpi.c | 21 +++++++-------------- arch/x86/boot/compressed/efi.c | 29 +++++++++++++++++++++++++++++ arch/x86/boot/compressed/misc.h | 6 ++++++ 3 files changed, 42 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c index db6c561920f0..58a3d3f3e305 100644 --- a/arch/x86/boot/compressed/acpi.c +++ b/arch/x86/boot/compressed/acpi.c @@ -105,7 +105,7 @@ static acpi_physical_address kexec_get_rsdp_addr(void) } /* Get systab from boot params. */ - systab = (efi_system_table_64_t *) (ei->efi_systab | ((__u64)ei->efi_systab_hi << 32)); + systab = (efi_system_table_64_t *)efi_get_system_table(boot_params); if (!systab) error("EFI system table not found in kexec boot_params."); @@ -118,9 +118,8 @@ static acpi_physical_address kexec_get_rsdp_addr(void) { return 0; } static acpi_physical_address efi_get_rsdp_addr(void) { #ifdef CONFIG_EFI - unsigned long systab, config_tables; + unsigned long systab_pa, config_tables; unsigned int nr_tables; - struct efi_info *ei; enum efi_type et; bool efi_64; @@ -132,24 +131,18 @@ static acpi_physical_address efi_get_rsdp_addr(void) else return 0; - /* Get systab from boot params. */ - ei = &boot_params->efi_info; -#ifdef CONFIG_X86_64 - systab = ei->efi_systab | ((__u64)ei->efi_systab_hi << 32); -#else - systab = ei->efi_systab; -#endif - if (!systab) - error("EFI system table not found."); + systab_pa = efi_get_system_table(boot_params); + if (!systab_pa) + error("EFI support advertised, but unable to locate system table."); /* Handle EFI bitness properly */ if (efi_64) { - efi_system_table_64_t *stbl = (efi_system_table_64_t *)systab; + efi_system_table_64_t *stbl = (efi_system_table_64_t *)systab_pa; config_tables = stbl->tables; nr_tables = stbl->nr_tables; } else { - efi_system_table_32_t *stbl = (efi_system_table_32_t *)systab; + efi_system_table_32_t *stbl = (efi_system_table_32_t *)systab_pa; config_tables = stbl->tables; nr_tables = stbl->nr_tables; diff --git a/arch/x86/boot/compressed/efi.c b/arch/x86/boot/compressed/efi.c index bad0ce3e2ef6..72a81ba1f87b 100644 --- a/arch/x86/boot/compressed/efi.c +++ b/arch/x86/boot/compressed/efi.c @@ -48,3 +48,32 @@ enum efi_type efi_get_type(struct boot_params *bp) return et; } + +/** + * efi_get_system_table - Given a pointer to boot_params, retrieve the physical address + * of the EFI system table. + * + * @bp: pointer to boot_params + * + * Return: EFI system table address on success. On error, return 0. + */ +unsigned long efi_get_system_table(struct boot_params *bp) +{ + unsigned long sys_tbl_pa; + struct efi_info *ei; + enum efi_type et; + + /* Get systab from boot params. */ + ei = &bp->efi_info; +#ifdef CONFIG_X86_64 + sys_tbl_pa = ei->efi_systab | ((__u64)ei->efi_systab_hi << 32); +#else + sys_tbl_pa = ei->efi_systab; +#endif + if (!sys_tbl_pa) { + debug_putstr("EFI system table not found."); + return 0; + } + + return sys_tbl_pa; +} diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index fede1afa39e9..b2acd3ac6525 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -185,11 +185,17 @@ enum efi_type { #ifdef CONFIG_EFI /* helpers for early EFI config table access */ enum efi_type efi_get_type(struct boot_params *bp); +unsigned long efi_get_system_table(struct boot_params *bp); #else static inline enum efi_type efi_get_type(struct boot_params *bp) { return EFI_TYPE_NONE; } + +static inline unsigned long efi_get_system_table(struct boot_params *bp) +{ + return 0; +} #endif /* CONFIG_EFI */ #endif /* BOOT_COMPRESSED_MISC_H */ -- cgit From 61c14ceda8405f37545a0983d6b9bc45c6236793 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Thu, 24 Feb 2022 10:56:06 -0600 Subject: x86/compressed/acpi: Move EFI config table lookup to helper Future patches for SEV-SNP-validated CPUID will also require early parsing of the EFI configuration. Incrementally move the related code into a set of helpers that can be re-used for that purpose. [ bp: Remove superfluous zeroing of a stack variable. ] Signed-off-by: Michael Roth Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-27-brijesh.singh@amd.com --- arch/x86/boot/compressed/acpi.c | 25 ++++++++---------------- arch/x86/boot/compressed/efi.c | 43 +++++++++++++++++++++++++++++++++++++++++ arch/x86/boot/compressed/misc.h | 9 +++++++++ 3 files changed, 60 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c index 58a3d3f3e305..9a824af69961 100644 --- a/arch/x86/boot/compressed/acpi.c +++ b/arch/x86/boot/compressed/acpi.c @@ -118,10 +118,13 @@ static acpi_physical_address kexec_get_rsdp_addr(void) { return 0; } static acpi_physical_address efi_get_rsdp_addr(void) { #ifdef CONFIG_EFI - unsigned long systab_pa, config_tables; + unsigned long cfg_tbl_pa = 0; + unsigned int cfg_tbl_len; + unsigned long systab_pa; unsigned int nr_tables; enum efi_type et; bool efi_64; + int ret; et = efi_get_type(boot_params); if (et == EFI_TYPE_64) @@ -135,23 +138,11 @@ static acpi_physical_address efi_get_rsdp_addr(void) if (!systab_pa) error("EFI support advertised, but unable to locate system table."); - /* Handle EFI bitness properly */ - if (efi_64) { - efi_system_table_64_t *stbl = (efi_system_table_64_t *)systab_pa; - - config_tables = stbl->tables; - nr_tables = stbl->nr_tables; - } else { - efi_system_table_32_t *stbl = (efi_system_table_32_t *)systab_pa; - - config_tables = stbl->tables; - nr_tables = stbl->nr_tables; - } - - if (!config_tables) - error("EFI config tables not found."); + ret = efi_get_conf_table(boot_params, &cfg_tbl_pa, &cfg_tbl_len); + if (ret || !cfg_tbl_pa) + error("EFI config table not found."); - return __efi_get_rsdp_addr(config_tables, nr_tables, efi_64); + return __efi_get_rsdp_addr(cfg_tbl_pa, cfg_tbl_len, efi_64); #else return 0; #endif diff --git a/arch/x86/boot/compressed/efi.c b/arch/x86/boot/compressed/efi.c index 72a81ba1f87b..350838ed80db 100644 --- a/arch/x86/boot/compressed/efi.c +++ b/arch/x86/boot/compressed/efi.c @@ -77,3 +77,46 @@ unsigned long efi_get_system_table(struct boot_params *bp) return sys_tbl_pa; } + +/** + * efi_get_conf_table - Given a pointer to boot_params, locate and return the physical + * address of EFI configuration table. + * + * @bp: pointer to boot_params + * @cfg_tbl_pa: location to store physical address of config table + * @cfg_tbl_len: location to store number of config table entries + * + * Return: 0 on success. On error, return params are left unchanged. + */ +int efi_get_conf_table(struct boot_params *bp, unsigned long *cfg_tbl_pa, + unsigned int *cfg_tbl_len) +{ + unsigned long sys_tbl_pa; + enum efi_type et; + int ret; + + if (!cfg_tbl_pa || !cfg_tbl_len) + return -EINVAL; + + sys_tbl_pa = efi_get_system_table(bp); + if (!sys_tbl_pa) + return -EINVAL; + + /* Handle EFI bitness properly */ + et = efi_get_type(bp); + if (et == EFI_TYPE_64) { + efi_system_table_64_t *stbl = (efi_system_table_64_t *)sys_tbl_pa; + + *cfg_tbl_pa = stbl->tables; + *cfg_tbl_len = stbl->nr_tables; + } else if (et == EFI_TYPE_32) { + efi_system_table_32_t *stbl = (efi_system_table_32_t *)sys_tbl_pa; + + *cfg_tbl_pa = stbl->tables; + *cfg_tbl_len = stbl->nr_tables; + } else { + return -EINVAL; + } + + return 0; +} diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index b2acd3ac6525..8815af092a10 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -186,6 +186,8 @@ enum efi_type { /* helpers for early EFI config table access */ enum efi_type efi_get_type(struct boot_params *bp); unsigned long efi_get_system_table(struct boot_params *bp); +int efi_get_conf_table(struct boot_params *bp, unsigned long *cfg_tbl_pa, + unsigned int *cfg_tbl_len); #else static inline enum efi_type efi_get_type(struct boot_params *bp) { @@ -196,6 +198,13 @@ static inline unsigned long efi_get_system_table(struct boot_params *bp) { return 0; } + +static inline int efi_get_conf_table(struct boot_params *bp, + unsigned long *cfg_tbl_pa, + unsigned int *cfg_tbl_len) +{ + return -ENOENT; +} #endif /* CONFIG_EFI */ #endif /* BOOT_COMPRESSED_MISC_H */ -- cgit From dee602dd5d1489b5aa4651c561dfbe90eaee1589 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Thu, 24 Feb 2022 10:56:07 -0600 Subject: x86/compressed/acpi: Move EFI vendor table lookup to helper Future patches for SEV-SNP-validated CPUID will also require early parsing of the EFI configuration. Incrementally move the related code into a set of helpers that can be re-used for that purpose. [ bp: Unbreak unnecessarily broken lines. ] Signed-off-by: Michael Roth Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-28-brijesh.singh@amd.com --- arch/x86/boot/compressed/acpi.c | 68 ++++++++++++++------------------------- arch/x86/boot/compressed/efi.c | 70 +++++++++++++++++++++++++++++++++++++++++ arch/x86/boot/compressed/misc.h | 13 ++++++++ 3 files changed, 106 insertions(+), 45 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c index 9a824af69961..b0c1dffc5510 100644 --- a/arch/x86/boot/compressed/acpi.c +++ b/arch/x86/boot/compressed/acpi.c @@ -20,48 +20,31 @@ */ struct mem_vector immovable_mem[MAX_NUMNODES*2]; -/* - * Search EFI system tables for RSDP. If both ACPI_20_TABLE_GUID and - * ACPI_TABLE_GUID are found, take the former, which has more features. - */ static acpi_physical_address -__efi_get_rsdp_addr(unsigned long config_tables, unsigned int nr_tables, - bool efi_64) +__efi_get_rsdp_addr(unsigned long cfg_tbl_pa, unsigned int cfg_tbl_len) { - acpi_physical_address rsdp_addr = 0; - #ifdef CONFIG_EFI - int i; - - /* Get EFI tables from systab. */ - for (i = 0; i < nr_tables; i++) { - acpi_physical_address table; - efi_guid_t guid; - - if (efi_64) { - efi_config_table_64_t *tbl = (efi_config_table_64_t *)config_tables + i; - - guid = tbl->guid; - table = tbl->table; - - if (!IS_ENABLED(CONFIG_X86_64) && table >> 32) { - debug_putstr("Error getting RSDP address: EFI config table located above 4GB.\n"); - return 0; - } - } else { - efi_config_table_32_t *tbl = (efi_config_table_32_t *)config_tables + i; - - guid = tbl->guid; - table = tbl->table; - } + unsigned long rsdp_addr; + int ret; - if (!(efi_guidcmp(guid, ACPI_TABLE_GUID))) - rsdp_addr = table; - else if (!(efi_guidcmp(guid, ACPI_20_TABLE_GUID))) - return table; - } + /* + * Search EFI system tables for RSDP. Preferred is ACPI_20_TABLE_GUID to + * ACPI_TABLE_GUID because it has more features. + */ + rsdp_addr = efi_find_vendor_table(boot_params, cfg_tbl_pa, cfg_tbl_len, + ACPI_20_TABLE_GUID); + if (rsdp_addr) + return (acpi_physical_address)rsdp_addr; + + /* No ACPI_20_TABLE_GUID found, fallback to ACPI_TABLE_GUID. */ + rsdp_addr = efi_find_vendor_table(boot_params, cfg_tbl_pa, cfg_tbl_len, + ACPI_TABLE_GUID); + if (rsdp_addr) + return (acpi_physical_address)rsdp_addr; + + debug_putstr("Error getting RSDP address.\n"); #endif - return rsdp_addr; + return 0; } /* EFI/kexec support is 64-bit only. */ @@ -109,7 +92,7 @@ static acpi_physical_address kexec_get_rsdp_addr(void) if (!systab) error("EFI system table not found in kexec boot_params."); - return __efi_get_rsdp_addr((unsigned long)esd->tables, systab->nr_tables, true); + return __efi_get_rsdp_addr((unsigned long)esd->tables, systab->nr_tables); } #else static acpi_physical_address kexec_get_rsdp_addr(void) { return 0; } @@ -123,15 +106,10 @@ static acpi_physical_address efi_get_rsdp_addr(void) unsigned long systab_pa; unsigned int nr_tables; enum efi_type et; - bool efi_64; int ret; et = efi_get_type(boot_params); - if (et == EFI_TYPE_64) - efi_64 = true; - else if (et == EFI_TYPE_32) - efi_64 = false; - else + if (et == EFI_TYPE_NONE) return 0; systab_pa = efi_get_system_table(boot_params); @@ -142,7 +120,7 @@ static acpi_physical_address efi_get_rsdp_addr(void) if (ret || !cfg_tbl_pa) error("EFI config table not found."); - return __efi_get_rsdp_addr(cfg_tbl_pa, cfg_tbl_len, efi_64); + return __efi_get_rsdp_addr(cfg_tbl_pa, cfg_tbl_len); #else return 0; #endif diff --git a/arch/x86/boot/compressed/efi.c b/arch/x86/boot/compressed/efi.c index 350838ed80db..4d363dfc2b33 100644 --- a/arch/x86/boot/compressed/efi.c +++ b/arch/x86/boot/compressed/efi.c @@ -120,3 +120,73 @@ int efi_get_conf_table(struct boot_params *bp, unsigned long *cfg_tbl_pa, return 0; } + +/* Get vendor table address/guid from EFI config table at the given index */ +static int get_vendor_table(void *cfg_tbl, unsigned int idx, + unsigned long *vendor_tbl_pa, + efi_guid_t *vendor_tbl_guid, + enum efi_type et) +{ + if (et == EFI_TYPE_64) { + efi_config_table_64_t *tbl_entry = (efi_config_table_64_t *)cfg_tbl + idx; + + if (!IS_ENABLED(CONFIG_X86_64) && tbl_entry->table >> 32) { + debug_putstr("Error: EFI config table entry located above 4GB.\n"); + return -EINVAL; + } + + *vendor_tbl_pa = tbl_entry->table; + *vendor_tbl_guid = tbl_entry->guid; + + } else if (et == EFI_TYPE_32) { + efi_config_table_32_t *tbl_entry = (efi_config_table_32_t *)cfg_tbl + idx; + + *vendor_tbl_pa = tbl_entry->table; + *vendor_tbl_guid = tbl_entry->guid; + } else { + return -EINVAL; + } + + return 0; +} + +/** + * efi_find_vendor_table - Given EFI config table, search it for the physical + * address of the vendor table associated with GUID. + * + * @bp: pointer to boot_params + * @cfg_tbl_pa: pointer to EFI configuration table + * @cfg_tbl_len: number of entries in EFI configuration table + * @guid: GUID of vendor table + * + * Return: vendor table address on success. On error, return 0. + */ +unsigned long efi_find_vendor_table(struct boot_params *bp, + unsigned long cfg_tbl_pa, + unsigned int cfg_tbl_len, + efi_guid_t guid) +{ + enum efi_type et; + unsigned int i; + + et = efi_get_type(bp); + if (et == EFI_TYPE_NONE) + return 0; + + for (i = 0; i < cfg_tbl_len; i++) { + unsigned long vendor_tbl_pa; + efi_guid_t vendor_tbl_guid; + int ret; + + ret = get_vendor_table((void *)cfg_tbl_pa, i, + &vendor_tbl_pa, + &vendor_tbl_guid, et); + if (ret) + return 0; + + if (!efi_guidcmp(guid, vendor_tbl_guid)) + return vendor_tbl_pa; + } + + return 0; +} diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 8815af092a10..ba538af37e90 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -188,6 +189,10 @@ enum efi_type efi_get_type(struct boot_params *bp); unsigned long efi_get_system_table(struct boot_params *bp); int efi_get_conf_table(struct boot_params *bp, unsigned long *cfg_tbl_pa, unsigned int *cfg_tbl_len); +unsigned long efi_find_vendor_table(struct boot_params *bp, + unsigned long cfg_tbl_pa, + unsigned int cfg_tbl_len, + efi_guid_t guid); #else static inline enum efi_type efi_get_type(struct boot_params *bp) { @@ -205,6 +210,14 @@ static inline int efi_get_conf_table(struct boot_params *bp, { return -ENOENT; } + +static inline unsigned long efi_find_vendor_table(struct boot_params *bp, + unsigned long cfg_tbl_pa, + unsigned int cfg_tbl_len, + efi_guid_t guid) +{ + return 0; +} #endif /* CONFIG_EFI */ #endif /* BOOT_COMPRESSED_MISC_H */ -- cgit From 824f37783189a48db914488fb41eba36ec57ebb7 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Thu, 24 Feb 2022 10:56:08 -0600 Subject: x86/compressed/acpi: Move EFI kexec handling into common code Future patches for SEV-SNP-validated CPUID will also require early parsing of the EFI configuration. Incrementally move the related code into a set of helpers that can be re-used for that purpose. In this instance, the current acpi.c kexec handling is mainly used to get the alternative EFI config table address provided by kexec via a setup_data entry of type SETUP_EFI. If not present, the code then falls back to normal EFI config table address provided by EFI system table. This would need to be done by all call-sites attempting to access the EFI config table, so just have efi_get_conf_table() handle that automatically. Signed-off-by: Michael Roth Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-29-brijesh.singh@amd.com --- arch/x86/boot/compressed/acpi.c | 59 ----------------------------------------- arch/x86/boot/compressed/efi.c | 46 +++++++++++++++++++++++++++++++- 2 files changed, 45 insertions(+), 60 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c index b0c1dffc5510..64b172dabd5c 100644 --- a/arch/x86/boot/compressed/acpi.c +++ b/arch/x86/boot/compressed/acpi.c @@ -47,57 +47,6 @@ __efi_get_rsdp_addr(unsigned long cfg_tbl_pa, unsigned int cfg_tbl_len) return 0; } -/* EFI/kexec support is 64-bit only. */ -#ifdef CONFIG_X86_64 -static struct efi_setup_data *get_kexec_setup_data_addr(void) -{ - struct setup_data *data; - u64 pa_data; - - pa_data = boot_params->hdr.setup_data; - while (pa_data) { - data = (struct setup_data *)pa_data; - if (data->type == SETUP_EFI) - return (struct efi_setup_data *)(pa_data + sizeof(struct setup_data)); - - pa_data = data->next; - } - return NULL; -} - -static acpi_physical_address kexec_get_rsdp_addr(void) -{ - efi_system_table_64_t *systab; - struct efi_setup_data *esd; - struct efi_info *ei; - enum efi_type et; - - esd = (struct efi_setup_data *)get_kexec_setup_data_addr(); - if (!esd) - return 0; - - if (!esd->tables) { - debug_putstr("Wrong kexec SETUP_EFI data.\n"); - return 0; - } - - et = efi_get_type(boot_params); - if (et != EFI_TYPE_64) { - debug_putstr("Unexpected kexec EFI environment (expected 64-bit EFI).\n"); - return 0; - } - - /* Get systab from boot params. */ - systab = (efi_system_table_64_t *)efi_get_system_table(boot_params); - if (!systab) - error("EFI system table not found in kexec boot_params."); - - return __efi_get_rsdp_addr((unsigned long)esd->tables, systab->nr_tables); -} -#else -static acpi_physical_address kexec_get_rsdp_addr(void) { return 0; } -#endif /* CONFIG_X86_64 */ - static acpi_physical_address efi_get_rsdp_addr(void) { #ifdef CONFIG_EFI @@ -210,14 +159,6 @@ acpi_physical_address get_rsdp_addr(void) pa = boot_params->acpi_rsdp_addr; - /* - * Try to get EFI data from setup_data. This can happen when we're a - * kexec'ed kernel and kexec(1) has passed all the required EFI info to - * us. - */ - if (!pa) - pa = kexec_get_rsdp_addr(); - if (!pa) pa = efi_get_rsdp_addr(); diff --git a/arch/x86/boot/compressed/efi.c b/arch/x86/boot/compressed/efi.c index 4d363dfc2b33..09fa3b5d70b8 100644 --- a/arch/x86/boot/compressed/efi.c +++ b/arch/x86/boot/compressed/efi.c @@ -78,6 +78,46 @@ unsigned long efi_get_system_table(struct boot_params *bp) return sys_tbl_pa; } +/* + * EFI config table address changes to virtual address after boot, which may + * not be accessible for the kexec'd kernel. To address this, kexec provides + * the initial physical address via a struct setup_data entry, which is + * checked for here, along with some sanity checks. + */ +static struct efi_setup_data *get_kexec_setup_data(struct boot_params *bp, + enum efi_type et) +{ +#ifdef CONFIG_X86_64 + struct efi_setup_data *esd = NULL; + struct setup_data *data; + u64 pa_data; + + pa_data = bp->hdr.setup_data; + while (pa_data) { + data = (struct setup_data *)pa_data; + if (data->type == SETUP_EFI) { + esd = (struct efi_setup_data *)(pa_data + sizeof(struct setup_data)); + break; + } + + pa_data = data->next; + } + + /* + * Original ACPI code falls back to attempting normal EFI boot in these + * cases, so maintain existing behavior by indicating non-kexec + * environment to the caller, but print them for debugging. + */ + if (esd && !esd->tables) { + debug_putstr("kexec EFI environment missing valid configuration table.\n"); + return NULL; + } + + return esd; +#endif + return NULL; +} + /** * efi_get_conf_table - Given a pointer to boot_params, locate and return the physical * address of EFI configuration table. @@ -106,8 +146,12 @@ int efi_get_conf_table(struct boot_params *bp, unsigned long *cfg_tbl_pa, et = efi_get_type(bp); if (et == EFI_TYPE_64) { efi_system_table_64_t *stbl = (efi_system_table_64_t *)sys_tbl_pa; + struct efi_setup_data *esd; - *cfg_tbl_pa = stbl->tables; + /* kexec provides an alternative EFI conf table, check for it. */ + esd = get_kexec_setup_data(bp, et); + + *cfg_tbl_pa = esd ? esd->tables : stbl->tables; *cfg_tbl_len = stbl->nr_tables; } else if (et == EFI_TYPE_32) { efi_system_table_32_t *stbl = (efi_system_table_32_t *)sys_tbl_pa; -- cgit From 9b5a7f4a2a8dcda461f9c7a6671150f4a8a902e8 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 31 Mar 2022 10:57:28 -0700 Subject: x86/configs: Add x86 debugging Kconfig fragment plus docs The kernel has a wide variety of debugging options to help catch and squash bugs. However, new debugging is added all the time and the existing options can be hard to find. Add a Kconfig fragment with the debugging options which tip maintainers expect to be used to test contributions. This should make it easier for contributors to test their code and find issues before submission. [ bp: Add to "make help" output, fix DEBUG_INFO selection as pointed out by Nathan Chancellor . ] Signed-off-by: Dave Hansen Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220331175728.299103A0@davehans-spike.ostc.intel.com --- arch/x86/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 63d50f65b828..1abd7cc9d6cd 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -313,5 +313,6 @@ define archhelp echo '' echo ' kvm_guest.config - Enable Kconfig items for running this kernel as a KVM guest' echo ' xen.config - Enable Kconfig items for running this kernel as a Xen guest' + echo ' x86_debug.config - Enable tip tree debugging options for testing' endef -- cgit From 5ea98e01ab524cbc53dad8aebd27b434ebe5d074 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 7 Mar 2022 15:33:39 -0600 Subject: x86/boot: Add Confidential Computing type to setup_data While launching encrypted guests, the hypervisor may need to provide some additional information during the guest boot. When booting under an EFI-based BIOS, the EFI configuration table contains an entry for the confidential computing blob that contains the required information. To support booting encrypted guests on non-EFI VMs, the hypervisor needs to pass this additional information to the guest kernel using a different method. For this purpose, introduce SETUP_CC_BLOB type in setup_data to hold the physical address of the confidential computing blob location. The boot loader or hypervisor may choose to use this method instead of an EFI configuration table. The CC blob location scanning should give preference to a setup_data blob over an EFI configuration table. In AMD SEV-SNP, the CC blob contains the address of the secrets and CPUID pages. The secrets page includes information such as a VM to PSP communication key and the CPUID page contains PSP-filtered CPUID values. Define the AMD SEV confidential computing blob structure. While at it, define the EFI GUID for the confidential computing blob. [ bp: Massage commit message, mark struct __packed. ] Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20220307213356.2797205-30-brijesh.singh@amd.com --- arch/x86/include/asm/sev.h | 18 ++++++++++++++++++ arch/x86/include/uapi/asm/bootparam.h | 1 + 2 files changed, 19 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index a3203b2caaca..8c934bda2a90 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -42,6 +42,24 @@ struct es_em_ctxt { struct es_fault_info fi; }; +/* + * AMD SEV Confidential computing blob structure. The structure is + * defined in OVMF UEFI firmware header: + * https://github.com/tianocore/edk2/blob/master/OvmfPkg/Include/Guid/ConfidentialComputingSevSnpBlob.h + */ +#define CC_BLOB_SEV_HDR_MAGIC 0x45444d41 +struct cc_blob_sev_info { + u32 magic; + u16 version; + u16 reserved; + u64 secrets_phys; + u32 secrets_len; + u32 rsvd1; + u64 cpuid_phys; + u32 cpuid_len; + u32 rsvd2; +} __packed; + void do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code); static inline u64 lower_bits(u64 val, unsigned int bits) diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index b25d3f82c2f3..1ac5acca72ce 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -10,6 +10,7 @@ #define SETUP_EFI 4 #define SETUP_APPLE_PROPERTIES 5 #define SETUP_JAILHOUSE 6 +#define SETUP_CC_BLOB 7 #define SETUP_INDIRECT (1<<31) -- cgit From b66370db9a90b3fa4c4a1a732af3e7e38d6d4c7c Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Thu, 24 Feb 2022 10:56:10 -0600 Subject: KVM: x86: Move lookup of indexed CPUID leafs to helper Determining which CPUID leafs have significant ECX/index values is also needed by guest kernel code when doing SEV-SNP-validated CPUID lookups. Move this to common code to keep future updates in sync. Signed-off-by: Michael Roth Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Reviewed-by: Venu Busireddy Link: https://lore.kernel.org/r/20220307213356.2797205-31-brijesh.singh@amd.com --- arch/x86/include/asm/cpuid.h | 34 ++++++++++++++++++++++++++++++++++ arch/x86/kvm/cpuid.c | 19 ++----------------- 2 files changed, 36 insertions(+), 17 deletions(-) create mode 100644 arch/x86/include/asm/cpuid.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h new file mode 100644 index 000000000000..70b2db18165e --- /dev/null +++ b/arch/x86/include/asm/cpuid.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * CPUID-related helpers/definitions + * + * Derived from arch/x86/kvm/cpuid.c + */ + +#ifndef _ASM_X86_CPUID_H +#define _ASM_X86_CPUID_H + +static __always_inline bool cpuid_function_is_indexed(u32 function) +{ + switch (function) { + case 4: + case 7: + case 0xb: + case 0xd: + case 0xf: + case 0x10: + case 0x12: + case 0x14: + case 0x17: + case 0x18: + case 0x1d: + case 0x1e: + case 0x1f: + case 0x8000001d: + return true; + } + + return false; +} + +#endif /* _ASM_X86_CPUID_H */ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index b24ca7f4ed7c..4b62d80bb22f 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "cpuid.h" #include "lapic.h" #include "mmu.h" @@ -744,24 +745,8 @@ static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array, cpuid_count(entry->function, entry->index, &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); - switch (function) { - case 4: - case 7: - case 0xb: - case 0xd: - case 0xf: - case 0x10: - case 0x12: - case 0x14: - case 0x17: - case 0x18: - case 0x1d: - case 0x1e: - case 0x1f: - case 0x8000001d: + if (cpuid_function_is_indexed(function)) entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - break; - } return entry; } -- cgit From 801baa693c1f6d7327475c39100c456db340cd3e Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Thu, 24 Feb 2022 10:56:11 -0600 Subject: x86/sev: Move MSR-based VMGEXITs for CPUID to helper This code will also be used later for SEV-SNP-validated CPUID code in some cases, so move it to a common helper. While here, also add a check to terminate in cases where the CPUID function/subfunction is indexed and the subfunction is non-zero, since the GHCB MSR protocol does not support non-zero subfunctions. Suggested-by: Sean Christopherson Signed-off-by: Michael Roth Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-32-brijesh.singh@amd.com --- arch/x86/boot/compressed/sev.c | 1 + arch/x86/kernel/sev-shared.c | 83 +++++++++++++++++++++++++++++------------- arch/x86/kernel/sev.c | 1 + 3 files changed, 59 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index f31b434e2ce4..7a9cfbc43f4b 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "error.h" #include "../msr.h" diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index 3aaef1a18ffe..b4d5558c9d0a 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -14,6 +14,16 @@ #define has_cpuflag(f) boot_cpu_has(f) #endif +/* I/O parameters for CPUID-related helpers */ +struct cpuid_leaf { + u32 fn; + u32 subfn; + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; +}; + /* * Since feature negotiation related variables are set early in the boot * process they must reside in the .data section so as not to be zeroed @@ -194,6 +204,44 @@ enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, bool set_ghcb_msr, return verify_exception_info(ghcb, ctxt); } +static int __sev_cpuid_hv(u32 fn, int reg_idx, u32 *reg) +{ + u64 val; + + sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, reg_idx)); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP) + return -EIO; + + *reg = (val >> 32); + + return 0; +} + +static int sev_cpuid_hv(struct cpuid_leaf *leaf) +{ + int ret; + + /* + * MSR protocol does not support fetching non-zero subfunctions, but is + * sufficient to handle current early-boot cases. Should that change, + * make sure to report an error rather than ignoring the index and + * grabbing random values. If this issue arises in the future, handling + * can be added here to use GHCB-page protocol for cases that occur late + * enough in boot that GHCB page is available. + */ + if (cpuid_function_is_indexed(leaf->fn) && leaf->subfn) + return -EINVAL; + + ret = __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EAX, &leaf->eax); + ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EBX, &leaf->ebx); + ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_ECX, &leaf->ecx); + ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EDX, &leaf->edx); + + return ret; +} + /* * Boot VC Handler - This is the first VC handler during boot, there is no GHCB * page yet, so it only supports the MSR based communication with the @@ -201,40 +249,23 @@ enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, bool set_ghcb_msr, */ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) { + unsigned int subfn = lower_bits(regs->cx, 32); unsigned int fn = lower_bits(regs->ax, 32); - unsigned long val; + struct cpuid_leaf leaf; /* Only CPUID is supported via MSR protocol */ if (exit_code != SVM_EXIT_CPUID) goto fail; - sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EAX)); - VMGEXIT(); - val = sev_es_rd_ghcb_msr(); - if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP) - goto fail; - regs->ax = val >> 32; - - sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EBX)); - VMGEXIT(); - val = sev_es_rd_ghcb_msr(); - if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP) - goto fail; - regs->bx = val >> 32; - - sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_ECX)); - VMGEXIT(); - val = sev_es_rd_ghcb_msr(); - if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP) + leaf.fn = fn; + leaf.subfn = subfn; + if (sev_cpuid_hv(&leaf)) goto fail; - regs->cx = val >> 32; - sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EDX)); - VMGEXIT(); - val = sev_es_rd_ghcb_msr(); - if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP) - goto fail; - regs->dx = val >> 32; + regs->ax = leaf.eax; + regs->bx = leaf.ebx; + regs->cx = leaf.ecx; + regs->dx = leaf.edx; /* * This is a VC handler and the #VC is only raised when SEV-ES is diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index d7915ae7729d..0c2bf39c4a8f 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -33,6 +33,7 @@ #include #include #include +#include #define DR7_RESET_VALUE 0x400 -- cgit From ee0bfa08a345370df28c07288e886abcbaac481f Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Thu, 24 Feb 2022 10:56:12 -0600 Subject: x86/compressed/64: Add support for SEV-SNP CPUID table in #VC handlers CPUID instructions generate a #VC exception for SEV-ES/SEV-SNP guests, for which early handlers are currently set up to handle. In the case of SEV-SNP, guests can use a configurable location in guest memory that has been pre-populated with a firmware-validated CPUID table to look up the relevant CPUID values rather than requesting them from hypervisor via a VMGEXIT. Add the various hooks in the #VC handlers to allow CPUID instructions to be handled via the table. The code to actually configure/enable the table will be added in a subsequent commit. Signed-off-by: Michael Roth Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-33-brijesh.singh@amd.com --- arch/x86/include/asm/sev-common.h | 2 + arch/x86/kernel/sev-shared.c | 324 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 326 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index e9b6815b3b3d..0759af9b1acf 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -152,6 +152,8 @@ struct snp_psc_desc { #define GHCB_TERM_PSC 1 /* Page State Change failure */ #define GHCB_TERM_PVALIDATE 2 /* Pvalidate failure */ #define GHCB_TERM_NOT_VMPL0 3 /* SNP guest is not running at VMPL-0 */ +#define GHCB_TERM_CPUID 4 /* CPUID-validation failure */ +#define GHCB_TERM_CPUID_HV 5 /* CPUID failure during hypervisor fallback */ #define GHCB_RESP_CODE(v) ((v) & GHCB_MSR_INFO_MASK) diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index b4d5558c9d0a..0f1375164ff0 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -24,6 +24,36 @@ struct cpuid_leaf { u32 edx; }; +/* + * Individual entries of the SNP CPUID table, as defined by the SNP + * Firmware ABI, Revision 0.9, Section 7.1, Table 14. + */ +struct snp_cpuid_fn { + u32 eax_in; + u32 ecx_in; + u64 xcr0_in; + u64 xss_in; + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; + u64 __reserved; +} __packed; + +/* + * SNP CPUID table, as defined by the SNP Firmware ABI, Revision 0.9, + * Section 8.14.2.6. Also noted there is the SNP firmware-enforced limit + * of 64 entries per CPUID table. + */ +#define SNP_CPUID_COUNT_MAX 64 + +struct snp_cpuid_table { + u32 count; + u32 __reserved1; + u64 __reserved2; + struct snp_cpuid_fn fn[SNP_CPUID_COUNT_MAX]; +} __packed; + /* * Since feature negotiation related variables are set early in the boot * process they must reside in the .data section so as not to be zeroed @@ -33,6 +63,19 @@ struct cpuid_leaf { */ static u16 ghcb_version __ro_after_init; +/* Copy of the SNP firmware's CPUID page. */ +static struct snp_cpuid_table cpuid_table_copy __ro_after_init; + +/* + * These will be initialized based on CPUID table so that non-present + * all-zero leaves (for sparse tables) can be differentiated from + * invalid/out-of-range leaves. This is needed since all-zero leaves + * still need to be post-processed. + */ +static u32 cpuid_std_range_max __ro_after_init; +static u32 cpuid_hyp_range_max __ro_after_init; +static u32 cpuid_ext_range_max __ro_after_init; + static bool __init sev_es_check_cpu_features(void) { if (!has_cpuflag(X86_FEATURE_RDRAND)) { @@ -242,6 +285,252 @@ static int sev_cpuid_hv(struct cpuid_leaf *leaf) return ret; } +/* + * This may be called early while still running on the initial identity + * mapping. Use RIP-relative addressing to obtain the correct address + * while running with the initial identity mapping as well as the + * switch-over to kernel virtual addresses later. + */ +static const struct snp_cpuid_table *snp_cpuid_get_table(void) +{ + void *ptr; + + asm ("lea cpuid_table_copy(%%rip), %0" + : "=r" (ptr) + : "p" (&cpuid_table_copy)); + + return ptr; +} + +/* + * The SNP Firmware ABI, Revision 0.9, Section 7.1, details the use of + * XCR0_IN and XSS_IN to encode multiple versions of 0xD subfunctions 0 + * and 1 based on the corresponding features enabled by a particular + * combination of XCR0 and XSS registers so that a guest can look up the + * version corresponding to the features currently enabled in its XCR0/XSS + * registers. The only values that differ between these versions/table + * entries is the enabled XSAVE area size advertised via EBX. + * + * While hypervisors may choose to make use of this support, it is more + * robust/secure for a guest to simply find the entry corresponding to the + * base/legacy XSAVE area size (XCR0=1 or XCR0=3), and then calculate the + * XSAVE area size using subfunctions 2 through 64, as documented in APM + * Volume 3, Rev 3.31, Appendix E.3.8, which is what is done here. + * + * Since base/legacy XSAVE area size is documented as 0x240, use that value + * directly rather than relying on the base size in the CPUID table. + * + * Return: XSAVE area size on success, 0 otherwise. + */ +static u32 snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted) +{ + const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); + u64 xfeatures_found = 0; + u32 xsave_size = 0x240; + int i; + + for (i = 0; i < cpuid_table->count; i++) { + const struct snp_cpuid_fn *e = &cpuid_table->fn[i]; + + if (!(e->eax_in == 0xD && e->ecx_in > 1 && e->ecx_in < 64)) + continue; + if (!(xfeatures_en & (BIT_ULL(e->ecx_in)))) + continue; + if (xfeatures_found & (BIT_ULL(e->ecx_in))) + continue; + + xfeatures_found |= (BIT_ULL(e->ecx_in)); + + if (compacted) + xsave_size += e->eax; + else + xsave_size = max(xsave_size, e->eax + e->ebx); + } + + /* + * Either the guest set unsupported XCR0/XSS bits, or the corresponding + * entries in the CPUID table were not present. This is not a valid + * state to be in. + */ + if (xfeatures_found != (xfeatures_en & GENMASK_ULL(63, 2))) + return 0; + + return xsave_size; +} + +static bool +snp_cpuid_get_validated_func(struct cpuid_leaf *leaf) +{ + const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); + int i; + + for (i = 0; i < cpuid_table->count; i++) { + const struct snp_cpuid_fn *e = &cpuid_table->fn[i]; + + if (e->eax_in != leaf->fn) + continue; + + if (cpuid_function_is_indexed(leaf->fn) && e->ecx_in != leaf->subfn) + continue; + + /* + * For 0xD subfunctions 0 and 1, only use the entry corresponding + * to the base/legacy XSAVE area size (XCR0=1 or XCR0=3, XSS=0). + * See the comments above snp_cpuid_calc_xsave_size() for more + * details. + */ + if (e->eax_in == 0xD && (e->ecx_in == 0 || e->ecx_in == 1)) + if (!(e->xcr0_in == 1 || e->xcr0_in == 3) || e->xss_in) + continue; + + leaf->eax = e->eax; + leaf->ebx = e->ebx; + leaf->ecx = e->ecx; + leaf->edx = e->edx; + + return true; + } + + return false; +} + +static void snp_cpuid_hv(struct cpuid_leaf *leaf) +{ + if (sev_cpuid_hv(leaf)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV); +} + +static int snp_cpuid_postprocess(struct cpuid_leaf *leaf) +{ + struct cpuid_leaf leaf_hv = *leaf; + + switch (leaf->fn) { + case 0x1: + snp_cpuid_hv(&leaf_hv); + + /* initial APIC ID */ + leaf->ebx = (leaf_hv.ebx & GENMASK(31, 24)) | (leaf->ebx & GENMASK(23, 0)); + /* APIC enabled bit */ + leaf->edx = (leaf_hv.edx & BIT(9)) | (leaf->edx & ~BIT(9)); + + /* OSXSAVE enabled bit */ + if (native_read_cr4() & X86_CR4_OSXSAVE) + leaf->ecx |= BIT(27); + break; + case 0x7: + /* OSPKE enabled bit */ + leaf->ecx &= ~BIT(4); + if (native_read_cr4() & X86_CR4_PKE) + leaf->ecx |= BIT(4); + break; + case 0xB: + leaf_hv.subfn = 0; + snp_cpuid_hv(&leaf_hv); + + /* extended APIC ID */ + leaf->edx = leaf_hv.edx; + break; + case 0xD: { + bool compacted = false; + u64 xcr0 = 1, xss = 0; + u32 xsave_size; + + if (leaf->subfn != 0 && leaf->subfn != 1) + return 0; + + if (native_read_cr4() & X86_CR4_OSXSAVE) + xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + if (leaf->subfn == 1) { + /* Get XSS value if XSAVES is enabled. */ + if (leaf->eax & BIT(3)) { + unsigned long lo, hi; + + asm volatile("rdmsr" : "=a" (lo), "=d" (hi) + : "c" (MSR_IA32_XSS)); + xss = (hi << 32) | lo; + } + + /* + * The PPR and APM aren't clear on what size should be + * encoded in 0xD:0x1:EBX when compaction is not enabled + * by either XSAVEC (feature bit 1) or XSAVES (feature + * bit 3) since SNP-capable hardware has these feature + * bits fixed as 1. KVM sets it to 0 in this case, but + * to avoid this becoming an issue it's safer to simply + * treat this as unsupported for SNP guests. + */ + if (!(leaf->eax & (BIT(1) | BIT(3)))) + return -EINVAL; + + compacted = true; + } + + xsave_size = snp_cpuid_calc_xsave_size(xcr0 | xss, compacted); + if (!xsave_size) + return -EINVAL; + + leaf->ebx = xsave_size; + } + break; + case 0x8000001E: + snp_cpuid_hv(&leaf_hv); + + /* extended APIC ID */ + leaf->eax = leaf_hv.eax; + /* compute ID */ + leaf->ebx = (leaf->ebx & GENMASK(31, 8)) | (leaf_hv.ebx & GENMASK(7, 0)); + /* node ID */ + leaf->ecx = (leaf->ecx & GENMASK(31, 8)) | (leaf_hv.ecx & GENMASK(7, 0)); + break; + default: + /* No fix-ups needed, use values as-is. */ + break; + } + + return 0; +} + +/* + * Returns -EOPNOTSUPP if feature not enabled. Any other non-zero return value + * should be treated as fatal by caller. + */ +static int snp_cpuid(struct cpuid_leaf *leaf) +{ + const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); + + if (!cpuid_table->count) + return -EOPNOTSUPP; + + if (!snp_cpuid_get_validated_func(leaf)) { + /* + * Some hypervisors will avoid keeping track of CPUID entries + * where all values are zero, since they can be handled the + * same as out-of-range values (all-zero). This is useful here + * as well as it allows virtually all guest configurations to + * work using a single SNP CPUID table. + * + * To allow for this, there is a need to distinguish between + * out-of-range entries and in-range zero entries, since the + * CPUID table entries are only a template that may need to be + * augmented with additional values for things like + * CPU-specific information during post-processing. So if it's + * not in the table, set the values to zero. Then, if they are + * within a valid CPUID range, proceed with post-processing + * using zeros as the initial values. Otherwise, skip + * post-processing and just return zeros immediately. + */ + leaf->eax = leaf->ebx = leaf->ecx = leaf->edx = 0; + + /* Skip post-processing for out-of-range zero leafs. */ + if (!(leaf->fn <= cpuid_std_range_max || + (leaf->fn >= 0x40000000 && leaf->fn <= cpuid_hyp_range_max) || + (leaf->fn >= 0x80000000 && leaf->fn <= cpuid_ext_range_max))) + return 0; + } + + return snp_cpuid_postprocess(leaf); +} + /* * Boot VC Handler - This is the first VC handler during boot, there is no GHCB * page yet, so it only supports the MSR based communication with the @@ -252,6 +541,7 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) unsigned int subfn = lower_bits(regs->cx, 32); unsigned int fn = lower_bits(regs->ax, 32); struct cpuid_leaf leaf; + int ret; /* Only CPUID is supported via MSR protocol */ if (exit_code != SVM_EXIT_CPUID) @@ -259,9 +549,18 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) leaf.fn = fn; leaf.subfn = subfn; + + ret = snp_cpuid(&leaf); + if (!ret) + goto cpuid_done; + + if (ret != -EOPNOTSUPP) + goto fail; + if (sev_cpuid_hv(&leaf)) goto fail; +cpuid_done: regs->ax = leaf.eax; regs->bx = leaf.ebx; regs->cx = leaf.ecx; @@ -556,12 +855,37 @@ static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) return ret; } +static int vc_handle_cpuid_snp(struct pt_regs *regs) +{ + struct cpuid_leaf leaf; + int ret; + + leaf.fn = regs->ax; + leaf.subfn = regs->cx; + ret = snp_cpuid(&leaf); + if (!ret) { + regs->ax = leaf.eax; + regs->bx = leaf.ebx; + regs->cx = leaf.ecx; + regs->dx = leaf.edx; + } + + return ret; +} + static enum es_result vc_handle_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt) { struct pt_regs *regs = ctxt->regs; u32 cr4 = native_read_cr4(); enum es_result ret; + int snp_cpuid_ret; + + snp_cpuid_ret = vc_handle_cpuid_snp(regs); + if (!snp_cpuid_ret) + return ES_OK; + if (snp_cpuid_ret != -EOPNOTSUPP) + return ES_VMM_ERROR; ghcb_set_rax(ghcb, regs->ax); ghcb_set_rcx(ghcb, regs->cx); -- cgit From 8c9c509baf660f1062bc758c26008b7f9bbc39f3 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Thu, 24 Feb 2022 10:56:13 -0600 Subject: x86/boot: Add a pointer to Confidential Computing blob in bootparams The previously defined Confidential Computing blob is provided to the kernel via a setup_data structure or EFI config table entry. Currently, these are both checked for by boot/compressed kernel to access the CPUID table address within it for use with SEV-SNP CPUID enforcement. To also enable that enforcement for the run-time kernel, similar access to the CPUID table is needed early on while it's still using the identity-mapped page table set up by boot/compressed, where global pointers need to be accessed via fixup_pointer(). This isn't much of an issue for accessing setup_data, and the EFI config table helper code currently used in boot/compressed *could* be used in this case as well since they both rely on identity-mapping. However, it has some reliance on EFI helpers/string constants that would need to be accessed via fixup_pointer(), and fixing it up while making it shareable between boot/compressed and run-time kernel is fragile and introduces a good bit of ugliness. Instead, add a boot_params->cc_blob_address pointer that the boot/compressed kernel can initialize so that the run-time kernel can access the CC blob from there instead of re-scanning the EFI config table. Also document these in Documentation/x86/zero-page.rst. While there, add missing documentation for the acpi_rsdp_addr field, which serves a similar purpose in providing the run-time kernel a pointer to the ACPI RSDP table so that it does not need to [re-]scan the EFI configuration table. [ bp: Fix typos, massage commit message. ] Signed-off-by: Michael Roth Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-34-brijesh.singh@amd.com --- arch/x86/include/asm/bootparam_utils.h | 1 + arch/x86/include/uapi/asm/bootparam.h | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/bootparam_utils.h b/arch/x86/include/asm/bootparam_utils.h index 981fe923a59f..53e9b0620d96 100644 --- a/arch/x86/include/asm/bootparam_utils.h +++ b/arch/x86/include/asm/bootparam_utils.h @@ -74,6 +74,7 @@ static void sanitize_boot_params(struct boot_params *boot_params) BOOT_PARAM_PRESERVE(hdr), BOOT_PARAM_PRESERVE(e820_table), BOOT_PARAM_PRESERVE(eddbuf), + BOOT_PARAM_PRESERVE(cc_blob_address), }; memset(&scratch, 0, sizeof(scratch)); diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 1ac5acca72ce..bea5cdcdf532 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -188,7 +188,8 @@ struct boot_params { __u32 ext_ramdisk_image; /* 0x0c0 */ __u32 ext_ramdisk_size; /* 0x0c4 */ __u32 ext_cmd_line_ptr; /* 0x0c8 */ - __u8 _pad4[116]; /* 0x0cc */ + __u8 _pad4[112]; /* 0x0cc */ + __u32 cc_blob_address; /* 0x13c */ struct edid_info edid_info; /* 0x140 */ struct efi_info efi_info; /* 0x1c0 */ __u32 alt_mem_k; /* 0x1e0 */ -- cgit From c01fce9cef8491974f7f007f90281f1608400768 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Thu, 24 Feb 2022 10:56:14 -0600 Subject: x86/compressed: Add SEV-SNP feature detection/setup Initial/preliminary detection of SEV-SNP is done via the Confidential Computing blob. Check for it prior to the normal SEV/SME feature initialization, and add some sanity checks to confirm it agrees with SEV-SNP CPUID/MSR bits. Signed-off-by: Michael Roth Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-35-brijesh.singh@amd.com --- arch/x86/boot/compressed/sev.c | 112 ++++++++++++++++++++++++++++++++++++++++- arch/x86/include/asm/sev.h | 3 ++ 2 files changed, 114 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 7a9cfbc43f4b..2a94bb7879ae 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -274,6 +274,13 @@ void sev_enable(struct boot_params *bp) { unsigned int eax, ebx, ecx, edx; struct msr m; + bool snp; + + /* + * Setup/preliminary detection of SNP. This will be sanity-checked + * against CPUID/MSR values later. + */ + snp = snp_init(bp); /* Check for the SME/SEV support leaf */ eax = 0x80000000; @@ -294,8 +301,11 @@ void sev_enable(struct boot_params *bp) ecx = 0; native_cpuid(&eax, &ebx, &ecx, &edx); /* Check whether SEV is supported */ - if (!(eax & BIT(1))) + if (!(eax & BIT(1))) { + if (snp) + error("SEV-SNP support indicated by CC blob, but not CPUID."); return; + } /* Set the SME mask if this is an SEV guest. */ boot_rdmsr(MSR_AMD64_SEV, &m); @@ -320,5 +330,105 @@ void sev_enable(struct boot_params *bp) enforce_vmpl0(); } + if (snp && !(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) + error("SEV-SNP supported indicated by CC blob, but not SEV status MSR."); + sme_me_mask = BIT_ULL(ebx & 0x3f); } + +/* Search for Confidential Computing blob in the EFI config table. */ +static struct cc_blob_sev_info *find_cc_blob_efi(struct boot_params *bp) +{ + unsigned long cfg_table_pa; + unsigned int cfg_table_len; + int ret; + + ret = efi_get_conf_table(bp, &cfg_table_pa, &cfg_table_len); + if (ret) + return NULL; + + return (struct cc_blob_sev_info *)efi_find_vendor_table(bp, cfg_table_pa, + cfg_table_len, + EFI_CC_BLOB_GUID); +} + +struct cc_setup_data { + struct setup_data header; + u32 cc_blob_address; +}; + +/* + * Search for a Confidential Computing blob passed in as a setup_data entry + * via the Linux Boot Protocol. + */ +static struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp) +{ + struct cc_setup_data *sd = NULL; + struct setup_data *hdr; + + hdr = (struct setup_data *)bp->hdr.setup_data; + + while (hdr) { + if (hdr->type == SETUP_CC_BLOB) { + sd = (struct cc_setup_data *)hdr; + return (struct cc_blob_sev_info *)(unsigned long)sd->cc_blob_address; + } + hdr = (struct setup_data *)hdr->next; + } + + return NULL; +} + +/* + * Initial set up of SNP relies on information provided by the + * Confidential Computing blob, which can be passed to the boot kernel + * by firmware/bootloader in the following ways: + * + * - via an entry in the EFI config table + * - via a setup_data structure, as defined by the Linux Boot Protocol + * + * Scan for the blob in that order. + */ +static struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp) +{ + struct cc_blob_sev_info *cc_info; + + cc_info = find_cc_blob_efi(bp); + if (cc_info) + goto found_cc_info; + + cc_info = find_cc_blob_setup_data(bp); + if (!cc_info) + return NULL; + +found_cc_info: + if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); + + return cc_info; +} + +/* + * Indicate SNP based on presence of SNP-specific CC blob. Subsequent checks + * will verify the SNP CPUID/MSR bits. + */ +bool snp_init(struct boot_params *bp) +{ + struct cc_blob_sev_info *cc_info; + + if (!bp) + return false; + + cc_info = find_cc_blob(bp); + if (!cc_info) + return false; + + /* + * Pass run-time kernel a pointer to CC info via boot_params so EFI + * config table doesn't need to be searched again during early startup + * phase. + */ + bp->cc_blob_address = (u32)(unsigned long)cc_info; + + return true; +} diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 8c934bda2a90..31b3b10323b6 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -11,6 +11,7 @@ #include #include #include +#include #define GHCB_PROTOCOL_MIN 1ULL #define GHCB_PROTOCOL_MAX 2ULL @@ -151,6 +152,7 @@ void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op void snp_set_memory_shared(unsigned long vaddr, unsigned int npages); void snp_set_memory_private(unsigned long vaddr, unsigned int npages); void snp_set_wakeup_secondary_cpu(void); +bool snp_init(struct boot_params *bp); #else static inline void sev_es_ist_enter(struct pt_regs *regs) { } static inline void sev_es_ist_exit(void) { } @@ -168,6 +170,7 @@ static inline void __init snp_prep_memory(unsigned long paddr, unsigned int sz, static inline void snp_set_memory_shared(unsigned long vaddr, unsigned int npages) { } static inline void snp_set_memory_private(unsigned long vaddr, unsigned int npages) { } static inline void snp_set_wakeup_secondary_cpu(void) { } +static inline bool snp_init(struct boot_params *bp) { return false; } #endif #endif -- cgit From 5f211f4fc49622473667e6983bb57beab755f6f6 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Thu, 24 Feb 2022 10:56:15 -0600 Subject: x86/compressed: Use firmware-validated CPUID leaves for SEV-SNP guests SEV-SNP guests will be provided the location of special 'secrets' 'CPUID' pages via the Confidential Computing blob. This blob is provided to the boot kernel either through an EFI config table entry, or via a setup_data structure as defined by the Linux Boot Protocol. Locate the Confidential Computing from these sources and, if found, use the provided CPUID page/table address to create a copy that the boot kernel will use when servicing CPUID instructions via a #VC CPUID handler. [ bp: s/cpuid/CPUID/ ] Signed-off-by: Michael Roth Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-36-brijesh.singh@amd.com --- arch/x86/boot/compressed/sev.c | 46 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 2a94bb7879ae..dd3cf55696d4 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -408,6 +408,43 @@ found_cc_info: return cc_info; } +/* + * Initialize the kernel's copy of the SNP CPUID table, and set up the + * pointer that will be used to access it. + * + * Maintaining a direct mapping of the SNP CPUID table used by firmware would + * be possible as an alternative, but the approach is brittle since the + * mapping needs to be updated in sync with all the changes to virtual memory + * layout and related mapping facilities throughout the boot process. + */ +static void setup_cpuid_table(const struct cc_blob_sev_info *cc_info) +{ + const struct snp_cpuid_table *cpuid_table_fw, *cpuid_table; + int i; + + if (!cc_info || !cc_info->cpuid_phys || cc_info->cpuid_len < PAGE_SIZE) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID); + + cpuid_table_fw = (const struct snp_cpuid_table *)cc_info->cpuid_phys; + if (!cpuid_table_fw->count || cpuid_table_fw->count > SNP_CPUID_COUNT_MAX) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID); + + cpuid_table = snp_cpuid_get_table(); + memcpy((void *)cpuid_table, cpuid_table_fw, sizeof(*cpuid_table)); + + /* Initialize CPUID ranges for range-checking. */ + for (i = 0; i < cpuid_table->count; i++) { + const struct snp_cpuid_fn *fn = &cpuid_table->fn[i]; + + if (fn->eax_in == 0x0) + cpuid_std_range_max = fn->eax; + else if (fn->eax_in == 0x40000000) + cpuid_hyp_range_max = fn->eax; + else if (fn->eax_in == 0x80000000) + cpuid_ext_range_max = fn->eax; + } +} + /* * Indicate SNP based on presence of SNP-specific CC blob. Subsequent checks * will verify the SNP CPUID/MSR bits. @@ -423,6 +460,15 @@ bool snp_init(struct boot_params *bp) if (!cc_info) return false; + /* + * If a SNP-specific Confidential Computing blob is present, then + * firmware/bootloader have indicated SNP support. Verifying this + * involves CPUID checks which will be more reliable if the SNP + * CPUID table is used. See comments over snp_setup_cpuid_table() for + * more details. + */ + setup_cpuid_table(cc_info); + /* * Pass run-time kernel a pointer to CC info via boot_params so EFI * config table doesn't need to be searched again during early startup -- cgit From a9ee679b1f8c3803490ed2eeffb688aaee56583f Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Thu, 24 Feb 2022 10:56:16 -0600 Subject: x86/compressed: Export and rename add_identity_map() SEV-specific code will need to add some additional mappings, but doing this within ident_map_64.c requires some SEV-specific helpers to be exported and some SEV-specific struct definitions to be pulled into ident_map_64.c. Instead, export add_identity_map() so SEV-specific (and other subsystem-specific) code can be better contained outside of ident_map_64.c. While at it, rename the function to kernel_add_identity_map(), similar to the kernel_ident_mapping_init() function it relies upon. No functional changes. Suggested-by: Borislav Petkov Signed-off-by: Michael Roth Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-37-brijesh.singh@amd.com --- arch/x86/boot/compressed/ident_map_64.c | 18 +++++++++--------- arch/x86/boot/compressed/misc.h | 1 + 2 files changed, 10 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c index 613367e7e09e..99348ee7999b 100644 --- a/arch/x86/boot/compressed/ident_map_64.c +++ b/arch/x86/boot/compressed/ident_map_64.c @@ -90,7 +90,7 @@ static struct x86_mapping_info mapping_info; /* * Adds the specified range to the identity mappings. */ -static void add_identity_map(unsigned long start, unsigned long end) +void kernel_add_identity_map(unsigned long start, unsigned long end) { int ret; @@ -157,11 +157,11 @@ void initialize_identity_maps(void *rmode) * explicitly here in case the compressed kernel does not touch them, * or does not touch all the pages covering them. */ - add_identity_map((unsigned long)_head, (unsigned long)_end); + kernel_add_identity_map((unsigned long)_head, (unsigned long)_end); boot_params = rmode; - add_identity_map((unsigned long)boot_params, (unsigned long)(boot_params + 1)); + kernel_add_identity_map((unsigned long)boot_params, (unsigned long)(boot_params + 1)); cmdline = get_cmd_line_ptr(); - add_identity_map(cmdline, cmdline + COMMAND_LINE_SIZE); + kernel_add_identity_map(cmdline, cmdline + COMMAND_LINE_SIZE); /* Load the new page-table. */ sev_verify_cbit(top_level_pgt); @@ -246,10 +246,10 @@ static int set_clr_page_flags(struct x86_mapping_info *info, * It should already exist, but keep things generic. * * To map the page just read from it and fault it in if there is no - * mapping yet. add_identity_map() can't be called here because that - * would unconditionally map the address on PMD level, destroying any - * PTE-level mappings that might already exist. Use assembly here so - * the access won't be optimized away. + * mapping yet. kernel_add_identity_map() can't be called here because + * that would unconditionally map the address on PMD level, destroying + * any PTE-level mappings that might already exist. Use assembly here + * so the access won't be optimized away. */ asm volatile("mov %[address], %%r9" :: [address] "g" (*(unsigned long *)address) @@ -363,5 +363,5 @@ void do_boot_page_fault(struct pt_regs *regs, unsigned long error_code) * Error code is sane - now identity map the 2M region around * the faulting address. */ - add_identity_map(address, end); + kernel_add_identity_map(address, end); } diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index ba538af37e90..aae2722c6e9a 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -156,6 +156,7 @@ static inline int count_immovable_mem_regions(void) { return 0; } #ifdef CONFIG_X86_5LEVEL extern unsigned int __pgtable_l5_enabled, pgdir_shift, ptrs_per_p4d; #endif +extern void kernel_add_identity_map(unsigned long start, unsigned long end); /* Used by PAGE_KERN* macros: */ extern pteval_t __default_kernel_pte_mask; -- cgit From 76f61e1e89b32f3e5d639f1b57413a919066da06 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Thu, 24 Feb 2022 10:56:17 -0600 Subject: x86/compressed/64: Add identity mapping for Confidential Computing blob The run-time kernel will need to access the Confidential Computing blob very early during boot to access the CPUID table it points to. At that stage, it will be relying on the identity-mapped page table set up by the boot/compressed kernel, so make sure the blob and the CPUID table it points to are mapped in advance. [ bp: Massage. ] Signed-off-by: Michael Roth Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-38-brijesh.singh@amd.com --- arch/x86/boot/compressed/ident_map_64.c | 3 ++- arch/x86/boot/compressed/misc.h | 2 ++ arch/x86/boot/compressed/sev.c | 21 +++++++++++++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c index 99348ee7999b..44c350d627c7 100644 --- a/arch/x86/boot/compressed/ident_map_64.c +++ b/arch/x86/boot/compressed/ident_map_64.c @@ -163,8 +163,9 @@ void initialize_identity_maps(void *rmode) cmdline = get_cmd_line_ptr(); kernel_add_identity_map(cmdline, cmdline + COMMAND_LINE_SIZE); + sev_prep_identity_maps(top_level_pgt); + /* Load the new page-table. */ - sev_verify_cbit(top_level_pgt); write_cr3(top_level_pgt); } diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index aae2722c6e9a..75d284ec763f 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -127,6 +127,7 @@ void sev_es_shutdown_ghcb(void); extern bool sev_es_check_ghcb_fault(unsigned long address); void snp_set_page_private(unsigned long paddr); void snp_set_page_shared(unsigned long paddr); +void sev_prep_identity_maps(unsigned long top_level_pgt); #else static inline void sev_enable(struct boot_params *bp) { } static inline void sev_es_shutdown_ghcb(void) { } @@ -136,6 +137,7 @@ static inline bool sev_es_check_ghcb_fault(unsigned long address) } static inline void snp_set_page_private(unsigned long paddr) { } static inline void snp_set_page_shared(unsigned long paddr) { } +static inline void sev_prep_identity_maps(unsigned long top_level_pgt) { } #endif /* acpi.c */ diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index dd3cf55696d4..1e4930ce2c95 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -478,3 +478,24 @@ bool snp_init(struct boot_params *bp) return true; } + +void sev_prep_identity_maps(unsigned long top_level_pgt) +{ + /* + * The Confidential Computing blob is used very early in uncompressed + * kernel to find the in-memory CPUID table to handle CPUID + * instructions. Make sure an identity-mapping exists so it can be + * accessed after switchover. + */ + if (sev_snp_enabled()) { + unsigned long cc_info_pa = boot_params->cc_blob_address; + struct cc_blob_sev_info *cc_info; + + kernel_add_identity_map(cc_info_pa, cc_info_pa + sizeof(*cc_info)); + + cc_info = (struct cc_blob_sev_info *)cc_info_pa; + kernel_add_identity_map(cc_info->cpuid_phys, cc_info->cpuid_phys + cc_info->cpuid_len); + } + + sev_verify_cbit(top_level_pgt); +} -- cgit From b190a043c49af4587f5e157053f909192820522a Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Thu, 24 Feb 2022 10:56:18 -0600 Subject: x86/sev: Add SEV-SNP feature detection/setup Initial/preliminary detection of SEV-SNP is done via the Confidential Computing blob. Check for it prior to the normal SEV/SME feature initialization, and add some sanity checks to confirm it agrees with SEV-SNP CPUID/MSR bits. Signed-off-by: Michael Roth Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-39-brijesh.singh@amd.com --- arch/x86/boot/compressed/sev.c | 27 ---------------- arch/x86/include/asm/sev.h | 2 ++ arch/x86/kernel/sev-shared.c | 27 ++++++++++++++++ arch/x86/kernel/sev.c | 64 ++++++++++++++++++++++++++++++++++++++ arch/x86/mm/mem_encrypt_identity.c | 8 +++++ 5 files changed, 101 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 1e4930ce2c95..82079ce7be06 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -352,33 +352,6 @@ static struct cc_blob_sev_info *find_cc_blob_efi(struct boot_params *bp) EFI_CC_BLOB_GUID); } -struct cc_setup_data { - struct setup_data header; - u32 cc_blob_address; -}; - -/* - * Search for a Confidential Computing blob passed in as a setup_data entry - * via the Linux Boot Protocol. - */ -static struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp) -{ - struct cc_setup_data *sd = NULL; - struct setup_data *hdr; - - hdr = (struct setup_data *)bp->hdr.setup_data; - - while (hdr) { - if (hdr->type == SETUP_CC_BLOB) { - sd = (struct cc_setup_data *)hdr; - return (struct cc_blob_sev_info *)(unsigned long)sd->cc_blob_address; - } - hdr = (struct setup_data *)hdr->next; - } - - return NULL; -} - /* * Initial set up of SNP relies on information provided by the * Confidential Computing blob, which can be passed to the boot kernel diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 31b3b10323b6..84d3d5121e9c 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -153,6 +153,7 @@ void snp_set_memory_shared(unsigned long vaddr, unsigned int npages); void snp_set_memory_private(unsigned long vaddr, unsigned int npages); void snp_set_wakeup_secondary_cpu(void); bool snp_init(struct boot_params *bp); +void snp_abort(void); #else static inline void sev_es_ist_enter(struct pt_regs *regs) { } static inline void sev_es_ist_exit(void) { } @@ -171,6 +172,7 @@ static inline void snp_set_memory_shared(unsigned long vaddr, unsigned int npage static inline void snp_set_memory_private(unsigned long vaddr, unsigned int npages) { } static inline void snp_set_wakeup_secondary_cpu(void) { } static inline bool snp_init(struct boot_params *bp) { return false; } +static inline void snp_abort(void) { } #endif #endif diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index 0f1375164ff0..a7a1c0fb298e 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -937,3 +937,30 @@ static enum es_result vc_handle_rdtsc(struct ghcb *ghcb, return ES_OK; } + +struct cc_setup_data { + struct setup_data header; + u32 cc_blob_address; +}; + +/* + * Search for a Confidential Computing blob passed in as a setup_data entry + * via the Linux Boot Protocol. + */ +static struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp) +{ + struct cc_setup_data *sd = NULL; + struct setup_data *hdr; + + hdr = (struct setup_data *)bp->hdr.setup_data; + + while (hdr) { + if (hdr->type == SETUP_CC_BLOB) { + sd = (struct cc_setup_data *)hdr; + return (struct cc_blob_sev_info *)(unsigned long)sd->cc_blob_address; + } + hdr = (struct setup_data *)hdr->next; + } + + return NULL; +} diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 0c2bf39c4a8f..692da7b29127 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -1974,3 +1974,67 @@ fail: while (true) halt(); } + +/* + * Initial set up of SNP relies on information provided by the + * Confidential Computing blob, which can be passed to the kernel + * in the following ways, depending on how it is booted: + * + * - when booted via the boot/decompress kernel: + * - via boot_params + * + * - when booted directly by firmware/bootloader (e.g. CONFIG_PVH): + * - via a setup_data entry, as defined by the Linux Boot Protocol + * + * Scan for the blob in that order. + */ +static __init struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp) +{ + struct cc_blob_sev_info *cc_info; + + /* Boot kernel would have passed the CC blob via boot_params. */ + if (bp->cc_blob_address) { + cc_info = (struct cc_blob_sev_info *)(unsigned long)bp->cc_blob_address; + goto found_cc_info; + } + + /* + * If kernel was booted directly, without the use of the + * boot/decompression kernel, the CC blob may have been passed via + * setup_data instead. + */ + cc_info = find_cc_blob_setup_data(bp); + if (!cc_info) + return NULL; + +found_cc_info: + if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC) + snp_abort(); + + return cc_info; +} + +bool __init snp_init(struct boot_params *bp) +{ + struct cc_blob_sev_info *cc_info; + + if (!bp) + return false; + + cc_info = find_cc_blob(bp); + if (!cc_info) + return false; + + /* + * The CC blob will be used later to access the secrets page. Cache + * it here like the boot kernel does. + */ + bp->cc_blob_address = (u32)(unsigned long)cc_info; + + return true; +} + +void __init snp_abort(void) +{ + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); +} diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index b43bc24d2bb6..f415498d3175 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -45,6 +45,7 @@ #include #include #include +#include #include "mm_internal.h" @@ -509,8 +510,11 @@ void __init sme_enable(struct boot_params *bp) bool active_by_default; unsigned long me_mask; char buffer[16]; + bool snp; u64 msr; + snp = snp_init(bp); + /* Check for the SME/SEV support leaf */ eax = 0x80000000; ecx = 0; @@ -542,6 +546,10 @@ void __init sme_enable(struct boot_params *bp) sev_status = __rdmsr(MSR_AMD64_SEV); feature_mask = (sev_status & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT; + /* The SEV-SNP CC blob should never be present unless SEV-SNP is enabled. */ + if (snp && !(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) + snp_abort(); + /* Check if memory encryption is enabled */ if (feature_mask == AMD_SME_BIT) { /* -- cgit From 30612045e69d088f1effd748048ebb0e282984ec Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Mon, 7 Mar 2022 15:33:49 -0600 Subject: x86/sev: Use firmware-validated CPUID for SEV-SNP guests SEV-SNP guests will be provided the location of special 'secrets' and 'CPUID' pages via the Confidential Computing blob. This blob is provided to the run-time kernel either through a boot_params field that was initialized by the boot/compressed kernel, or via a setup_data structure as defined by the Linux Boot Protocol. Locate the Confidential Computing blob from these sources and, if found, use the provided CPUID page/table address to create a copy that the run-time kernel will use when servicing CPUID instructions via a #VC handler. Signed-off-by: Michael Roth Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-40-brijesh.singh@amd.com --- arch/x86/boot/compressed/sev.c | 37 ------------------------------------- arch/x86/kernel/sev-shared.c | 37 +++++++++++++++++++++++++++++++++++++ arch/x86/kernel/sev.c | 24 ++++++++++++++++++++++++ 3 files changed, 61 insertions(+), 37 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 82079ce7be06..52f989f6acc2 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -381,43 +381,6 @@ found_cc_info: return cc_info; } -/* - * Initialize the kernel's copy of the SNP CPUID table, and set up the - * pointer that will be used to access it. - * - * Maintaining a direct mapping of the SNP CPUID table used by firmware would - * be possible as an alternative, but the approach is brittle since the - * mapping needs to be updated in sync with all the changes to virtual memory - * layout and related mapping facilities throughout the boot process. - */ -static void setup_cpuid_table(const struct cc_blob_sev_info *cc_info) -{ - const struct snp_cpuid_table *cpuid_table_fw, *cpuid_table; - int i; - - if (!cc_info || !cc_info->cpuid_phys || cc_info->cpuid_len < PAGE_SIZE) - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID); - - cpuid_table_fw = (const struct snp_cpuid_table *)cc_info->cpuid_phys; - if (!cpuid_table_fw->count || cpuid_table_fw->count > SNP_CPUID_COUNT_MAX) - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID); - - cpuid_table = snp_cpuid_get_table(); - memcpy((void *)cpuid_table, cpuid_table_fw, sizeof(*cpuid_table)); - - /* Initialize CPUID ranges for range-checking. */ - for (i = 0; i < cpuid_table->count; i++) { - const struct snp_cpuid_fn *fn = &cpuid_table->fn[i]; - - if (fn->eax_in == 0x0) - cpuid_std_range_max = fn->eax; - else if (fn->eax_in == 0x40000000) - cpuid_hyp_range_max = fn->eax; - else if (fn->eax_in == 0x80000000) - cpuid_ext_range_max = fn->eax; - } -} - /* * Indicate SNP based on presence of SNP-specific CC blob. Subsequent checks * will verify the SNP CPUID/MSR bits. diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index a7a1c0fb298e..2b4270d5559e 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -964,3 +964,40 @@ static struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp) return NULL; } + +/* + * Initialize the kernel's copy of the SNP CPUID table, and set up the + * pointer that will be used to access it. + * + * Maintaining a direct mapping of the SNP CPUID table used by firmware would + * be possible as an alternative, but the approach is brittle since the + * mapping needs to be updated in sync with all the changes to virtual memory + * layout and related mapping facilities throughout the boot process. + */ +static void __init setup_cpuid_table(const struct cc_blob_sev_info *cc_info) +{ + const struct snp_cpuid_table *cpuid_table_fw, *cpuid_table; + int i; + + if (!cc_info || !cc_info->cpuid_phys || cc_info->cpuid_len < PAGE_SIZE) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID); + + cpuid_table_fw = (const struct snp_cpuid_table *)cc_info->cpuid_phys; + if (!cpuid_table_fw->count || cpuid_table_fw->count > SNP_CPUID_COUNT_MAX) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID); + + cpuid_table = snp_cpuid_get_table(); + memcpy((void *)cpuid_table, cpuid_table_fw, sizeof(*cpuid_table)); + + /* Initialize CPUID ranges for range-checking. */ + for (i = 0; i < cpuid_table->count; i++) { + const struct snp_cpuid_fn *fn = &cpuid_table->fn[i]; + + if (fn->eax_in == 0x0) + cpuid_std_range_max = fn->eax; + else if (fn->eax_in == 0x40000000) + cpuid_hyp_range_max = fn->eax; + else if (fn->eax_in == 0x80000000) + cpuid_ext_range_max = fn->eax; + } +} diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 692da7b29127..c8733725d8bf 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -34,6 +34,7 @@ #include #include #include +#include #define DR7_RESET_VALUE 0x400 @@ -2025,6 +2026,8 @@ bool __init snp_init(struct boot_params *bp) if (!cc_info) return false; + setup_cpuid_table(cc_info); + /* * The CC blob will be used later to access the secrets page. Cache * it here like the boot kernel does. @@ -2038,3 +2041,24 @@ void __init snp_abort(void) { sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); } + +/* + * It is useful from an auditing/testing perspective to provide an easy way + * for the guest owner to know that the CPUID table has been initialized as + * expected, but that initialization happens too early in boot to print any + * sort of indicator, and there's not really any other good place to do it, + * so do it here. + */ +static int __init report_cpuid_table(void) +{ + const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); + + if (!cpuid_table->count) + return 0; + + pr_info("Using SNP CPUID table, %d entries present.\n", + cpuid_table->count); + + return 0; +} +arch_initcall(report_cpuid_table); -- cgit From ba37a1438aeb540cc48722d629f4b2e7e4398466 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Mon, 7 Mar 2022 15:33:50 -0600 Subject: x86/sev: Add a sev= cmdline option For debugging purposes it is very useful to have a way to see the full contents of the SNP CPUID table provided to a guest. Add an sev=debug kernel command-line option to do so. Also introduce some infrastructure so that additional options can be specified via sev=option1[,option2] over time in a consistent manner. [ bp: Massage, simplify string parsing. ] Suggested-by: Borislav Petkov Signed-off-by: Michael Roth Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-41-brijesh.singh@amd.com --- arch/x86/kernel/sev.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index c8733725d8bf..70ecc6e2f251 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -112,6 +112,13 @@ DEFINE_STATIC_KEY_FALSE(sev_es_enable_key); static DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa); +struct sev_config { + __u64 debug : 1, + __reserved : 63; +}; + +static struct sev_config sev_cfg __read_mostly; + static __always_inline bool on_vc_stack(struct pt_regs *regs) { unsigned long sp = regs->sp; @@ -2042,6 +2049,23 @@ void __init snp_abort(void) sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); } +static void dump_cpuid_table(void) +{ + const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); + int i = 0; + + pr_info("count=%d reserved=0x%x reserved2=0x%llx\n", + cpuid_table->count, cpuid_table->__reserved1, cpuid_table->__reserved2); + + for (i = 0; i < SNP_CPUID_COUNT_MAX; i++) { + const struct snp_cpuid_fn *fn = &cpuid_table->fn[i]; + + pr_info("index=%3d fn=0x%08x subfn=0x%08x: eax=0x%08x ebx=0x%08x ecx=0x%08x edx=0x%08x xcr0_in=0x%016llx xss_in=0x%016llx reserved=0x%016llx\n", + i, fn->eax_in, fn->ecx_in, fn->eax, fn->ebx, fn->ecx, + fn->edx, fn->xcr0_in, fn->xss_in, fn->__reserved); + } +} + /* * It is useful from an auditing/testing perspective to provide an easy way * for the guest owner to know that the CPUID table has been initialized as @@ -2059,6 +2083,26 @@ static int __init report_cpuid_table(void) pr_info("Using SNP CPUID table, %d entries present.\n", cpuid_table->count); + if (sev_cfg.debug) + dump_cpuid_table(); + return 0; } arch_initcall(report_cpuid_table); + +static int __init init_sev_config(char *str) +{ + char *s; + + while ((s = strsep(&str, ","))) { + if (!strcmp(s, "debug")) { + sev_cfg.debug = true; + continue; + } + + pr_info("SEV command-line option '%s' was not recognized\n", s); + } + + return 1; +} +__setup("sev=", init_sev_config); -- cgit From d5af44dde5461d125d1602ac913ab5c6bdf09b8b Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 7 Mar 2022 15:33:51 -0600 Subject: x86/sev: Provide support for SNP guest request NAEs Version 2 of GHCB specification provides SNP_GUEST_REQUEST and SNP_EXT_GUEST_REQUEST NAE that can be used by the SNP guest to communicate with the PSP. While at it, add a snp_issue_guest_request() helper that will be used by driver or other subsystem to issue the request to PSP. See SEV-SNP firmware and GHCB spec for more details. Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-42-brijesh.singh@amd.com --- arch/x86/include/asm/sev-common.h | 3 +++ arch/x86/include/asm/sev.h | 14 ++++++++++ arch/x86/include/uapi/asm/svm.h | 4 +++ arch/x86/kernel/sev.c | 57 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 78 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index 0759af9b1acf..b8357d6ecd47 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -128,6 +128,9 @@ struct snp_psc_desc { struct psc_entry entries[VMGEXIT_PSC_MAX_ENTRY]; } __packed; +/* Guest message request error code */ +#define SNP_GUEST_REQ_INVALID_LEN BIT_ULL(32) + #define GHCB_MSR_TERM_REQ 0x100 #define GHCB_MSR_TERM_REASON_SET_POS 12 #define GHCB_MSR_TERM_REASON_SET_MASK 0xf diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 84d3d5121e9c..c63a1d485c20 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -87,6 +87,14 @@ extern bool handle_vc_boot_ghcb(struct pt_regs *regs); #define RMPADJUST_VMSA_PAGE_BIT BIT(16) +/* SNP Guest message request */ +struct snp_req_data { + unsigned long req_gpa; + unsigned long resp_gpa; + unsigned long data_gpa; + unsigned int data_npages; +}; + #ifdef CONFIG_AMD_MEM_ENCRYPT extern struct static_key_false sev_es_enable_key; extern void __sev_es_ist_enter(struct pt_regs *regs); @@ -154,6 +162,7 @@ void snp_set_memory_private(unsigned long vaddr, unsigned int npages); void snp_set_wakeup_secondary_cpu(void); bool snp_init(struct boot_params *bp); void snp_abort(void); +int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned long *fw_err); #else static inline void sev_es_ist_enter(struct pt_regs *regs) { } static inline void sev_es_ist_exit(void) { } @@ -173,6 +182,11 @@ static inline void snp_set_memory_private(unsigned long vaddr, unsigned int npag static inline void snp_set_wakeup_secondary_cpu(void) { } static inline bool snp_init(struct boot_params *bp) { return false; } static inline void snp_abort(void) { } +static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, + unsigned long *fw_err) +{ + return -ENOTTY; +} #endif #endif diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index cfea5e875088..f69c168391aa 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -109,6 +109,8 @@ #define SVM_VMGEXIT_SET_AP_JUMP_TABLE 0 #define SVM_VMGEXIT_GET_AP_JUMP_TABLE 1 #define SVM_VMGEXIT_PSC 0x80000010 +#define SVM_VMGEXIT_GUEST_REQUEST 0x80000011 +#define SVM_VMGEXIT_EXT_GUEST_REQUEST 0x80000012 #define SVM_VMGEXIT_AP_CREATION 0x80000013 #define SVM_VMGEXIT_AP_CREATE_ON_INIT 0 #define SVM_VMGEXIT_AP_CREATE 1 @@ -225,6 +227,8 @@ { SVM_VMGEXIT_AP_HLT_LOOP, "vmgexit_ap_hlt_loop" }, \ { SVM_VMGEXIT_AP_JUMP_TABLE, "vmgexit_ap_jump_table" }, \ { SVM_VMGEXIT_PSC, "vmgexit_page_state_change" }, \ + { SVM_VMGEXIT_GUEST_REQUEST, "vmgexit_guest_request" }, \ + { SVM_VMGEXIT_EXT_GUEST_REQUEST, "vmgexit_ext_guest_request" }, \ { SVM_VMGEXIT_AP_CREATION, "vmgexit_ap_creation" }, \ { SVM_VMGEXIT_HV_FEATURES, "vmgexit_hypervisor_feature" }, \ { SVM_EXIT_ERR, "invalid_guest_state" } diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 70ecc6e2f251..7237b4178935 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -2106,3 +2106,60 @@ static int __init init_sev_config(char *str) return 1; } __setup("sev=", init_sev_config); + +int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned long *fw_err) +{ + struct ghcb_state state; + struct es_em_ctxt ctxt; + unsigned long flags; + struct ghcb *ghcb; + int ret; + + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return -ENODEV; + + if (!fw_err) + return -EINVAL; + + /* + * __sev_get_ghcb() needs to run with IRQs disabled because it is using + * a per-CPU GHCB. + */ + local_irq_save(flags); + + ghcb = __sev_get_ghcb(&state); + if (!ghcb) { + ret = -EIO; + goto e_restore_irq; + } + + vc_ghcb_invalidate(ghcb); + + if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) { + ghcb_set_rax(ghcb, input->data_gpa); + ghcb_set_rbx(ghcb, input->data_npages); + } + + ret = sev_es_ghcb_hv_call(ghcb, true, &ctxt, exit_code, input->req_gpa, input->resp_gpa); + if (ret) + goto e_put; + + if (ghcb->save.sw_exit_info_2) { + /* Number of expected pages are returned in RBX */ + if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST && + ghcb->save.sw_exit_info_2 == SNP_GUEST_REQ_INVALID_LEN) + input->data_npages = ghcb_get_rbx(ghcb); + + *fw_err = ghcb->save.sw_exit_info_2; + + ret = -EIO; + } + +e_put: + __sev_put_ghcb(&state); +e_restore_irq: + local_irq_restore(flags); + + return ret; +} +EXPORT_SYMBOL_GPL(snp_issue_guest_request); -- cgit From 3a45b3753849c4a12cca2dd176c0192cd2a63e62 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Thu, 24 Feb 2022 10:56:21 -0600 Subject: x86/sev: Register SEV-SNP guest request platform device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Version 2 of the GHCB specification provides a Non Automatic Exit (NAE) event type that can be used by the SEV-SNP guest to communicate with the PSP without risk from a malicious hypervisor who wishes to read, alter, drop or replay the messages sent. SNP_LAUNCH_UPDATE can insert two special pages into the guest’s memory: the secrets page and the CPUID page. The PSP firmware populates the contents of the secrets page. The secrets page contains encryption keys used by the guest to interact with the firmware. Because the secrets page is encrypted with the guest’s memory encryption key, the hypervisor cannot read the keys. See SEV-SNP firmware spec for further details on the secrets page format. Create a platform device that the SEV-SNP guest driver can bind to get the platform resources such as encryption key and message id to use to communicate with the PSP. The SEV-SNP guest driver provides a userspace interface to get the attestation report, key derivation, extended attestation report etc. Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-43-brijesh.singh@amd.com --- arch/x86/include/asm/sev.h | 4 ++++ arch/x86/kernel/sev.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index c63a1d485c20..9c2d33f1cfee 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -95,6 +95,10 @@ struct snp_req_data { unsigned int data_npages; }; +struct snp_guest_platform_data { + u64 secrets_gpa; +}; + #ifdef CONFIG_AMD_MEM_ENCRYPT extern struct static_key_false sev_es_enable_key; extern void __sev_es_ist_enter(struct pt_regs *regs); diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 7237b4178935..ace43e116b40 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -19,6 +19,9 @@ #include #include #include +#include +#include +#include #include #include @@ -2163,3 +2166,56 @@ e_restore_irq: return ret; } EXPORT_SYMBOL_GPL(snp_issue_guest_request); + +static struct platform_device guest_req_device = { + .name = "snp-guest", + .id = -1, +}; + +static u64 get_secrets_page(void) +{ + u64 pa_data = boot_params.cc_blob_address; + struct cc_blob_sev_info info; + void *map; + + /* + * The CC blob contains the address of the secrets page, check if the + * blob is present. + */ + if (!pa_data) + return 0; + + map = early_memremap(pa_data, sizeof(info)); + memcpy(&info, map, sizeof(info)); + early_memunmap(map, sizeof(info)); + + /* smoke-test the secrets page passed */ + if (!info.secrets_phys || info.secrets_len != PAGE_SIZE) + return 0; + + return info.secrets_phys; +} + +static int __init snp_init_platform_device(void) +{ + struct snp_guest_platform_data data; + u64 gpa; + + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return -ENODEV; + + gpa = get_secrets_page(); + if (!gpa) + return -ENODEV; + + data.secrets_gpa = gpa; + if (platform_device_add_data(&guest_req_device, &data, sizeof(data))) + return -ENODEV; + + if (platform_device_register(&guest_req_device)) + return -ENODEV; + + pr_info("SNP guest platform device initialized.\n"); + return 0; +} +device_initcall(snp_init_platform_device); -- cgit From d812f7c475c6a4dcfff02a85fbfd7a9c87e6a094 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Wed, 6 Apr 2022 14:51:47 -0500 Subject: x86/platform/uv: Update NMI Handler for UV5 Update NMI handler for UV5 hardware. A platform register changed, and UV5 only uses one of the two NMI methods used on previous hardware. Signed-off-by: Mike Travis Signed-off-by: Steve Wahl Signed-off-by: Borislav Petkov Reviewed-by: Dimitri Sivanich Acked-by: Thomas Gleixner Link: https://lore.kernel.org/r/20220406195149.228164-2-steve.wahl@hpe.com --- arch/x86/platform/uv/uv_nmi.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c index 1e9ff28bc2e0..61ec3be77328 100644 --- a/arch/x86/platform/uv/uv_nmi.c +++ b/arch/x86/platform/uv/uv_nmi.c @@ -244,8 +244,10 @@ static inline bool uv_nmi_action_is(const char *action) /* Setup which NMI support is present in system */ static void uv_nmi_setup_mmrs(void) { + bool new_nmi_method_only = false; + /* First determine arch specific MMRs to handshake with BIOS */ - if (UVH_EVENT_OCCURRED0_EXTIO_INT0_MASK) { + if (UVH_EVENT_OCCURRED0_EXTIO_INT0_MASK) { /* UV2,3,4 setup */ uvh_nmi_mmrx = UVH_EVENT_OCCURRED0; uvh_nmi_mmrx_clear = UVH_EVENT_OCCURRED0_ALIAS; uvh_nmi_mmrx_shift = UVH_EVENT_OCCURRED0_EXTIO_INT0_SHFT; @@ -255,26 +257,25 @@ static void uv_nmi_setup_mmrs(void) uvh_nmi_mmrx_req = UVH_BIOS_KERNEL_MMR_ALIAS_2; uvh_nmi_mmrx_req_shift = 62; - } else if (UVH_EVENT_OCCURRED1_EXTIO_INT0_MASK) { + } else if (UVH_EVENT_OCCURRED1_EXTIO_INT0_MASK) { /* UV5+ setup */ uvh_nmi_mmrx = UVH_EVENT_OCCURRED1; uvh_nmi_mmrx_clear = UVH_EVENT_OCCURRED1_ALIAS; uvh_nmi_mmrx_shift = UVH_EVENT_OCCURRED1_EXTIO_INT0_SHFT; uvh_nmi_mmrx_type = "OCRD1-EXTIO_INT0"; - uvh_nmi_mmrx_supported = UVH_EXTIO_INT0_BROADCAST; - uvh_nmi_mmrx_req = UVH_BIOS_KERNEL_MMR_ALIAS_2; - uvh_nmi_mmrx_req_shift = 62; + new_nmi_method_only = true; /* Newer nmi always valid on UV5+ */ + uvh_nmi_mmrx_req = 0; /* no request bit to clear */ } else { - pr_err("UV:%s:cannot find EVENT_OCCURRED*_EXTIO_INT0\n", - __func__); + pr_err("UV:%s:NMI support not available on this system\n", __func__); return; } /* Then find out if new NMI is supported */ - if (likely(uv_read_local_mmr(uvh_nmi_mmrx_supported))) { - uv_write_local_mmr(uvh_nmi_mmrx_req, - 1UL << uvh_nmi_mmrx_req_shift); + if (new_nmi_method_only || uv_read_local_mmr(uvh_nmi_mmrx_supported)) { + if (uvh_nmi_mmrx_req) + uv_write_local_mmr(uvh_nmi_mmrx_req, + 1UL << uvh_nmi_mmrx_req_shift); nmi_mmr = uvh_nmi_mmrx; nmi_mmr_clear = uvh_nmi_mmrx_clear; nmi_mmr_pending = 1UL << uvh_nmi_mmrx_shift; -- cgit From bb3ab81bdbd53f88f26ffabc9fb15bd8466486ec Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Wed, 6 Apr 2022 14:51:48 -0500 Subject: x86/platform/uv: Update TSC sync state for UV5 The UV5 platform synchronizes the TSCs among all chassis, and will not proceed to OS boot without achieving synchronization. Previous UV platforms provided a register indicating successful synchronization. This is no longer available on UV5. On this platform TSC_ADJUST should not be reset by the kernel. Signed-off-by: Mike Travis Signed-off-by: Steve Wahl Signed-off-by: Borislav Petkov Reviewed-by: Dimitri Sivanich Acked-by: Thomas Gleixner Link: https://lore.kernel.org/r/20220406195149.228164-3-steve.wahl@hpe.com --- arch/x86/kernel/apic/x2apic_uv_x.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index f5a48e66e4f5..a6e9c2794ef5 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -199,7 +199,13 @@ static void __init uv_tsc_check_sync(void) int mmr_shift; char *state; - /* Different returns from different UV BIOS versions */ + /* UV5 guarantees synced TSCs; do not zero TSC_ADJUST */ + if (!is_uv(UV2|UV3|UV4)) { + mark_tsc_async_resets("UV5+"); + return; + } + + /* UV2,3,4, UV BIOS TSC sync state available */ mmr = uv_early_read_mmr(UVH_TSC_SYNC_MMR); mmr_shift = is_uv2_hub() ? UVH_TSC_SYNC_SHIFT_UV2K : UVH_TSC_SYNC_SHIFT; -- cgit From 327c348988c6f0bacd7abd29c151f37bdf1e2a02 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Wed, 6 Apr 2022 14:51:49 -0500 Subject: x86/platform/uv: Log gap hole end size Show value of gap end in the kernel log which equates to number of physical address bits used by system. Signed-off-by: Mike Travis Signed-off-by: Steve Wahl Signed-off-by: Borislav Petkov Acked-by: Thomas Gleixner Link: https://lore.kernel.org/r/20220406195149.228164-4-steve.wahl@hpe.com --- arch/x86/kernel/apic/x2apic_uv_x.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index a6e9c2794ef5..482855227964 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -1346,7 +1346,7 @@ static void __init decode_gam_params(unsigned long ptr) static void __init decode_gam_rng_tbl(unsigned long ptr) { struct uv_gam_range_entry *gre = (struct uv_gam_range_entry *)ptr; - unsigned long lgre = 0; + unsigned long lgre = 0, gend = 0; int index = 0; int sock_min = 999999, pnode_min = 99999; int sock_max = -1, pnode_max = -1; @@ -1380,6 +1380,9 @@ static void __init decode_gam_rng_tbl(unsigned long ptr) flag, size, suffix[order], gre->type, gre->nasid, gre->sockid, gre->pnode); + if (gre->type == UV_GAM_RANGE_TYPE_HOLE) + gend = (unsigned long)gre->limit << UV_GAM_RANGE_SHFT; + /* update to next range start */ lgre = gre->limit; if (sock_min > gre->sockid) @@ -1397,7 +1400,8 @@ static void __init decode_gam_rng_tbl(unsigned long ptr) _max_pnode = pnode_max; _gr_table_len = index; - pr_info("UV: GRT: %d entries, sockets(min:%x,max:%x) pnodes(min:%x,max:%x)\n", index, _min_socket, _max_socket, _min_pnode, _max_pnode); + pr_info("UV: GRT: %d entries, sockets(min:%x,max:%x), pnodes(min:%x,max:%x), gap_end(%d)\n", + index, _min_socket, _max_socket, _min_pnode, _max_pnode, fls64(gend)); } /* Walk through UVsystab decoding the fields */ -- cgit From 59bd54a84d15e9335de5b8abe7b3b9713a36b99b Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Wed, 6 Apr 2022 02:29:10 +0300 Subject: x86/tdx: Detect running as a TDX guest in early boot In preparation of extending cc_platform_has() API to support TDX guest, use CPUID instruction to detect support for TDX guests in the early boot code (via tdx_early_init()). Since copy_bootdata() is the first user of cc_platform_has() API, detect the TDX guest status before it. Define a synthetic feature flag (X86_FEATURE_TDX_GUEST) and set this bit in a valid TDX guest platform. Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Andi Kleen Reviewed-by: Tony Luck Reviewed-by: Dave Hansen Reviewed-by: Borislav Petkov Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20220405232939.73860-2-kirill.shutemov@linux.intel.com --- arch/x86/Kconfig | 12 ++++++++++++ arch/x86/coco/Makefile | 2 ++ arch/x86/coco/tdx/Makefile | 3 +++ arch/x86/coco/tdx/tdx.c | 22 ++++++++++++++++++++++ arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/disabled-features.h | 8 +++++++- arch/x86/include/asm/tdx.h | 21 +++++++++++++++++++++ arch/x86/kernel/head64.c | 4 ++++ 8 files changed, 72 insertions(+), 1 deletion(-) create mode 100644 arch/x86/coco/tdx/Makefile create mode 100644 arch/x86/coco/tdx/tdx.c create mode 100644 arch/x86/include/asm/tdx.h (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b0142e01002e..4ae27322869d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -878,6 +878,18 @@ config ACRN_GUEST IOT with small footprint and real-time features. More details can be found in https://projectacrn.org/. +config INTEL_TDX_GUEST + bool "Intel TDX (Trust Domain Extensions) - Guest Support" + depends on X86_64 && CPU_SUP_INTEL + depends on X86_X2APIC + help + Support running as a guest under Intel TDX. Without this support, + the guest kernel can not boot or run under TDX. + TDX includes memory encryption and integrity capabilities + which protect the confidentiality and integrity of guest + memory contents and CPU state. TDX guests are protected from + some attacks from the VMM. + endif #HYPERVISOR_GUEST source "arch/x86/Kconfig.cpu" diff --git a/arch/x86/coco/Makefile b/arch/x86/coco/Makefile index c1ead00017a7..c816acf78b6a 100644 --- a/arch/x86/coco/Makefile +++ b/arch/x86/coco/Makefile @@ -4,3 +4,5 @@ KASAN_SANITIZE_core.o := n CFLAGS_core.o += -fno-stack-protector obj-y += core.o + +obj-$(CONFIG_INTEL_TDX_GUEST) += tdx/ diff --git a/arch/x86/coco/tdx/Makefile b/arch/x86/coco/tdx/Makefile new file mode 100644 index 000000000000..c929d53ee059 --- /dev/null +++ b/arch/x86/coco/tdx/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-y += tdx.o diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c new file mode 100644 index 000000000000..97674471fd1e --- /dev/null +++ b/arch/x86/coco/tdx/tdx.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2021-2022 Intel Corporation */ + +#undef pr_fmt +#define pr_fmt(fmt) "tdx: " fmt + +#include +#include + +void __init tdx_early_init(void) +{ + u32 eax, sig[3]; + + cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, &sig[0], &sig[2], &sig[1]); + + if (memcmp(TDX_IDENT, sig, sizeof(sig))) + return; + + setup_force_cpu_cap(X86_FEATURE_TDX_GUEST); + + pr_info("Guest detected\n"); +} diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 73e643ae94b6..20df73b51025 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -238,6 +238,7 @@ #define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */ #define X86_FEATURE_PVUNLOCK ( 8*32+20) /* "" PV unlock function */ #define X86_FEATURE_VCPUPREEMPT ( 8*32+21) /* "" PV vcpu_is_preempted function */ +#define X86_FEATURE_TDX_GUEST ( 8*32+22) /* Intel Trust Domain Extensions Guest */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ #define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 1231d63f836d..b37de8268c9a 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -68,6 +68,12 @@ # define DISABLE_SGX (1 << (X86_FEATURE_SGX & 31)) #endif +#ifdef CONFIG_INTEL_TDX_GUEST +# define DISABLE_TDX_GUEST 0 +#else +# define DISABLE_TDX_GUEST (1 << (X86_FEATURE_TDX_GUEST & 31)) +#endif + /* * Make sure to add features to the correct mask */ @@ -79,7 +85,7 @@ #define DISABLED_MASK5 0 #define DISABLED_MASK6 0 #define DISABLED_MASK7 (DISABLE_PTI) -#define DISABLED_MASK8 0 +#define DISABLED_MASK8 (DISABLE_TDX_GUEST) #define DISABLED_MASK9 (DISABLE_SMAP|DISABLE_SGX) #define DISABLED_MASK10 0 #define DISABLED_MASK11 0 diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h new file mode 100644 index 000000000000..ba8042ce61c2 --- /dev/null +++ b/arch/x86/include/asm/tdx.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2021-2022 Intel Corporation */ +#ifndef _ASM_X86_TDX_H +#define _ASM_X86_TDX_H + +#include + +#define TDX_CPUID_LEAF_ID 0x21 +#define TDX_IDENT "IntelTDX " + +#ifdef CONFIG_INTEL_TDX_GUEST + +void __init tdx_early_init(void); + +#else + +static inline void tdx_early_init(void) { }; + +#endif /* CONFIG_INTEL_TDX_GUEST */ + +#endif /* _ASM_X86_TDX_H */ diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 4f5ecbbaae77..6dff50c3edd6 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -40,6 +40,7 @@ #include #include #include +#include /* * Manage page tables very early on. @@ -514,6 +515,9 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) idt_setup_early_handler(); + /* Needed before cc_platform_has() can be used for TDX */ + tdx_early_init(); + copy_bootdata(__va(real_mode_data)); /* -- cgit From 527a534c732604931959e73e9c3a8952d8c1a994 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:11 +0300 Subject: x86/tdx: Provide common base for SEAMCALL and TDCALL C wrappers Secure Arbitration Mode (SEAM) is an extension of VMX architecture. It defines a new VMX root operation (SEAM VMX root) and a new VMX non-root operation (SEAM VMX non-root) which are both isolated from the legacy VMX operation where the host kernel runs. A CPU-attested software module (called 'TDX module') runs in SEAM VMX root to manage and protect VMs running in SEAM VMX non-root. SEAM VMX root is also used to host another CPU-attested software module (called 'P-SEAMLDR') to load and update the TDX module. Host kernel transits to either P-SEAMLDR or TDX module via the new SEAMCALL instruction, which is essentially a VMExit from VMX root mode to SEAM VMX root mode. SEAMCALLs are leaf functions defined by P-SEAMLDR and TDX module around the new SEAMCALL instruction. A guest kernel can also communicate with TDX module via TDCALL instruction. TDCALLs and SEAMCALLs use an ABI different from the x86-64 system-v ABI. RAX is used to carry both the SEAMCALL leaf function number (input) and the completion status (output). Additional GPRs (RCX, RDX, R8-R11) may be further used as both input and output operands in individual leaf. TDCALL and SEAMCALL share the same ABI and require the largely same code to pass down arguments and retrieve results. Define an assembly macro that can be used to implement C wrapper for both TDCALL and SEAMCALL. Suggested-by: Thomas Gleixner Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20220405232939.73860-3-kirill.shutemov@linux.intel.com --- arch/x86/include/asm/tdx.h | 29 +++++++++++++ arch/x86/kernel/asm-offsets.c | 9 ++++ arch/x86/virt/vmx/tdx/tdxcall.S | 96 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 134 insertions(+) create mode 100644 arch/x86/virt/vmx/tdx/tdxcall.S (limited to 'arch/x86') diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index ba8042ce61c2..cb4c4e607c43 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -4,10 +4,38 @@ #define _ASM_X86_TDX_H #include +#include #define TDX_CPUID_LEAF_ID 0x21 #define TDX_IDENT "IntelTDX " +/* + * SW-defined error codes. + * + * Bits 47:40 == 0xFF indicate Reserved status code class that never used by + * TDX module. + */ +#define TDX_ERROR _BITUL(63) +#define TDX_SW_ERROR (TDX_ERROR | GENMASK_ULL(47, 40)) +#define TDX_SEAMCALL_VMFAILINVALID (TDX_SW_ERROR | _UL(0xFFFF0000)) + +#ifndef __ASSEMBLY__ + +/* + * Used to gather the output registers values of the TDCALL and SEAMCALL + * instructions when requesting services from the TDX module. + * + * This is a software only structure and not part of the TDX module/VMM ABI. + */ +struct tdx_module_output { + u64 rcx; + u64 rdx; + u64 r8; + u64 r9; + u64 r10; + u64 r11; +}; + #ifdef CONFIG_INTEL_TDX_GUEST void __init tdx_early_init(void); @@ -18,4 +46,5 @@ static inline void tdx_early_init(void) { }; #endif /* CONFIG_INTEL_TDX_GUEST */ +#endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_TDX_H */ diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 9fb0a2f8b62a..7dca52f5cfc6 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -18,6 +18,7 @@ #include #include #include +#include #ifdef CONFIG_XEN #include @@ -65,6 +66,14 @@ static void __used common(void) OFFSET(XEN_vcpu_info_arch_cr2, vcpu_info, arch.cr2); #endif + BLANK(); + OFFSET(TDX_MODULE_rcx, tdx_module_output, rcx); + OFFSET(TDX_MODULE_rdx, tdx_module_output, rdx); + OFFSET(TDX_MODULE_r8, tdx_module_output, r8); + OFFSET(TDX_MODULE_r9, tdx_module_output, r9); + OFFSET(TDX_MODULE_r10, tdx_module_output, r10); + OFFSET(TDX_MODULE_r11, tdx_module_output, r11); + BLANK(); OFFSET(BP_scratch, boot_params, scratch); OFFSET(BP_secure_boot, boot_params, secure_boot); diff --git a/arch/x86/virt/vmx/tdx/tdxcall.S b/arch/x86/virt/vmx/tdx/tdxcall.S new file mode 100644 index 000000000000..49a54356ae99 --- /dev/null +++ b/arch/x86/virt/vmx/tdx/tdxcall.S @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include + +/* + * TDCALL and SEAMCALL are supported in Binutils >= 2.36. + */ +#define tdcall .byte 0x66,0x0f,0x01,0xcc +#define seamcall .byte 0x66,0x0f,0x01,0xcf + +/* + * TDX_MODULE_CALL - common helper macro for both + * TDCALL and SEAMCALL instructions. + * + * TDCALL - used by TDX guests to make requests to the + * TDX module and hypercalls to the VMM. + * SEAMCALL - used by TDX hosts to make requests to the + * TDX module. + */ +.macro TDX_MODULE_CALL host:req + /* + * R12 will be used as temporary storage for struct tdx_module_output + * pointer. Since R12-R15 registers are not used by TDCALL/SEAMCALL + * services supported by this function, it can be reused. + */ + + /* Callee saved, so preserve it */ + push %r12 + + /* + * Push output pointer to stack. + * After the operation, it will be fetched into R12 register. + */ + push %r9 + + /* Mangle function call ABI into TDCALL/SEAMCALL ABI: */ + /* Move Leaf ID to RAX */ + mov %rdi, %rax + /* Move input 4 to R9 */ + mov %r8, %r9 + /* Move input 3 to R8 */ + mov %rcx, %r8 + /* Move input 1 to RCX */ + mov %rsi, %rcx + /* Leave input param 2 in RDX */ + + .if \host + seamcall + /* + * SEAMCALL instruction is essentially a VMExit from VMX root + * mode to SEAM VMX root mode. VMfailInvalid (CF=1) indicates + * that the targeted SEAM firmware is not loaded or disabled, + * or P-SEAMLDR is busy with another SEAMCALL. %rax is not + * changed in this case. + * + * Set %rax to TDX_SEAMCALL_VMFAILINVALID for VMfailInvalid. + * This value will never be used as actual SEAMCALL error code as + * it is from the Reserved status code class. + */ + jnc .Lno_vmfailinvalid + mov $TDX_SEAMCALL_VMFAILINVALID, %rax +.Lno_vmfailinvalid: + + .else + tdcall + .endif + + /* + * Fetch output pointer from stack to R12 (It is used + * as temporary storage) + */ + pop %r12 + + /* + * Since this macro can be invoked with NULL as an output pointer, + * check if caller provided an output struct before storing output + * registers. + * + * Update output registers, even if the call failed (RAX != 0). + * Other registers may contain details of the failure. + */ + test %r12, %r12 + jz .Lno_output_struct + + /* Copy result registers to output struct: */ + movq %rcx, TDX_MODULE_rcx(%r12) + movq %rdx, TDX_MODULE_rdx(%r12) + movq %r8, TDX_MODULE_r8(%r12) + movq %r9, TDX_MODULE_r9(%r12) + movq %r10, TDX_MODULE_r10(%r12) + movq %r11, TDX_MODULE_r11(%r12) + +.Lno_output_struct: + /* Restore the state of R12 register */ + pop %r12 +.endm -- cgit From eb94f1b6a70a683040d60d21bbb6ad65f082600a Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Wed, 6 Apr 2022 02:29:12 +0300 Subject: x86/tdx: Add __tdx_module_call() and __tdx_hypercall() helper functions Guests communicate with VMMs with hypercalls. Historically, these are implemented using instructions that are known to cause VMEXITs like VMCALL, VMLAUNCH, etc. However, with TDX, VMEXITs no longer expose the guest state to the host. This prevents the old hypercall mechanisms from working. So, to communicate with VMM, TDX specification defines a new instruction called TDCALL. In a TDX based VM, since the VMM is an untrusted entity, an intermediary layer -- TDX module -- facilitates secure communication between the host and the guest. TDX module is loaded like a firmware into a special CPU mode called SEAM. TDX guests communicate with the TDX module using the TDCALL instruction. A guest uses TDCALL to communicate with both the TDX module and VMM. The value of the RAX register when executing the TDCALL instruction is used to determine the TDCALL type. A leaf of TDCALL used to communicate with the VMM is called TDVMCALL. Add generic interfaces to communicate with the TDX module and VMM (using the TDCALL instruction). __tdx_module_call() - Used to communicate with the TDX module (via TDCALL instruction). __tdx_hypercall() - Used by the guest to request services from the VMM (via TDVMCALL leaf of TDCALL). Also define an additional wrapper _tdx_hypercall(), which adds error handling support for the TDCALL failure. The __tdx_module_call() and __tdx_hypercall() helper functions are implemented in assembly in a .S file. The TDCALL ABI requires shuffling arguments in and out of registers, which proved to be awkward with inline assembly. Just like syscalls, not all TDVMCALL use cases need to use the same number of argument registers. The implementation here picks the current worst-case scenario for TDCALL (4 registers). For TDCALLs with fewer than 4 arguments, there will end up being a few superfluous (cheap) instructions. But, this approach maximizes code reuse. For registers used by the TDCALL instruction, please check TDX GHCI specification, the section titled "TDCALL instruction" and "TDG.VP.VMCALL Interface". Based on previous patch by Sean Christopherson. Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Tony Luck Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20220405232939.73860-4-kirill.shutemov@linux.intel.com --- arch/x86/coco/tdx/Makefile | 2 +- arch/x86/coco/tdx/tdcall.S | 191 ++++++++++++++++++++++++++++++++++++++++++ arch/x86/coco/tdx/tdx.c | 24 ++++++ arch/x86/include/asm/tdx.h | 30 +++++++ arch/x86/kernel/asm-offsets.c | 8 ++ 5 files changed, 254 insertions(+), 1 deletion(-) create mode 100644 arch/x86/coco/tdx/tdcall.S (limited to 'arch/x86') diff --git a/arch/x86/coco/tdx/Makefile b/arch/x86/coco/tdx/Makefile index c929d53ee059..46c55998557d 100644 --- a/arch/x86/coco/tdx/Makefile +++ b/arch/x86/coco/tdx/Makefile @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y += tdx.o +obj-y += tdx.o tdcall.o diff --git a/arch/x86/coco/tdx/tdcall.S b/arch/x86/coco/tdx/tdcall.S new file mode 100644 index 000000000000..662479ccf630 --- /dev/null +++ b/arch/x86/coco/tdx/tdcall.S @@ -0,0 +1,191 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include +#include +#include + +#include +#include +#include + +#include "../../virt/vmx/tdx/tdxcall.S" + +/* + * Bitmasks of exposed registers (with VMM). + */ +#define TDX_R10 BIT(10) +#define TDX_R11 BIT(11) +#define TDX_R12 BIT(12) +#define TDX_R13 BIT(13) +#define TDX_R14 BIT(14) +#define TDX_R15 BIT(15) + +/* + * These registers are clobbered to hold arguments for each + * TDVMCALL. They are safe to expose to the VMM. + * Each bit in this mask represents a register ID. Bit field + * details can be found in TDX GHCI specification, section + * titled "TDCALL [TDG.VP.VMCALL] leaf". + */ +#define TDVMCALL_EXPOSE_REGS_MASK ( TDX_R10 | TDX_R11 | \ + TDX_R12 | TDX_R13 | \ + TDX_R14 | TDX_R15 ) + +/* + * __tdx_module_call() - Used by TDX guests to request services from + * the TDX module (does not include VMM services) using TDCALL instruction. + * + * Transforms function call register arguments into the TDCALL register ABI. + * After TDCALL operation, TDX module output is saved in @out (if it is + * provided by the user). + * + *------------------------------------------------------------------------- + * TDCALL ABI: + *------------------------------------------------------------------------- + * Input Registers: + * + * RAX - TDCALL Leaf number. + * RCX,RDX,R8-R9 - TDCALL Leaf specific input registers. + * + * Output Registers: + * + * RAX - TDCALL instruction error code. + * RCX,RDX,R8-R11 - TDCALL Leaf specific output registers. + * + *------------------------------------------------------------------------- + * + * __tdx_module_call() function ABI: + * + * @fn (RDI) - TDCALL Leaf ID, moved to RAX + * @rcx (RSI) - Input parameter 1, moved to RCX + * @rdx (RDX) - Input parameter 2, moved to RDX + * @r8 (RCX) - Input parameter 3, moved to R8 + * @r9 (R8) - Input parameter 4, moved to R9 + * + * @out (R9) - struct tdx_module_output pointer + * stored temporarily in R12 (not + * shared with the TDX module). It + * can be NULL. + * + * Return status of TDCALL via RAX. + */ +SYM_FUNC_START(__tdx_module_call) + FRAME_BEGIN + TDX_MODULE_CALL host=0 + FRAME_END + ret +SYM_FUNC_END(__tdx_module_call) + +/* + * __tdx_hypercall() - Make hypercalls to a TDX VMM using TDVMCALL leaf + * of TDCALL instruction + * + * Transforms values in function call argument struct tdx_hypercall_args @args + * into the TDCALL register ABI. After TDCALL operation, VMM output is saved + * back in @args. + * + *------------------------------------------------------------------------- + * TD VMCALL ABI: + *------------------------------------------------------------------------- + * + * Input Registers: + * + * RAX - TDCALL instruction leaf number (0 - TDG.VP.VMCALL) + * RCX - BITMAP which controls which part of TD Guest GPR + * is passed as-is to the VMM and back. + * R10 - Set 0 to indicate TDCALL follows standard TDX ABI + * specification. Non zero value indicates vendor + * specific ABI. + * R11 - VMCALL sub function number + * RBX, RBP, RDI, RSI - Used to pass VMCALL sub function specific arguments. + * R8-R9, R12-R15 - Same as above. + * + * Output Registers: + * + * RAX - TDCALL instruction status (Not related to hypercall + * output). + * R10 - Hypercall output error code. + * R11-R15 - Hypercall sub function specific output values. + * + *------------------------------------------------------------------------- + * + * __tdx_hypercall() function ABI: + * + * @args (RDI) - struct tdx_hypercall_args for input and output + * @flags (RSI) - TDX_HCALL_* flags + * + * On successful completion, return the hypercall error code. + */ +SYM_FUNC_START(__tdx_hypercall) + FRAME_BEGIN + + /* Save callee-saved GPRs as mandated by the x86_64 ABI */ + push %r15 + push %r14 + push %r13 + push %r12 + + /* Mangle function call ABI into TDCALL ABI: */ + /* Set TDCALL leaf ID (TDVMCALL (0)) in RAX */ + xor %eax, %eax + + /* Copy hypercall registers from arg struct: */ + movq TDX_HYPERCALL_r10(%rdi), %r10 + movq TDX_HYPERCALL_r11(%rdi), %r11 + movq TDX_HYPERCALL_r12(%rdi), %r12 + movq TDX_HYPERCALL_r13(%rdi), %r13 + movq TDX_HYPERCALL_r14(%rdi), %r14 + movq TDX_HYPERCALL_r15(%rdi), %r15 + + movl $TDVMCALL_EXPOSE_REGS_MASK, %ecx + + tdcall + + /* + * RAX==0 indicates a failure of the TDVMCALL mechanism itself and that + * something has gone horribly wrong with the TDX module. + * + * The return status of the hypercall operation is in a separate + * register (in R10). Hypercall errors are a part of normal operation + * and are handled by callers. + */ + testq %rax, %rax + jne .Lpanic + + /* TDVMCALL leaf return code is in R10 */ + movq %r10, %rax + + /* Copy hypercall result registers to arg struct if needed */ + testq $TDX_HCALL_HAS_OUTPUT, %rsi + jz .Lout + + movq %r10, TDX_HYPERCALL_r10(%rdi) + movq %r11, TDX_HYPERCALL_r11(%rdi) + movq %r12, TDX_HYPERCALL_r12(%rdi) + movq %r13, TDX_HYPERCALL_r13(%rdi) + movq %r14, TDX_HYPERCALL_r14(%rdi) + movq %r15, TDX_HYPERCALL_r15(%rdi) +.Lout: + /* + * Zero out registers exposed to the VMM to avoid speculative execution + * with VMM-controlled values. This needs to include all registers + * present in TDVMCALL_EXPOSE_REGS_MASK (except R12-R15). R12-R15 + * context will be restored. + */ + xor %r10d, %r10d + xor %r11d, %r11d + + /* Restore callee-saved GPRs as mandated by the x86_64 ABI */ + pop %r12 + pop %r13 + pop %r14 + pop %r15 + + FRAME_END + + retq +.Lpanic: + call __tdx_hypercall_failed + /* __tdx_hypercall_failed never returns */ + jmp .Lpanic +SYM_FUNC_END(__tdx_hypercall) diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 97674471fd1e..4b57880e45b0 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -7,6 +7,30 @@ #include #include +/* + * Wrapper for standard use of __tdx_hypercall with no output aside from + * return code. + */ +static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14, u64 r15) +{ + struct tdx_hypercall_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = fn, + .r12 = r12, + .r13 = r13, + .r14 = r14, + .r15 = r15, + }; + + return __tdx_hypercall(&args, 0); +} + +/* Called from __tdx_hypercall() for unrecoverable failure */ +void __tdx_hypercall_failed(void) +{ + panic("TDVMCALL failed. TDX module bug?"); +} + void __init tdx_early_init(void) { u32 eax, sig[3]; diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index cb4c4e607c43..a33d47abe67d 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -3,12 +3,17 @@ #ifndef _ASM_X86_TDX_H #define _ASM_X86_TDX_H +#include #include #include #define TDX_CPUID_LEAF_ID 0x21 #define TDX_IDENT "IntelTDX " +#define TDX_HYPERCALL_STANDARD 0 + +#define TDX_HCALL_HAS_OUTPUT BIT(0) + /* * SW-defined error codes. * @@ -36,10 +41,35 @@ struct tdx_module_output { u64 r11; }; +/* + * Used in __tdx_hypercall() to pass down and get back registers' values of + * the TDCALL instruction when requesting services from the VMM. + * + * This is a software only structure and not part of the TDX module/VMM ABI. + */ +struct tdx_hypercall_args { + u64 r10; + u64 r11; + u64 r12; + u64 r13; + u64 r14; + u64 r15; +}; + #ifdef CONFIG_INTEL_TDX_GUEST void __init tdx_early_init(void); +/* Used to communicate with the TDX module */ +u64 __tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9, + struct tdx_module_output *out); + +/* Used to request services from the VMM */ +u64 __tdx_hypercall(struct tdx_hypercall_args *args, unsigned long flags); + +/* Called from __tdx_hypercall() for unrecoverable failure */ +void __tdx_hypercall_failed(void); + #else static inline void tdx_early_init(void) { }; diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 7dca52f5cfc6..437308004ef2 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -74,6 +74,14 @@ static void __used common(void) OFFSET(TDX_MODULE_r10, tdx_module_output, r10); OFFSET(TDX_MODULE_r11, tdx_module_output, r11); + BLANK(); + OFFSET(TDX_HYPERCALL_r10, tdx_hypercall_args, r10); + OFFSET(TDX_HYPERCALL_r11, tdx_hypercall_args, r11); + OFFSET(TDX_HYPERCALL_r12, tdx_hypercall_args, r12); + OFFSET(TDX_HYPERCALL_r13, tdx_hypercall_args, r13); + OFFSET(TDX_HYPERCALL_r14, tdx_hypercall_args, r14); + OFFSET(TDX_HYPERCALL_r15, tdx_hypercall_args, r15); + BLANK(); OFFSET(BP_scratch, boot_params, scratch); OFFSET(BP_secure_boot, boot_params, secure_boot); -- cgit From 41394e33f3a0ce791caf0e086e1fca850832ddec Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:13 +0300 Subject: x86/tdx: Extend the confidential computing API to support TDX guests Confidential Computing (CC) features (like string I/O unroll support, memory encryption/decryption support, etc) are conditionally enabled in the kernel using cc_platform_has() API. Since TDX guests also need to use these CC features, extend cc_platform_has() API and add TDX guest-specific CC attributes support. CC API also provides an interface to deal with encryption mask. Extend it to cover TDX. Details about which bit in the page table entry to be used to indicate shared/private state is determined by using the TDINFO TDCALL. Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Dave Hansen Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20220405232939.73860-5-kirill.shutemov@linux.intel.com --- arch/x86/Kconfig | 1 + arch/x86/coco/core.c | 12 ++++++++++++ arch/x86/coco/tdx/tdx.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 60 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4ae27322869d..984315ca0275 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -882,6 +882,7 @@ config INTEL_TDX_GUEST bool "Intel TDX (Trust Domain Extensions) - Guest Support" depends on X86_64 && CPU_SUP_INTEL depends on X86_X2APIC + select ARCH_HAS_CC_PLATFORM help Support running as a guest under Intel TDX. Without this support, the guest kernel can not boot or run under TDX. diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c index fc1365dd927e..3f3008783e05 100644 --- a/arch/x86/coco/core.c +++ b/arch/x86/coco/core.c @@ -87,9 +87,18 @@ EXPORT_SYMBOL_GPL(cc_platform_has); u64 cc_mkenc(u64 val) { + /* + * Both AMD and Intel use a bit in the page table to indicate + * encryption status of the page. + * + * - for AMD, bit *set* means the page is encrypted + * - for Intel *clear* means encrypted. + */ switch (vendor) { case CC_VENDOR_AMD: return val | cc_mask; + case CC_VENDOR_INTEL: + return val & ~cc_mask; default: return val; } @@ -97,9 +106,12 @@ u64 cc_mkenc(u64 val) u64 cc_mkdec(u64 val) { + /* See comment in cc_mkenc() */ switch (vendor) { case CC_VENDOR_AMD: return val & ~cc_mask; + case CC_VENDOR_INTEL: + return val | cc_mask; default: return val; } diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 4b57880e45b0..96b2611baac5 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -5,8 +5,12 @@ #define pr_fmt(fmt) "tdx: " fmt #include +#include #include +/* TDX module Call Leaf IDs */ +#define TDX_GET_INFO 1 + /* * Wrapper for standard use of __tdx_hypercall with no output aside from * return code. @@ -31,8 +35,47 @@ void __tdx_hypercall_failed(void) panic("TDVMCALL failed. TDX module bug?"); } +/* + * Used for TDX guests to make calls directly to the TD module. This + * should only be used for calls that have no legitimate reason to fail + * or where the kernel can not survive the call failing. + */ +static inline void tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9, + struct tdx_module_output *out) +{ + if (__tdx_module_call(fn, rcx, rdx, r8, r9, out)) + panic("TDCALL %lld failed (Buggy TDX module!)\n", fn); +} + +static u64 get_cc_mask(void) +{ + struct tdx_module_output out; + unsigned int gpa_width; + + /* + * TDINFO TDX module call is used to get the TD execution environment + * information like GPA width, number of available vcpus, debug mode + * information, etc. More details about the ABI can be found in TDX + * Guest-Host-Communication Interface (GHCI), section 2.4.2 TDCALL + * [TDG.VP.INFO]. + * + * The GPA width that comes out of this call is critical. TDX guests + * can not meaningfully run without it. + */ + tdx_module_call(TDX_GET_INFO, 0, 0, 0, 0, &out); + + gpa_width = out.rcx & GENMASK(5, 0); + + /* + * The highest bit of a guest physical address is the "sharing" bit. + * Set it for shared pages and clear it for private pages. + */ + return BIT_ULL(gpa_width - 1); +} + void __init tdx_early_init(void) { + u64 cc_mask; u32 eax, sig[3]; cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, &sig[0], &sig[2], &sig[1]); @@ -42,5 +85,9 @@ void __init tdx_early_init(void) setup_force_cpu_cap(X86_FEATURE_TDX_GUEST); + cc_set_vendor(CC_VENDOR_INTEL); + cc_mask = get_cc_mask(); + cc_set_mask(cc_mask); + pr_info("Guest detected\n"); } -- cgit From 65fab5bc033aad1a9faf976caec46558c2f88319 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:14 +0300 Subject: x86/tdx: Exclude shared bit from __PHYSICAL_MASK In TDX guests, by default memory is protected from host access. If a guest needs to communicate with the VMM (like the I/O use case), it uses a single bit in the physical address to communicate the protected/shared attribute of the given page. In the x86 ARCH code, __PHYSICAL_MASK macro represents the width of the physical address in the given architecture. It is used in creating physical PAGE_MASK for address bits in the kernel. Since in TDX guest, a single bit is used as metadata, it needs to be excluded from valid physical address bits to avoid using incorrect addresses bits in the kernel. Enable DYNAMIC_PHYSICAL_MASK to support updating the __PHYSICAL_MASK. Co-developed-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Andi Kleen Reviewed-by: Tony Luck Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20220405232939.73860-6-kirill.shutemov@linux.intel.com --- arch/x86/Kconfig | 1 + arch/x86/coco/tdx/tdx.c | 8 ++++++++ 2 files changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 984315ca0275..aea4cc404c31 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -883,6 +883,7 @@ config INTEL_TDX_GUEST depends on X86_64 && CPU_SUP_INTEL depends on X86_X2APIC select ARCH_HAS_CC_PLATFORM + select DYNAMIC_PHYSICAL_MASK help Support running as a guest under Intel TDX. Without this support, the guest kernel can not boot or run under TDX. diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 96b2611baac5..e84f6dd3ed2a 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -89,5 +89,13 @@ void __init tdx_early_init(void) cc_mask = get_cc_mask(); cc_set_mask(cc_mask); + /* + * All bits above GPA width are reserved and kernel treats shared bit + * as flag, not as part of physical address. + * + * Adjust physical mask to only cover valid GPA bits. + */ + physical_mask &= cc_mask - 1; + pr_info("Guest detected\n"); } -- cgit From 775acc82a88fd36f5e89a08d39874fdeeaa04247 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:15 +0300 Subject: x86/traps: Refactor exc_general_protection() TDX brings a new exception -- Virtualization Exception (#VE). Handling of #VE structurally very similar to handling #GP. Extract two helpers from exc_general_protection() that can be reused for handling #VE. No functional changes. Suggested-by: Thomas Gleixner Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/20220405232939.73860-7-kirill.shutemov@linux.intel.com --- arch/x86/kernel/traps.c | 57 +++++++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 1563fb995005..db8d22a0d003 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -686,13 +686,40 @@ static bool try_fixup_enqcmd_gp(void) #endif } +static bool gp_try_fixup_and_notify(struct pt_regs *regs, int trapnr, + unsigned long error_code, const char *str) +{ + if (fixup_exception(regs, trapnr, error_code, 0)) + return true; + + current->thread.error_code = error_code; + current->thread.trap_nr = trapnr; + + /* + * To be potentially processing a kprobe fault and to trust the result + * from kprobe_running(), we have to be non-preemptible. + */ + if (!preemptible() && kprobe_running() && + kprobe_fault_handler(regs, trapnr)) + return true; + + return notify_die(DIE_GPF, str, regs, error_code, trapnr, SIGSEGV) == NOTIFY_STOP; +} + +static void gp_user_force_sig_segv(struct pt_regs *regs, int trapnr, + unsigned long error_code, const char *str) +{ + current->thread.error_code = error_code; + current->thread.trap_nr = trapnr; + show_signal(current, SIGSEGV, "", str, regs, error_code); + force_sig(SIGSEGV); +} + DEFINE_IDTENTRY_ERRORCODE(exc_general_protection) { char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR; enum kernel_gp_hint hint = GP_NO_HINT; - struct task_struct *tsk; unsigned long gp_addr; - int ret; if (user_mode(regs) && try_fixup_enqcmd_gp()) return; @@ -711,40 +738,18 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection) return; } - tsk = current; - if (user_mode(regs)) { if (fixup_iopl_exception(regs)) goto exit; - tsk->thread.error_code = error_code; - tsk->thread.trap_nr = X86_TRAP_GP; - if (fixup_vdso_exception(regs, X86_TRAP_GP, error_code, 0)) goto exit; - show_signal(tsk, SIGSEGV, "", desc, regs, error_code); - force_sig(SIGSEGV); + gp_user_force_sig_segv(regs, X86_TRAP_GP, error_code, desc); goto exit; } - if (fixup_exception(regs, X86_TRAP_GP, error_code, 0)) - goto exit; - - tsk->thread.error_code = error_code; - tsk->thread.trap_nr = X86_TRAP_GP; - - /* - * To be potentially processing a kprobe fault and to trust the result - * from kprobe_running(), we have to be non-preemptible. - */ - if (!preemptible() && - kprobe_running() && - kprobe_fault_handler(regs, X86_TRAP_GP)) - goto exit; - - ret = notify_die(DIE_GPF, desc, regs, error_code, X86_TRAP_GP, SIGSEGV); - if (ret == NOTIFY_STOP) + if (gp_try_fixup_and_notify(regs, X86_TRAP_GP, error_code, desc)) goto exit; if (error_code) -- cgit From 9a22bf6debbf5169f750af53c7f86eb4e3cd6712 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:16 +0300 Subject: x86/traps: Add #VE support for TDX guest Virtualization Exceptions (#VE) are delivered to TDX guests due to specific guest actions which may happen in either user space or the kernel: * Specific instructions (WBINVD, for example) * Specific MSR accesses * Specific CPUID leaf accesses * Access to specific guest physical addresses Syscall entry code has a critical window where the kernel stack is not yet set up. Any exception in this window leads to hard to debug issues and can be exploited for privilege escalation. Exceptions in the NMI entry code also cause issues. Returning from the exception handler with IRET will re-enable NMIs and nested NMI will corrupt the NMI stack. For these reasons, the kernel avoids #VEs during the syscall gap and the NMI entry code. Entry code paths do not access TD-shared memory, MMIO regions, use #VE triggering MSRs, instructions, or CPUID leaves that might generate #VE. VMM can remove memory from TD at any point, but access to unaccepted (or missing) private memory leads to VM termination, not to #VE. Similarly to page faults and breakpoints, #VEs are allowed in NMI handlers once the kernel is ready to deal with nested NMIs. During #VE delivery, all interrupts, including NMIs, are blocked until TDGETVEINFO is called. It prevents #VE nesting until the kernel reads the VE info. TDGETVEINFO retrieves the #VE info from the TDX module, which also clears the "#VE valid" flag. This must be done before anything else as any #VE that occurs while the valid flag is set escalates to #DF by TDX module. It will result in an oops. Virtual NMIs are inhibited if the #VE valid flag is set. NMI will not be delivered until TDGETVEINFO is called. For now, convert unhandled #VE's (everything, until later in this series) so that they appear just like a #GP by calling the ve_raise_fault() directly. The ve_raise_fault() function is similar to #GP handler and is responsible for sending SIGSEGV to userspace and CPU die and notifying debuggers and other die chain users. Co-developed-by: Sean Christopherson Co-developed-by: Kuppuswamy Sathyanarayanan Signed-off-by: Sean Christopherson Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Andi Kleen Reviewed-by: Tony Luck Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/20220405232939.73860-8-kirill.shutemov@linux.intel.com --- arch/x86/coco/tdx/tdx.c | 38 ++++++++++++++++++ arch/x86/include/asm/idtentry.h | 4 ++ arch/x86/include/asm/tdx.h | 21 ++++++++++ arch/x86/kernel/idt.c | 3 ++ arch/x86/kernel/traps.c | 86 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 152 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index e84f6dd3ed2a..60a3f2ff5b95 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -10,6 +10,7 @@ /* TDX module Call Leaf IDs */ #define TDX_GET_INFO 1 +#define TDX_GET_VEINFO 3 /* * Wrapper for standard use of __tdx_hypercall with no output aside from @@ -73,6 +74,43 @@ static u64 get_cc_mask(void) return BIT_ULL(gpa_width - 1); } +void tdx_get_ve_info(struct ve_info *ve) +{ + struct tdx_module_output out; + + /* + * Called during #VE handling to retrieve the #VE info from the + * TDX module. + * + * This has to be called early in #VE handling. A "nested" #VE which + * occurs before this will raise a #DF and is not recoverable. + * + * The call retrieves the #VE info from the TDX module, which also + * clears the "#VE valid" flag. This must be done before anything else + * because any #VE that occurs while the valid flag is set will lead to + * #DF. + * + * Note, the TDX module treats virtual NMIs as inhibited if the #VE + * valid flag is set. It means that NMI=>#VE will not result in a #DF. + */ + tdx_module_call(TDX_GET_VEINFO, 0, 0, 0, 0, &out); + + /* Transfer the output parameters */ + ve->exit_reason = out.rcx; + ve->exit_qual = out.rdx; + ve->gla = out.r8; + ve->gpa = out.r9; + ve->instr_len = lower_32_bits(out.r10); + ve->instr_info = upper_32_bits(out.r10); +} + +bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve) +{ + pr_warn("Unexpected #VE: %lld\n", ve->exit_reason); + + return false; +} + void __init tdx_early_init(void) { u64 cc_mask; diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 7924f27f5c8b..72184b0b2219 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -632,6 +632,10 @@ DECLARE_IDTENTRY_XENCB(X86_TRAP_OTHER, exc_xen_hypervisor_callback); DECLARE_IDTENTRY_RAW(X86_TRAP_OTHER, exc_xen_unknown_trap); #endif +#ifdef CONFIG_INTEL_TDX_GUEST +DECLARE_IDTENTRY(X86_TRAP_VE, exc_virtualization_exception); +#endif + /* Device interrupts common/spurious */ DECLARE_IDTENTRY_IRQ(X86_TRAP_OTHER, common_interrupt); #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index a33d47abe67d..c4142e7b004c 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -6,6 +6,7 @@ #include #include #include +#include #define TDX_CPUID_LEAF_ID 0x21 #define TDX_IDENT "IntelTDX " @@ -56,6 +57,22 @@ struct tdx_hypercall_args { u64 r15; }; +/* + * Used by the #VE exception handler to gather the #VE exception + * info from the TDX module. This is a software only structure + * and not part of the TDX module/VMM ABI. + */ +struct ve_info { + u64 exit_reason; + u64 exit_qual; + /* Guest Linear (virtual) Address */ + u64 gla; + /* Guest Physical Address */ + u64 gpa; + u32 instr_len; + u32 instr_info; +}; + #ifdef CONFIG_INTEL_TDX_GUEST void __init tdx_early_init(void); @@ -70,6 +87,10 @@ u64 __tdx_hypercall(struct tdx_hypercall_args *args, unsigned long flags); /* Called from __tdx_hypercall() for unrecoverable failure */ void __tdx_hypercall_failed(void); +void tdx_get_ve_info(struct ve_info *ve); + +bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve); + #else static inline void tdx_early_init(void) { }; diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 608eb63bf044..a58c6bc1cd68 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -69,6 +69,9 @@ static const __initconst struct idt_data early_idts[] = { */ INTG(X86_TRAP_PF, asm_exc_page_fault), #endif +#ifdef CONFIG_INTEL_TDX_GUEST + INTG(X86_TRAP_VE, asm_exc_virtualization_exception), +#endif }; /* diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index db8d22a0d003..f9fb6530338f 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -62,6 +62,7 @@ #include #include #include +#include #ifdef CONFIG_X86_64 #include @@ -1348,6 +1349,91 @@ DEFINE_IDTENTRY(exc_device_not_available) } } +#ifdef CONFIG_INTEL_TDX_GUEST + +#define VE_FAULT_STR "VE fault" + +static void ve_raise_fault(struct pt_regs *regs, long error_code) +{ + if (user_mode(regs)) { + gp_user_force_sig_segv(regs, X86_TRAP_VE, error_code, VE_FAULT_STR); + return; + } + + if (gp_try_fixup_and_notify(regs, X86_TRAP_VE, error_code, VE_FAULT_STR)) + return; + + die_addr(VE_FAULT_STR, regs, error_code, 0); +} + +/* + * Virtualization Exceptions (#VE) are delivered to TDX guests due to + * specific guest actions which may happen in either user space or the + * kernel: + * + * * Specific instructions (WBINVD, for example) + * * Specific MSR accesses + * * Specific CPUID leaf accesses + * * Access to specific guest physical addresses + * + * In the settings that Linux will run in, virtualization exceptions are + * never generated on accesses to normal, TD-private memory that has been + * accepted. + * + * Syscall entry code has a critical window where the kernel stack is not + * yet set up. Any exception in this window leads to hard to debug issues + * and can be exploited for privilege escalation. Exceptions in the NMI + * entry code also cause issues. Returning from the exception handler with + * IRET will re-enable NMIs and nested NMI will corrupt the NMI stack. + * + * For these reasons, the kernel avoids #VEs during the syscall gap and + * the NMI entry code. Entry code paths do not access TD-shared memory, + * MMIO regions, use #VE triggering MSRs, instructions, or CPUID leaves + * that might generate #VE. VMM can remove memory from TD at any point, + * but access to unaccepted (or missing) private memory leads to VM + * termination, not to #VE. + * + * Similarly to page faults and breakpoints, #VEs are allowed in NMI + * handlers once the kernel is ready to deal with nested NMIs. + * + * During #VE delivery, all interrupts, including NMIs, are blocked until + * TDGETVEINFO is called. It prevents #VE nesting until the kernel reads + * the VE info. + * + * If a guest kernel action which would normally cause a #VE occurs in + * the interrupt-disabled region before TDGETVEINFO, a #DF (fault + * exception) is delivered to the guest which will result in an oops. + * + * The entry code has been audited carefully for following these expectations. + * Changes in the entry code have to be audited for correctness vs. this + * aspect. Similarly to #PF, #VE in these places will expose kernel to + * privilege escalation or may lead to random crashes. + */ +DEFINE_IDTENTRY(exc_virtualization_exception) +{ + struct ve_info ve; + + /* + * NMIs/Machine-checks/Interrupts will be in a disabled state + * till TDGETVEINFO TDCALL is executed. This ensures that VE + * info cannot be overwritten by a nested #VE. + */ + tdx_get_ve_info(&ve); + + cond_local_irq_enable(regs); + + /* + * If tdx_handle_virt_exception() could not process + * it successfully, treat it as #GP(0) and handle it. + */ + if (!tdx_handle_virt_exception(regs, &ve)) + ve_raise_fault(regs, 0); + + cond_local_irq_disable(regs); +} + +#endif + #ifdef CONFIG_X86_32 DEFINE_IDTENTRY_SW(iret_error) { -- cgit From bfe6ed0c672782ac2a8edffac93b1ba84b0ff984 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:17 +0300 Subject: x86/tdx: Add HLT support for TDX guests The HLT instruction is a privileged instruction, executing it stops instruction execution and places the processor in a HALT state. It is used in kernel for cases like reboot, idle loop and exception fixup handlers. For the idle case, interrupts will be enabled (using STI) before the HLT instruction (this is also called safe_halt()). To support the HLT instruction in TDX guests, it needs to be emulated using TDVMCALL (hypercall to VMM). More details about it can be found in Intel Trust Domain Extensions (Intel TDX) Guest-Host-Communication Interface (GHCI) specification, section TDVMCALL[Instruction.HLT]. In TDX guests, executing HLT instruction will generate a #VE, which is used to emulate the HLT instruction. But #VE based emulation will not work for the safe_halt() flavor, because it requires STI instruction to be executed just before the TDCALL. Since idle loop is the only user of safe_halt() variant, handle it as a special case. To avoid *safe_halt() call in the idle function, define the tdx_guest_idle() and use it to override the "x86_idle" function pointer for a valid TDX guest. Alternative choices like PV ops have been considered for adding safe_halt() support. But it was rejected because HLT paravirt calls only exist under PARAVIRT_XXL, and enabling it in TDX guest just for safe_halt() use case is not worth the cost. Co-developed-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Andi Kleen Reviewed-by: Tony Luck Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/20220405232939.73860-9-kirill.shutemov@linux.intel.com --- arch/x86/coco/tdx/tdcall.S | 13 +++++++ arch/x86/coco/tdx/tdx.c | 93 +++++++++++++++++++++++++++++++++++++++++++++- arch/x86/include/asm/tdx.h | 4 ++ arch/x86/kernel/process.c | 4 ++ 4 files changed, 112 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/coco/tdx/tdcall.S b/arch/x86/coco/tdx/tdcall.S index 662479ccf630..245888290bb6 100644 --- a/arch/x86/coco/tdx/tdcall.S +++ b/arch/x86/coco/tdx/tdcall.S @@ -139,6 +139,19 @@ SYM_FUNC_START(__tdx_hypercall) movl $TDVMCALL_EXPOSE_REGS_MASK, %ecx + /* + * For the idle loop STI needs to be called directly before the TDCALL + * that enters idle (EXIT_REASON_HLT case). STI instruction enables + * interrupts only one instruction later. If there is a window between + * STI and the instruction that emulates the HALT state, there is a + * chance for interrupts to happen in this window, which can delay the + * HLT operation indefinitely. Since this is the not the desired + * result, conditionally call STI before TDCALL. + */ + testq $TDX_HCALL_ISSUE_STI, %rsi + jz .Lskip_sti + sti +.Lskip_sti: tdcall /* diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 60a3f2ff5b95..ed7302581cc7 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -7,6 +7,7 @@ #include #include #include +#include /* TDX module Call Leaf IDs */ #define TDX_GET_INFO 1 @@ -36,6 +37,17 @@ void __tdx_hypercall_failed(void) panic("TDVMCALL failed. TDX module bug?"); } +/* + * The TDG.VP.VMCALL-Instruction-execution sub-functions are defined + * independently from but are currently matched 1:1 with VMX EXIT_REASONs. + * Reusing the KVM EXIT_REASON macros makes it easier to connect the host and + * guest sides of these calls. + */ +static u64 hcall_func(u64 exit_reason) +{ + return exit_reason; +} + /* * Used for TDX guests to make calls directly to the TD module. This * should only be used for calls that have no legitimate reason to fail @@ -74,6 +86,62 @@ static u64 get_cc_mask(void) return BIT_ULL(gpa_width - 1); } +static u64 __cpuidle __halt(const bool irq_disabled, const bool do_sti) +{ + struct tdx_hypercall_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = hcall_func(EXIT_REASON_HLT), + .r12 = irq_disabled, + }; + + /* + * Emulate HLT operation via hypercall. More info about ABI + * can be found in TDX Guest-Host-Communication Interface + * (GHCI), section 3.8 TDG.VP.VMCALL. + * + * The VMM uses the "IRQ disabled" param to understand IRQ + * enabled status (RFLAGS.IF) of the TD guest and to determine + * whether or not it should schedule the halted vCPU if an + * IRQ becomes pending. E.g. if IRQs are disabled, the VMM + * can keep the vCPU in virtual HLT, even if an IRQ is + * pending, without hanging/breaking the guest. + */ + return __tdx_hypercall(&args, do_sti ? TDX_HCALL_ISSUE_STI : 0); +} + +static bool handle_halt(void) +{ + /* + * Since non safe halt is mainly used in CPU offlining + * and the guest will always stay in the halt state, don't + * call the STI instruction (set do_sti as false). + */ + const bool irq_disabled = irqs_disabled(); + const bool do_sti = false; + + if (__halt(irq_disabled, do_sti)) + return false; + + return true; +} + +void __cpuidle tdx_safe_halt(void) +{ + /* + * For do_sti=true case, __tdx_hypercall() function enables + * interrupts using the STI instruction before the TDCALL. So + * set irq_disabled as false. + */ + const bool irq_disabled = false; + const bool do_sti = true; + + /* + * Use WARN_ONCE() to report the failure. + */ + if (__halt(irq_disabled, do_sti)) + WARN_ONCE(1, "HLT instruction emulation failed\n"); +} + void tdx_get_ve_info(struct ve_info *ve) { struct tdx_module_output out; @@ -104,11 +172,32 @@ void tdx_get_ve_info(struct ve_info *ve) ve->instr_info = upper_32_bits(out.r10); } +/* Handle the kernel #VE */ +static bool virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve) +{ + switch (ve->exit_reason) { + case EXIT_REASON_HLT: + return handle_halt(); + default: + pr_warn("Unexpected #VE: %lld\n", ve->exit_reason); + return false; + } +} + bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve) { - pr_warn("Unexpected #VE: %lld\n", ve->exit_reason); + bool ret; + + if (user_mode(regs)) + ret = false; + else + ret = virt_exception_kernel(regs, ve); + + /* After successful #VE handling, move the IP */ + if (ret) + regs->ip += ve->instr_len; - return false; + return ret; } void __init tdx_early_init(void) diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index c4142e7b004c..cbd61e142f4e 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -14,6 +14,7 @@ #define TDX_HYPERCALL_STANDARD 0 #define TDX_HCALL_HAS_OUTPUT BIT(0) +#define TDX_HCALL_ISSUE_STI BIT(1) /* * SW-defined error codes. @@ -91,9 +92,12 @@ void tdx_get_ve_info(struct ve_info *ve); bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve); +void tdx_safe_halt(void); + #else static inline void tdx_early_init(void) { }; +static inline void tdx_safe_halt(void) { }; #endif /* CONFIG_INTEL_TDX_GUEST */ diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index b370767f5b19..dbaf12c43fe1 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -46,6 +46,7 @@ #include #include #include +#include #include "process.h" @@ -873,6 +874,9 @@ void select_idle_routine(const struct cpuinfo_x86 *c) } else if (prefer_mwait_c1_over_halt(c)) { pr_info("using mwait in idle threads\n"); x86_idle = mwait_idle; + } else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { + pr_info("using TDX aware idle routine\n"); + x86_idle = tdx_safe_halt; } else x86_idle = default_idle; } -- cgit From ae87f609cd52825fa7fa36f02b29e4357fd29eaa Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:18 +0300 Subject: x86/tdx: Add MSR support for TDX guests Use hypercall to emulate MSR read/write for the TDX platform. There are two viable approaches for doing MSRs in a TD guest: 1. Execute the RDMSR/WRMSR instructions like most VMs and bare metal do. Some will succeed, others will cause a #VE. All of those that cause a #VE will be handled with a TDCALL. 2. Use paravirt infrastructure. The paravirt hook has to keep a list of which MSRs would cause a #VE and use a TDCALL. All other MSRs execute RDMSR/WRMSR instructions directly. The second option can be ruled out because the list of MSRs was challenging to maintain. That leaves option #1 as the only viable solution for the minimal TDX support. Kernel relies on the exception fixup machinery to handle MSR access errors. #VE handler uses the same exception fixup code as #GP. It covers MSR accesses along with other types of fixups. For performance-critical MSR writes (like TSC_DEADLINE), future patches will replace the WRMSR/#VE sequence with the direct TDCALL. RDMSR and WRMSR specification details can be found in Guest-Host-Communication Interface (GHCI) for Intel Trust Domain Extensions (Intel TDX) specification, sec titled "TDG.VP. VMCALL" and "TDG.VP.VMCALL". Co-developed-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Andi Kleen Reviewed-by: Tony Luck Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20220405232939.73860-10-kirill.shutemov@linux.intel.com --- arch/x86/coco/tdx/tdx.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index ed7302581cc7..00ff0a830970 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -142,6 +142,44 @@ void __cpuidle tdx_safe_halt(void) WARN_ONCE(1, "HLT instruction emulation failed\n"); } +static bool read_msr(struct pt_regs *regs) +{ + struct tdx_hypercall_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = hcall_func(EXIT_REASON_MSR_READ), + .r12 = regs->cx, + }; + + /* + * Emulate the MSR read via hypercall. More info about ABI + * can be found in TDX Guest-Host-Communication Interface + * (GHCI), section titled "TDG.VP.VMCALL". + */ + if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT)) + return false; + + regs->ax = lower_32_bits(args.r11); + regs->dx = upper_32_bits(args.r11); + return true; +} + +static bool write_msr(struct pt_regs *regs) +{ + struct tdx_hypercall_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = hcall_func(EXIT_REASON_MSR_WRITE), + .r12 = regs->cx, + .r13 = (u64)regs->dx << 32 | regs->ax, + }; + + /* + * Emulate the MSR write via hypercall. More info about ABI + * can be found in TDX Guest-Host-Communication Interface + * (GHCI) section titled "TDG.VP.VMCALL". + */ + return !__tdx_hypercall(&args, 0); +} + void tdx_get_ve_info(struct ve_info *ve) { struct tdx_module_output out; @@ -178,6 +216,10 @@ static bool virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve) switch (ve->exit_reason) { case EXIT_REASON_HLT: return handle_halt(); + case EXIT_REASON_MSR_READ: + return read_msr(regs); + case EXIT_REASON_MSR_WRITE: + return write_msr(regs); default: pr_warn("Unexpected #VE: %lld\n", ve->exit_reason); return false; -- cgit From c141fa2c2bbaff0d1f2dc51160626dd22bc16ae2 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:19 +0300 Subject: x86/tdx: Handle CPUID via #VE In TDX guests, most CPUID leaf/sub-leaf combinations are virtualized by the TDX module while some trigger #VE. Implement the #VE handling for EXIT_REASON_CPUID by handing it through the hypercall, which in turn lets the TDX module handle it by invoking the host VMM. More details on CPUID Virtualization can be found in the TDX module specification, the section titled "CPUID Virtualization". Note that VMM that handles the hypercall is not trusted. It can return data that may steer the guest kernel in wrong direct. Only allow VMM to control range reserved for hypervisor communication. Return all-zeros for any CPUID outside the hypervisor range. It matches CPU behaviour for non-supported leaf. Co-developed-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Andi Kleen Reviewed-by: Tony Luck Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20220405232939.73860-11-kirill.shutemov@linux.intel.com --- arch/x86/coco/tdx/tdx.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 00ff0a830970..50c3b97d6db7 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -180,6 +180,48 @@ static bool write_msr(struct pt_regs *regs) return !__tdx_hypercall(&args, 0); } +static bool handle_cpuid(struct pt_regs *regs) +{ + struct tdx_hypercall_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = hcall_func(EXIT_REASON_CPUID), + .r12 = regs->ax, + .r13 = regs->cx, + }; + + /* + * Only allow VMM to control range reserved for hypervisor + * communication. + * + * Return all-zeros for any CPUID outside the range. It matches CPU + * behaviour for non-supported leaf. + */ + if (regs->ax < 0x40000000 || regs->ax > 0x4FFFFFFF) { + regs->ax = regs->bx = regs->cx = regs->dx = 0; + return true; + } + + /* + * Emulate the CPUID instruction via a hypercall. More info about + * ABI can be found in TDX Guest-Host-Communication Interface + * (GHCI), section titled "VP.VMCALL". + */ + if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT)) + return false; + + /* + * As per TDX GHCI CPUID ABI, r12-r15 registers contain contents of + * EAX, EBX, ECX, EDX registers after the CPUID instruction execution. + * So copy the register contents back to pt_regs. + */ + regs->ax = args.r12; + regs->bx = args.r13; + regs->cx = args.r14; + regs->dx = args.r15; + + return true; +} + void tdx_get_ve_info(struct ve_info *ve) { struct tdx_module_output out; @@ -210,6 +252,18 @@ void tdx_get_ve_info(struct ve_info *ve) ve->instr_info = upper_32_bits(out.r10); } +/* Handle the user initiated #VE */ +static bool virt_exception_user(struct pt_regs *regs, struct ve_info *ve) +{ + switch (ve->exit_reason) { + case EXIT_REASON_CPUID: + return handle_cpuid(regs); + default: + pr_warn("Unexpected #VE: %lld\n", ve->exit_reason); + return false; + } +} + /* Handle the kernel #VE */ static bool virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve) { @@ -220,6 +274,8 @@ static bool virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve) return read_msr(regs); case EXIT_REASON_MSR_WRITE: return write_msr(regs); + case EXIT_REASON_CPUID: + return handle_cpuid(regs); default: pr_warn("Unexpected #VE: %lld\n", ve->exit_reason); return false; @@ -231,7 +287,7 @@ bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve) bool ret; if (user_mode(regs)) - ret = false; + ret = virt_exception_user(regs, ve); else ret = virt_exception_kernel(regs, ve); -- cgit From 31d58c4e557d46fa7f8557714250fb6f89c941ae Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:20 +0300 Subject: x86/tdx: Handle in-kernel MMIO In non-TDX VMs, MMIO is implemented by providing the guest a mapping which will cause a VMEXIT on access and then the VMM emulating the instruction that caused the VMEXIT. That's not possible for TDX VM. To emulate an instruction an emulator needs two things: - R/W access to the register file to read/modify instruction arguments and see RIP of the faulted instruction. - Read access to memory where instruction is placed to see what to emulate. In this case it is guest kernel text. Both of them are not available to VMM in TDX environment: - Register file is never exposed to VMM. When a TD exits to the module, it saves registers into the state-save area allocated for that TD. The module then scrubs these registers before returning execution control to the VMM, to help prevent leakage of TD state. - TDX does not allow guests to execute from shared memory. All executed instructions are in TD-private memory. Being private to the TD, VMMs have no way to access TD-private memory and no way to read the instruction to decode and emulate it. In TDX the MMIO regions are instead configured by VMM to trigger a #VE exception in the guest. Add #VE handling that emulates the MMIO instruction inside the guest and converts it into a controlled hypercall to the host. This approach is bad for performance. But, it has (virtually) no impact on the size of the kernel image and will work for a wide variety of drivers. This allows TDX deployments to use arbitrary devices and device drivers, including virtio. TDX customers have asked for the capability to use random devices in their deployments. In other words, even if all of the work was done to paravirtualize all x86 MMIO users and virtio, this approach would still be needed. There is essentially no way to get rid of this code. This approach is functional for all in-kernel MMIO users current and future and does so with a minimal amount of code and kernel image bloat. MMIO addresses can be used with any CPU instruction that accesses memory. Address only MMIO accesses done via io.h helpers, such as 'readl()' or 'writeq()'. Any CPU instruction that accesses memory can also be used to access MMIO. However, by convention, MMIO access are typically performed via io.h helpers such as 'readl()' or 'writeq()'. The io.h helpers intentionally use a limited set of instructions when accessing MMIO. This known, limited set of instructions makes MMIO instruction decoding and emulation feasible in KVM hosts and SEV guests today. MMIO accesses performed without the io.h helpers are at the mercy of the compiler. Compilers can and will generate a much more broad set of instructions which can not practically be decoded and emulated. TDX guests will oops if they encounter one of these decoding failures. This means that TDX guests *must* use the io.h helpers to access MMIO. This requirement is not new. Both KVM hosts and AMD SEV guests have the same limitations on MMIO access. === Potential alternative approaches === == Paravirtualizing all MMIO == An alternative to letting MMIO induce a #VE exception is to avoid the #VE in the first place. Similar to the port I/O case, it is theoretically possible to paravirtualize MMIO accesses. Like the exception-based approach offered here, a fully paravirtualized approach would be limited to MMIO users that leverage common infrastructure like the io.h macros. However, any paravirtual approach would be patching approximately 120k call sites. Any paravirtual approach would need to replace a bare memory access instruction with (at least) a function call. With a conservative overhead estimation of 5 bytes per call site (CALL instruction), it leads to bloating code by 600k. Many drivers will never be used in the TDX environment and the bloat cannot be justified. == Patching TDX drivers == Rather than touching the entire kernel, it might also be possible to just go after drivers that use MMIO in TDX guests *and* are performance critical to justify the effrort. Right now, that's limited only to virtio. All virtio MMIO appears to be done through a single function, which makes virtio eminently easy to patch. This approach will be adopted in the future, removing the bulk of MMIO #VEs. The #VE-based MMIO will remain serving non-virtio use cases. Co-developed-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Andi Kleen Reviewed-by: Tony Luck Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20220405232939.73860-12-kirill.shutemov@linux.intel.com --- arch/x86/coco/tdx/tdx.c | 121 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 121 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 50c3b97d6db7..ab10bc73a7c5 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -8,11 +8,17 @@ #include #include #include +#include +#include /* TDX module Call Leaf IDs */ #define TDX_GET_INFO 1 #define TDX_GET_VEINFO 3 +/* MMIO direction */ +#define EPT_READ 0 +#define EPT_WRITE 1 + /* * Wrapper for standard use of __tdx_hypercall with no output aside from * return code. @@ -222,6 +228,119 @@ static bool handle_cpuid(struct pt_regs *regs) return true; } +static bool mmio_read(int size, unsigned long addr, unsigned long *val) +{ + struct tdx_hypercall_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = hcall_func(EXIT_REASON_EPT_VIOLATION), + .r12 = size, + .r13 = EPT_READ, + .r14 = addr, + .r15 = *val, + }; + + if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT)) + return false; + *val = args.r11; + return true; +} + +static bool mmio_write(int size, unsigned long addr, unsigned long val) +{ + return !_tdx_hypercall(hcall_func(EXIT_REASON_EPT_VIOLATION), size, + EPT_WRITE, addr, val); +} + +static bool handle_mmio(struct pt_regs *regs, struct ve_info *ve) +{ + char buffer[MAX_INSN_SIZE]; + unsigned long *reg, val; + struct insn insn = {}; + enum mmio_type mmio; + int size, extend_size; + u8 extend_val = 0; + + /* Only in-kernel MMIO is supported */ + if (WARN_ON_ONCE(user_mode(regs))) + return false; + + if (copy_from_kernel_nofault(buffer, (void *)regs->ip, MAX_INSN_SIZE)) + return false; + + if (insn_decode(&insn, buffer, MAX_INSN_SIZE, INSN_MODE_64)) + return false; + + mmio = insn_decode_mmio(&insn, &size); + if (WARN_ON_ONCE(mmio == MMIO_DECODE_FAILED)) + return false; + + if (mmio != MMIO_WRITE_IMM && mmio != MMIO_MOVS) { + reg = insn_get_modrm_reg_ptr(&insn, regs); + if (!reg) + return false; + } + + ve->instr_len = insn.length; + + /* Handle writes first */ + switch (mmio) { + case MMIO_WRITE: + memcpy(&val, reg, size); + return mmio_write(size, ve->gpa, val); + case MMIO_WRITE_IMM: + val = insn.immediate.value; + return mmio_write(size, ve->gpa, val); + case MMIO_READ: + case MMIO_READ_ZERO_EXTEND: + case MMIO_READ_SIGN_EXTEND: + /* Reads are handled below */ + break; + case MMIO_MOVS: + case MMIO_DECODE_FAILED: + /* + * MMIO was accessed with an instruction that could not be + * decoded or handled properly. It was likely not using io.h + * helpers or accessed MMIO accidentally. + */ + return false; + default: + WARN_ONCE(1, "Unknown insn_decode_mmio() decode value?"); + return false; + } + + /* Handle reads */ + if (!mmio_read(size, ve->gpa, &val)) + return false; + + switch (mmio) { + case MMIO_READ: + /* Zero-extend for 32-bit operation */ + extend_size = size == 4 ? sizeof(*reg) : 0; + break; + case MMIO_READ_ZERO_EXTEND: + /* Zero extend based on operand size */ + extend_size = insn.opnd_bytes; + break; + case MMIO_READ_SIGN_EXTEND: + /* Sign extend based on operand size */ + extend_size = insn.opnd_bytes; + if (size == 1 && val & BIT(7)) + extend_val = 0xFF; + else if (size > 1 && val & BIT(15)) + extend_val = 0xFF; + break; + default: + /* All other cases has to be covered with the first switch() */ + WARN_ON_ONCE(1); + return false; + } + + if (extend_size) + memset(reg, extend_val, extend_size); + memcpy(reg, &val, size); + return true; +} + void tdx_get_ve_info(struct ve_info *ve) { struct tdx_module_output out; @@ -276,6 +395,8 @@ static bool virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve) return write_msr(regs); case EXIT_REASON_CPUID: return handle_cpuid(regs); + case EXIT_REASON_EPT_VIOLATION: + return handle_mmio(regs, ve); default: pr_warn("Unexpected #VE: %lld\n", ve->exit_reason); return false; -- cgit From 4b05f81504bfcaab51083d3a271fada75e6b8904 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Wed, 6 Apr 2022 02:29:21 +0300 Subject: x86/tdx: Detect TDX at early kernel decompression time The early decompression code does port I/O for its console output. But, handling the decompression-time port I/O demands a different approach from normal runtime because the IDT required to support #VE based port I/O emulation is not yet set up. Paravirtualizing I/O calls during the decompression step is acceptable because the decompression code doesn't have a lot of call sites to IO instruction. To support port I/O in decompression code, TDX must be detected before the decompression code might do port I/O. Detect whether the kernel runs in a TDX guest. Add an early_is_tdx_guest() interface to query the cached TDX guest status in the decompression code. TDX is detected with CPUID. Make cpuid_count() accessible outside boot/cpuflags.c. TDX detection in the main kernel is very similar. Move common bits into . The actual port I/O paravirtualization will come later in the series. Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Tony Luck Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/20220405232939.73860-13-kirill.shutemov@linux.intel.com --- arch/x86/boot/compressed/Makefile | 1 + arch/x86/boot/compressed/misc.c | 8 ++++++++ arch/x86/boot/compressed/misc.h | 2 ++ arch/x86/boot/compressed/tdx.c | 16 ++++++++++++++++ arch/x86/boot/compressed/tdx.h | 13 +++++++++++++ arch/x86/boot/cpuflags.c | 3 +-- arch/x86/boot/cpuflags.h | 1 + arch/x86/include/asm/shared/tdx.h | 8 ++++++++ arch/x86/include/asm/tdx.h | 4 +--- 9 files changed, 51 insertions(+), 5 deletions(-) create mode 100644 arch/x86/boot/compressed/tdx.c create mode 100644 arch/x86/boot/compressed/tdx.h create mode 100644 arch/x86/include/asm/shared/tdx.h (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 6115274fe10f..732f6b21ecbd 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -101,6 +101,7 @@ ifdef CONFIG_X86_64 endif vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o +vmlinux-objs-$(CONFIG_INTEL_TDX_GUEST) += $(obj)/tdx.o vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o efi-obj-$(CONFIG_EFI_STUB) = $(objtree)/drivers/firmware/efi/libstub/lib.a diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 1cdcaf34ee36..e8142e977ddb 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -371,6 +371,14 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, lines = boot_params->screen_info.orig_video_lines; cols = boot_params->screen_info.orig_video_cols; + /* + * Detect TDX guest environment. + * + * It has to be done before console_init() in order to use + * paravirtualized port I/O operations if needed. + */ + early_tdx_detect(); + console_init(); /* diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 16ed360b6692..0d8e275a9d96 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -28,6 +28,8 @@ #include #include +#include "tdx.h" + #define BOOT_CTYPE_H #include diff --git a/arch/x86/boot/compressed/tdx.c b/arch/x86/boot/compressed/tdx.c new file mode 100644 index 000000000000..5f6d01a2f1f4 --- /dev/null +++ b/arch/x86/boot/compressed/tdx.c @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "../cpuflags.h" +#include "../string.h" + +#include + +void early_tdx_detect(void) +{ + u32 eax, sig[3]; + + cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, &sig[0], &sig[2], &sig[1]); + + if (memcmp(TDX_IDENT, sig, sizeof(sig))) + return; +} diff --git a/arch/x86/boot/compressed/tdx.h b/arch/x86/boot/compressed/tdx.h new file mode 100644 index 000000000000..9055482cd35c --- /dev/null +++ b/arch/x86/boot/compressed/tdx.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef BOOT_COMPRESSED_TDX_H +#define BOOT_COMPRESSED_TDX_H + +#include + +#ifdef CONFIG_INTEL_TDX_GUEST +void early_tdx_detect(void); +#else +static inline void early_tdx_detect(void) { }; +#endif + +#endif /* BOOT_COMPRESSED_TDX_H */ diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c index a0b75f73dc63..a83d67ec627d 100644 --- a/arch/x86/boot/cpuflags.c +++ b/arch/x86/boot/cpuflags.c @@ -71,8 +71,7 @@ int has_eflag(unsigned long mask) # define EBX_REG "=b" #endif -static inline void cpuid_count(u32 id, u32 count, - u32 *a, u32 *b, u32 *c, u32 *d) +void cpuid_count(u32 id, u32 count, u32 *a, u32 *b, u32 *c, u32 *d) { asm volatile(".ifnc %%ebx,%3 ; movl %%ebx,%3 ; .endif \n\t" "cpuid \n\t" diff --git a/arch/x86/boot/cpuflags.h b/arch/x86/boot/cpuflags.h index 2e20814d3ce3..475b8fde90f7 100644 --- a/arch/x86/boot/cpuflags.h +++ b/arch/x86/boot/cpuflags.h @@ -17,5 +17,6 @@ extern u32 cpu_vendor[3]; int has_eflag(unsigned long mask); void get_cpuflags(void); +void cpuid_count(u32 id, u32 count, u32 *a, u32 *b, u32 *c, u32 *d); #endif diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h new file mode 100644 index 000000000000..8209ba9ffe1a --- /dev/null +++ b/arch/x86/include/asm/shared/tdx.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_SHARED_TDX_H +#define _ASM_X86_SHARED_TDX_H + +#define TDX_CPUID_LEAF_ID 0x21 +#define TDX_IDENT "IntelTDX " + +#endif /* _ASM_X86_SHARED_TDX_H */ diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index cbd61e142f4e..81a1ec14e476 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -7,9 +7,7 @@ #include #include #include - -#define TDX_CPUID_LEAF_ID 0x21 -#define TDX_IDENT "IntelTDX " +#include #define TDX_HYPERCALL_STANDARD 0 -- cgit From 15104de122a4f0258981b06ed94cf616a6eb03ef Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:22 +0300 Subject: x86: Adjust types used in port I/O helpers Change port I/O helpers to use u8/u16/u32 instead of unsigned char/short/int for values. Use u16 instead of int for port number. It aligns the helpers with implementation in boot stub in preparation for consolidation. Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20220405232939.73860-14-kirill.shutemov@linux.intel.com --- arch/x86/include/asm/io.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index f6d91ecb8026..638c1a2a82e0 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -258,37 +258,37 @@ static inline void slow_down_io(void) #endif #define BUILDIO(bwl, bw, type) \ -static inline void out##bwl(unsigned type value, int port) \ +static inline void out##bwl(type value, u16 port) \ { \ asm volatile("out" #bwl " %" #bw "0, %w1" \ : : "a"(value), "Nd"(port)); \ } \ \ -static inline unsigned type in##bwl(int port) \ +static inline type in##bwl(u16 port) \ { \ - unsigned type value; \ + type value; \ asm volatile("in" #bwl " %w1, %" #bw "0" \ : "=a"(value) : "Nd"(port)); \ return value; \ } \ \ -static inline void out##bwl##_p(unsigned type value, int port) \ +static inline void out##bwl##_p(type value, u16 port) \ { \ out##bwl(value, port); \ slow_down_io(); \ } \ \ -static inline unsigned type in##bwl##_p(int port) \ +static inline type in##bwl##_p(u16 port) \ { \ - unsigned type value = in##bwl(port); \ + type value = in##bwl(port); \ slow_down_io(); \ return value; \ } \ \ -static inline void outs##bwl(int port, const void *addr, unsigned long count) \ +static inline void outs##bwl(u16 port, const void *addr, unsigned long count) \ { \ if (cc_platform_has(CC_ATTR_GUEST_UNROLL_STRING_IO)) { \ - unsigned type *value = (unsigned type *)addr; \ + type *value = (type *)addr; \ while (count) { \ out##bwl(*value, port); \ value++; \ @@ -301,10 +301,10 @@ static inline void outs##bwl(int port, const void *addr, unsigned long count) \ } \ } \ \ -static inline void ins##bwl(int port, void *addr, unsigned long count) \ +static inline void ins##bwl(u16 port, void *addr, unsigned long count) \ { \ if (cc_platform_has(CC_ATTR_GUEST_UNROLL_STRING_IO)) { \ - unsigned type *value = (unsigned type *)addr; \ + type *value = (type *)addr; \ while (count) { \ *value = in##bwl(port); \ value++; \ @@ -317,9 +317,9 @@ static inline void ins##bwl(int port, void *addr, unsigned long count) \ } \ } -BUILDIO(b, b, char) -BUILDIO(w, w, short) -BUILDIO(l, , int) +BUILDIO(b, b, u8) +BUILDIO(w, w, u16) +BUILDIO(l, , u32) #define inb inb #define inw inw -- cgit From 1e8f93e18379d05da9fd130eb7d50988a20f8b9a Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:23 +0300 Subject: x86: Consolidate port I/O helpers There are two implementations of port I/O helpers: one in the kernel and one in the boot stub. Move the helpers required for both to and use the one implementation everywhere. Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20220405232939.73860-15-kirill.shutemov@linux.intel.com --- arch/x86/boot/boot.h | 35 +---------------------------------- arch/x86/boot/compressed/misc.h | 2 +- arch/x86/include/asm/io.h | 22 ++-------------------- arch/x86/include/asm/shared/io.h | 34 ++++++++++++++++++++++++++++++++++ 4 files changed, 38 insertions(+), 55 deletions(-) create mode 100644 arch/x86/include/asm/shared/io.h (limited to 'arch/x86') diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 34c9dbb6a47d..22a474c5b3e8 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -23,6 +23,7 @@ #include #include #include +#include #include "bitops.h" #include "ctype.h" #include "cpuflags.h" @@ -35,40 +36,6 @@ extern struct boot_params boot_params; #define cpu_relax() asm volatile("rep; nop") -/* Basic port I/O */ -static inline void outb(u8 v, u16 port) -{ - asm volatile("outb %0,%1" : : "a" (v), "dN" (port)); -} -static inline u8 inb(u16 port) -{ - u8 v; - asm volatile("inb %1,%0" : "=a" (v) : "dN" (port)); - return v; -} - -static inline void outw(u16 v, u16 port) -{ - asm volatile("outw %0,%1" : : "a" (v), "dN" (port)); -} -static inline u16 inw(u16 port) -{ - u16 v; - asm volatile("inw %1,%0" : "=a" (v) : "dN" (port)); - return v; -} - -static inline void outl(u32 v, u16 port) -{ - asm volatile("outl %0,%1" : : "a" (v), "dN" (port)); -} -static inline u32 inl(u16 port) -{ - u32 v; - asm volatile("inl %1,%0" : "=a" (v) : "dN" (port)); - return v; -} - static inline void io_delay(void) { const u16 DELAY_PORT = 0x80; diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 0d8e275a9d96..8a253e85f990 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -22,11 +22,11 @@ #include #include #include -#include #include #include #include #include +#include #include "tdx.h" diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 638c1a2a82e0..a1eb218a49f8 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -44,6 +44,7 @@ #include #include #include +#include #define build_mmio_read(name, size, type, reg, barrier) \ static inline type name(const volatile void __iomem *addr) \ @@ -258,20 +259,6 @@ static inline void slow_down_io(void) #endif #define BUILDIO(bwl, bw, type) \ -static inline void out##bwl(type value, u16 port) \ -{ \ - asm volatile("out" #bwl " %" #bw "0, %w1" \ - : : "a"(value), "Nd"(port)); \ -} \ - \ -static inline type in##bwl(u16 port) \ -{ \ - type value; \ - asm volatile("in" #bwl " %w1, %" #bw "0" \ - : "=a"(value) : "Nd"(port)); \ - return value; \ -} \ - \ static inline void out##bwl##_p(type value, u16 port) \ { \ out##bwl(value, port); \ @@ -320,10 +307,8 @@ static inline void ins##bwl(u16 port, void *addr, unsigned long count) \ BUILDIO(b, b, u8) BUILDIO(w, w, u16) BUILDIO(l, , u32) +#undef BUILDIO -#define inb inb -#define inw inw -#define inl inl #define inb_p inb_p #define inw_p inw_p #define inl_p inl_p @@ -331,9 +316,6 @@ BUILDIO(l, , u32) #define insw insw #define insl insl -#define outb outb -#define outw outw -#define outl outl #define outb_p outb_p #define outw_p outw_p #define outl_p outl_p diff --git a/arch/x86/include/asm/shared/io.h b/arch/x86/include/asm/shared/io.h new file mode 100644 index 000000000000..c0ef921c0586 --- /dev/null +++ b/arch/x86/include/asm/shared/io.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_SHARED_IO_H +#define _ASM_X86_SHARED_IO_H + +#include + +#define BUILDIO(bwl, bw, type) \ +static inline void __out##bwl(type value, u16 port) \ +{ \ + asm volatile("out" #bwl " %" #bw "0, %w1" \ + : : "a"(value), "Nd"(port)); \ +} \ + \ +static inline type __in##bwl(u16 port) \ +{ \ + type value; \ + asm volatile("in" #bwl " %w1, %" #bw "0" \ + : "=a"(value) : "Nd"(port)); \ + return value; \ +} + +BUILDIO(b, b, u8) +BUILDIO(w, w, u16) +BUILDIO(l, , u32) +#undef BUILDIO + +#define inb __inb +#define inw __inw +#define inl __inl +#define outb __outb +#define outw __outw +#define outl __outl + +#endif -- cgit From eb4ea1ae8f45e3249e7586f30be8977478202a37 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:24 +0300 Subject: x86/boot: Port I/O: Allow to hook up alternative helpers Port I/O instructions trigger #VE in the TDX environment. In response to the exception, kernel emulates these instructions using hypercalls. But during early boot, on the decompression stage, it is cumbersome to deal with #VE. It is cleaner to go to hypercalls directly, bypassing #VE handling. Add a way to hook up alternative port I/O helpers in the boot stub with a new pio_ops structure. For now, set the ops structure to just call the normal I/O operation functions. out*()/in*() macros redefined to use pio_ops callbacks. It eliminates need in changing call sites. io_delay() changed to use port I/O helper instead of inline assembly. Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/20220405232939.73860-16-kirill.shutemov@linux.intel.com --- arch/x86/boot/boot.h | 4 ++-- arch/x86/boot/compressed/misc.c | 4 ++++ arch/x86/boot/compressed/misc.h | 2 +- arch/x86/boot/io.h | 41 +++++++++++++++++++++++++++++++++++++++++ arch/x86/boot/main.c | 4 ++++ arch/x86/realmode/rm/wakemain.c | 4 ++++ 6 files changed, 56 insertions(+), 3 deletions(-) create mode 100644 arch/x86/boot/io.h (limited to 'arch/x86') diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 22a474c5b3e8..b42b91606ca8 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -23,10 +23,10 @@ #include #include #include -#include #include "bitops.h" #include "ctype.h" #include "cpuflags.h" +#include "io.h" /* Useful macros */ #define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) @@ -39,7 +39,7 @@ extern struct boot_params boot_params; static inline void io_delay(void) { const u16 DELAY_PORT = 0x80; - asm volatile("outb %%al,%0" : : "dN" (DELAY_PORT)); + outb(0, DELAY_PORT); } /* These functions are used to reference data in other segments. */ diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index e8142e977ddb..fa8969fad011 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -48,6 +48,8 @@ void *memmove(void *dest, const void *src, size_t n); */ struct boot_params *boot_params; +struct port_io_ops pio_ops; + memptr free_mem_ptr; memptr free_mem_end_ptr; @@ -371,6 +373,8 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, lines = boot_params->screen_info.orig_video_lines; cols = boot_params->screen_info.orig_video_cols; + init_default_io_ops(); + /* * Detect TDX guest environment. * diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 8a253e85f990..ea71cf3d64e1 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -26,7 +26,6 @@ #include #include #include -#include #include "tdx.h" @@ -35,6 +34,7 @@ #define BOOT_BOOT_H #include "../ctype.h" +#include "../io.h" #ifdef CONFIG_X86_64 #define memptr long diff --git a/arch/x86/boot/io.h b/arch/x86/boot/io.h new file mode 100644 index 000000000000..110880907f87 --- /dev/null +++ b/arch/x86/boot/io.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef BOOT_IO_H +#define BOOT_IO_H + +#include + +#undef inb +#undef inw +#undef inl +#undef outb +#undef outw +#undef outl + +struct port_io_ops { + u8 (*f_inb)(u16 port); + void (*f_outb)(u8 v, u16 port); + void (*f_outw)(u16 v, u16 port); +}; + +extern struct port_io_ops pio_ops; + +/* + * Use the normal I/O instructions by default. + * TDX guests override these to use hypercalls. + */ +static inline void init_default_io_ops(void) +{ + pio_ops.f_inb = __inb; + pio_ops.f_outb = __outb; + pio_ops.f_outw = __outw; +} + +/* + * Redirect port I/O operations via pio_ops callbacks. + * TDX guests override these callbacks with TDX-specific helpers. + */ +#define inb pio_ops.f_inb +#define outb pio_ops.f_outb +#define outw pio_ops.f_outw + +#endif diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c index e3add857c2c9..1202d4f8a390 100644 --- a/arch/x86/boot/main.c +++ b/arch/x86/boot/main.c @@ -17,6 +17,8 @@ struct boot_params boot_params __attribute__((aligned(16))); +struct port_io_ops pio_ops; + char *HEAP = _end; char *heap_end = _end; /* Default end of heap = no heap */ @@ -133,6 +135,8 @@ static void init_heap(void) void main(void) { + init_default_io_ops(); + /* First, copy the boot header into the "zeropage" */ copy_boot_params(); diff --git a/arch/x86/realmode/rm/wakemain.c b/arch/x86/realmode/rm/wakemain.c index 1d6437e6d2ba..a6f4d8388ad8 100644 --- a/arch/x86/realmode/rm/wakemain.c +++ b/arch/x86/realmode/rm/wakemain.c @@ -62,8 +62,12 @@ static void send_morse(const char *pattern) } } +struct port_io_ops pio_ops; + void main(void) { + init_default_io_ops(); + /* Kill machine if structures are wrong */ if (wakeup_header.real_magic != 0x12345678) while (1) -- cgit From 4c5b9aac6cade51aef64cc6ed67f2ad5acda9aed Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:25 +0300 Subject: x86/boot: Port I/O: Add decompression-time support for TDX Port I/O instructions trigger #VE in the TDX environment. In response to the exception, kernel emulates these instructions using hypercalls. But during early boot, on the decompression stage, it is cumbersome to deal with #VE. It is cleaner to go to hypercalls directly, bypassing #VE handling. Hook up TDX-specific port I/O helpers if booting in TDX environment. Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/20220405232939.73860-17-kirill.shutemov@linux.intel.com --- arch/x86/boot/compressed/Makefile | 2 +- arch/x86/boot/compressed/tdcall.S | 3 ++ arch/x86/boot/compressed/tdx.c | 61 +++++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/shared/tdx.h | 32 ++++++++++++++++++++ arch/x86/include/asm/tdx.h | 27 ----------------- 5 files changed, 97 insertions(+), 28 deletions(-) create mode 100644 arch/x86/boot/compressed/tdcall.S (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 732f6b21ecbd..8fd0e6ae2e1f 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -101,7 +101,7 @@ ifdef CONFIG_X86_64 endif vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o -vmlinux-objs-$(CONFIG_INTEL_TDX_GUEST) += $(obj)/tdx.o +vmlinux-objs-$(CONFIG_INTEL_TDX_GUEST) += $(obj)/tdx.o $(obj)/tdcall.o vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o efi-obj-$(CONFIG_EFI_STUB) = $(objtree)/drivers/firmware/efi/libstub/lib.a diff --git a/arch/x86/boot/compressed/tdcall.S b/arch/x86/boot/compressed/tdcall.S new file mode 100644 index 000000000000..46d0495e0d3a --- /dev/null +++ b/arch/x86/boot/compressed/tdcall.S @@ -0,0 +1,3 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include "../../coco/tdx/tdcall.S" diff --git a/arch/x86/boot/compressed/tdx.c b/arch/x86/boot/compressed/tdx.c index 5f6d01a2f1f4..918a7606f53c 100644 --- a/arch/x86/boot/compressed/tdx.c +++ b/arch/x86/boot/compressed/tdx.c @@ -2,9 +2,65 @@ #include "../cpuflags.h" #include "../string.h" +#include "../io.h" +#include "error.h" + +#include +#include #include +/* Called from __tdx_hypercall() for unrecoverable failure */ +void __tdx_hypercall_failed(void) +{ + error("TDVMCALL failed. TDX module bug?"); +} + +static inline unsigned int tdx_io_in(int size, u16 port) +{ + struct tdx_hypercall_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = EXIT_REASON_IO_INSTRUCTION, + .r12 = size, + .r13 = 0, + .r14 = port, + }; + + if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT)) + return UINT_MAX; + + return args.r11; +} + +static inline void tdx_io_out(int size, u16 port, u32 value) +{ + struct tdx_hypercall_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = EXIT_REASON_IO_INSTRUCTION, + .r12 = size, + .r13 = 1, + .r14 = port, + .r15 = value, + }; + + __tdx_hypercall(&args, 0); +} + +static inline u8 tdx_inb(u16 port) +{ + return tdx_io_in(1, port); +} + +static inline void tdx_outb(u8 value, u16 port) +{ + tdx_io_out(1, port, value); +} + +static inline void tdx_outw(u16 value, u16 port) +{ + tdx_io_out(2, port, value); +} + void early_tdx_detect(void) { u32 eax, sig[3]; @@ -13,4 +69,9 @@ void early_tdx_detect(void) if (memcmp(TDX_IDENT, sig, sizeof(sig))) return; + + /* Use hypercalls instead of I/O instructions */ + pio_ops.f_inb = tdx_inb; + pio_ops.f_outb = tdx_outb; + pio_ops.f_outw = tdx_outw; } diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h index 8209ba9ffe1a..e53f26228fbb 100644 --- a/arch/x86/include/asm/shared/tdx.h +++ b/arch/x86/include/asm/shared/tdx.h @@ -2,7 +2,39 @@ #ifndef _ASM_X86_SHARED_TDX_H #define _ASM_X86_SHARED_TDX_H +#include +#include + +#define TDX_HYPERCALL_STANDARD 0 + +#define TDX_HCALL_HAS_OUTPUT BIT(0) +#define TDX_HCALL_ISSUE_STI BIT(1) + #define TDX_CPUID_LEAF_ID 0x21 #define TDX_IDENT "IntelTDX " +#ifndef __ASSEMBLY__ + +/* + * Used in __tdx_hypercall() to pass down and get back registers' values of + * the TDCALL instruction when requesting services from the VMM. + * + * This is a software only structure and not part of the TDX module/VMM ABI. + */ +struct tdx_hypercall_args { + u64 r10; + u64 r11; + u64 r12; + u64 r13; + u64 r14; + u64 r15; +}; + +/* Used to request services from the VMM */ +u64 __tdx_hypercall(struct tdx_hypercall_args *args, unsigned long flags); + +/* Called from __tdx_hypercall() for unrecoverable failure */ +void __tdx_hypercall_failed(void); + +#endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_SHARED_TDX_H */ diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 81a1ec14e476..7944fd1ae07d 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -3,17 +3,11 @@ #ifndef _ASM_X86_TDX_H #define _ASM_X86_TDX_H -#include #include #include #include #include -#define TDX_HYPERCALL_STANDARD 0 - -#define TDX_HCALL_HAS_OUTPUT BIT(0) -#define TDX_HCALL_ISSUE_STI BIT(1) - /* * SW-defined error codes. * @@ -41,21 +35,6 @@ struct tdx_module_output { u64 r11; }; -/* - * Used in __tdx_hypercall() to pass down and get back registers' values of - * the TDCALL instruction when requesting services from the VMM. - * - * This is a software only structure and not part of the TDX module/VMM ABI. - */ -struct tdx_hypercall_args { - u64 r10; - u64 r11; - u64 r12; - u64 r13; - u64 r14; - u64 r15; -}; - /* * Used by the #VE exception handler to gather the #VE exception * info from the TDX module. This is a software only structure @@ -80,12 +59,6 @@ void __init tdx_early_init(void); u64 __tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9, struct tdx_module_output *out); -/* Used to request services from the VMM */ -u64 __tdx_hypercall(struct tdx_hypercall_args *args, unsigned long flags); - -/* Called from __tdx_hypercall() for unrecoverable failure */ -void __tdx_hypercall_failed(void); - void tdx_get_ve_info(struct ve_info *ve); bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve); -- cgit From 03149948832a078f759022ed5b92e722d8d23c26 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Wed, 6 Apr 2022 02:29:26 +0300 Subject: x86/tdx: Port I/O: Add runtime hypercalls TDX hypervisors cannot emulate instructions directly. This includes port I/O which is normally emulated in the hypervisor. All port I/O instructions inside TDX trigger the #VE exception in the guest and would be normally emulated there. Use a hypercall to emulate port I/O. Extend the tdx_handle_virt_exception() and add support to handle the #VE due to port I/O instructions. String I/O operations are not supported in TDX. Unroll them by declaring CC_ATTR_GUEST_UNROLL_STRING_IO confidential computing attribute. == Userspace Implications == The ioperm() facility allows userspace access to I/O instructions like inb/outb. Among other things, this allows writing userspace device drivers. This series has no special handling for ioperm(). Users will be able to successfully request I/O permissions but will induce a #VE on their first I/O instruction which leads SIGSEGV. If this is undesirable users can enable kernel lockdown feature with 'lockdown=integrity' kernel command line option. It makes ioperm() fail. More robust handling of this situation (denying ioperm() in all TDX guests) will be addressed in follow-on work. Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Andi Kleen Reviewed-by: Dan Williams Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20220405232939.73860-18-kirill.shutemov@linux.intel.com --- arch/x86/coco/core.c | 7 ++++- arch/x86/coco/tdx/tdx.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c index 3f3008783e05..df08edc94f9b 100644 --- a/arch/x86/coco/core.c +++ b/arch/x86/coco/core.c @@ -18,7 +18,12 @@ static u64 cc_mask __ro_after_init; static bool intel_cc_platform_has(enum cc_attr attr) { - return false; + switch (attr) { + case CC_ATTR_GUEST_UNROLL_STRING_IO: + return true; + default: + return false; + } } /* diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index ab10bc73a7c5..e47e2ed6b03e 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -19,6 +19,16 @@ #define EPT_READ 0 #define EPT_WRITE 1 +/* Port I/O direction */ +#define PORT_READ 0 +#define PORT_WRITE 1 + +/* See Exit Qualification for I/O Instructions in VMX documentation */ +#define VE_IS_IO_IN(e) ((e) & BIT(3)) +#define VE_GET_IO_SIZE(e) (((e) & GENMASK(2, 0)) + 1) +#define VE_GET_PORT_NUM(e) ((e) >> 16) +#define VE_IS_IO_STRING(e) ((e) & BIT(4)) + /* * Wrapper for standard use of __tdx_hypercall with no output aside from * return code. @@ -341,6 +351,73 @@ static bool handle_mmio(struct pt_regs *regs, struct ve_info *ve) return true; } +static bool handle_in(struct pt_regs *regs, int size, int port) +{ + struct tdx_hypercall_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = hcall_func(EXIT_REASON_IO_INSTRUCTION), + .r12 = size, + .r13 = PORT_READ, + .r14 = port, + }; + u64 mask = GENMASK(BITS_PER_BYTE * size, 0); + bool success; + + /* + * Emulate the I/O read via hypercall. More info about ABI can be found + * in TDX Guest-Host-Communication Interface (GHCI) section titled + * "TDG.VP.VMCALL". + */ + success = !__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT); + + /* Update part of the register affected by the emulated instruction */ + regs->ax &= ~mask; + if (success) + regs->ax |= args.r11 & mask; + + return success; +} + +static bool handle_out(struct pt_regs *regs, int size, int port) +{ + u64 mask = GENMASK(BITS_PER_BYTE * size, 0); + + /* + * Emulate the I/O write via hypercall. More info about ABI can be found + * in TDX Guest-Host-Communication Interface (GHCI) section titled + * "TDG.VP.VMCALL". + */ + return !_tdx_hypercall(hcall_func(EXIT_REASON_IO_INSTRUCTION), size, + PORT_WRITE, port, regs->ax & mask); +} + +/* + * Emulate I/O using hypercall. + * + * Assumes the IO instruction was using ax, which is enforced + * by the standard io.h macros. + * + * Return True on success or False on failure. + */ +static bool handle_io(struct pt_regs *regs, u32 exit_qual) +{ + int size, port; + bool in; + + if (VE_IS_IO_STRING(exit_qual)) + return false; + + in = VE_IS_IO_IN(exit_qual); + size = VE_GET_IO_SIZE(exit_qual); + port = VE_GET_PORT_NUM(exit_qual); + + + if (in) + return handle_in(regs, size, port); + else + return handle_out(regs, size, port); +} + void tdx_get_ve_info(struct ve_info *ve) { struct tdx_module_output out; @@ -397,6 +474,8 @@ static bool virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve) return handle_cpuid(regs); case EXIT_REASON_EPT_VIOLATION: return handle_mmio(regs, ve); + case EXIT_REASON_IO_INSTRUCTION: + return handle_io(regs, ve->exit_qual); default: pr_warn("Unexpected #VE: %lld\n", ve->exit_reason); return false; -- cgit From 32e72854fa5fef6bc72e27c54f31897db9092acb Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 6 Apr 2022 02:29:27 +0300 Subject: x86/tdx: Port I/O: Add early boot support TDX guests cannot do port I/O directly. The TDX module triggers a #VE exception to let the guest kernel emulate port I/O by converting them into TDCALLs to call the host. But before IDT handlers are set up, port I/O cannot be emulated using normal kernel #VE handlers. To support the #VE-based emulation during this boot window, add a minimal early #VE handler support in early exception handlers. This is similar to what AMD SEV does. This is mainly to support earlyprintk's serial driver, as well as potentially the VGA driver. The early handler only supports I/O-related #VE exceptions. Unhandled or failed exceptions will be handled via early_fixup_exceptions() (like normal exception failures). At runtime I/O-related #VE exceptions (along with other types) handled by virt_exception_kernel(). Signed-off-by: Andi Kleen Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Dan Williams Reviewed-by: Thomas Gleixner Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/20220405232939.73860-19-kirill.shutemov@linux.intel.com --- arch/x86/coco/tdx/tdx.c | 16 ++++++++++++++++ arch/x86/include/asm/tdx.h | 4 ++++ arch/x86/kernel/head64.c | 3 +++ 3 files changed, 23 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index e47e2ed6b03e..cc14b7c0c157 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -418,6 +418,22 @@ static bool handle_io(struct pt_regs *regs, u32 exit_qual) return handle_out(regs, size, port); } +/* + * Early #VE exception handler. Only handles a subset of port I/O. + * Intended only for earlyprintk. If failed, return false. + */ +__init bool tdx_early_handle_ve(struct pt_regs *regs) +{ + struct ve_info ve; + + tdx_get_ve_info(&ve); + + if (ve.exit_reason != EXIT_REASON_IO_INSTRUCTION) + return false; + + return handle_io(regs, ve.exit_qual); +} + void tdx_get_ve_info(struct ve_info *ve) { struct tdx_module_output out; diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 7944fd1ae07d..9ffd0d2e6e0f 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -65,11 +65,15 @@ bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve); void tdx_safe_halt(void); +bool tdx_early_handle_ve(struct pt_regs *regs); + #else static inline void tdx_early_init(void) { }; static inline void tdx_safe_halt(void) { }; +static inline bool tdx_early_handle_ve(struct pt_regs *regs) { return false; } + #endif /* CONFIG_INTEL_TDX_GUEST */ #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 6dff50c3edd6..ecbf50e5b8e0 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -417,6 +417,9 @@ void __init do_early_exception(struct pt_regs *regs, int trapnr) trapnr == X86_TRAP_VC && handle_vc_boot_ghcb(regs)) return; + if (trapnr == X86_TRAP_VE && tdx_early_handle_ve(regs)) + return; + early_fixup_exception(regs, trapnr); } -- cgit From cfb8ec7a31f234b4519c104f1cc9accbc8b393a9 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Wed, 6 Apr 2022 02:29:28 +0300 Subject: x86/tdx: Wire up KVM hypercalls KVM hypercalls use the VMCALL or VMMCALL instructions. Although the ABI is similar, those instructions no longer function for TDX guests. Make vendor-specific TDVMCALLs instead of VMCALL. This enables TDX guests to run with KVM acting as the hypervisor. Among other things, KVM hypercall is used to send IPIs. Since the KVM driver can be built as a kernel module, export tdx_kvm_hypercall() to make the symbols visible to kvm.ko. Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20220405232939.73860-20-kirill.shutemov@linux.intel.com --- arch/x86/coco/tdx/tdx.c | 17 +++++++++++++++++ arch/x86/include/asm/kvm_para.h | 22 ++++++++++++++++++++++ arch/x86/include/asm/tdx.h | 11 +++++++++++ 3 files changed, 50 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index cc14b7c0c157..f50f530aff5f 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -64,6 +64,23 @@ static u64 hcall_func(u64 exit_reason) return exit_reason; } +#ifdef CONFIG_KVM_GUEST +long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, unsigned long p2, + unsigned long p3, unsigned long p4) +{ + struct tdx_hypercall_args args = { + .r10 = nr, + .r11 = p1, + .r12 = p2, + .r13 = p3, + .r14 = p4, + }; + + return __tdx_hypercall(&args, 0); +} +EXPORT_SYMBOL_GPL(tdx_kvm_hypercall); +#endif + /* * Used for TDX guests to make calls directly to the TD module. This * should only be used for calls that have no legitimate reason to fail diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 56935ebb1dfe..57bc74e112f2 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -7,6 +7,8 @@ #include #include +#include + #ifdef CONFIG_KVM_GUEST bool kvm_check_and_clear_guest_paused(void); #else @@ -32,6 +34,10 @@ static inline bool kvm_check_and_clear_guest_paused(void) static inline long kvm_hypercall0(unsigned int nr) { long ret; + + if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) + return tdx_kvm_hypercall(nr, 0, 0, 0, 0); + asm volatile(KVM_HYPERCALL : "=a"(ret) : "a"(nr) @@ -42,6 +48,10 @@ static inline long kvm_hypercall0(unsigned int nr) static inline long kvm_hypercall1(unsigned int nr, unsigned long p1) { long ret; + + if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) + return tdx_kvm_hypercall(nr, p1, 0, 0, 0); + asm volatile(KVM_HYPERCALL : "=a"(ret) : "a"(nr), "b"(p1) @@ -53,6 +63,10 @@ static inline long kvm_hypercall2(unsigned int nr, unsigned long p1, unsigned long p2) { long ret; + + if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) + return tdx_kvm_hypercall(nr, p1, p2, 0, 0); + asm volatile(KVM_HYPERCALL : "=a"(ret) : "a"(nr), "b"(p1), "c"(p2) @@ -64,6 +78,10 @@ static inline long kvm_hypercall3(unsigned int nr, unsigned long p1, unsigned long p2, unsigned long p3) { long ret; + + if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) + return tdx_kvm_hypercall(nr, p1, p2, p3, 0); + asm volatile(KVM_HYPERCALL : "=a"(ret) : "a"(nr), "b"(p1), "c"(p2), "d"(p3) @@ -76,6 +94,10 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, unsigned long p4) { long ret; + + if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) + return tdx_kvm_hypercall(nr, p1, p2, p3, p4); + asm volatile(KVM_HYPERCALL : "=a"(ret) : "a"(nr), "b"(p1), "c"(p2), "d"(p3), "S"(p4) diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 9ffd0d2e6e0f..020c81a7c729 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -76,5 +76,16 @@ static inline bool tdx_early_handle_ve(struct pt_regs *regs) { return false; } #endif /* CONFIG_INTEL_TDX_GUEST */ +#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_INTEL_TDX_GUEST) +long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, unsigned long p2, + unsigned long p3, unsigned long p4); +#else +static inline long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, + unsigned long p2, unsigned long p3, + unsigned long p4) +{ + return -ENODEV; +} +#endif /* CONFIG_INTEL_TDX_GUEST && CONFIG_KVM_GUEST */ #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_TDX_H */ -- cgit From ff2e64684f153a61f6ae553e860af4b6ef2f4ca5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 6 Apr 2022 02:29:29 +0300 Subject: x86/boot: Add a trampoline for booting APs via firmware handoff Historically, x86 platforms have booted secondary processors (APs) using INIT followed by the start up IPI (SIPI) messages. In regular VMs, this boot sequence is supported by the VMM emulation. But such a wakeup model is fatal for secure VMs like TDX in which VMM is an untrusted entity. To address this issue, a new wakeup model was added in ACPI v6.4, in which firmware (like TDX virtual BIOS) will help boot the APs. More details about this wakeup model can be found in ACPI specification v6.4, the section titled "Multiprocessor Wakeup Structure". Since the existing trampoline code requires processors to boot in real mode with 16-bit addressing, it will not work for this wakeup model (because it boots the AP in 64-bit mode). To handle it, extend the trampoline code to support 64-bit mode firmware handoff. Also, extend IDT and GDT pointers to support 64-bit mode hand off. There is no TDX-specific detection for this new boot method. The kernel will rely on it as the sole boot method whenever the new ACPI structure is present. The ACPI table parser for the MADT multiprocessor wake up structure and the wakeup method that uses this structure will be added by the following patch in this series. Signed-off-by: Sean Christopherson Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Andi Kleen Reviewed-by: Dan Williams Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20220405232939.73860-21-kirill.shutemov@linux.intel.com --- arch/x86/include/asm/apic.h | 2 ++ arch/x86/include/asm/realmode.h | 1 + arch/x86/kernel/smpboot.c | 12 ++++++++-- arch/x86/realmode/rm/header.S | 1 + arch/x86/realmode/rm/trampoline_64.S | 38 ++++++++++++++++++++++++++++++++ arch/x86/realmode/rm/trampoline_common.S | 12 +++++++++- 6 files changed, 63 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 48067af94678..35006e151774 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -328,6 +328,8 @@ struct apic { /* wakeup_secondary_cpu */ int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip); + /* wakeup secondary CPU using 64-bit wakeup point */ + int (*wakeup_secondary_cpu_64)(int apicid, unsigned long start_eip); void (*inquire_remote_apic)(int apicid); diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index 331474b150f1..fd6f6e5b755a 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -25,6 +25,7 @@ struct real_mode_header { u32 sev_es_trampoline_start; #endif #ifdef CONFIG_X86_64 + u32 trampoline_start64; u32 trampoline_pgd; #endif /* ACPI S3 wakeup */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2ef14772dc04..870cc5d203b1 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1082,6 +1082,11 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, unsigned long boot_error = 0; unsigned long timeout; +#ifdef CONFIG_X86_64 + /* If 64-bit wakeup method exists, use the 64-bit mode trampoline IP */ + if (apic->wakeup_secondary_cpu_64) + start_ip = real_mode_header->trampoline_start64; +#endif idle->thread.sp = (unsigned long)task_pt_regs(idle); early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu); initial_code = (unsigned long)start_secondary; @@ -1123,11 +1128,14 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, /* * Wake up a CPU in difference cases: - * - Use the method in the APIC driver if it's defined + * - Use a method from the APIC driver if one defined, with wakeup + * straight to 64-bit mode preferred over wakeup to RM. * Otherwise, * - Use an INIT boot APIC message for APs or NMI for BSP. */ - if (apic->wakeup_secondary_cpu) + if (apic->wakeup_secondary_cpu_64) + boot_error = apic->wakeup_secondary_cpu_64(apicid, start_ip); + else if (apic->wakeup_secondary_cpu) boot_error = apic->wakeup_secondary_cpu(apicid, start_ip); else boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid, diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S index 8c1db5bf5d78..2eb62be6d256 100644 --- a/arch/x86/realmode/rm/header.S +++ b/arch/x86/realmode/rm/header.S @@ -24,6 +24,7 @@ SYM_DATA_START(real_mode_header) .long pa_sev_es_trampoline_start #endif #ifdef CONFIG_X86_64 + .long pa_trampoline_start64 .long pa_trampoline_pgd; #endif /* ACPI S3 wakeup */ diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index cc8391f86cdb..ae112a91592f 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -161,6 +161,19 @@ SYM_CODE_START(startup_32) ljmpl $__KERNEL_CS, $pa_startup_64 SYM_CODE_END(startup_32) +SYM_CODE_START(pa_trampoline_compat) + /* + * In compatibility mode. Prep ESP and DX for startup_32, then disable + * paging and complete the switch to legacy 32-bit mode. + */ + movl $rm_stack_end, %esp + movw $__KERNEL_DS, %dx + + movl $X86_CR0_PE, %eax + movl %eax, %cr0 + ljmpl $__KERNEL32_CS, $pa_startup_32 +SYM_CODE_END(pa_trampoline_compat) + .section ".text64","ax" .code64 .balign 4 @@ -169,6 +182,20 @@ SYM_CODE_START(startup_64) jmpq *tr_start(%rip) SYM_CODE_END(startup_64) +SYM_CODE_START(trampoline_start64) + /* + * APs start here on a direct transfer from 64-bit BIOS with identity + * mapped page tables. Load the kernel's GDT in order to gear down to + * 32-bit mode (to handle 4-level vs. 5-level paging), and to (re)load + * segment registers. Load the zero IDT so any fault triggers a + * shutdown instead of jumping back into BIOS. + */ + lidt tr_idt(%rip) + lgdt tr_gdt64(%rip) + + ljmpl *tr_compat(%rip) +SYM_CODE_END(trampoline_start64) + .section ".rodata","a" # Duplicate the global descriptor table # so the kernel can live anywhere @@ -182,6 +209,17 @@ SYM_DATA_START(tr_gdt) .quad 0x00cf93000000ffff # __KERNEL_DS SYM_DATA_END_LABEL(tr_gdt, SYM_L_LOCAL, tr_gdt_end) +SYM_DATA_START(tr_gdt64) + .short tr_gdt_end - tr_gdt - 1 # gdt limit + .long pa_tr_gdt + .long 0 +SYM_DATA_END(tr_gdt64) + +SYM_DATA_START(tr_compat) + .long pa_trampoline_compat + .short __KERNEL32_CS +SYM_DATA_END(tr_compat) + .bss .balign PAGE_SIZE SYM_DATA(trampoline_pgd, .space PAGE_SIZE) diff --git a/arch/x86/realmode/rm/trampoline_common.S b/arch/x86/realmode/rm/trampoline_common.S index 5033e640f957..4331c32c47f8 100644 --- a/arch/x86/realmode/rm/trampoline_common.S +++ b/arch/x86/realmode/rm/trampoline_common.S @@ -1,4 +1,14 @@ /* SPDX-License-Identifier: GPL-2.0 */ .section ".rodata","a" .balign 16 -SYM_DATA_LOCAL(tr_idt, .fill 1, 6, 0) + +/* + * When a bootloader hands off to the kernel in 32-bit mode an + * IDT with a 2-byte limit and 4-byte base is needed. When a boot + * loader hands off to a kernel 64-bit mode the base address + * extends to 8-bytes. Reserve enough space for either scenario. + */ +SYM_DATA_START_LOCAL(tr_idt) + .short 0 + .quad 0 +SYM_DATA_END(tr_idt) -- cgit From f39642d0dbacded8b4a816a9197a73efb74e5702 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Wed, 6 Apr 2022 02:29:30 +0300 Subject: x86/acpi/x86/boot: Add multiprocessor wake-up support Secondary CPU startup is currently performed with something called the "INIT/SIPI protocol". This protocol requires assistance from VMMs to boot guests. As should be a familiar story by now, that support can not be provded to TDX guests because TDX VMMs are not trusted by guests. To remedy this situation a new[1] "Multiprocessor Wakeup Structure" has been added to to an existing ACPI table (MADT). This structure provides the physical address of a "mailbox". A write to the mailbox then steers the secondary CPU to the boot code. Add ACPI MADT wake structure parsing support and wake support. Use this support to wake CPUs whenever it is present instead of INIT/SIPI. While this structure can theoretically be used on 32-bit kernels, there are no 32-bit TDX guest kernels. It has not been tested and can not practically *be* tested on 32-bit. Make it 64-bit only. 1. Details about the new structure can be found in ACPI v6.4, in the "Multiprocessor Wakeup Structure" section. Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Andi Kleen Reviewed-by: Rafael J. Wysocki Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/20220405232939.73860-22-kirill.shutemov@linux.intel.com --- arch/x86/include/asm/apic.h | 5 +++ arch/x86/kernel/acpi/boot.c | 93 ++++++++++++++++++++++++++++++++++++++++++++- arch/x86/kernel/apic/apic.c | 10 +++++ 3 files changed, 107 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 35006e151774..bd8ae0a7010a 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -490,6 +490,11 @@ static inline unsigned int read_apic_id(void) return apic->get_apic_id(reg); } +#ifdef CONFIG_X86_64 +typedef int (*wakeup_cpu_handler)(int apicid, unsigned long start_eip); +extern void acpi_wake_cpu_handler_update(wakeup_cpu_handler handler); +#endif + extern int default_apic_id_valid(u32 apicid); extern int default_acpi_madt_oem_check(char *, char *); extern void default_setup_apic_routing(void); diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 0d01e7f5078c..6d2c50819501 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -65,6 +65,13 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; static bool acpi_support_online_capable; #endif +#ifdef CONFIG_X86_64 +/* Physical address of the Multiprocessor Wakeup Structure mailbox */ +static u64 acpi_mp_wake_mailbox_paddr; +/* Virtual address of the Multiprocessor Wakeup Structure mailbox */ +static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox; +#endif + #ifdef CONFIG_X86_IO_APIC /* * Locks related to IOAPIC hotplug @@ -336,7 +343,60 @@ acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long e return 0; } -#endif /*CONFIG_X86_LOCAL_APIC */ +#ifdef CONFIG_X86_64 +static int acpi_wakeup_cpu(int apicid, unsigned long start_ip) +{ + /* + * Remap mailbox memory only for the first call to acpi_wakeup_cpu(). + * + * Wakeup of secondary CPUs is fully serialized in the core code. + * No need to protect acpi_mp_wake_mailbox from concurrent accesses. + */ + if (!acpi_mp_wake_mailbox) { + acpi_mp_wake_mailbox = memremap(acpi_mp_wake_mailbox_paddr, + sizeof(*acpi_mp_wake_mailbox), + MEMREMAP_WB); + } + + /* + * Mailbox memory is shared between the firmware and OS. Firmware will + * listen on mailbox command address, and once it receives the wakeup + * command, the CPU associated with the given apicid will be booted. + * + * The value of 'apic_id' and 'wakeup_vector' must be visible to the + * firmware before the wakeup command is visible. smp_store_release() + * ensures ordering and visibility. + */ + acpi_mp_wake_mailbox->apic_id = apicid; + acpi_mp_wake_mailbox->wakeup_vector = start_ip; + smp_store_release(&acpi_mp_wake_mailbox->command, + ACPI_MP_WAKE_COMMAND_WAKEUP); + + /* + * Wait for the CPU to wake up. + * + * The CPU being woken up is essentially in a spin loop waiting to be + * woken up. It should not take long for it wake up and acknowledge by + * zeroing out ->command. + * + * ACPI specification doesn't provide any guidance on how long kernel + * has to wait for a wake up acknowledgement. It also doesn't provide + * a way to cancel a wake up request if it takes too long. + * + * In TDX environment, the VMM has control over how long it takes to + * wake up secondary. It can postpone scheduling secondary vCPU + * indefinitely. Giving up on wake up request and reporting error opens + * possible attack vector for VMM: it can wake up a secondary CPU when + * kernel doesn't expect it. Wait until positive result of the wake up + * request. + */ + while (READ_ONCE(acpi_mp_wake_mailbox->command)) + cpu_relax(); + + return 0; +} +#endif /* CONFIG_X86_64 */ +#endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_IO_APIC #define MP_ISA_BUS 0 @@ -1083,6 +1143,29 @@ static int __init acpi_parse_madt_lapic_entries(void) } return 0; } + +#ifdef CONFIG_X86_64 +static int __init acpi_parse_mp_wake(union acpi_subtable_headers *header, + const unsigned long end) +{ + struct acpi_madt_multiproc_wakeup *mp_wake; + + if (!IS_ENABLED(CONFIG_SMP)) + return -ENODEV; + + mp_wake = (struct acpi_madt_multiproc_wakeup *)header; + if (BAD_MADT_ENTRY(mp_wake, end)) + return -EINVAL; + + acpi_table_print_madt_entry(&header->common); + + acpi_mp_wake_mailbox_paddr = mp_wake->base_address; + + acpi_wake_cpu_handler_update(acpi_wakeup_cpu); + + return 0; +} +#endif /* CONFIG_X86_64 */ #endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_IO_APIC @@ -1278,6 +1361,14 @@ static void __init acpi_process_madt(void) smp_found_config = 1; } + +#ifdef CONFIG_X86_64 + /* + * Parse MADT MP Wake entry. + */ + acpi_table_parse_madt(ACPI_MADT_TYPE_MULTIPROC_WAKEUP, + acpi_parse_mp_wake, 1); +#endif } if (error == -EINVAL) { /* diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b70344bf6600..3c8f2c797a98 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2551,6 +2551,16 @@ u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid) } EXPORT_SYMBOL_GPL(x86_msi_msg_get_destid); +#ifdef CONFIG_X86_64 +void __init acpi_wake_cpu_handler_update(wakeup_cpu_handler handler) +{ + struct apic **drv; + + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) + (*drv)->wakeup_secondary_cpu_64 = handler; +} +#endif + /* * Override the generic EOI implementation with an optimized version. * Only called during early boot when only one CPU is active and with -- cgit From 9cf30606405f37b68ee1c0f6846253313c077088 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:31 +0300 Subject: x86/boot: Set CR0.NE early and keep it set during the boot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TDX guest requires CR0.NE to be set. Clearing the bit triggers #GP(0). If CR0.NE is 0, the MS-DOS compatibility mode for handling floating-point exceptions is selected. In this mode, the software exception handler for floating-point exceptions is invoked externally using the processor’s FERR#, INTR, and IGNNE# pins. Using FERR# and IGNNE# to handle floating-point exception is deprecated. CR0.NE=0 also limits newer processors to operate with one logical processor active. Kernel uses CR0_STATE constant to initialize CR0. It has NE bit set. But during early boot kernel has more ad-hoc approach to setting bit in the register. During some of this ad-hoc manipulation, CR0.NE is cleared. This causes a #GP in TDX guests and makes it die in early boot. Make CR0 initialization consistent, deriving the initial value of CR0 from CR0_STATE. Since CR0_STATE always has CR0.NE=1, this ensures that CR0.NE is never 0 and avoids the #GP. Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20220405232939.73860-23-kirill.shutemov@linux.intel.com --- arch/x86/boot/compressed/head_64.S | 7 ++++--- arch/x86/realmode/rm/trampoline_64.S | 8 ++++---- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index dea95301196b..7b5d36214352 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -289,7 +289,7 @@ SYM_FUNC_START(startup_32) pushl %eax /* Enter paged protected Mode, activating Long Mode */ - movl $(X86_CR0_PG | X86_CR0_PE), %eax /* Enable Paging and Protected mode */ + movl $CR0_STATE, %eax movl %eax, %cr0 /* Jump from 32bit compatibility mode into 64bit mode. */ @@ -661,8 +661,9 @@ SYM_CODE_START(trampoline_32bit_src) pushl $__KERNEL_CS pushl %eax - /* Enable paging again */ - movl $(X86_CR0_PG | X86_CR0_PE), %eax + /* Enable paging again. */ + movl %cr0, %eax + btsl $X86_CR0_PG_BIT, %eax movl %eax, %cr0 lret diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index ae112a91592f..d380f2d1fd23 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -70,7 +70,7 @@ SYM_CODE_START(trampoline_start) movw $__KERNEL_DS, %dx # Data segment descriptor # Enable protected mode - movl $X86_CR0_PE, %eax # protected mode (PE) bit + movl $(CR0_STATE & ~X86_CR0_PG), %eax movl %eax, %cr0 # into protected mode # flush prefetch and jump to startup_32 @@ -148,8 +148,8 @@ SYM_CODE_START(startup_32) movl $MSR_EFER, %ecx wrmsr - # Enable paging and in turn activate Long Mode - movl $(X86_CR0_PG | X86_CR0_WP | X86_CR0_PE), %eax + # Enable paging and in turn activate Long Mode. + movl $CR0_STATE, %eax movl %eax, %cr0 /* @@ -169,7 +169,7 @@ SYM_CODE_START(pa_trampoline_compat) movl $rm_stack_end, %esp movw $__KERNEL_DS, %dx - movl $X86_CR0_PE, %eax + movl $(CR0_STATE & ~X86_CR0_PG), %eax movl %eax, %cr0 ljmpl $__KERNEL32_CS, $pa_startup_32 SYM_CODE_END(pa_trampoline_compat) -- cgit From 77a512e35db7609a8c909e2006b2ea82f2b1616f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 6 Apr 2022 02:29:32 +0300 Subject: x86/boot: Avoid #VE during boot for TDX platforms There are a few MSRs and control register bits that the kernel normally needs to modify during boot. But, TDX disallows modification of these registers to help provide consistent security guarantees. Fortunately, TDX ensures that these are all in the correct state before the kernel loads, which means the kernel does not need to modify them. The conditions to avoid are: * Any writes to the EFER MSR * Clearing CR4.MCE This theoretically makes the guest boot more fragile. If, for instance, EFER was set up incorrectly and a WRMSR was performed, it will trigger early exception panic or a triple fault, if it's before early exceptions are set up. However, this is likely to trip up the guest BIOS long before control reaches the kernel. In any case, these kinds of problems are unlikely to occur in production environments, and developers have good debug tools to fix them quickly. Change the common boot code to work on TDX and non-TDX systems. This should have no functional effect on non-TDX systems. Signed-off-by: Sean Christopherson Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Andi Kleen Reviewed-by: Dan Williams Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20220405232939.73860-24-kirill.shutemov@linux.intel.com --- arch/x86/Kconfig | 1 + arch/x86/boot/compressed/head_64.S | 20 ++++++++++++++++++-- arch/x86/boot/compressed/pgtable.h | 2 +- arch/x86/kernel/head_64.S | 28 ++++++++++++++++++++++++++-- arch/x86/realmode/rm/trampoline_64.S | 13 ++++++++++++- 5 files changed, 58 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index aea4cc404c31..a18185525708 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -884,6 +884,7 @@ config INTEL_TDX_GUEST depends on X86_X2APIC select ARCH_HAS_CC_PLATFORM select DYNAMIC_PHYSICAL_MASK + select X86_MCE help Support running as a guest under Intel TDX. Without this support, the guest kernel can not boot or run under TDX. diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 7b5d36214352..f63ec254e457 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -642,12 +642,28 @@ SYM_CODE_START(trampoline_32bit_src) movl $MSR_EFER, %ecx rdmsr btsl $_EFER_LME, %eax + /* Avoid writing EFER if no change was made (for TDX guest) */ + jc 1f wrmsr - popl %edx +1: popl %edx popl %ecx +#ifdef CONFIG_X86_MCE + /* + * Preserve CR4.MCE if the kernel will enable #MC support. + * Clearing MCE may fault in some environments (that also force #MC + * support). Any machine check that occurs before #MC support is fully + * configured will crash the system regardless of the CR4.MCE value set + * here. + */ + movl %cr4, %eax + andl $X86_CR4_MCE, %eax +#else + movl $0, %eax +#endif + /* Enable PAE and LA57 (if required) paging modes */ - movl $X86_CR4_PAE, %eax + orl $X86_CR4_PAE, %eax testl %edx, %edx jz 1f orl $X86_CR4_LA57, %eax diff --git a/arch/x86/boot/compressed/pgtable.h b/arch/x86/boot/compressed/pgtable.h index 6ff7e81b5628..cc9b2529a086 100644 --- a/arch/x86/boot/compressed/pgtable.h +++ b/arch/x86/boot/compressed/pgtable.h @@ -6,7 +6,7 @@ #define TRAMPOLINE_32BIT_PGTABLE_OFFSET 0 #define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE -#define TRAMPOLINE_32BIT_CODE_SIZE 0x70 +#define TRAMPOLINE_32BIT_CODE_SIZE 0x80 #define TRAMPOLINE_32BIT_STACK_END TRAMPOLINE_32BIT_SIZE diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index b8e3019547a5..39a6ff7c0060 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -142,8 +142,22 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) addq $(init_top_pgt - __START_KERNEL_map), %rax 1: +#ifdef CONFIG_X86_MCE + /* + * Preserve CR4.MCE if the kernel will enable #MC support. + * Clearing MCE may fault in some environments (that also force #MC + * support). Any machine check that occurs before #MC support is fully + * configured will crash the system regardless of the CR4.MCE value set + * here. + */ + movq %cr4, %rcx + andl $X86_CR4_MCE, %ecx +#else + movl $0, %ecx +#endif + /* Enable PAE mode, PGE and LA57 */ - movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx + orl $(X86_CR4_PAE | X86_CR4_PGE), %ecx #ifdef CONFIG_X86_5LEVEL testl $1, __pgtable_l5_enabled(%rip) jz 1f @@ -249,13 +263,23 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) /* Setup EFER (Extended Feature Enable Register) */ movl $MSR_EFER, %ecx rdmsr + /* + * Preserve current value of EFER for comparison and to skip + * EFER writes if no change was made (for TDX guest) + */ + movl %eax, %edx btsl $_EFER_SCE, %eax /* Enable System Call */ btl $20,%edi /* No Execute supported? */ jnc 1f btsl $_EFER_NX, %eax btsq $_PAGE_BIT_NX,early_pmd_flags(%rip) -1: wrmsr /* Make changes effective */ + /* Avoid writing EFER if no change was made (for TDX guest) */ +1: cmpl %edx, %eax + je 1f + xor %edx, %edx + wrmsr /* Make changes effective */ +1: /* Setup cr0 */ movl $CR0_STATE, %eax /* Make changes effective */ diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index d380f2d1fd23..e38d61d6562e 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -143,11 +143,22 @@ SYM_CODE_START(startup_32) movl %eax, %cr3 # Set up EFER + movl $MSR_EFER, %ecx + rdmsr + /* + * Skip writing to EFER if the register already has desired + * value (to avoid #VE for the TDX guest). + */ + cmp pa_tr_efer, %eax + jne .Lwrite_efer + cmp pa_tr_efer + 4, %edx + je .Ldone_efer +.Lwrite_efer: movl pa_tr_efer, %eax movl pa_tr_efer + 4, %edx - movl $MSR_EFER, %ecx wrmsr +.Ldone_efer: # Enable paging and in turn activate Long Mode. movl $CR0_STATE, %eax movl %eax, %cr0 -- cgit From bae1a962ac2c5e6be08319ff3f7d6df542584fce Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Wed, 6 Apr 2022 02:29:33 +0300 Subject: x86/topology: Disable CPU online/offline control for TDX guests Unlike regular VMs, TDX guests use the firmware hand-off wakeup method to wake up the APs during the boot process. This wakeup model uses a mailbox to communicate with firmware to bring up the APs. As per the design, this mailbox can only be used once for the given AP, which means after the APs are booted, the same mailbox cannot be used to offline/online the given AP. More details about this requirement can be found in Intel TDX Virtual Firmware Design Guide, sec titled "AP initialization in OS" and in sec titled "Hotplug Device". Since the architecture does not support any method of offlining the CPUs, disable CPU hotplug support in the kernel. Since this hotplug disable feature can be re-used by other VM guests, add a new CC attribute CC_ATTR_HOTPLUG_DISABLED and use it to disable the hotplug support. Attempt to offline CPU will fail with -EOPNOTSUPP. Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Andi Kleen Reviewed-by: Tony Luck Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20220405232939.73860-25-kirill.shutemov@linux.intel.com --- arch/x86/coco/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c index df08edc94f9b..70956f9d7c7e 100644 --- a/arch/x86/coco/core.c +++ b/arch/x86/coco/core.c @@ -20,6 +20,7 @@ static bool intel_cc_platform_has(enum cc_attr attr) { switch (attr) { case CC_ATTR_GUEST_UNROLL_STRING_IO: + case CC_ATTR_HOTPLUG_DISABLED: return true; default: return false; -- cgit From 9aa6ea69852c46e551f4180dce4208bd53df418c Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:34 +0300 Subject: x86/tdx: Make pages shared in ioremap() In TDX guests, guest memory is protected from host access. If a guest performs I/O, it needs to explicitly share the I/O memory with the host. Make all ioremap()ed pages that are not backed by normal memory (IORES_DESC_NONE or IORES_DESC_RESERVED) mapped as shared. The permissions in PAGE_KERNEL_IO already work for "decrypted" memory on AMD SEV/SME systems. That means that they have no need to make a pgprot_decrypted() call. TDX guests, on the other hand, _need_ change to PAGE_KERNEL_IO for "decrypted" mappings. Add a pgprot_decrypted() for TDX. Co-developed-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Andi Kleen Reviewed-by: Tony Luck Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20220405232939.73860-26-kirill.shutemov@linux.intel.com --- arch/x86/mm/ioremap.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 17a492c27306..1ad0228f8ceb 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -242,10 +242,15 @@ __ioremap_caller(resource_size_t phys_addr, unsigned long size, * If the page being mapped is in memory and SEV is active then * make sure the memory encryption attribute is enabled in the * resulting mapping. + * In TDX guests, memory is marked private by default. If encryption + * is not requested (using encrypted), explicitly set decrypt + * attribute in all IOREMAPPED memory. */ prot = PAGE_KERNEL_IO; if ((io_desc.flags & IORES_MAP_ENCRYPTED) || encrypted) prot = pgprot_encrypted(prot); + else + prot = pgprot_decrypted(prot); switch (pcm) { case _PAGE_CACHE_MODE_UC: -- cgit From 7dbde7631629896b478bc5b1f4c3e52e6d518d12 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:35 +0300 Subject: x86/mm/cpa: Add support for TDX shared memory Intel TDX protects guest memory from VMM access. Any memory that is required for communication with the VMM must be explicitly shared. It is a two-step process: the guest sets the shared bit in the page table entry and notifies VMM about the change. The notification happens using MapGPA hypercall. Conversion back to private memory requires clearing the shared bit, notifying VMM with MapGPA hypercall following with accepting the memory with AcceptPage hypercall. Provide a TDX version of x86_platform.guest.* callbacks. It makes __set_memory_enc_pgtable() work right in TDX guest. Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20220405232939.73860-27-kirill.shutemov@linux.intel.com --- arch/x86/coco/core.c | 1 + arch/x86/coco/tdx/tdx.c | 133 ++++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/traps.c | 2 +- 3 files changed, 135 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c index 70956f9d7c7e..9f74125c582d 100644 --- a/arch/x86/coco/core.c +++ b/arch/x86/coco/core.c @@ -21,6 +21,7 @@ static bool intel_cc_platform_has(enum cc_attr attr) switch (attr) { case CC_ATTR_GUEST_UNROLL_STRING_IO: case CC_ATTR_HOTPLUG_DISABLED: + case CC_ATTR_GUEST_MEM_ENCRYPT: return true; default: return false; diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index f50f530aff5f..03deb4d6920d 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -10,10 +10,15 @@ #include #include #include +#include /* TDX module Call Leaf IDs */ #define TDX_GET_INFO 1 #define TDX_GET_VEINFO 3 +#define TDX_ACCEPT_PAGE 6 + +/* TDX hypercall Leaf IDs */ +#define TDVMCALL_MAP_GPA 0x10001 /* MMIO direction */ #define EPT_READ 0 @@ -531,6 +536,130 @@ bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve) return ret; } +static bool tdx_tlb_flush_required(bool private) +{ + /* + * TDX guest is responsible for flushing TLB on private->shared + * transition. VMM is responsible for flushing on shared->private. + * + * The VMM _can't_ flush private addresses as it can't generate PAs + * with the guest's HKID. Shared memory isn't subject to integrity + * checking, i.e. the VMM doesn't need to flush for its own protection. + * + * There's no need to flush when converting from shared to private, + * as flushing is the VMM's responsibility in this case, e.g. it must + * flush to avoid integrity failures in the face of a buggy or + * malicious guest. + */ + return !private; +} + +static bool tdx_cache_flush_required(void) +{ + /* + * AMD SME/SEV can avoid cache flushing if HW enforces cache coherence. + * TDX doesn't have such capability. + * + * Flush cache unconditionally. + */ + return true; +} + +static bool try_accept_one(phys_addr_t *start, unsigned long len, + enum pg_level pg_level) +{ + unsigned long accept_size = page_level_size(pg_level); + u64 tdcall_rcx; + u8 page_size; + + if (!IS_ALIGNED(*start, accept_size)) + return false; + + if (len < accept_size) + return false; + + /* + * Pass the page physical address to the TDX module to accept the + * pending, private page. + * + * Bits 2:0 of RCX encode page size: 0 - 4K, 1 - 2M, 2 - 1G. + */ + switch (pg_level) { + case PG_LEVEL_4K: + page_size = 0; + break; + case PG_LEVEL_2M: + page_size = 1; + break; + case PG_LEVEL_1G: + page_size = 2; + break; + default: + return false; + } + + tdcall_rcx = *start | page_size; + if (__tdx_module_call(TDX_ACCEPT_PAGE, tdcall_rcx, 0, 0, 0, NULL)) + return false; + + *start += accept_size; + return true; +} + +/* + * Inform the VMM of the guest's intent for this physical page: shared with + * the VMM or private to the guest. The VMM is expected to change its mapping + * of the page in response. + */ +static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc) +{ + phys_addr_t start = __pa(vaddr); + phys_addr_t end = __pa(vaddr + numpages * PAGE_SIZE); + + if (!enc) { + /* Set the shared (decrypted) bits: */ + start |= cc_mkdec(0); + end |= cc_mkdec(0); + } + + /* + * Notify the VMM about page mapping conversion. More info about ABI + * can be found in TDX Guest-Host-Communication Interface (GHCI), + * section "TDG.VP.VMCALL" + */ + if (_tdx_hypercall(TDVMCALL_MAP_GPA, start, end - start, 0, 0)) + return false; + + /* private->shared conversion requires only MapGPA call */ + if (!enc) + return true; + + /* + * For shared->private conversion, accept the page using + * TDX_ACCEPT_PAGE TDX module call. + */ + while (start < end) { + unsigned long len = end - start; + + /* + * Try larger accepts first. It gives chance to VMM to keep + * 1G/2M SEPT entries where possible and speeds up process by + * cutting number of hypercalls (if successful). + */ + + if (try_accept_one(&start, len, PG_LEVEL_1G)) + continue; + + if (try_accept_one(&start, len, PG_LEVEL_2M)) + continue; + + if (!try_accept_one(&start, len, PG_LEVEL_4K)) + return false; + } + + return true; +} + void __init tdx_early_init(void) { u64 cc_mask; @@ -555,5 +684,9 @@ void __init tdx_early_init(void) */ physical_mask &= cc_mask - 1; + x86_platform.guest.enc_cache_flush_required = tdx_cache_flush_required; + x86_platform.guest.enc_tlb_flush_required = tdx_tlb_flush_required; + x86_platform.guest.enc_status_change_finish = tdx_enc_status_changed; + pr_info("Guest detected\n"); } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index f9fb6530338f..a4e2efde5d1f 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -1378,7 +1378,7 @@ static void ve_raise_fault(struct pt_regs *regs, long error_code) * * In the settings that Linux will run in, virtualization exceptions are * never generated on accesses to normal, TD-private memory that has been - * accepted. + * accepted (by BIOS or with tdx_enc_status_changed()). * * Syscall entry code has a critical window where the kernel stack is not * yet set up. Any exception in this window leads to hard to debug issues -- cgit From 968b493173ac5205fe75f6330ee767f96bf88e57 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:36 +0300 Subject: x86/mm: Make DMA memory shared for TD guest Intel TDX doesn't allow VMM to directly access guest private memory. Any memory that is required for communication with the VMM must be shared explicitly. The same rule applies for any DMA to and from the TDX guest. All DMA pages have to be marked as shared pages. A generic way to achieve this without any changes to device drivers is to use the SWIOTLB framework. The previous patch ("Add support for TDX shared memory") gave TDX guests the _ability_ to make some pages shared, but did not make any pages shared. This actually marks SWIOTLB buffers *as* shared. Start returning true for cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) in TDX guests. This has several implications: - Allows the existing mem_encrypt_init() to be used for TDX which sets SWIOTLB buffers shared (aka. "decrypted"). - Ensures that all DMA is routed via the SWIOTLB mechanism (see pci_swiotlb_detect()) Stop selecting DYNAMIC_PHYSICAL_MASK directly. It will get set indirectly by selecting X86_MEM_ENCRYPT. mem_encrypt_init() is currently under an AMD-specific #ifdef. Move it to a generic area of the header. Co-developed-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Andi Kleen Reviewed-by: Tony Luck Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/20220405232939.73860-28-kirill.shutemov@linux.intel.com --- arch/x86/Kconfig | 2 +- arch/x86/coco/core.c | 1 + arch/x86/include/asm/mem_encrypt.h | 6 +++--- arch/x86/mm/mem_encrypt.c | 9 ++++++++- 4 files changed, 13 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a18185525708..7021ec725dd3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -883,7 +883,7 @@ config INTEL_TDX_GUEST depends on X86_64 && CPU_SUP_INTEL depends on X86_X2APIC select ARCH_HAS_CC_PLATFORM - select DYNAMIC_PHYSICAL_MASK + select X86_MEM_ENCRYPT select X86_MCE help Support running as a guest under Intel TDX. Without this support, diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c index 9f74125c582d..4320fadae716 100644 --- a/arch/x86/coco/core.c +++ b/arch/x86/coco/core.c @@ -22,6 +22,7 @@ static bool intel_cc_platform_has(enum cc_attr attr) case CC_ATTR_GUEST_UNROLL_STRING_IO: case CC_ATTR_HOTPLUG_DISABLED: case CC_ATTR_GUEST_MEM_ENCRYPT: + case CC_ATTR_MEM_ENCRYPT: return true; default: return false; diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index e2c6f433ed10..88ceaf3648b3 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -49,9 +49,6 @@ void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, void __init mem_encrypt_free_decrypted_mem(void); -/* Architecture __weak replacement functions */ -void __init mem_encrypt_init(void); - void __init sev_es_init_vc_handling(void); #define __bss_decrypted __section(".bss..decrypted") @@ -89,6 +86,9 @@ static inline void mem_encrypt_free_decrypted_mem(void) { } #endif /* CONFIG_AMD_MEM_ENCRYPT */ +/* Architecture __weak replacement functions */ +void __init mem_encrypt_init(void); + /* * The __sme_pa() and __sme_pa_nodebug() macros are meant for use when * writing to or comparing values from the cr3 register. Having the diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 50d209939c66..10ee40b5204b 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -42,7 +42,14 @@ bool force_dma_unencrypted(struct device *dev) static void print_mem_encrypt_feature_info(void) { - pr_info("AMD Memory Encryption Features active:"); + pr_info("Memory Encryption Features active:"); + + if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { + pr_cont(" Intel TDX\n"); + return; + } + + pr_cont("AMD "); /* Secure Memory Encryption */ if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) { -- cgit From f4c9361f97c40d365c34f9cb8b8bc3eae0ee7778 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 6 Apr 2022 02:29:37 +0300 Subject: x86/tdx/ioapic: Add shared bit for IOAPIC base address The kernel interacts with each bare-metal IOAPIC with a special MMIO page. When running under KVM, the guest's IOAPICs are emulated by KVM. When running as a TDX guest, the guest needs to mark each IOAPIC mapping as "shared" with the host. This ensures that TDX private protections are not applied to the page, which allows the TDX host emulation to work. ioremap()-created mappings such as virtio will be marked as shared by default. However, the IOAPIC code does not use ioremap() and instead uses the fixmap mechanism. Introduce a special fixmap helper just for the IOAPIC code. Ensure that it marks IOAPIC pages as "shared". This replaces set_fixmap_nocache() with __set_fixmap() since __set_fixmap() allows custom 'prot' values. AMD SEV gets IOAPIC pages shared because FIXMAP_PAGE_NOCACHE has _ENC bit clear. TDX has to set bit to share the page with the host. Signed-off-by: Isaku Yamahata Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Andi Kleen Reviewed-by: Tony Luck Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20220405232939.73860-29-kirill.shutemov@linux.intel.com --- arch/x86/kernel/apic/io_apic.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index c1bb384935b0..a868b76cd3d4 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -65,6 +65,7 @@ #include #include #include +#include #define for_each_ioapic(idx) \ for ((idx) = 0; (idx) < nr_ioapics; (idx)++) @@ -2677,6 +2678,19 @@ static struct resource * __init ioapic_setup_resources(void) return res; } +static void io_apic_set_fixmap(enum fixed_addresses idx, phys_addr_t phys) +{ + pgprot_t flags = FIXMAP_PAGE_NOCACHE; + + /* + * Ensure fixmaps for IOAPIC MMIO respect memory encryption pgprot + * bits, just like normal ioremap(): + */ + flags = pgprot_decrypted(flags); + + __set_fixmap(idx, phys, flags); +} + void __init io_apic_init_mappings(void) { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; @@ -2709,7 +2723,7 @@ fake_ioapic_page: __func__, PAGE_SIZE, PAGE_SIZE); ioapic_phys = __pa(ioapic_phys); } - set_fixmap_nocache(idx, ioapic_phys); + io_apic_set_fixmap(idx, ioapic_phys); apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n", __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK), ioapic_phys); @@ -2838,7 +2852,7 @@ int mp_register_ioapic(int id, u32 address, u32 gsi_base, ioapics[idx].mp_config.flags = MPC_APIC_USABLE; ioapics[idx].mp_config.apicaddr = address; - set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); + io_apic_set_fixmap(FIX_IO_APIC_BASE_0 + idx, address); if (bad_ioapic_register(idx)) { clear_fixmap(FIX_IO_APIC_BASE_0 + idx); return -ENODEV; -- cgit From e2efb6359e620521d1e13f69b2257de8ceaa9475 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:38 +0300 Subject: ACPICA: Avoid cache flush inside virtual machines While running inside virtual machine, the kernel can bypass cache flushing. Changing sleep state in a virtual machine doesn't affect the host system sleep state and cannot lead to data loss. Before entering sleep states, the ACPI code flushes caches to prevent data loss using the WBINVD instruction. This mechanism is required on bare metal. But, any use WBINVD inside of a guest is worthless. Changing sleep state in a virtual machine doesn't affect the host system sleep state and cannot lead to data loss, so most hypervisors simply ignore it. Despite this, the ACPI code calls WBINVD unconditionally anyway. It's useless, but also normally harmless. In TDX guests, though, WBINVD stops being harmless; it triggers a virtualization exception (#VE). If the ACPI cache-flushing WBINVD were left in place, TDX guests would need handling to recover from the exception. Avoid using WBINVD whenever running under a hypervisor. This both removes the useless WBINVDs and saves TDX from implementing WBINVD handling. Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Dave Hansen Reviewed-by: Dan Williams Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20220405232939.73860-30-kirill.shutemov@linux.intel.com --- arch/x86/include/asm/acenv.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/acenv.h b/arch/x86/include/asm/acenv.h index 9aff97f0de7f..d937c55e717e 100644 --- a/arch/x86/include/asm/acenv.h +++ b/arch/x86/include/asm/acenv.h @@ -13,7 +13,19 @@ /* Asm macros */ -#define ACPI_FLUSH_CPU_CACHE() wbinvd() +/* + * ACPI_FLUSH_CPU_CACHE() flushes caches on entering sleep states. + * It is required to prevent data loss. + * + * While running inside virtual machine, the kernel can bypass cache flushing. + * Changing sleep state in a virtual machine doesn't affect the host system + * sleep state and cannot lead to data loss. + */ +#define ACPI_FLUSH_CPU_CACHE() \ +do { \ + if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) \ + wbinvd(); \ +} while (0) int __acpi_acquire_global_lock(unsigned int *lock); int __acpi_release_global_lock(unsigned int *lock); -- cgit From f16a005cde3b1f31cad5ecba0cc810652c3874ec Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 16 Mar 2022 12:20:09 -0700 Subject: crypto: x86 - eliminate anonymous module_init & module_exit Eliminate anonymous module_init() and module_exit(), which can lead to confusion or ambiguity when reading System.map, crashes/oops/bugs, or an initcall_debug log. Give each of these init and exit functions unique driver-specific names to eliminate the anonymous names. Example 1: (System.map) ffffffff832fc78c t init ffffffff832fc79e t init ffffffff832fc8f8 t init Example 2: (initcall_debug log) calling init+0x0/0x12 @ 1 initcall init+0x0/0x12 returned 0 after 15 usecs calling init+0x0/0x60 @ 1 initcall init+0x0/0x60 returned 0 after 2 usecs calling init+0x0/0x9a @ 1 initcall init+0x0/0x9a returned 0 after 74 usecs Fixes: 64b94ceae8c1 ("crypto: blowfish - add x86_64 assembly implementation") Fixes: 676a38046f4f ("crypto: camellia-x86_64 - module init/exit functions should be static") Fixes: 0b95ec56ae19 ("crypto: camellia - add assembler implementation for x86_64") Fixes: 56d76c96a9f3 ("crypto: serpent - add AVX2/x86_64 assembler implementation of serpent cipher") Fixes: b9f535ffe38f ("[CRYPTO] twofish: i586 assembly version") Fixes: ff0a70fe0536 ("crypto: twofish-x86_64-3way - module init/exit functions should be static") Fixes: 8280daad436e ("crypto: twofish - add 3-way parallel x86_64 assembler implemention") Signed-off-by: Randy Dunlap Cc: Jussi Kivilinna Cc: Joachim Fritschi Cc: Herbert Xu Cc: "David S. Miller" Cc: linux-crypto@vger.kernel.org Cc: x86@kernel.org Signed-off-by: Herbert Xu --- arch/x86/crypto/blowfish_glue.c | 8 ++++---- arch/x86/crypto/camellia_glue.c | 8 ++++---- arch/x86/crypto/serpent_avx2_glue.c | 8 ++++---- arch/x86/crypto/twofish_glue.c | 8 ++++---- arch/x86/crypto/twofish_glue_3way.c | 8 ++++---- 5 files changed, 20 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c index fda6066437aa..ba06322c1e39 100644 --- a/arch/x86/crypto/blowfish_glue.c +++ b/arch/x86/crypto/blowfish_glue.c @@ -303,7 +303,7 @@ static int force; module_param(force, int, 0); MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist"); -static int __init init(void) +static int __init blowfish_init(void) { int err; @@ -327,15 +327,15 @@ static int __init init(void) return err; } -static void __exit fini(void) +static void __exit blowfish_fini(void) { crypto_unregister_alg(&bf_cipher_alg); crypto_unregister_skciphers(bf_skcipher_algs, ARRAY_SIZE(bf_skcipher_algs)); } -module_init(init); -module_exit(fini); +module_init(blowfish_init); +module_exit(blowfish_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Blowfish Cipher Algorithm, asm optimized"); diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c index 66c435ba9d3d..d45e9c0c42ac 100644 --- a/arch/x86/crypto/camellia_glue.c +++ b/arch/x86/crypto/camellia_glue.c @@ -1377,7 +1377,7 @@ static int force; module_param(force, int, 0); MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist"); -static int __init init(void) +static int __init camellia_init(void) { int err; @@ -1401,15 +1401,15 @@ static int __init init(void) return err; } -static void __exit fini(void) +static void __exit camellia_fini(void) { crypto_unregister_alg(&camellia_cipher_alg); crypto_unregister_skciphers(camellia_skcipher_algs, ARRAY_SIZE(camellia_skcipher_algs)); } -module_init(init); -module_exit(fini); +module_init(camellia_init); +module_exit(camellia_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Camellia Cipher Algorithm, asm optimized"); diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c index ccf0b5fa4933..347e97f4b713 100644 --- a/arch/x86/crypto/serpent_avx2_glue.c +++ b/arch/x86/crypto/serpent_avx2_glue.c @@ -96,7 +96,7 @@ static struct skcipher_alg serpent_algs[] = { static struct simd_skcipher_alg *serpent_simd_algs[ARRAY_SIZE(serpent_algs)]; -static int __init init(void) +static int __init serpent_avx2_init(void) { const char *feature_name; @@ -115,14 +115,14 @@ static int __init init(void) serpent_simd_algs); } -static void __exit fini(void) +static void __exit serpent_avx2_fini(void) { simd_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs), serpent_simd_algs); } -module_init(init); -module_exit(fini); +module_init(serpent_avx2_init); +module_exit(serpent_avx2_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX2 optimized"); diff --git a/arch/x86/crypto/twofish_glue.c b/arch/x86/crypto/twofish_glue.c index 77e06c2da83d..f9c4adc27404 100644 --- a/arch/x86/crypto/twofish_glue.c +++ b/arch/x86/crypto/twofish_glue.c @@ -81,18 +81,18 @@ static struct crypto_alg alg = { } }; -static int __init init(void) +static int __init twofish_glue_init(void) { return crypto_register_alg(&alg); } -static void __exit fini(void) +static void __exit twofish_glue_fini(void) { crypto_unregister_alg(&alg); } -module_init(init); -module_exit(fini); +module_init(twofish_glue_init); +module_exit(twofish_glue_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION ("Twofish Cipher Algorithm, asm optimized"); diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index 3507cf2064f1..90454cf18e0d 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -140,7 +140,7 @@ static int force; module_param(force, int, 0); MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist"); -static int __init init(void) +static int __init twofish_3way_init(void) { if (!force && is_blacklisted_cpu()) { printk(KERN_INFO @@ -154,13 +154,13 @@ static int __init init(void) ARRAY_SIZE(tf_skciphers)); } -static void __exit fini(void) +static void __exit twofish_3way_fini(void) { crypto_unregister_skciphers(tf_skciphers, ARRAY_SIZE(tf_skciphers)); } -module_init(init); -module_exit(fini); +module_init(twofish_3way_init); +module_exit(twofish_3way_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Twofish Cipher Algorithm, 3-way parallel asm optimized"); -- cgit From e720ea52e85c9d00cf8c5769795d0a3e585524f6 Mon Sep 17 00:00:00 2001 From: Peter Gonda Date: Thu, 17 Mar 2022 14:19:13 -0700 Subject: x86/sev-es: Replace open-coded hlt-loop with sev_es_terminate() Replace the halt loop in handle_vc_boot_ghcb() with an sev_es_terminate(). The HLT gives the system no indication the guest is unhappy. The termination request will signal there was an error during VC handling during boot. [ bp: Update it to pass the reason set too. ] Signed-off-by: Peter Gonda Signed-off-by: Borislav Petkov Reviewed-by: Joerg Roedel Link: https://lore.kernel.org/r/20220317211913.1397427-1-pgonda@google.com --- arch/x86/kernel/sev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index ace43e116b40..f01f4550e2c6 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -1982,8 +1982,7 @@ bool __init handle_vc_boot_ghcb(struct pt_regs *regs) fail: show_regs(regs); - while (true) - halt(); + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); } /* -- cgit From 31bf0f4333254469ebf34d7f17d64a57bce516d4 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 7 Apr 2022 17:42:02 -0500 Subject: x86: Log resource clipping for E820 regions When remove_e820_regions() clips a resource because an E820 region overlaps it, log a note in dmesg to add in debugging. Signed-off-by: Bjorn Helgaas --- arch/x86/kernel/resource.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c index 8ffe68437744..30d524adb012 100644 --- a/arch/x86/kernel/resource.c +++ b/arch/x86/kernel/resource.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include static void resource_clip(struct resource *res, resource_size_t start, @@ -28,6 +29,7 @@ static void remove_e820_regions(struct resource *avail) int i; struct e820_entry *entry; u64 e820_start, e820_end; + struct resource orig = *avail; for (i = 0; i < e820_table->nr_entries; i++) { entry = &e820_table->entries[i]; @@ -35,6 +37,11 @@ static void remove_e820_regions(struct resource *avail) e820_end = entry->addr + entry->size - 1; resource_clip(avail, e820_start, e820_end); + if (orig.start != avail->start || orig.end != avail->end) { + pr_info("clipped %pR to %pR for e820 entry [mem %#010Lx-%#010Lx]\n", + &orig, avail, e820_start, e820_end); + orig = *avail; + } } } -- cgit From 4c5e242d3e937bb9f9c226d06888d9189826879d Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 3 Mar 2022 16:00:39 -0600 Subject: x86/PCI: Clip only host bridge windows for E820 regions ACPI firmware advertises PCI host bridge resources via PNP0A03 _CRS methods. Some BIOSes include non-window address space in _CRS, and if we allocate that non-window space for PCI devices, they don't work. 4dc2287c1805 ("x86: avoid E820 regions when allocating address space") works around this issue by clipping out any regions mentioned in the E820 table in the allocate_resource() path, but the implementation has a couple issues: - The clipping is done for *all* allocations, not just those for PCI address space, and - The clipping is done at each allocation instead of being done once when setting up the host bridge windows. Rework the implementation so we only clip PCI host bridge windows, and we do it once when setting them up. Example output changes: BIOS-e820: [mem 0x00000000b0000000-0x00000000c00fffff] reserved + acpi PNP0A08:00: clipped [mem 0xc0000000-0xfebfffff window] to [mem 0xc0100000-0xfebfffff window] for e820 entry [mem 0xb0000000-0xc00fffff] - pci_bus 0000:00: root bus resource [mem 0xc0000000-0xfebfffff window] + pci_bus 0000:00: root bus resource [mem 0xc0100000-0xfebfffff window] Link: https://lore.kernel.org/r/20220304035110.988712-3-helgaas@kernel.org Signed-off-by: Bjorn Helgaas Reviewed-by: Hans de Goede Reviewed-by: Mika Westerberg Acked-by: Rafael J. Wysocki --- arch/x86/include/asm/e820/api.h | 5 +++++ arch/x86/kernel/resource.c | 14 +++++++------- arch/x86/pci/acpi.c | 5 +++++ 3 files changed, 17 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h index e8f58ddd06d9..5a39ed59b6db 100644 --- a/arch/x86/include/asm/e820/api.h +++ b/arch/x86/include/asm/e820/api.h @@ -4,6 +4,9 @@ #include +struct device; +struct resource; + extern struct e820_table *e820_table; extern struct e820_table *e820_table_kexec; extern struct e820_table *e820_table_firmware; @@ -43,6 +46,8 @@ extern void e820__register_nosave_regions(unsigned long limit_pfn); extern int e820__get_entry_type(u64 start, u64 end); +extern void remove_e820_regions(struct device *dev, struct resource *avail); + /* * Returns true iff the specified range [start,end) is completely contained inside * the ISA region. diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c index 30d524adb012..db2b350a37b7 100644 --- a/arch/x86/kernel/resource.c +++ b/arch/x86/kernel/resource.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include -#include #include static void resource_clip(struct resource *res, resource_size_t start, @@ -24,13 +24,16 @@ static void resource_clip(struct resource *res, resource_size_t start, res->start = end + 1; } -static void remove_e820_regions(struct resource *avail) +void remove_e820_regions(struct device *dev, struct resource *avail) { int i; struct e820_entry *entry; u64 e820_start, e820_end; struct resource orig = *avail; + if (!(avail->flags & IORESOURCE_MEM)) + return; + for (i = 0; i < e820_table->nr_entries; i++) { entry = &e820_table->entries[i]; e820_start = entry->addr; @@ -38,7 +41,7 @@ static void remove_e820_regions(struct resource *avail) resource_clip(avail, e820_start, e820_end); if (orig.start != avail->start || orig.end != avail->end) { - pr_info("clipped %pR to %pR for e820 entry [mem %#010Lx-%#010Lx]\n", + dev_info(dev, "clipped %pR to %pR for e820 entry [mem %#010Lx-%#010Lx]\n", &orig, avail, e820_start, e820_end); orig = *avail; } @@ -52,9 +55,6 @@ void arch_remove_reservations(struct resource *avail) * the low 1MB unconditionally, as this area is needed for some ISA * cards requiring a memory range, e.g. the i82365 PCMCIA controller. */ - if (avail->flags & IORESOURCE_MEM) { + if (avail->flags & IORESOURCE_MEM) resource_clip(avail, BIOS_ROM_BASE, BIOS_ROM_END); - - remove_e820_regions(avail); - } } diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 052f1d78a562..562c81a51ea0 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -8,6 +8,7 @@ #include #include #include +#include struct pci_root_info { struct acpi_pci_root_info common; @@ -299,6 +300,10 @@ static int pci_acpi_root_prepare_resources(struct acpi_pci_root_info *ci) int status; status = acpi_pci_probe_root_resources(ci); + + resource_list_for_each_entry(entry, &ci->resources) + remove_e820_regions(&device->dev, entry->res); + if (pci_use_crs) { resource_list_for_each_entry_safe(entry, tmp, &ci->resources) if (resource_is_pcicfg_ioport(entry->res)) -- cgit From 613fa6e217e1f216109da784d6f127cc708026c0 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Thu, 31 Mar 2022 08:10:12 +0100 Subject: x86/PCI: Show the physical address of the $PIR table It makes no sense to hide the address of the $PIR table in a debug dump: PCI: Interrupt Routing Table found at 0x(ptrval) let alone print its virtual address, given that this is a BIOS entity at a fixed location in the system's memory map. Show the physical address instead then, e.g.: PCI: Interrupt Routing Table found at 0xfde10 Signed-off-by: Maciej W. Rozycki Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/alpine.DEB.2.21.2203301532330.22465@angie.orcam.me.uk --- arch/x86/pci/irq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 97b63e35e152..a33fe9c811c7 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -84,8 +84,8 @@ static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr) for (i = 0; i < rt->size; i++) sum += addr[i]; if (!sum) { - DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n", - rt); + DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%lx\n", + __pa(rt)); return rt; } return NULL; -- cgit From dc0e64087213768a6232af980076a517aaaa4adb Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Thu, 31 Mar 2022 08:10:17 +0100 Subject: x86/PCI: Include function number in $PIR table dump Contrary to the PCI BIOS specification[1] some systems include the PCI function number for motherboard devices in their $PIR table, e.g. this is what the Tyan Tomcat IV S1564D board reports: 00:14 slot=01 0:60/deb8 1:61/deb8 2:62/deb8 3:63/deb8 00:13 slot=02 0:61/deb8 1:62/deb8 2:63/deb8 3:60/deb8 00:12 slot=03 0:62/deb8 1:63/deb8 2:60/deb8 3:61/deb8 00:11 slot=04 0:63/deb8 1:60/deb8 2:61/deb8 3:62/deb8 00:07 slot=00 0:00/deb8 1:00/deb8 2:00/deb8 3:00/deb8 00:07 slot=00 0:00/deb8 1:00/deb8 2:00/deb8 3:63/deb8 Print the function number then in the debug $PIR table dump: 00:14.0 slot=01 0:60/deb8 1:61/deb8 2:62/deb8 3:63/deb8 00:13.0 slot=02 0:61/deb8 1:62/deb8 2:63/deb8 3:60/deb8 00:12.0 slot=03 0:62/deb8 1:63/deb8 2:60/deb8 3:61/deb8 00:11.0 slot=04 0:63/deb8 1:60/deb8 2:61/deb8 3:62/deb8 00:07.1 slot=00 0:00/deb8 1:00/deb8 2:00/deb8 3:00/deb8 00:07.2 slot=00 0:00/deb8 1:00/deb8 2:00/deb8 3:63/deb8 References: [1] "PCI BIOS Specification", Revision 2.1, PCI Special Interest Group, August 26, 1994, Table 4-1 "Layout of IRQ routing table entry.", p. 12 Signed-off-by: Maciej W. Rozycki Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/alpine.DEB.2.21.2203301534440.22465@angie.orcam.me.uk --- arch/x86/pci/irq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index a33fe9c811c7..b6b985338d4e 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -135,7 +135,8 @@ static void __init pirq_peer_trick(void) #ifdef DEBUG { int j; - DBG(KERN_DEBUG "%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot); + DBG(KERN_DEBUG "%02x:%02x.%x slot=%02x", + e->bus, e->devfn / 8, e->devfn % 8, e->slot); for (j = 0; j < 4; j++) DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap); DBG("\n"); -- cgit From 3132450254f28428cb0a4368b0115a26cd85d170 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Thu, 31 Mar 2022 08:10:21 +0100 Subject: x86/PCI: Also match function number in $PIR table Contrary to the PCI BIOS specification[1] some systems include the PCI function number for onboard devices in their $PIR table. Consequently the wrong entry can be matched leading to interrupt routing failures. For example the Tyan Tomcat IV S1564D board has: 00:07.1 slot=00 0:00/deb8 1:00/deb8 2:00/deb8 3:00/deb8 00:07.2 slot=00 0:00/deb8 1:00/deb8 2:00/deb8 3:63/deb8 for its IDE interface and USB controller functions of the 82371SB PIIX3 southbridge. Consequently the first entry matches causing the inability to route the USB interrupt in the `noapic' mode, in which case we need to rely on the interrupt line set by the BIOS: uhci_hcd 0000:00:07.2: runtime IRQ mapping not provided by arch uhci_hcd 0000:00:07.2: PCI INT D not routed uhci_hcd 0000:00:07.2: enabling bus mastering uhci_hcd 0000:00:07.2: UHCI Host Controller uhci_hcd 0000:00:07.2: new USB bus registered, assigned bus number 1 uhci_hcd 0000:00:07.2: irq 11, io base 0x00006000 Try to match the PCI device and function combined then and if that fails move on to PCI device matching only. Compliant systems will only have a single $PIR table entry per PCI device, so this update does not change the semantics with them, while systems that have several entries for individual functions of a single PCI device each will match the correct entry: uhci_hcd 0000:00:07.2: runtime IRQ mapping not provided by arch uhci_hcd 0000:00:07.2: PCI INT D -> PIRQ 63, mask deb8, excl 0c20 uhci_hcd 0000:00:07.2: PCI INT D -> newirq 11 uhci_hcd 0000:00:07.2: found PCI INT D -> IRQ 11 uhci_hcd 0000:00:07.2: sharing IRQ 11 with 0000:00:11.0 uhci_hcd 0000:00:07.2: enabling bus mastering uhci_hcd 0000:00:07.2: UHCI Host Controller uhci_hcd 0000:00:07.2: new USB bus registered, assigned bus number 1 uhci_hcd 0000:00:07.2: irq 11, io base 0x00006000 [1] "PCI BIOS Specification", Revision 2.1, PCI Special Interest Group, August 26, 1994, Table 4-1 "Layout of IRQ routing table entry.", p. 12 Signed-off-by: Maciej W. Rozycki Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/alpine.DEB.2.21.2203301536020.22465@angie.orcam.me.uk --- arch/x86/pci/irq.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index b6b985338d4e..dcb9c21c714c 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -1132,18 +1132,29 @@ static void __init pirq_find_router(struct irq_router *r) /* The device remains referenced for the kernel lifetime */ } +/* + * We're supposed to match on the PCI device only and not the function, + * but some BIOSes build their tables with the PCI function included + * for motherboard devices, so if a complete match is found, then give + * it precedence over a slot match. + */ static struct irq_info *pirq_get_info(struct pci_dev *dev) { struct irq_routing_table *rt = pirq_table; int entries = (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); + struct irq_info *slotinfo = NULL; struct irq_info *info; for (info = rt->slots; entries--; info++) - if (info->bus == dev->bus->number && - PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn)) - return info; - return NULL; + if (info->bus == dev->bus->number) { + if (info->devfn == dev->devfn) + return info; + if (!slotinfo && + PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn)) + slotinfo = info; + } + return slotinfo; } static int pcibios_lookup_irq(struct pci_dev *dev, int assign) -- cgit From d88a8b1cf472a245e146f2edfc65f37db860836a Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Thu, 31 Mar 2022 08:10:25 +0100 Subject: x86/PCI: Handle IRQ swizzling with PIRQ routers Similarly to MP-tables PIRQ routing tables may not list devices behind PCI-to-PCI bridges, leading to interrupt routing failures, e.g.: pci 0000:00:07.0: PIIX/ICH IRQ router [8086:7000] pci 0000:02:00.0: ignoring bogus IRQ 255 pci 0000:02:01.0: ignoring bogus IRQ 255 pci 0000:02:02.0: ignoring bogus IRQ 255 pci 0000:04:00.0: ignoring bogus IRQ 255 pci 0000:04:00.3: ignoring bogus IRQ 255 pci 0000:00:11.0: PCI INT A -> PIRQ 63, mask deb8, excl 0c20 pci 0000:00:11.0: PCI INT A -> newirq 0 PCI: setting IRQ 11 as level-triggered pci 0000:00:11.0: found PCI INT A -> IRQ 11 pci 0000:00:11.0: sharing IRQ 11 with 0000:00:07.2 pci 0000:02:00.0: PCI INT A not found in routing table pci 0000:02:01.0: PCI INT A not found in routing table pci 0000:02:02.0: PCI INT A not found in routing table pci 0000:04:00.0: PCI INT A not found in routing table pci 0000:04:00.3: PCI INT D not found in routing table pci 0000:06:05.0: PCI INT A not found in routing table pci 0000:06:08.0: PCI INT A not found in routing table pci 0000:06:08.1: PCI INT B not found in routing table pci 0000:06:08.2: PCI INT C not found in routing table and consequently non-working devices. Since PCI-to-PCI bridges have a standardised way of routing interrupts by the means of swizzling do it for configurations that use a PIRQ router as well, like with APIC-based setups, and use the determined corresponding topmost bridge's interrupt pin assignment to route a given device's interrupt: pci 0000:00:07.0: PIIX/ICH IRQ router [8086:7000] pci 0000:02:00.0: ignoring bogus IRQ 255 pci 0000:02:01.0: ignoring bogus IRQ 255 pci 0000:02:02.0: ignoring bogus IRQ 255 pci 0000:04:00.0: ignoring bogus IRQ 255 pci 0000:04:00.3: ignoring bogus IRQ 255 pci 0000:00:11.0: PCI INT A -> PIRQ 63, mask deb8, excl 0c20 pci 0000:00:11.0: PCI INT A -> newirq 0 PCI: setting IRQ 11 as level-triggered pci 0000:00:11.0: found PCI INT A -> IRQ 11 pci 0000:00:11.0: sharing IRQ 11 with 0000:00:07.2 pci 0000:02:00.0: using bridge 0000:00:11.0 INT A to get INT A pci 0000:00:11.0: sharing IRQ 11 with 0000:02:00.0 pci 0000:02:01.0: using bridge 0000:00:11.0 INT B to get INT A pci 0000:02:02.0: using bridge 0000:00:11.0 INT C to get INT A pci 0000:04:00.0: using bridge 0000:00:11.0 INT B to get INT A pci 0000:04:00.3: using bridge 0000:00:11.0 INT A to get INT D pci 0000:00:11.0: sharing IRQ 11 with 0000:04:00.3 pci 0000:06:05.0: using bridge 0000:00:11.0 INT D to get INT A pci 0000:06:08.0: using bridge 0000:00:11.0 INT C to get INT A pci 0000:06:08.1: using bridge 0000:00:11.0 INT D to get INT B pci 0000:06:08.2: using bridge 0000:00:11.0 INT A to get INT C pci 0000:00:11.0: sharing IRQ 11 with 0000:06:08.2 pci 0000:02:01.0: using bridge 0000:00:11.0 INT B to get INT A pci 0000:02:01.0: PCI INT A -> PIRQ 60, mask deb8, excl 0c20 pci 0000:02:01.0: PCI INT A -> newirq 0 PCI: setting IRQ 10 as level-triggered pci 0000:02:01.0: found PCI INT A -> IRQ 10 pci 0000:02:01.0: sharing IRQ 10 with 0000:00:14.0 pci 0000:02:00.0: using bridge 0000:00:11.0 INT A to get INT A pci 0000:02:01.0: using bridge 0000:00:11.0 INT B to get INT A pci 0000:02:02.0: using bridge 0000:00:11.0 INT C to get INT A pci 0000:04:00.0: using bridge 0000:00:11.0 INT B to get INT A pci 0000:02:01.0: sharing IRQ 10 with 0000:04:00.0 pci 0000:04:00.3: using bridge 0000:00:11.0 INT A to get INT D pci 0000:06:05.0: using bridge 0000:00:11.0 INT D to get INT A pci 0000:06:08.0: using bridge 0000:00:11.0 INT C to get INT A pci 0000:06:08.1: using bridge 0000:00:11.0 INT D to get INT B pci 0000:06:08.2: using bridge 0000:00:11.0 INT A to get INT C pci 0000:02:02.0: using bridge 0000:00:11.0 INT C to get INT A pci 0000:02:02.0: PCI INT A -> PIRQ 61, mask deb8, excl 0c20 pci 0000:02:02.0: PCI INT A -> newirq 0 PCI: setting IRQ 5 as level-triggered pci 0000:02:02.0: found PCI INT A -> IRQ 5 pci 0000:02:02.0: sharing IRQ 5 with 0000:00:13.0 pci 0000:02:00.0: using bridge 0000:00:11.0 INT A to get INT A pci 0000:02:01.0: using bridge 0000:00:11.0 INT B to get INT A pci 0000:02:02.0: using bridge 0000:00:11.0 INT C to get INT A pci 0000:04:00.0: using bridge 0000:00:11.0 INT B to get INT A pci 0000:04:00.3: using bridge 0000:00:11.0 INT A to get INT D pci 0000:06:05.0: using bridge 0000:00:11.0 INT D to get INT A pci 0000:06:08.0: using bridge 0000:00:11.0 INT C to get INT A pci 0000:02:02.0: sharing IRQ 5 with 0000:06:08.0 pci 0000:06:08.1: using bridge 0000:00:11.0 INT D to get INT B pci 0000:06:08.2: using bridge 0000:00:11.0 INT A to get INT C pci 0000:06:05.0: using bridge 0000:00:11.0 INT D to get INT A pci 0000:06:05.0: PCI INT A -> PIRQ 62, mask deb8, excl 0c20 pci 0000:06:05.0: PCI INT A -> newirq 0 pci 0000:06:05.0: found PCI INT A -> IRQ 5 pci 0000:06:05.0: sharing IRQ 5 with 0000:00:12.0 pci 0000:02:00.0: using bridge 0000:00:11.0 INT A to get INT A pci 0000:02:01.0: using bridge 0000:00:11.0 INT B to get INT A pci 0000:02:02.0: using bridge 0000:00:11.0 INT C to get INT A pci 0000:04:00.0: using bridge 0000:00:11.0 INT B to get INT A pci 0000:04:00.3: using bridge 0000:00:11.0 INT A to get INT D pci 0000:06:05.0: using bridge 0000:00:11.0 INT D to get INT A pci 0000:06:08.0: using bridge 0000:00:11.0 INT C to get INT A pci 0000:06:08.1: using bridge 0000:00:11.0 INT D to get INT B pci 0000:06:05.0: sharing IRQ 5 with 0000:06:08.1 pci 0000:06:08.2: using bridge 0000:00:11.0 INT A to get INT C Adjust log messages accordingly. Signed-off-by: Maciej W. Rozycki Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/alpine.DEB.2.21.2203301538440.22465@angie.orcam.me.uk --- arch/x86/pci/irq.c | 60 ++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 47 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index dcb9c21c714c..bd32e4b0579d 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -1138,7 +1138,7 @@ static void __init pirq_find_router(struct irq_router *r) * for motherboard devices, so if a complete match is found, then give * it precedence over a slot match. */ -static struct irq_info *pirq_get_info(struct pci_dev *dev) +static struct irq_info *pirq_get_dev_info(struct pci_dev *dev) { struct irq_routing_table *rt = pirq_table; int entries = (rt->size - sizeof(struct irq_routing_table)) / @@ -1157,11 +1157,42 @@ static struct irq_info *pirq_get_info(struct pci_dev *dev) return slotinfo; } +/* + * Buses behind bridges are typically not listed in the PIRQ routing table. + * Do the usual dance then and walk the tree of bridges up adjusting the + * pin number accordingly on the way until the originating root bus device + * has been reached and then use its routing information. + */ +static struct irq_info *pirq_get_info(struct pci_dev *dev, u8 *pin) +{ + struct pci_dev *temp_dev = dev; + struct irq_info *info; + u8 temp_pin = *pin; + u8 dpin = temp_pin; + + info = pirq_get_dev_info(dev); + while (!info && temp_dev->bus->parent) { + struct pci_dev *bridge = temp_dev->bus->self; + + temp_pin = pci_swizzle_interrupt_pin(temp_dev, temp_pin); + info = pirq_get_dev_info(bridge); + if (info) + dev_warn(&dev->dev, + "using bridge %s INT %c to get INT %c\n", + pci_name(bridge), + 'A' + temp_pin - 1, 'A' + dpin - 1); + + temp_dev = bridge; + } + *pin = temp_pin; + return info; +} + static int pcibios_lookup_irq(struct pci_dev *dev, int assign) { - u8 pin; struct irq_info *info; int i, pirq, newirq; + u8 dpin, pin; int irq = 0; u32 mask; struct irq_router *r = &pirq_router; @@ -1169,8 +1200,8 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) char *msg = NULL; /* Find IRQ pin */ - pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); - if (!pin) { + pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &dpin); + if (!dpin) { dev_dbg(&dev->dev, "no interrupt pin\n"); return 0; } @@ -1183,20 +1214,21 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) if (!pirq_table) return 0; - info = pirq_get_info(dev); + pin = dpin; + info = pirq_get_info(dev, &pin); if (!info) { dev_dbg(&dev->dev, "PCI INT %c not found in routing table\n", - 'A' + pin - 1); + 'A' + dpin - 1); return 0; } pirq = info->irq[pin - 1].link; mask = info->irq[pin - 1].bitmap; if (!pirq) { - dev_dbg(&dev->dev, "PCI INT %c not routed\n", 'A' + pin - 1); + dev_dbg(&dev->dev, "PCI INT %c not routed\n", 'A' + dpin - 1); return 0; } dev_dbg(&dev->dev, "PCI INT %c -> PIRQ %02x, mask %04x, excl %04x", - 'A' + pin - 1, pirq, mask, pirq_table->exclusive_irqs); + 'A' + dpin - 1, pirq, mask, pirq_table->exclusive_irqs); mask &= pcibios_irq_mask; /* Work around broken HP Pavilion Notebooks which assign USB to @@ -1238,7 +1270,7 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) newirq = i; } } - dev_dbg(&dev->dev, "PCI INT %c -> newirq %d", 'A' + pin - 1, newirq); + dev_dbg(&dev->dev, "PCI INT %c -> newirq %d", 'A' + dpin - 1, newirq); /* Check if it is hardcoded */ if ((pirq & 0xf0) == 0xf0) { @@ -1272,15 +1304,17 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) return 0; } } - dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin - 1, irq); + dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", + msg, 'A' + dpin - 1, irq); /* Update IRQ for all devices with the same pirq value */ for_each_pci_dev(dev2) { - pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin); - if (!pin) + pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &dpin); + if (!dpin) continue; - info = pirq_get_info(dev2); + pin = dpin; + info = pirq_get_info(dev2, &pin); if (!info) continue; if (info->irq[pin - 1].link == pirq) { -- cgit From 5a0e5fa957db79177baa851d687b6f6aa5a0be96 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Thu, 31 Mar 2022 08:10:39 +0100 Subject: x86/PCI: Disambiguate SiS85C503 PIRQ router code entities In preparation to adding support for the SiS85C497 PIRQ router add `503' to the names of SiS85C503 PIRQ router code entities so that they clearly indicate which device they refer to. Also restructure `sis_router_probe' such that new device IDs will be just new switch cases. No functional change. Signed-off-by: Maciej W. Rozycki Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/alpine.DEB.2.21.2203301610000.22465@angie.orcam.me.uk --- arch/x86/pci/irq.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index bd32e4b0579d..e5bc9f7a17c3 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -641,11 +641,12 @@ static int pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, * bit 6-4 are probably unused, not like 5595 */ -#define PIRQ_SIS_IRQ_MASK 0x0f -#define PIRQ_SIS_IRQ_DISABLE 0x80 -#define PIRQ_SIS_USB_ENABLE 0x40 +#define PIRQ_SIS503_IRQ_MASK 0x0f +#define PIRQ_SIS503_IRQ_DISABLE 0x80 +#define PIRQ_SIS503_USB_ENABLE 0x40 -static int pirq_sis_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +static int pirq_sis503_get(struct pci_dev *router, struct pci_dev *dev, + int pirq) { u8 x; int reg; @@ -654,10 +655,11 @@ static int pirq_sis_get(struct pci_dev *router, struct pci_dev *dev, int pirq) if (reg >= 0x01 && reg <= 0x04) reg += 0x40; pci_read_config_byte(router, reg, &x); - return (x & PIRQ_SIS_IRQ_DISABLE) ? 0 : (x & PIRQ_SIS_IRQ_MASK); + return (x & PIRQ_SIS503_IRQ_DISABLE) ? 0 : (x & PIRQ_SIS503_IRQ_MASK); } -static int pirq_sis_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) +static int pirq_sis503_set(struct pci_dev *router, struct pci_dev *dev, + int pirq, int irq) { u8 x; int reg; @@ -666,8 +668,8 @@ static int pirq_sis_set(struct pci_dev *router, struct pci_dev *dev, int pirq, i if (reg >= 0x01 && reg <= 0x04) reg += 0x40; pci_read_config_byte(router, reg, &x); - x &= ~(PIRQ_SIS_IRQ_MASK | PIRQ_SIS_IRQ_DISABLE); - x |= irq ? irq: PIRQ_SIS_IRQ_DISABLE; + x &= ~(PIRQ_SIS503_IRQ_MASK | PIRQ_SIS503_IRQ_DISABLE); + x |= irq ? irq : PIRQ_SIS503_IRQ_DISABLE; pci_write_config_byte(router, reg, x); return 1; } @@ -959,13 +961,14 @@ static __init int serverworks_router_probe(struct irq_router *r, static __init int sis_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) { - if (device != PCI_DEVICE_ID_SI_503) - return 0; - - r->name = "SIS"; - r->get = pirq_sis_get; - r->set = pirq_sis_set; - return 1; + switch (device) { + case PCI_DEVICE_ID_SI_503: + r->name = "SiS85C503"; + r->get = pirq_sis503_get; + r->set = pirq_sis503_set; + return 1; + } + return 0; } static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) -- cgit From fe62bc23620fa027162e05594a610ff5e556496a Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Thu, 31 Mar 2022 08:10:46 +0100 Subject: x86/PCI: Add support for the SiS85C497 PIRQ router The SiS 85C496/497 486 Green PC VESA/ISA/PCI Chipset has support for PCI steering and the ELCR register implemented. These features are handled by the SiS85C497 AT Bus Controller & Megacell (ATM) ISA bridge, however the device is wired as a peer bridge directly to the host bus and has its PCI configuration registers decoded at addresses 0x80-0xff by the accompanying SiS85C496 PCI & CPU Memory Controller (PCM) host bridge[1]. Therefore we need to match on the host bridge's vendor and device ID. Like with the SiS85C503 PIRQ router handle link value ranges of 1-4 and 0xc0-0xc3, corresponding respectively to PIRQ line numbers counted from 1 and link register PCI configuration space addresses. References: [1] "486 Green PC VESA/ISA/PCI Chipset, SiS 85C496/497", Rev 3.0, Silicon Integrated Systems Corp., July 1995, Part IV, Section 3. "PCI Configuration Space Registers (00h ~ FFh)", p. 114 Signed-off-by: Maciej W. Rozycki Signed-off-by: Thomas Gleixner Tested-by: Nikolai Zhubr Link: https://lore.kernel.org/r/alpine.DEB.2.21.2203301610490.22465@angie.orcam.me.uk --- arch/x86/pci/irq.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index e5bc9f7a17c3..4b0e008aeb30 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -580,6 +580,81 @@ static int pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, return 1; } + +/* + * PIRQ routing for the SiS85C497 AT Bus Controller & Megacell (ATM) + * ISA bridge used with the SiS 85C496/497 486 Green PC VESA/ISA/PCI + * Chipset. + * + * There are four PCI INTx#-to-IRQ Link registers provided in the + * SiS85C497 part of the peculiar combined 85C496/497 configuration + * space decoded by the SiS85C496 PCI & CPU Memory Controller (PCM) + * host bridge, at 0xc0/0xc1/0xc2/0xc3 respectively for the PCI INT + * A/B/C/D lines. Bit 7 enables the respective link if set and bits + * 3:0 select the 8259A IRQ line as follows: + * + * 0000 : Reserved + * 0001 : Reserved + * 0010 : Reserved + * 0011 : IRQ3 + * 0100 : IRQ4 + * 0101 : IRQ5 + * 0110 : IRQ6 + * 0111 : IRQ7 + * 1000 : Reserved + * 1001 : IRQ9 + * 1010 : IRQ10 + * 1011 : IRQ11 + * 1100 : IRQ12 + * 1101 : Reserved + * 1110 : IRQ14 + * 1111 : IRQ15 + * + * We avoid using a reserved value for disabled links, hence the + * choice of IRQ15 for that case. + * + * References: + * + * "486 Green PC VESA/ISA/PCI Chipset, SiS 85C496/497", Rev 3.0, + * Silicon Integrated Systems Corp., July 1995 + */ + +#define PCI_SIS497_INTA_TO_IRQ_LINK 0xc0u + +#define PIRQ_SIS497_IRQ_MASK 0x0fu +#define PIRQ_SIS497_IRQ_ENABLE 0x80u + +static int pirq_sis497_get(struct pci_dev *router, struct pci_dev *dev, + int pirq) +{ + int reg; + u8 x; + + reg = pirq; + if (reg >= 1 && reg <= 4) + reg += PCI_SIS497_INTA_TO_IRQ_LINK - 1; + + pci_read_config_byte(router, reg, &x); + return (x & PIRQ_SIS497_IRQ_ENABLE) ? (x & PIRQ_SIS497_IRQ_MASK) : 0; +} + +static int pirq_sis497_set(struct pci_dev *router, struct pci_dev *dev, + int pirq, int irq) +{ + int reg; + u8 x; + + reg = pirq; + if (reg >= 1 && reg <= 4) + reg += PCI_SIS497_INTA_TO_IRQ_LINK - 1; + + pci_read_config_byte(router, reg, &x); + x &= ~(PIRQ_SIS497_IRQ_MASK | PIRQ_SIS497_IRQ_ENABLE); + x |= irq ? (PIRQ_SIS497_IRQ_ENABLE | irq) : PIRQ_SIS497_IRQ_MASK; + pci_write_config_byte(router, reg, x); + return 1; +} + /* * PIRQ routing for SiS 85C503 router used in several SiS chipsets. * We have to deal with the following issues here: @@ -962,6 +1037,11 @@ static __init int serverworks_router_probe(struct irq_router *r, static __init int sis_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) { switch (device) { + case PCI_DEVICE_ID_SI_496: + r->name = "SiS85C497"; + r->get = pirq_sis497_get; + r->set = pirq_sis497_set; + return 1; case PCI_DEVICE_ID_SI_503: r->name = "SiS85C503"; r->get = pirq_sis503_get; -- cgit From 5d64089aa4a5bd3d7e00e3d6ddf4943dd34627b3 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Thu, 31 Mar 2022 08:10:55 +0100 Subject: x86/PCI: Add PIRQ routing table range checks Verify that the PCI IRQ Routing Table header as well as individual slot entries are all wholly contained within the BIOS memory area. Do not even call the checksum calculator if the header would overrun the area and then bail out early if any slot would. Signed-off-by: Maciej W. Rozycki Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/alpine.DEB.2.21.2203301735510.22465@angie.orcam.me.uk --- arch/x86/pci/irq.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 4b0e008aeb30..ef97b260a556 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -68,7 +68,8 @@ void (*pcibios_disable_irq)(struct pci_dev *dev) = pirq_disable_irq; * and perform checksum verification. */ -static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr) +static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr, + u8 *limit) { struct irq_routing_table *rt; int i; @@ -78,7 +79,8 @@ static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr) if (rt->signature != PIRQ_SIGNATURE || rt->version != PIRQ_VERSION || rt->size % 16 || - rt->size < sizeof(struct irq_routing_table)) + rt->size < sizeof(struct irq_routing_table) || + (limit && rt->size > limit - addr)) return NULL; sum = 0; for (i = 0; i < rt->size; i++) @@ -99,17 +101,22 @@ static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr) static struct irq_routing_table * __init pirq_find_routing_table(void) { + u8 * const bios_start = (u8 *)__va(0xf0000); + u8 * const bios_end = (u8 *)__va(0x100000); u8 *addr; struct irq_routing_table *rt; if (pirq_table_addr) { - rt = pirq_check_routing_table((u8 *) __va(pirq_table_addr)); + rt = pirq_check_routing_table((u8 *)__va(pirq_table_addr), + NULL); if (rt) return rt; printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n"); } - for (addr = (u8 *) __va(0xf0000); addr < (u8 *) __va(0x100000); addr += 16) { - rt = pirq_check_routing_table(addr); + for (addr = bios_start; + addr < bios_end - sizeof(struct irq_routing_table); + addr += 16) { + rt = pirq_check_routing_table(addr, bios_end); if (rt) return rt; } -- cgit From ac7cd5e16df8696c39e29b03dfedf069a025b822 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Thu, 31 Mar 2022 08:11:01 +0100 Subject: x86/PCI: Handle PIRQ routing tables with no router device given PIRQ routing tables provided by the PCI BIOS usually specify the PCI vendor:device ID as well as the bus address of the device implementing the PIRQ router, e.g.: PCI: Interrupt Routing Table found at 0xc00fde10 [...] PCI: Attempting to find IRQ router for [8086:7000] pci 0000:00:07.0: PIIX/ICH IRQ router [8086:7000] however in some cases they do not, in which case we fail to match the router handler, e.g.: PCI: Interrupt Routing Table found at 0xc00fdae0 [...] PCI: Attempting to find IRQ router for [0000:0000] PCI: Interrupt router not found at 00:00 This is because we always match the vendor:device ID and the bus address literally, even if they are all zeros. Handle this case then and iterate over all PCI devices until we find a matching router handler if the vendor ID given by the routing table is the invalid value of zero: PCI: Attempting to find IRQ router for [0000:0000] PCI: Trying IRQ router for [1039:0496] pci 0000:00:05.0: SiS85C497 IRQ router [1039:0496] Signed-off-by: Maciej W. Rozycki Signed-off-by: Thomas Gleixner Tested-by: Nikolai Zhubr Link: https://lore.kernel.org/r/alpine.DEB.2.21.2203302018570.9038@angie.orcam.me.uk --- arch/x86/pci/irq.c | 64 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 44 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index ef97b260a556..d4ecf881c0c8 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -1175,10 +1175,32 @@ static struct pci_dev *pirq_router_dev; * chipset" ? */ +static bool __init pirq_try_router(struct irq_router *r, + struct irq_routing_table *rt, + struct pci_dev *dev) +{ + struct irq_router_handler *h; + + DBG(KERN_DEBUG "PCI: Trying IRQ router for [%04x:%04x]\n", + dev->vendor, dev->device); + + for (h = pirq_routers; h->vendor; h++) { + /* First look for a router match */ + if (rt->rtr_vendor == h->vendor && + h->probe(r, dev, rt->rtr_device)) + return true; + /* Fall back to a device match */ + if (dev->vendor == h->vendor && + h->probe(r, dev, dev->device)) + return true; + } + return false; +} + static void __init pirq_find_router(struct irq_router *r) { struct irq_routing_table *rt = pirq_table; - struct irq_router_handler *h; + struct pci_dev *dev; #ifdef CONFIG_PCI_BIOS if (!rt->signature) { @@ -1197,27 +1219,29 @@ static void __init pirq_find_router(struct irq_router *r) DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for [%04x:%04x]\n", rt->rtr_vendor, rt->rtr_device); - pirq_router_dev = pci_get_domain_bus_and_slot(0, rt->rtr_bus, - rt->rtr_devfn); - if (!pirq_router_dev) { - DBG(KERN_DEBUG "PCI: Interrupt router not found at " - "%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn); - return; + /* Use any vendor:device provided by the routing table or try all. */ + if (rt->rtr_vendor) { + dev = pci_get_domain_bus_and_slot(0, rt->rtr_bus, + rt->rtr_devfn); + if (dev && pirq_try_router(r, rt, dev)) + pirq_router_dev = dev; + } else { + dev = NULL; + for_each_pci_dev(dev) { + if (pirq_try_router(r, rt, dev)) { + pirq_router_dev = dev; + break; + } + } } - for (h = pirq_routers; h->vendor; h++) { - /* First look for a router match */ - if (rt->rtr_vendor == h->vendor && - h->probe(r, pirq_router_dev, rt->rtr_device)) - break; - /* Fall back to a device match */ - if (pirq_router_dev->vendor == h->vendor && - h->probe(r, pirq_router_dev, pirq_router_dev->device)) - break; - } - dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x:%04x]\n", - pirq_router.name, - pirq_router_dev->vendor, pirq_router_dev->device); + if (pirq_router_dev) + dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x:%04x]\n", + pirq_router.name, + pirq_router_dev->vendor, pirq_router_dev->device); + else + DBG(KERN_DEBUG "PCI: Interrupt router not found at " + "%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn); /* The device remains referenced for the kernel lifetime */ } -- cgit From b584db0c84db5ed9230356d5fa6610de55d297e6 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Thu, 31 Mar 2022 08:11:05 +0100 Subject: x86/PCI: Add $IRT PIRQ routing table support Handle the $IRT PCI IRQ Routing Table format used by AMI for its BCP (BIOS Configuration Program) external tool meant for tweaking BIOS structures without the need to rebuild it from sources[1]. The $IRT format has been invented by AMI before Microsoft has come up with its $PIR format and a $IRT table is therefore there in some systems that lack a $PIR table, such as the DataExpert EXP8449 mainboard based on the ALi FinALi 486 chipset (M1489/M1487), which predates DMI 2.0 and cannot therefore be easily identified at run time. Unlike with the $PIR format there is no alignment guarantee as to the placement of the $IRT table, so scan the whole BIOS area bytewise. Credit to Michal Necasek for helping me chase documentation for the format. References: [1] "What is BCP? - AMI", Signed-off-by: Maciej W. Rozycki Signed-off-by: Thomas Gleixner Tested-by: Dmitry Osipenko # crosvm Link: https://lore.kernel.org/r/alpine.DEB.2.21.2203302228410.9038@angie.orcam.me.uk --- arch/x86/include/asm/pci_x86.h | 9 +++++ arch/x86/pci/irq.c | 76 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index a0627dfae541..1307cd689d2a 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -93,6 +93,15 @@ struct irq_routing_table { struct irq_info slots[]; } __attribute__((packed)); +struct irt_routing_table { + u32 signature; /* IRT_SIGNATURE should be here */ + u8 size; /* Number of entries provided */ + u8 used; /* Number of entries actually used */ + u16 exclusive_irqs; /* IRQs devoted exclusively to + PCI usage */ + struct irq_info slots[]; +} __attribute__((packed)); + extern unsigned int pcibios_irq_mask; extern raw_spinlock_t pci_config_lock; diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index d4ecf881c0c8..4a5e80f8fb01 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -25,6 +25,8 @@ #define PIRQ_SIGNATURE (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24)) #define PIRQ_VERSION 0x0100 +#define IRT_SIGNATURE (('$' << 0) + ('I' << 8) + ('R' << 16) + ('T' << 24)) + static int broken_hp_bios_irq9; static int acer_tm360_irqrouting; @@ -93,7 +95,74 @@ static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr, return NULL; } +/* + * Handle the $IRT PCI IRQ Routing Table format used by AMI for its BCP + * (BIOS Configuration Program) external tool meant for tweaking BIOS + * structures without the need to rebuild it from sources. The $IRT + * format has been invented by AMI before Microsoft has come up with its + * $PIR format and a $IRT table is therefore there in some systems that + * lack a $PIR table. + * + * It uses the same PCI BIOS 2.1 format for interrupt routing entries + * themselves but has a different simpler header prepended instead, + * occupying 8 bytes, where a `$IRT' signature is followed by one byte + * specifying the total number of interrupt routing entries allocated in + * the table, then one byte specifying the actual number of entries used + * (which the BCP tool can take advantage of when modifying the table), + * and finally a 16-bit word giving the IRQs devoted exclusively to PCI. + * Unlike with the $PIR table there is no alignment guarantee. + * + * Given the similarity of the two formats the $IRT one is trivial to + * convert to the $PIR one, which we do here, except that obviously we + * have no information as to the router device to use, but we can handle + * it by matching PCI device IDs actually seen on the bus against ones + * that our individual routers recognise. + * + * Reportedly there is another $IRT table format where a 16-bit word + * follows the header instead that points to interrupt routing entries + * in a $PIR table provided elsewhere. In that case this code will not + * be reached though as the $PIR table will have been chosen instead. + */ +static inline struct irq_routing_table *pirq_convert_irt_table(u8 *addr, + u8 *limit) +{ + struct irt_routing_table *ir; + struct irq_routing_table *rt; + u16 size; + u8 sum; + int i; + + ir = (struct irt_routing_table *)addr; + if (ir->signature != IRT_SIGNATURE || !ir->used || ir->size < ir->used) + return NULL; + + size = sizeof(*ir) + ir->used * sizeof(ir->slots[0]); + if (size > limit - addr) + return NULL; + + DBG(KERN_DEBUG "PCI: $IRT Interrupt Routing Table found at 0x%lx\n", + __pa(ir)); + + size = sizeof(*rt) + ir->used * sizeof(rt->slots[0]); + rt = kzalloc(size, GFP_KERNEL); + if (!rt) + return NULL; + rt->signature = PIRQ_SIGNATURE; + rt->version = PIRQ_VERSION; + rt->size = size; + rt->exclusive_irqs = ir->exclusive_irqs; + for (i = 0; i < ir->used; i++) + rt->slots[i] = ir->slots[i]; + + addr = (u8 *)rt; + sum = 0; + for (i = 0; i < size; i++) + sum += addr[i]; + rt->checksum = -sum; + + return rt; +} /* * Search 0xf0000 -- 0xfffff for the PCI IRQ Routing Table. @@ -120,6 +189,13 @@ static struct irq_routing_table * __init pirq_find_routing_table(void) if (rt) return rt; } + for (addr = bios_start; + addr < bios_end - sizeof(struct irt_routing_table); + addr++) { + rt = pirq_convert_irt_table(addr, bios_end); + if (rt) + return rt; + } return NULL; } -- cgit From 4969e223b109754c2340a26bba9b1cf44f0cba9b Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Thu, 31 Mar 2022 08:11:10 +0100 Subject: x86/PCI: Fix ALi M1487 (IBC) PIRQ router link value interpretation Fix an issue with commit 1ce849c75534 ("x86/PCI: Add support for the ALi M1487 (IBC) PIRQ router") and correct ALi M1487 (IBC) PIRQ router link value (`pirq' cookie) interpretation according to findings in the BIOS. Credit to Nikolai Zhubr for the detective work as to the bit layout. Fixes: 1ce849c75534 ("x86/PCI: Add support for the ALi M1487 (IBC) PIRQ router") Signed-off-by: Maciej W. Rozycki Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/alpine.DEB.2.21.2203310013270.44113@angie.orcam.me.uk --- arch/x86/pci/irq.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 4a5e80f8fb01..ceac715bbdc4 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -337,6 +337,15 @@ static void write_pc_conf_nybble(u8 base, u8 index, u8 val) pc_conf_set(reg, x); } +/* + * FinALi pirq rules are as follows: + * + * - bit 0 selects between INTx Routing Table Mapping Registers, + * + * - bit 3 selects the nibble within the INTx Routing Table Mapping Register, + * + * - bits 7:4 map to bits 3:0 of the PCI INTx Sensitivity Register. + */ static int pirq_finali_get(struct pci_dev *router, struct pci_dev *dev, int pirq) { @@ -344,11 +353,13 @@ static int pirq_finali_get(struct pci_dev *router, struct pci_dev *dev, 0, 9, 3, 10, 4, 5, 7, 6, 0, 11, 0, 12, 0, 14, 0, 15 }; unsigned long flags; + u8 index; u8 x; + index = (pirq & 1) << 1 | (pirq & 8) >> 3; raw_spin_lock_irqsave(&pc_conf_lock, flags); pc_conf_set(PC_CONF_FINALI_LOCK, PC_CONF_FINALI_LOCK_KEY); - x = irqmap[read_pc_conf_nybble(PC_CONF_FINALI_PCI_INTX_RT1, pirq - 1)]; + x = irqmap[read_pc_conf_nybble(PC_CONF_FINALI_PCI_INTX_RT1, index)]; pc_conf_set(PC_CONF_FINALI_LOCK, 0); raw_spin_unlock_irqrestore(&pc_conf_lock, flags); return x; @@ -362,13 +373,15 @@ static int pirq_finali_set(struct pci_dev *router, struct pci_dev *dev, }; u8 val = irqmap[irq]; unsigned long flags; + u8 index; if (!val) return 0; + index = (pirq & 1) << 1 | (pirq & 8) >> 3; raw_spin_lock_irqsave(&pc_conf_lock, flags); pc_conf_set(PC_CONF_FINALI_LOCK, PC_CONF_FINALI_LOCK_KEY); - write_pc_conf_nybble(PC_CONF_FINALI_PCI_INTX_RT1, pirq - 1, val); + write_pc_conf_nybble(PC_CONF_FINALI_PCI_INTX_RT1, index, val); pc_conf_set(PC_CONF_FINALI_LOCK, 0); raw_spin_unlock_irqrestore(&pc_conf_lock, flags); return 1; @@ -377,7 +390,7 @@ static int pirq_finali_set(struct pci_dev *router, struct pci_dev *dev, static int pirq_finali_lvl(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) { - u8 mask = ~(1u << (pirq - 1)); + u8 mask = ~((pirq & 0xf0u) >> 4); unsigned long flags; u8 trig; -- cgit From c25f23459c117d950e657458b0d3dcaaf9039ec9 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Thu, 31 Mar 2022 08:11:14 +0100 Subject: x86/PCI: Fix coding style in PIRQ table verification Remove an extraneous space with a cast in `pirq_check_routing_table'. Signed-off-by: Maciej W. Rozycki Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/alpine.DEB.2.21.2203310017260.44113@angie.orcam.me.uk --- arch/x86/pci/irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index ceac715bbdc4..a498b847d740 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -77,7 +77,7 @@ static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr, int i; u8 sum; - rt = (struct irq_routing_table *) addr; + rt = (struct irq_routing_table *)addr; if (rt->signature != PIRQ_SIGNATURE || rt->version != PIRQ_VERSION || rt->size % 16 || -- cgit From 3a5ff1f6dd50f5e1c2aa87491910dd6d275af24b Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 10 Feb 2022 14:49:00 -0800 Subject: x86: Replace cpumask_weight() with cpumask_empty() where appropriate In some cases, x86 code calls cpumask_weight() to check if any bit of a given cpumask is set. This can be done more efficiently with cpumask_empty() because cpumask_empty() stops traversing the cpumask as soon as it finds first set bit, while cpumask_weight() counts all bits unconditionally. Signed-off-by: Yury Norov Signed-off-by: Thomas Gleixner Reviewed-by: Steve Wahl Link: https://lore.kernel.org/r/20220210224933.379149-17-yury.norov@gmail.com --- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 14 +++++++------- arch/x86/mm/mmio-mod.c | 2 +- arch/x86/platform/uv/uv_nmi.c | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 83f901e2c2df..f276aff521e8 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -341,14 +341,14 @@ static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, /* Check whether cpus belong to parent ctrl group */ cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask); - if (cpumask_weight(tmpmask)) { + if (!cpumask_empty(tmpmask)) { rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n"); return -EINVAL; } /* Check whether cpus are dropped from this group */ cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); - if (cpumask_weight(tmpmask)) { + if (!cpumask_empty(tmpmask)) { /* Give any dropped cpus to parent rdtgroup */ cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask); update_closid_rmid(tmpmask, prgrp); @@ -359,7 +359,7 @@ static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, * and update per-cpu rmid */ cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); - if (cpumask_weight(tmpmask)) { + if (!cpumask_empty(tmpmask)) { head = &prgrp->mon.crdtgrp_list; list_for_each_entry(crgrp, head, mon.crdtgrp_list) { if (crgrp == rdtgrp) @@ -394,7 +394,7 @@ static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, /* Check whether cpus are dropped from this group */ cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); - if (cpumask_weight(tmpmask)) { + if (!cpumask_empty(tmpmask)) { /* Can't drop from default group */ if (rdtgrp == &rdtgroup_default) { rdt_last_cmd_puts("Can't drop CPUs from default group\n"); @@ -413,12 +413,12 @@ static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, * and update per-cpu closid/rmid. */ cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); - if (cpumask_weight(tmpmask)) { + if (!cpumask_empty(tmpmask)) { list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { if (r == rdtgrp) continue; cpumask_and(tmpmask1, &r->cpu_mask, tmpmask); - if (cpumask_weight(tmpmask1)) + if (!cpumask_empty(tmpmask1)) cpumask_rdtgrp_clear(r, tmpmask1); } update_closid_rmid(tmpmask, rdtgrp); @@ -488,7 +488,7 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, /* check that user didn't specify any offline cpus */ cpumask_andnot(tmpmask, newmask, cpu_online_mask); - if (cpumask_weight(tmpmask)) { + if (!cpumask_empty(tmpmask)) { ret = -EINVAL; rdt_last_cmd_puts("Can only assign online CPUs\n"); goto unlock; diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 933a2ebad471..c3317f0650d8 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -400,7 +400,7 @@ static void leave_uniprocessor(void) int cpu; int err; - if (!cpumask_available(downed_cpus) || cpumask_weight(downed_cpus) == 0) + if (!cpumask_available(downed_cpus) || cpumask_empty(downed_cpus)) return; pr_notice("Re-enabling CPUs...\n"); for_each_cpu(cpu, downed_cpus) { diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c index 1e9ff28bc2e0..ea277fc08357 100644 --- a/arch/x86/platform/uv/uv_nmi.c +++ b/arch/x86/platform/uv/uv_nmi.c @@ -985,7 +985,7 @@ static int uv_handle_nmi(unsigned int reason, struct pt_regs *regs) /* Clear global flags */ if (master) { - if (cpumask_weight(uv_nmi_cpu_mask)) + if (!cpumask_empty(uv_nmi_cpu_mask)) uv_nmi_cleanup_mask(); atomic_set(&uv_nmi_cpus_in_nmi, -1); atomic_set(&uv_nmi_cpu, -1); -- cgit From c2a911d302b0d014a4d0d732a2bfc319e643eb62 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 10 Feb 2022 14:49:09 -0800 Subject: x86/mm: Replace nodes_weight() with nodes_empty() where appropriate Various mm code calls nodes_weight() to check if any bit of a given nodemask is set. This can be done more efficiently with nodes_empty() because nodes_empty() stops traversing the nodemask as soon as it finds first set bit, while nodes_weight() counts all bits unconditionally. Signed-off-by: Yury Norov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20220210224933.379149-26-yury.norov@gmail.com --- arch/x86/mm/amdtopology.c | 2 +- arch/x86/mm/numa_emulation.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c index 058b2f36b3a6..b3ca7d23e4b0 100644 --- a/arch/x86/mm/amdtopology.c +++ b/arch/x86/mm/amdtopology.c @@ -154,7 +154,7 @@ int __init amd_numa_init(void) node_set(nodeid, numa_nodes_parsed); } - if (!nodes_weight(numa_nodes_parsed)) + if (nodes_empty(numa_nodes_parsed)) return -ENOENT; /* diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index 1a02b791d273..9a9305367fdd 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -123,7 +123,7 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei, * Continue to fill physical nodes with fake nodes until there is no * memory left on any of them. */ - while (nodes_weight(physnode_mask)) { + while (!nodes_empty(physnode_mask)) { for_each_node_mask(i, physnode_mask) { u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); u64 start, limit, end; @@ -270,7 +270,7 @@ static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei, * Fill physical nodes with fake nodes of size until there is no memory * left on any of them. */ - while (nodes_weight(physnode_mask)) { + while (!nodes_empty(physnode_mask)) { for_each_node_mask(i, physnode_mask) { u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); u64 start, limit, end; -- cgit From adb5680b8dfdd09756f13450bfc1ed874d895935 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Sun, 10 Apr 2022 23:00:25 +0300 Subject: x86/kaslr: Fix build warning in KASLR code in boot stub lib/kaslr.c is used by both the main kernel and the boot stub. It includes asm/io.h which is supposed to be used in the main kernel. It leads to build warnings like this with clang 13: warning: implicit declaration of function 'outl' is invalid in C99 [-Wimplicit-function-declaration] Replace with which is suitable for both cases. Fixes: 1e8f93e18379 ("x86: Consolidate port I/O helpers") Reported-by: Borislav Petkov Signed-off-by: Kirill A. Shutemov Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220410200025.3stf4jjvwfe5oxew@box.shutemov.name --- arch/x86/lib/kaslr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/kaslr.c b/arch/x86/lib/kaslr.c index 2b3eb8c948a3..a58f451a7dd3 100644 --- a/arch/x86/lib/kaslr.c +++ b/arch/x86/lib/kaslr.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include /* * When built for the regular kernel, several functions need to be stubbed out -- cgit From c7bda0dca98cca351a9bc852b3df8b9b99ffd400 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 13 Jan 2022 14:26:10 +0100 Subject: x86: Remove a.out support Commit eac616557050 ("x86: Deprecate a.out support") deprecated a.out support with the promise to remove it a couple of releases later. That commit landed in v5.1. Now it is more than a couple of releases later, no one has complained so remove it. Fold in a hunk removing the reference to arch/x86/ia32/ia32_aout.c in MAINTAINERS: https://lore.kernel.org/r/20220316050828.17255-1-lukas.bulwahn@gmail.com Signed-off-by: Borislav Petkov Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20220113160115.5375-1-bp@alien8.de --- arch/x86/Kconfig | 7 - arch/x86/ia32/Makefile | 2 - arch/x86/ia32/ia32_aout.c | 325 ---------------------------------------------- 3 files changed, 334 deletions(-) delete mode 100644 arch/x86/ia32/ia32_aout.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b0142e01002e..69231f66a2b5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2838,13 +2838,6 @@ config IA32_EMULATION 64-bit kernel. You should likely turn this on, unless you're 100% sure that you don't have any 32-bit programs left. -config IA32_AOUT - tristate "IA32 a.out support" - depends on IA32_EMULATION - depends on BROKEN - help - Support old a.out binaries in the 32bit emulation. - config X86_X32_ABI bool "x32 ABI for 64-bit mode" depends on X86_64 diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile index 8e4d0391ff6c..e481056698de 100644 --- a/arch/x86/ia32/Makefile +++ b/arch/x86/ia32/Makefile @@ -5,7 +5,5 @@ obj-$(CONFIG_IA32_EMULATION) := ia32_signal.o -obj-$(CONFIG_IA32_AOUT) += ia32_aout.o - audit-class-$(CONFIG_AUDIT) := audit.o obj-$(CONFIG_IA32_EMULATION) += $(audit-class-y) diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c deleted file mode 100644 index 9bd15241fadb..000000000000 --- a/arch/x86/ia32/ia32_aout.c +++ /dev/null @@ -1,325 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * a.out loader for x86-64 - * - * Copyright (C) 1991, 1992, 1996 Linus Torvalds - * Hacked together by Andi Kleen - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#undef WARN_OLD - -static int load_aout_binary(struct linux_binprm *); -static int load_aout_library(struct file *); - -static struct linux_binfmt aout_format = { - .module = THIS_MODULE, - .load_binary = load_aout_binary, - .load_shlib = load_aout_library, -}; - -static int set_brk(unsigned long start, unsigned long end) -{ - start = PAGE_ALIGN(start); - end = PAGE_ALIGN(end); - if (end <= start) - return 0; - return vm_brk(start, end - start); -} - - -/* - * create_aout_tables() parses the env- and arg-strings in new user - * memory and creates the pointer tables from them, and puts their - * addresses on the "stack", returning the new stack pointer value. - */ -static u32 __user *create_aout_tables(char __user *p, struct linux_binprm *bprm) -{ - u32 __user *argv, *envp, *sp; - int argc = bprm->argc, envc = bprm->envc; - - sp = (u32 __user *) ((-(unsigned long)sizeof(u32)) & (unsigned long) p); - sp -= envc+1; - envp = sp; - sp -= argc+1; - argv = sp; - put_user((unsigned long) envp, --sp); - put_user((unsigned long) argv, --sp); - put_user(argc, --sp); - current->mm->arg_start = (unsigned long) p; - while (argc-- > 0) { - char c; - - put_user((u32)(unsigned long)p, argv++); - do { - get_user(c, p++); - } while (c); - } - put_user(0, argv); - current->mm->arg_end = current->mm->env_start = (unsigned long) p; - while (envc-- > 0) { - char c; - - put_user((u32)(unsigned long)p, envp++); - do { - get_user(c, p++); - } while (c); - } - put_user(0, envp); - current->mm->env_end = (unsigned long) p; - return sp; -} - -/* - * These are the functions used to load a.out style executables and shared - * libraries. There is no binary dependent code anywhere else. - */ -static int load_aout_binary(struct linux_binprm *bprm) -{ - unsigned long error, fd_offset, rlim; - struct pt_regs *regs = current_pt_regs(); - struct exec ex; - int retval; - - ex = *((struct exec *) bprm->buf); /* exec-header */ - if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC && - N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) || - N_TRSIZE(ex) || N_DRSIZE(ex) || - i_size_read(file_inode(bprm->file)) < - ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { - return -ENOEXEC; - } - - fd_offset = N_TXTOFF(ex); - - /* Check initial limits. This avoids letting people circumvent - * size limits imposed on them by creating programs with large - * arrays in the data or bss. - */ - rlim = rlimit(RLIMIT_DATA); - if (rlim >= RLIM_INFINITY) - rlim = ~0; - if (ex.a_data + ex.a_bss > rlim) - return -ENOMEM; - - /* Flush all traces of the currently running executable */ - retval = begin_new_exec(bprm); - if (retval) - return retval; - - /* OK, This is the point of no return */ - set_personality(PER_LINUX); - set_personality_ia32(false); - - setup_new_exec(bprm); - - regs->cs = __USER32_CS; - regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 = - regs->r13 = regs->r14 = regs->r15 = 0; - - current->mm->end_code = ex.a_text + - (current->mm->start_code = N_TXTADDR(ex)); - current->mm->end_data = ex.a_data + - (current->mm->start_data = N_DATADDR(ex)); - current->mm->brk = ex.a_bss + - (current->mm->start_brk = N_BSSADDR(ex)); - - retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT); - if (retval < 0) - return retval; - - if (N_MAGIC(ex) == OMAGIC) { - unsigned long text_addr, map_size; - - text_addr = N_TXTADDR(ex); - map_size = ex.a_text+ex.a_data; - - error = vm_brk(text_addr & PAGE_MASK, map_size); - - if (error) - return error; - - error = read_code(bprm->file, text_addr, 32, - ex.a_text + ex.a_data); - if ((signed long)error < 0) - return error; - } else { -#ifdef WARN_OLD - static unsigned long error_time, error_time2; - if ((ex.a_text & 0xfff || ex.a_data & 0xfff) && - (N_MAGIC(ex) != NMAGIC) && - time_after(jiffies, error_time2 + 5*HZ)) { - printk(KERN_NOTICE "executable not page aligned\n"); - error_time2 = jiffies; - } - - if ((fd_offset & ~PAGE_MASK) != 0 && - time_after(jiffies, error_time + 5*HZ)) { - printk(KERN_WARNING - "fd_offset is not page aligned. Please convert " - "program: %pD\n", - bprm->file); - error_time = jiffies; - } -#endif - - if (!bprm->file->f_op->mmap || (fd_offset & ~PAGE_MASK) != 0) { - error = vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); - if (error) - return error; - - read_code(bprm->file, N_TXTADDR(ex), fd_offset, - ex.a_text+ex.a_data); - goto beyond_if; - } - - error = vm_mmap(bprm->file, N_TXTADDR(ex), ex.a_text, - PROT_READ | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_32BIT, - fd_offset); - - if (error != N_TXTADDR(ex)) - return error; - - error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data, - PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_32BIT, - fd_offset + ex.a_text); - if (error != N_DATADDR(ex)) - return error; - } - -beyond_if: - error = set_brk(current->mm->start_brk, current->mm->brk); - if (error) - return error; - - set_binfmt(&aout_format); - - current->mm->start_stack = - (unsigned long)create_aout_tables((char __user *)bprm->p, bprm); - /* start thread */ - loadsegment(fs, 0); - loadsegment(ds, __USER32_DS); - loadsegment(es, __USER32_DS); - load_gs_index(0); - (regs)->ip = ex.a_entry; - (regs)->sp = current->mm->start_stack; - (regs)->flags = 0x200; - (regs)->cs = __USER32_CS; - (regs)->ss = __USER32_DS; - regs->r8 = regs->r9 = regs->r10 = regs->r11 = - regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; - return 0; -} - -static int load_aout_library(struct file *file) -{ - unsigned long bss, start_addr, len, error; - int retval; - struct exec ex; - loff_t pos = 0; - - retval = -ENOEXEC; - error = kernel_read(file, &ex, sizeof(ex), &pos); - if (error != sizeof(ex)) - goto out; - - /* We come in here for the regular a.out style of shared libraries */ - if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) || - N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) || - i_size_read(file_inode(file)) < - ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { - goto out; - } - - if (N_FLAGS(ex)) - goto out; - - /* For QMAGIC, the starting address is 0x20 into the page. We mask - this off to get the starting address for the page */ - - start_addr = ex.a_entry & 0xfffff000; - - if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) { -#ifdef WARN_OLD - static unsigned long error_time; - if (time_after(jiffies, error_time + 5*HZ)) { - printk(KERN_WARNING - "N_TXTOFF is not page aligned. Please convert " - "library: %pD\n", - file); - error_time = jiffies; - } -#endif - retval = vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); - if (retval) - goto out; - - read_code(file, start_addr, N_TXTOFF(ex), - ex.a_text + ex.a_data); - retval = 0; - goto out; - } - /* Now use mmap to map the library into memory. */ - error = vm_mmap(file, start_addr, ex.a_text + ex.a_data, - PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_32BIT, - N_TXTOFF(ex)); - retval = error; - if (error != start_addr) - goto out; - - len = PAGE_ALIGN(ex.a_text + ex.a_data); - bss = ex.a_text + ex.a_data + ex.a_bss; - if (bss > len) { - retval = vm_brk(start_addr + len, bss - len); - if (retval) - goto out; - } - retval = 0; -out: - return retval; -} - -static int __init init_aout_binfmt(void) -{ - register_binfmt(&aout_format); - return 0; -} - -static void __exit exit_aout_binfmt(void) -{ - unregister_binfmt(&aout_format); -} - -module_init(init_aout_binfmt); -module_exit(exit_aout_binfmt); -MODULE_LICENSE("GPL"); -- cgit From e8a69f12f01f487c6a0e704eb14ccf2dd015277d Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Wed, 9 Feb 2022 09:02:13 +0100 Subject: x86/xen: Allow to retry if cpu_initialize_context() failed. If memory allocation in cpu_initialize_context() fails then it will bring up the VCPU and leave with the corresponding CPU bit set in xen_cpu_initialized_map. The following (presumably successful) CPU bring up will BUG in xen_pv_cpu_up() because nothing for that VCPU would be initialized. Clear the CPU bits, that were set in cpu_initialize_context() in case the memory allocation fails. [ bigeasy: Creating a patch from Boris' email. ] Signed-off-by: Boris Ostrovsky Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20220209080214.1439408-2-bigeasy@linutronix.de --- arch/x86/xen/smp_pv.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 688aa8b6ae29..ba7af2eca755 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -260,8 +260,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) return 0; ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL); - if (ctxt == NULL) + if (ctxt == NULL) { + cpumask_clear_cpu(cpu, xen_cpu_initialized_map); + cpumask_clear_cpu(cpu, cpu_callout_mask); return -ENOMEM; + } gdt = get_cpu_gdt_rw(cpu); -- cgit From f5d9283ecb33329073033029fe427155aa0abfb1 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Fri, 25 Mar 2022 11:39:50 -0400 Subject: x86/32: Simplify ELF_CORE_COPY_REGS GS is now always a user segment, so there is no difference between user and kernel registers. Signed-off-by: Brian Gerst Signed-off-by: Borislav Petkov Reviewed-by: Thomas Gleixner Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20220325153953.162643-2-brgerst@gmail.com --- arch/x86/include/asm/elf.h | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 29fea180a665..cb0ff1055ab1 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -116,7 +116,7 @@ extern unsigned int vdso32_enabled; * now struct_user_regs, they are different) */ -#define ELF_CORE_COPY_REGS_COMMON(pr_reg, regs) \ +#define ELF_CORE_COPY_REGS(pr_reg, regs) \ do { \ pr_reg[0] = regs->bx; \ pr_reg[1] = regs->cx; \ @@ -128,6 +128,7 @@ do { \ pr_reg[7] = regs->ds; \ pr_reg[8] = regs->es; \ pr_reg[9] = regs->fs; \ + savesegment(gs, pr_reg[10]); \ pr_reg[11] = regs->orig_ax; \ pr_reg[12] = regs->ip; \ pr_reg[13] = regs->cs; \ @@ -136,18 +137,6 @@ do { \ pr_reg[16] = regs->ss; \ } while (0); -#define ELF_CORE_COPY_REGS(pr_reg, regs) \ -do { \ - ELF_CORE_COPY_REGS_COMMON(pr_reg, regs);\ - pr_reg[10] = get_user_gs(regs); \ -} while (0); - -#define ELF_CORE_COPY_KERNEL_REGS(pr_reg, regs) \ -do { \ - ELF_CORE_COPY_REGS_COMMON(pr_reg, regs);\ - savesegment(gs, pr_reg[10]); \ -} while (0); - #define ELF_PLATFORM (utsname()->machine) #define set_personality_64bit() do { } while (0) -- cgit From 84958f38d897f85b34036356f64e908e4754170f Mon Sep 17 00:00:00 2001 From: Amadeusz SÅ‚awiÅ„ski Date: Tue, 29 Mar 2022 15:33:52 +0200 Subject: x86/ACPI: Preserve ACPI-table override during hibernation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When overriding NHLT ACPI-table tests show that on some platforms there is problem that NHLT contains garbage after hibernation/resume cycle. Problem stems from the fact that ACPI override performs early memory allocation using memblock_phys_alloc_range() in memblock_phys_alloc_range(). This memory block is later being marked as ACPI memory block in arch_reserve_mem_area(). Later when memory areas are considered for hibernation it is being marked as nosave in e820__register_nosave_regions(). Fix this by marking ACPI override memory area as ACPI NVS (Non-Volatile-Sleeping), which according to specification needs to be saved on entering S4 and restored when leaving and is implemented as such in kernel. Signed-off-by: Amadeusz SÅ‚awiÅ„ski Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/acpi/boot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 0d01e7f5078c..2eeca97b730b 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1772,7 +1772,7 @@ int __acpi_release_global_lock(unsigned int *lock) void __init arch_reserve_mem_area(acpi_physical_address addr, size_t size) { - e820__range_add(addr, size, E820_TYPE_ACPI); + e820__range_add(addr, size, E820_TYPE_NVS); e820__update_table_print(); } -- cgit From daf3af4705ba8f49d33ea9b7bafdc9fd9efd49e0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Apr 2022 22:34:21 +0200 Subject: x86/apic: Clarify i82489DX bit overlap in APIC_LVT0 Daniel stumbled over the bit overlap of the i82498DX external APIC and the TSC deadline timer configuration bit in modern APICs, which is neither documented in the code nor in the current SDM. Maciej provided links to the original i82489DX/486 documentation. See Link. Remove the i82489DX macro maze, use a i82489DX specific define in the apic code and document the overlap in a comment. Reported-by: Daniel Vacek Signed-off-by: Thomas Gleixner Cc: Maciej W. Rozycki Link: https://lore.kernel.org/r/87ee22f3ci.ffs@tglx --- arch/x86/include/asm/apicdef.h | 6 ------ arch/x86/kernel/apic/apic.c | 11 ++++++++++- 2 files changed, 10 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 5716f22f81ac..92035eb3afee 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -95,12 +95,6 @@ #define APIC_LVTTHMR 0x330 #define APIC_LVTPC 0x340 #define APIC_LVT0 0x350 -#define APIC_LVT_TIMER_BASE_MASK (0x3 << 18) -#define GET_APIC_TIMER_BASE(x) (((x) >> 18) & 0x3) -#define SET_APIC_TIMER_BASE(x) (((x) << 18)) -#define APIC_TIMER_BASE_CLKIN 0x0 -#define APIC_TIMER_BASE_TMBASE 0x1 -#define APIC_TIMER_BASE_DIV 0x2 #define APIC_LVT_TIMER_ONESHOT (0 << 17) #define APIC_LVT_TIMER_PERIODIC (1 << 17) #define APIC_LVT_TIMER_TSCDEADLINE (2 << 17) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b70344bf6600..13819bfa8dde 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -320,6 +320,9 @@ int lapic_get_maxlvt(void) #define APIC_DIVISOR 16 #define TSC_DIVISOR 8 +/* i82489DX specific */ +#define I82489DX_BASE_DIVIDER (((0x2) << 18)) + /* * This function sets up the local APIC timer, with a timeout of * 'clocks' APIC bus clock. During calibration we actually call @@ -340,8 +343,14 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) else if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) lvtt_value |= APIC_LVT_TIMER_TSCDEADLINE; + /* + * The i82489DX APIC uses bit 18 and 19 for the base divider. This + * overlaps with bit 18 on integrated APICs, but is not documented + * in the SDM. No problem though. i82489DX equipped systems do not + * have TSC deadline timer. + */ if (!lapic_is_integrated()) - lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); + lvtt_value |= I82489DX_BASE_DIVIDER; if (!irqen) lvtt_value |= APIC_LVT_MASKED; -- cgit From 1227418989346af3af179742cf42ce842e0ad484 Mon Sep 17 00:00:00 2001 From: Dov Murik Date: Tue, 12 Apr 2022 21:21:24 +0000 Subject: efi: Save location of EFI confidential computing area Confidential computing (coco) hardware such as AMD SEV (Secure Encrypted Virtualization) allows a guest owner to inject secrets into the VMs memory without the host/hypervisor being able to read them. Firmware support for secret injection is available in OVMF, which reserves a memory area for secret injection and includes a pointer to it the in EFI config table entry LINUX_EFI_COCO_SECRET_TABLE_GUID. If EFI exposes such a table entry, uefi_init() will keep a pointer to the EFI config table entry in efi.coco_secret, so it can be used later by the kernel (specifically drivers/virt/coco/efi_secret). It will also appear in the kernel log as "CocoSecret=ADDRESS"; for example: [ 0.000000] efi: EFI v2.70 by EDK II [ 0.000000] efi: CocoSecret=0x7f22e680 SMBIOS=0x7f541000 ACPI=0x7f77e000 ACPI 2.0=0x7f77e014 MEMATTR=0x7ea0c018 The new functionality can be enabled with CONFIG_EFI_COCO_SECRET=y. Signed-off-by: Dov Murik Reviewed-by: Gerd Hoffmann Link: https://lore.kernel.org/r/20220412212127.154182-2-dovmurik@linux.ibm.com Signed-off-by: Ard Biesheuvel --- arch/x86/platform/efi/efi.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 147c30a81f15..1591d67e0bcd 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -93,6 +93,9 @@ static const unsigned long * const efi_tables[] = { #ifdef CONFIG_LOAD_UEFI_KEYS &efi.mokvar_table, #endif +#ifdef CONFIG_EFI_COCO_SECRET + &efi.coco_secret, +#endif }; u64 efi_setup; /* efi setup_data physical address */ -- cgit From b57a7c9dd732ca29c4400a9a710c56c55877086d Mon Sep 17 00:00:00 2001 From: Eric DeVolder Date: Wed, 13 Apr 2022 12:42:30 -0400 Subject: x86/crash: Fix minor typo/bug in debug message The pr_debug() intends to display the memsz member, but the parameter is actually the bufsz member (which is already displayed). Correct this to display memsz value. Signed-off-by: Eric DeVolder Signed-off-by: Borislav Petkov Acked-by: Baoquan He Link: https://lore.kernel.org/r/20220413164237.20845-2-eric.devolder@oracle.com --- arch/x86/kernel/crash.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index e8326a8d1c5d..9730c88530fc 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -407,7 +407,7 @@ int crash_load_segments(struct kimage *image) } image->elf_load_addr = kbuf.mem; pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - image->elf_load_addr, kbuf.bufsz, kbuf.bufsz); + image->elf_load_addr, kbuf.bufsz, kbuf.memsz); return ret; } -- cgit From a090931524d03a4975c84b89a32210420d852313 Mon Sep 17 00:00:00 2001 From: Liu Xinpeng Date: Tue, 12 Apr 2022 11:05:19 +0800 Subject: ACPI: APEI: Fix missing ERST record id Read a record is cleared by others, but the deleted record cache entry is still created by erst_get_record_id_next. When next enumerate the records, get the cached deleted record, then erst_read() return -ENOENT and try to get next record, loop back to first ID will return 0 in function __erst_record_id_cache_add_one and then set record_id as APEI_ERST_INVALID_RECORD_ID, finished this time read operation. It will result in read the records just in the cache hereafter. This patch cleared the deleted record cache, fix the issue that "./erst-inject -p" shows record counts not equal to "./erst-inject -n". A reproducer of the problem(retry many times): [root@localhost erst-inject]# ./erst-inject -c 0xaaaaa00011 [root@localhost erst-inject]# ./erst-inject -p rc: 273 rcd sig: CPER rcd id: 0xaaaaa00012 rc: 273 rcd sig: CPER rcd id: 0xaaaaa00013 rc: 273 rcd sig: CPER rcd id: 0xaaaaa00014 [root@localhost erst-inject]# ./erst-inject -i 0xaaaaa000006 [root@localhost erst-inject]# ./erst-inject -i 0xaaaaa000007 [root@localhost erst-inject]# ./erst-inject -i 0xaaaaa000008 [root@localhost erst-inject]# ./erst-inject -p rc: 273 rcd sig: CPER rcd id: 0xaaaaa00012 rc: 273 rcd sig: CPER rcd id: 0xaaaaa00013 rc: 273 rcd sig: CPER rcd id: 0xaaaaa00014 [root@localhost erst-inject]# ./erst-inject -n total error record count: 6 Signed-off-by: Liu Xinpeng Reviewed-by: Tony Luck Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/cpu/mce/apei.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c index 0e3ae64d3b76..717192915f28 100644 --- a/arch/x86/kernel/cpu/mce/apei.c +++ b/arch/x86/kernel/cpu/mce/apei.c @@ -177,16 +177,14 @@ retry: /* no more record */ if (*record_id == APEI_ERST_INVALID_RECORD_ID) goto out; - rc = erst_read(*record_id, &rcd.hdr, sizeof(rcd)); + rc = erst_read_record(*record_id, &rcd.hdr, sizeof(rcd), sizeof(rcd), + &CPER_CREATOR_MCE); /* someone else has cleared the record, try next one */ if (rc == -ENOENT) goto retry; else if (rc < 0) goto out; - /* try to skip other type records in storage */ - else if (rc != sizeof(rcd) || - !guid_equal(&rcd.hdr.creator_id, &CPER_CREATOR_MCE)) - goto retry; + memcpy(m, &rcd.mce, sizeof(*m)); rc = sizeof(*m); out: -- cgit From 4e140f59d285c1ca1e5c81b4c13e27366865bd09 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 10 Jan 2022 23:15:27 +0000 Subject: mm/usercopy: Check kmap addresses properly If you are copying to an address in the kmap region, you may not copy across a page boundary, no matter what the size of the underlying allocation. You can't kmap() a slab page because slab pages always come from low memory. Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Kees Cook Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220110231530.665970-2-willy@infradead.org --- arch/x86/include/asm/highmem.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index 032e020853aa..731ee7cc40a5 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -26,6 +26,7 @@ #include #include #include +#include /* declarations for highmem.c */ extern unsigned long highstart_pfn, highend_pfn; -- cgit From dbb5ab6d2c0a7397d77c9f60fe23844d4fe4e634 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Thu, 14 Apr 2022 14:21:10 +0800 Subject: x86/process: Fix kernel-doc warning due to a changed function name Fix the following scripts/kernel-doc warning: arch/x86/kernel/process.c:412: warning: expecting prototype for tss_update_io_bitmap(). Prototype was for native_tss_update_io_bitmap() instead. [ bp: Massage. ] Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220414062110.60343-1-jiapeng.chong@linux.alibaba.com --- arch/x86/kernel/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index b370767f5b19..b3d2d414cfad 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -405,7 +405,7 @@ static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm) } /** - * tss_update_io_bitmap - Update I/O bitmap before exiting to usermode + * native_tss_update_io_bitmap - Update I/O bitmap before exiting to user mode */ void native_tss_update_io_bitmap(void) { -- cgit From 3a24a60854d2ef19e0edcd11cdbbb4fabc655dde Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Fri, 25 Mar 2022 11:39:52 -0400 Subject: x86/32: Remove lazy GS macros GS is always a user segment now. Signed-off-by: Brian Gerst Signed-off-by: Borislav Petkov Reviewed-by: Thomas Gleixner Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20220325153953.162643-4-brgerst@gmail.com --- arch/x86/include/asm/mmu_context.h | 2 +- arch/x86/include/asm/segment.h | 5 ----- arch/x86/kernel/process.c | 5 +---- arch/x86/kernel/process_32.c | 11 ++++------- arch/x86/kernel/ptrace.c | 6 +++--- arch/x86/kernel/signal.c | 8 +++++--- arch/x86/kernel/vm86_32.c | 4 ++-- arch/x86/lib/insn-eval.c | 5 +++-- arch/x86/math-emu/get_address.c | 2 +- 9 files changed, 20 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 27516046117a..b8d40ddeab00 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -141,7 +141,7 @@ do { \ #ifdef CONFIG_X86_32 #define deactivate_mm(tsk, mm) \ do { \ - lazy_load_gs(0); \ + loadsegment(gs, 0); \ } while (0) #else #define deactivate_mm(tsk, mm) \ diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 656ed6531d03..617b3663e4dd 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -354,11 +354,6 @@ static inline void __loadsegment_fs(unsigned short value) * x86-32 user GS accessors. This is ugly and could do with some cleaning up. */ #ifdef CONFIG_X86_32 -# define get_user_gs(regs) (u16)({ unsigned long v; savesegment(gs, v); v; }) -# define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v)) -# define task_user_gs(tsk) ((tsk)->thread.gs) -# define lazy_save_gs(v) savesegment(gs, (v)) -# define lazy_load_gs(v) loadsegment(gs, (v)) # define load_gs_index(v) loadsegment(gs, (v)) #endif /* X86_32 */ diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index b370767f5b19..96a9885b9c3a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -160,6 +160,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, savesegment(ds, p->thread.ds); #else p->thread.sp0 = (unsigned long) (childregs + 1); + savesegment(gs, p->thread.gs); /* * Clear all status flags including IF and set fixed bit. 64bit * does not have this initialization as the frame does not contain @@ -191,10 +192,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, if (sp) childregs->sp = sp; -#ifdef CONFIG_X86_32 - task_user_gs(p) = get_user_gs(current_pt_regs()); -#endif - if (unlikely(p->flags & PF_IO_WORKER)) { /* * An IO thread is a user space thread, but it doesn't diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 26edb1cd07a4..877358f3dba7 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -63,10 +63,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, unsigned long d0, d1, d2, d3, d6, d7; unsigned short gs; - if (user_mode(regs)) - gs = get_user_gs(regs); - else - savesegment(gs, gs); + savesegment(gs, gs); show_ip(regs, log_lvl); @@ -114,7 +111,7 @@ void release_thread(struct task_struct *dead_task) void start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) { - set_user_gs(regs, 0); + loadsegment(gs, 0); regs->fs = 0; regs->ds = __USER_DS; regs->es = __USER_DS; @@ -177,7 +174,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * used %fs or %gs (it does not today), or if the kernel is * running inside of a hypervisor layer. */ - lazy_save_gs(prev->gs); + savesegment(gs, prev->gs); /* * Load the per-thread Thread-Local Storage descriptor. @@ -208,7 +205,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * Restore %gs if needed (which is common) */ if (prev->gs | next->gs) - lazy_load_gs(next->gs); + loadsegment(gs, next->gs); this_cpu_write(current_task, next_p); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 98d10ef60571..37c12fb92906 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -170,9 +170,9 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset) retval = *pt_regs_access(task_pt_regs(task), offset); else { if (task == current) - retval = get_user_gs(task_pt_regs(task)); + savesegment(gs, retval); else - retval = task_user_gs(task); + retval = task->thread.gs; } return retval; } @@ -210,7 +210,7 @@ static int set_segment_reg(struct task_struct *task, break; case offsetof(struct user_regs_struct, gs): - task_user_gs(task) = value; + task->thread.gs = value; } return 0; diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index e439eb14325f..9c7265b524c7 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -93,7 +93,7 @@ static bool restore_sigcontext(struct pt_regs *regs, return false; #ifdef CONFIG_X86_32 - set_user_gs(regs, sc.gs); + loadsegment(gs, sc.gs); regs->fs = sc.fs; regs->es = sc.es; regs->ds = sc.ds; @@ -146,8 +146,10 @@ __unsafe_setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, struct pt_regs *regs, unsigned long mask) { #ifdef CONFIG_X86_32 - unsafe_put_user(get_user_gs(regs), - (unsigned int __user *)&sc->gs, Efault); + unsigned int gs; + savesegment(gs, gs); + + unsafe_put_user(gs, (unsigned int __user *)&sc->gs, Efault); unsafe_put_user(regs->fs, (unsigned int __user *)&sc->fs, Efault); unsafe_put_user(regs->es, (unsigned int __user *)&sc->es, Efault); unsafe_put_user(regs->ds, (unsigned int __user *)&sc->ds, Efault); diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index c21bcd668284..e9e803a4d44c 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -151,7 +151,7 @@ exit_vm86: memcpy(®s->pt, &vm86->regs32, sizeof(struct pt_regs)); - lazy_load_gs(vm86->regs32.gs); + loadsegment(gs, vm86->regs32.gs); regs->pt.ax = retval; return; @@ -325,7 +325,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) * Save old state */ vm86->saved_sp0 = tsk->thread.sp0; - lazy_save_gs(vm86->regs32.gs); + savesegment(gs, vm86->regs32.gs); /* make room for real-mode segments */ preempt_disable(); diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index b781d324211b..21104c41cba0 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -342,9 +342,9 @@ static int resolve_seg_reg(struct insn *insn, struct pt_regs *regs, int regoff) */ static short get_segment_selector(struct pt_regs *regs, int seg_reg_idx) { -#ifdef CONFIG_X86_64 unsigned short sel; +#ifdef CONFIG_X86_64 switch (seg_reg_idx) { case INAT_SEG_REG_IGNORE: return 0; @@ -402,7 +402,8 @@ static short get_segment_selector(struct pt_regs *regs, int seg_reg_idx) case INAT_SEG_REG_FS: return (unsigned short)(regs->fs & 0xffff); case INAT_SEG_REG_GS: - return get_user_gs(regs); + savesegment(gs, sel); + return sel; case INAT_SEG_REG_IGNORE: default: return -EINVAL; diff --git a/arch/x86/math-emu/get_address.c b/arch/x86/math-emu/get_address.c index b82ca14ba718..4a9fd9029a53 100644 --- a/arch/x86/math-emu/get_address.c +++ b/arch/x86/math-emu/get_address.c @@ -153,7 +153,7 @@ static long pm_address(u_char FPU_modrm, u_char segment, switch (segment) { case PREFIX_GS_ - 1: /* user gs handling can be lazy, use special accessors */ - addr->selector = get_user_gs(FPU_info->regs); + savesegment(gs, addr->selector); break; default: addr->selector = PM_REG_(segment); -- cgit From 203d8919a9eda5d1bc68ac3cd7637588334c9dc1 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Fri, 25 Mar 2022 11:39:53 -0400 Subject: x86/asm: Merge load_gs_index() Merge the 32- and 64-bit implementations of load_gs_index(). Signed-off-by: Brian Gerst Signed-off-by: Borislav Petkov Reviewed-by: Thomas Gleixner Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20220325153953.162643-5-brgerst@gmail.com --- arch/x86/include/asm/segment.h | 7 ------- arch/x86/include/asm/special_insns.h | 7 ++++--- 2 files changed, 4 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 617b3663e4dd..2e7890dd58a4 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -350,13 +350,6 @@ static inline void __loadsegment_fs(unsigned short value) #define savesegment(seg, value) \ asm("mov %%" #seg ",%0":"=r" (value) : : "memory") -/* - * x86-32 user GS accessors. This is ugly and could do with some cleaning up. - */ -#ifdef CONFIG_X86_32 -# define load_gs_index(v) loadsegment(gs, (v)) -#endif /* X86_32 */ - #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 68c257a3de0d..45b18eb94fa1 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -184,14 +184,15 @@ static inline void wbinvd(void) native_wbinvd(); } -#ifdef CONFIG_X86_64 static inline void load_gs_index(unsigned int selector) { +#ifdef CONFIG_X86_64 native_load_gs_index(selector); -} - +#else + loadsegment(gs, selector); #endif +} #endif /* CONFIG_PARAVIRT_XXL */ -- cgit From f52ba93190457aa285ae805a3e8360a50552cfd8 Mon Sep 17 00:00:00 2001 From: Sumeet Pawnikar Date: Fri, 20 Aug 2021 17:42:43 +0530 Subject: tools/power turbostat: Add Power Limit4 support Add Power Limit4 support. Signed-off-by: Sumeet Pawnikar Acked-by: Zhang Rui Signed-off-by: Len Brown --- arch/x86/include/asm/msr-index.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index a4a39c3e0f19..b890840e0059 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -310,6 +310,7 @@ /* Run Time Average Power Limiting (RAPL) Interface */ +#define MSR_VR_CURRENT_CONFIG 0x00000601 #define MSR_RAPL_POWER_UNIT 0x00000606 #define MSR_PKG_POWER_LIMIT 0x00000610 -- cgit From 5dc91f2d4f3c160199fea9421d6b08f67a906947 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 2 Feb 2022 20:24:30 +0100 Subject: x86/boot: Add an efi.h header for the decompressor Copy the needed symbols only and remove the kernel proper includes. No functional changes. Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/YlCKWhMJEMUgJmjF@zn.tnic --- arch/x86/boot/compressed/acpi.c | 3 +- arch/x86/boot/compressed/efi.c | 2 - arch/x86/boot/compressed/efi.h | 126 ++++++++++++++++++++++++++++++++++ arch/x86/boot/compressed/kaslr.c | 3 +- arch/x86/boot/compressed/misc.h | 3 +- arch/x86/boot/compressed/pgtable_64.c | 3 +- 6 files changed, 131 insertions(+), 9 deletions(-) create mode 100644 arch/x86/boot/compressed/efi.h (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c index 64b172dabd5c..9caf89063e77 100644 --- a/arch/x86/boot/compressed/acpi.c +++ b/arch/x86/boot/compressed/acpi.c @@ -3,10 +3,9 @@ #include "misc.h" #include "error.h" #include "../string.h" +#include "efi.h" #include -#include -#include /* * Longest parameter of 'acpi=' is 'copy_dsdt', plus an extra '\0' diff --git a/arch/x86/boot/compressed/efi.c b/arch/x86/boot/compressed/efi.c index 09fa3b5d70b8..6edd034b0b30 100644 --- a/arch/x86/boot/compressed/efi.c +++ b/arch/x86/boot/compressed/efi.c @@ -6,8 +6,6 @@ */ #include "misc.h" -#include -#include /** * efi_get_type - Given a pointer to boot_params, determine the type of EFI environment. diff --git a/arch/x86/boot/compressed/efi.h b/arch/x86/boot/compressed/efi.h new file mode 100644 index 000000000000..7db2f41b54cd --- /dev/null +++ b/arch/x86/boot/compressed/efi.h @@ -0,0 +1,126 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef BOOT_COMPRESSED_EFI_H +#define BOOT_COMPRESSED_EFI_H + +#if defined(_LINUX_EFI_H) || defined(_ASM_X86_EFI_H) +#error Please do not include kernel proper namespace headers +#endif + +typedef guid_t efi_guid_t __aligned(__alignof__(u32)); + +#define EFI_GUID(a, b, c, d...) (efi_guid_t){ { \ + (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \ + (b) & 0xff, ((b) >> 8) & 0xff, \ + (c) & 0xff, ((c) >> 8) & 0xff, d } } + +#define ACPI_TABLE_GUID EFI_GUID(0xeb9d2d30, 0x2d88, 0x11d3, 0x9a, 0x16, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d) +#define ACPI_20_TABLE_GUID EFI_GUID(0x8868e871, 0xe4f1, 0x11d3, 0xbc, 0x22, 0x00, 0x80, 0xc7, 0x3c, 0x88, 0x81) +#define EFI_CC_BLOB_GUID EFI_GUID(0x067b1f5f, 0xcf26, 0x44c5, 0x85, 0x54, 0x93, 0xd7, 0x77, 0x91, 0x2d, 0x42) + +#define EFI32_LOADER_SIGNATURE "EL32" +#define EFI64_LOADER_SIGNATURE "EL64" + +/* + * Generic EFI table header + */ +typedef struct { + u64 signature; + u32 revision; + u32 headersize; + u32 crc32; + u32 reserved; +} efi_table_hdr_t; + +#define EFI_CONVENTIONAL_MEMORY 7 + +#define EFI_MEMORY_MORE_RELIABLE \ + ((u64)0x0000000000010000ULL) /* higher reliability */ +#define EFI_MEMORY_SP ((u64)0x0000000000040000ULL) /* soft reserved */ + +#define EFI_PAGE_SHIFT 12 + +typedef struct { + u32 type; + u32 pad; + u64 phys_addr; + u64 virt_addr; + u64 num_pages; + u64 attribute; +} efi_memory_desc_t; + +#define efi_early_memdesc_ptr(map, desc_size, n) \ + (efi_memory_desc_t *)((void *)(map) + ((n) * (desc_size))) + +typedef struct { + efi_guid_t guid; + u64 table; +} efi_config_table_64_t; + +typedef struct { + efi_guid_t guid; + u32 table; +} efi_config_table_32_t; + +typedef struct { + efi_table_hdr_t hdr; + u64 fw_vendor; /* physical addr of CHAR16 vendor string */ + u32 fw_revision; + u32 __pad1; + u64 con_in_handle; + u64 con_in; + u64 con_out_handle; + u64 con_out; + u64 stderr_handle; + u64 stderr; + u64 runtime; + u64 boottime; + u32 nr_tables; + u32 __pad2; + u64 tables; +} efi_system_table_64_t; + +typedef struct { + efi_table_hdr_t hdr; + u32 fw_vendor; /* physical addr of CHAR16 vendor string */ + u32 fw_revision; + u32 con_in_handle; + u32 con_in; + u32 con_out_handle; + u32 con_out; + u32 stderr_handle; + u32 stderr; + u32 runtime; + u32 boottime; + u32 nr_tables; + u32 tables; +} efi_system_table_32_t; + +/* kexec external ABI */ +struct efi_setup_data { + u64 fw_vendor; + u64 __unused; + u64 tables; + u64 smbios; + u64 reserved[8]; +}; + +static inline int efi_guidcmp (efi_guid_t left, efi_guid_t right) +{ + return memcmp(&left, &right, sizeof (efi_guid_t)); +} + +#ifdef CONFIG_EFI +bool __pure __efi_soft_reserve_enabled(void); + +static inline bool __pure efi_soft_reserve_enabled(void) +{ + return IS_ENABLED(CONFIG_EFI_SOFT_RESERVE) + && __efi_soft_reserve_enabled(); +} +#else +static inline bool efi_soft_reserve_enabled(void) +{ + return false; +} +#endif /* CONFIG_EFI */ +#endif /* BOOT_COMPRESSED_EFI_H */ diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 411b268bc0a2..4a3f223973f4 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -22,15 +22,14 @@ #include "misc.h" #include "error.h" #include "../string.h" +#include "efi.h" #include #include #include #include #include -#include #include -#include #define _SETUP #include /* For COMMAND_LINE_SIZE */ diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 75d284ec763f..4ca2857ea041 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include @@ -35,6 +34,8 @@ #define BOOT_BOOT_H #include "../ctype.h" +#include "efi.h" + #ifdef CONFIG_X86_64 #define memptr long #else diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index a1733319a22a..2ac12ff4111b 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -1,11 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 #include "misc.h" -#include #include #include -#include #include "pgtable.h" #include "../string.h" +#include "efi.h" #define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */ #define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */ -- cgit From 0d5ffd9a256d8995764f9d4a35a8c3917839d169 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Feb 2022 11:07:28 +0100 Subject: swiotlb: rename swiotlb_late_init_with_default_size swiotlb_late_init_with_default_size is an overly verbose name that doesn't even catch what the function is doing, given that the size is not just a default but the actual requested size. Rename it to swiotlb_init_late. Signed-off-by: Christoph Hellwig Reviewed-by: Anshuman Khandual Reviewed-by: Konrad Rzeszutek Wilk Tested-by: Boris Ostrovsky --- arch/x86/pci/sta2x11-fixup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c index 101081ad64b6..e0c039a75b2d 100644 --- a/arch/x86/pci/sta2x11-fixup.c +++ b/arch/x86/pci/sta2x11-fixup.c @@ -57,7 +57,7 @@ static void sta2x11_new_instance(struct pci_dev *pdev) int size = STA2X11_SWIOTLB_SIZE; /* First instance: register your own swiotlb area */ dev_info(&pdev->dev, "Using SWIOTLB (size %i)\n", size); - if (swiotlb_late_init_with_default_size(size)) + if (swiotlb_init_late(size)) dev_emerg(&pdev->dev, "init swiotlb failed\n"); } list_add(&instance->list, &sta2x11_instance_list); -- cgit From 78013eaadf696d2105982abb4018fbae394ca08f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Feb 2022 14:11:44 +0100 Subject: x86: remove the IOMMU table infrastructure The IOMMU table tries to separate the different IOMMUs into different backends, but actually requires various cross calls. Rewrite the code to do the generic swiotlb/swiotlb-xen setup directly in pci-dma.c and then just call into the IOMMU drivers. Signed-off-by: Christoph Hellwig Reviewed-by: Konrad Rzeszutek Wilk Tested-by: Boris Ostrovsky --- arch/x86/include/asm/dma-mapping.h | 1 - arch/x86/include/asm/gart.h | 5 +- arch/x86/include/asm/iommu.h | 6 ++ arch/x86/include/asm/iommu_table.h | 102 ------------------------------- arch/x86/include/asm/swiotlb.h | 30 --------- arch/x86/include/asm/xen/swiotlb-xen.h | 2 - arch/x86/kernel/Makefile | 2 - arch/x86/kernel/amd_gart_64.c | 5 +- arch/x86/kernel/aperture_64.c | 14 ++--- arch/x86/kernel/pci-dma.c | 107 ++++++++++++++++++++++++++------- arch/x86/kernel/pci-iommu_table.c | 77 ------------------------ arch/x86/kernel/pci-swiotlb.c | 77 ------------------------ arch/x86/kernel/tboot.c | 1 - arch/x86/kernel/vmlinux.lds.S | 12 ---- arch/x86/xen/Makefile | 2 - arch/x86/xen/pci-swiotlb-xen.c | 96 ----------------------------- 16 files changed, 100 insertions(+), 439 deletions(-) delete mode 100644 arch/x86/include/asm/iommu_table.h delete mode 100644 arch/x86/include/asm/swiotlb.h delete mode 100644 arch/x86/kernel/pci-iommu_table.c delete mode 100644 arch/x86/kernel/pci-swiotlb.c delete mode 100644 arch/x86/xen/pci-swiotlb-xen.c (limited to 'arch/x86') diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index bb1654fe0ce7..256fd8115223 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -9,7 +9,6 @@ #include #include -#include extern int iommu_merge; extern int panic_on_overflow; diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h index 318556574345..5af8088a10df 100644 --- a/arch/x86/include/asm/gart.h +++ b/arch/x86/include/asm/gart.h @@ -38,7 +38,7 @@ extern int gart_iommu_aperture_disabled; extern void early_gart_iommu_check(void); extern int gart_iommu_init(void); extern void __init gart_parse_options(char *); -extern int gart_iommu_hole_init(void); +void gart_iommu_hole_init(void); #else #define gart_iommu_aperture 0 @@ -51,9 +51,8 @@ static inline void early_gart_iommu_check(void) static inline void gart_parse_options(char *options) { } -static inline int gart_iommu_hole_init(void) +static inline void gart_iommu_hole_init(void) { - return -ENODEV; } #endif diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h index bf1ed2ddc74b..dba89ed40d38 100644 --- a/arch/x86/include/asm/iommu.h +++ b/arch/x86/include/asm/iommu.h @@ -9,6 +9,12 @@ extern int force_iommu, no_iommu; extern int iommu_detected; +#ifdef CONFIG_SWIOTLB +extern bool x86_swiotlb_enable; +#else +#define x86_swiotlb_enable false +#endif + /* 10 seconds */ #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) diff --git a/arch/x86/include/asm/iommu_table.h b/arch/x86/include/asm/iommu_table.h deleted file mode 100644 index 1fb3fd1a83c2..000000000000 --- a/arch/x86/include/asm/iommu_table.h +++ /dev/null @@ -1,102 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_IOMMU_TABLE_H -#define _ASM_X86_IOMMU_TABLE_H - -#include - -/* - * History lesson: - * The execution chain of IOMMUs in 2.6.36 looks as so: - * - * [xen-swiotlb] - * | - * +----[swiotlb *]--+ - * / | \ - * / | \ - * [GART] [Calgary] [Intel VT-d] - * / - * / - * [AMD-Vi] - * - * *: if SWIOTLB detected 'iommu=soft'/'swiotlb=force' it would skip - * over the rest of IOMMUs and unconditionally initialize the SWIOTLB. - * Also it would surreptitiously initialize set the swiotlb=1 if there were - * more than 4GB and if the user did not pass in 'iommu=off'. The swiotlb - * flag would be turned off by all IOMMUs except the Calgary one. - * - * The IOMMU_INIT* macros allow a similar tree (or more complex if desired) - * to be built by defining who we depend on. - * - * And all that needs to be done is to use one of the macros in the IOMMU - * and the pci-dma.c will take care of the rest. - */ - -struct iommu_table_entry { - initcall_t detect; - initcall_t depend; - void (*early_init)(void); /* No memory allocate available. */ - void (*late_init)(void); /* Yes, can allocate memory. */ -#define IOMMU_FINISH_IF_DETECTED (1<<0) -#define IOMMU_DETECTED (1<<1) - int flags; -}; -/* - * Macro fills out an entry in the .iommu_table that is equivalent - * to the fields that 'struct iommu_table_entry' has. The entries - * that are put in the .iommu_table section are not put in any order - * hence during boot-time we will have to resort them based on - * dependency. */ - - -#define __IOMMU_INIT(_detect, _depend, _early_init, _late_init, _finish)\ - static const struct iommu_table_entry \ - __iommu_entry_##_detect __used \ - __attribute__ ((unused, __section__(".iommu_table"), \ - aligned((sizeof(void *))))) \ - = {_detect, _depend, _early_init, _late_init, \ - _finish ? IOMMU_FINISH_IF_DETECTED : 0} -/* - * The simplest IOMMU definition. Provide the detection routine - * and it will be run after the SWIOTLB and the other IOMMUs - * that utilize this macro. If the IOMMU is detected (ie, the - * detect routine returns a positive value), the other IOMMUs - * are also checked. You can use IOMMU_INIT_POST_FINISH if you prefer - * to stop detecting the other IOMMUs after yours has been detected. - */ -#define IOMMU_INIT_POST(_detect) \ - __IOMMU_INIT(_detect, pci_swiotlb_detect_4gb, NULL, NULL, 0) - -#define IOMMU_INIT_POST_FINISH(detect) \ - __IOMMU_INIT(_detect, pci_swiotlb_detect_4gb, NULL, NULL, 1) - -/* - * A more sophisticated version of IOMMU_INIT. This variant requires: - * a). A detection routine function. - * b). The name of the detection routine we depend on to get called - * before us. - * c). The init routine which gets called if the detection routine - * returns a positive value from the pci_iommu_alloc. This means - * no presence of a memory allocator. - * d). Similar to the 'init', except that this gets called from pci_iommu_init - * where we do have a memory allocator. - * - * The standard IOMMU_INIT differs from the IOMMU_INIT_FINISH variant - * in that the former will continue detecting other IOMMUs in the call - * list after the detection routine returns a positive number, while the - * latter will stop the execution chain upon first successful detection. - * Both variants will still call the 'init' and 'late_init' functions if - * they are set. - */ -#define IOMMU_INIT_FINISH(_detect, _depend, _init, _late_init) \ - __IOMMU_INIT(_detect, _depend, _init, _late_init, 1) - -#define IOMMU_INIT(_detect, _depend, _init, _late_init) \ - __IOMMU_INIT(_detect, _depend, _init, _late_init, 0) - -void sort_iommu_table(struct iommu_table_entry *start, - struct iommu_table_entry *finish); - -void check_iommu_entries(struct iommu_table_entry *start, - struct iommu_table_entry *finish); - -#endif /* _ASM_X86_IOMMU_TABLE_H */ diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h deleted file mode 100644 index ff6c92eff035..000000000000 --- a/arch/x86/include/asm/swiotlb.h +++ /dev/null @@ -1,30 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_SWIOTLB_H -#define _ASM_X86_SWIOTLB_H - -#include - -#ifdef CONFIG_SWIOTLB -extern int swiotlb; -extern int __init pci_swiotlb_detect_override(void); -extern int __init pci_swiotlb_detect_4gb(void); -extern void __init pci_swiotlb_init(void); -extern void __init pci_swiotlb_late_init(void); -#else -#define swiotlb 0 -static inline int pci_swiotlb_detect_override(void) -{ - return 0; -} -static inline int pci_swiotlb_detect_4gb(void) -{ - return 0; -} -static inline void pci_swiotlb_init(void) -{ -} -static inline void pci_swiotlb_late_init(void) -{ -} -#endif -#endif /* _ASM_X86_SWIOTLB_H */ diff --git a/arch/x86/include/asm/xen/swiotlb-xen.h b/arch/x86/include/asm/xen/swiotlb-xen.h index 66b4ddde7743..e5a90b42e4dd 100644 --- a/arch/x86/include/asm/xen/swiotlb-xen.h +++ b/arch/x86/include/asm/xen/swiotlb-xen.h @@ -3,10 +3,8 @@ #define _ASM_X86_SWIOTLB_XEN_H #ifdef CONFIG_SWIOTLB_XEN -extern int __init pci_xen_swiotlb_detect(void); extern int pci_xen_swiotlb_init_late(void); #else -#define pci_xen_swiotlb_detect NULL static inline int pci_xen_swiotlb_init_late(void) { return -ENXIO; } #endif diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index c41ef42adbe8..e17b7e92a3fa 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -68,7 +68,6 @@ obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o topology.o kdebugfs.o obj-y += alternative.o i8253.o hw_breakpoint.o obj-y += tsc.o tsc_msr.o io_delay.o rtc.o -obj-y += pci-iommu_table.o obj-y += resource.o obj-y += irqflags.o obj-y += static_call.o @@ -134,7 +133,6 @@ obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o -obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o obj-$(CONFIG_OF) += devicetree.o obj-$(CONFIG_UPROBES) += uprobes.o diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index ed837383de5c..194d54eed537 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -38,11 +38,9 @@ #include #include #include -#include #include #include #include -#include static unsigned long iommu_bus_base; /* GART remapping area (physical) */ static unsigned long iommu_size; /* size of remapping area bytes */ @@ -808,7 +806,7 @@ int __init gart_iommu_init(void) flush_gart(); dma_ops = &gart_dma_ops; x86_platform.iommu_shutdown = gart_iommu_shutdown; - swiotlb = 0; + x86_swiotlb_enable = false; return 0; } @@ -842,4 +840,3 @@ void __init gart_parse_options(char *p) } } } -IOMMU_INIT_POST(gart_iommu_hole_init); diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index af3ba08b684b..7a5630d904b2 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -392,7 +392,7 @@ void __init early_gart_iommu_check(void) static int __initdata printed_gart_size_msg; -int __init gart_iommu_hole_init(void) +void __init gart_iommu_hole_init(void) { u32 agp_aper_base = 0, agp_aper_order = 0; u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0; @@ -401,11 +401,11 @@ int __init gart_iommu_hole_init(void) int i, node; if (!amd_gart_present()) - return -ENODEV; + return; if (gart_iommu_aperture_disabled || !fix_aperture || !early_pci_allowed()) - return -ENODEV; + return; pr_info("Checking aperture...\n"); @@ -491,10 +491,8 @@ out: * and fixed up the northbridge */ exclude_from_core(last_aper_base, last_aper_order); - - return 1; } - return 0; + return; } if (!fallback_aper_force) { @@ -527,7 +525,7 @@ out: panic("Not enough memory for aperture"); } } else { - return 0; + return; } /* @@ -561,6 +559,4 @@ out: } set_up_gart_resume(aper_order, aper_alloc); - - return 1; } diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index de234e7a8962..df96926421be 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -7,13 +7,16 @@ #include #include #include +#include #include #include #include #include #include -#include + +#include +#include static bool disable_dac_quirk __read_mostly; @@ -34,24 +37,83 @@ int no_iommu __read_mostly; /* Set this to 1 if there is a HW IOMMU in the system */ int iommu_detected __read_mostly = 0; -extern struct iommu_table_entry __iommu_table[], __iommu_table_end[]; +#ifdef CONFIG_SWIOTLB +bool x86_swiotlb_enable; + +static void __init pci_swiotlb_detect(void) +{ + /* don't initialize swiotlb if iommu=off (no_iommu=1) */ + if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN) + x86_swiotlb_enable = true; + + /* + * Set swiotlb to 1 so that bounce buffers are allocated and used for + * devices that can't support DMA to encrypted memory. + */ + if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) + x86_swiotlb_enable = true; + + if (swiotlb_force == SWIOTLB_FORCE) + x86_swiotlb_enable = true; +} +#else +static inline void __init pci_swiotlb_detect(void) +{ +} +#endif /* CONFIG_SWIOTLB */ + +#ifdef CONFIG_SWIOTLB_XEN +static bool xen_swiotlb; + +static void __init pci_xen_swiotlb_init(void) +{ + if (!xen_initial_domain() && !x86_swiotlb_enable && + swiotlb_force != SWIOTLB_FORCE) + return; + x86_swiotlb_enable = true; + xen_swiotlb = true; + xen_swiotlb_init_early(); + dma_ops = &xen_swiotlb_dma_ops; + if (IS_ENABLED(CONFIG_PCI)) + pci_request_acs(); +} + +int pci_xen_swiotlb_init_late(void) +{ + int rc; + + if (xen_swiotlb) + return 0; + + rc = xen_swiotlb_init(); + if (rc) + return rc; + + /* XXX: this switches the dma ops under live devices! */ + dma_ops = &xen_swiotlb_dma_ops; + if (IS_ENABLED(CONFIG_PCI)) + pci_request_acs(); + return 0; +} +EXPORT_SYMBOL_GPL(pci_xen_swiotlb_init_late); +#else +static inline void __init pci_xen_swiotlb_init(void) +{ +} +#endif /* CONFIG_SWIOTLB_XEN */ void __init pci_iommu_alloc(void) { - struct iommu_table_entry *p; - - sort_iommu_table(__iommu_table, __iommu_table_end); - check_iommu_entries(__iommu_table, __iommu_table_end); - - for (p = __iommu_table; p < __iommu_table_end; p++) { - if (p && p->detect && p->detect() > 0) { - p->flags |= IOMMU_DETECTED; - if (p->early_init) - p->early_init(); - if (p->flags & IOMMU_FINISH_IF_DETECTED) - break; - } + if (xen_pv_domain()) { + pci_xen_swiotlb_init(); + return; } + pci_swiotlb_detect(); + gart_iommu_hole_init(); + amd_iommu_detect(); + detect_intel_iommu(); + if (x86_swiotlb_enable) + swiotlb_init(0); } /* @@ -102,7 +164,7 @@ static __init int iommu_setup(char *p) } #ifdef CONFIG_SWIOTLB if (!strncmp(p, "soft", 4)) - swiotlb = 1; + x86_swiotlb_enable = true; #endif if (!strncmp(p, "pt", 2)) iommu_set_default_passthrough(true); @@ -121,14 +183,17 @@ early_param("iommu", iommu_setup); static int __init pci_iommu_init(void) { - struct iommu_table_entry *p; - x86_init.iommu.iommu_init(); - for (p = __iommu_table; p < __iommu_table_end; p++) { - if (p && (p->flags & IOMMU_DETECTED) && p->late_init) - p->late_init(); +#ifdef CONFIG_SWIOTLB + /* An IOMMU turned us off. */ + if (x86_swiotlb_enable) { + pr_info("PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); + swiotlb_print_info(); + } else { + swiotlb_exit(); } +#endif return 0; } diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c deleted file mode 100644 index 42e92ec62973..000000000000 --- a/arch/x86/kernel/pci-iommu_table.c +++ /dev/null @@ -1,77 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include - -static struct iommu_table_entry * __init -find_dependents_of(struct iommu_table_entry *start, - struct iommu_table_entry *finish, - struct iommu_table_entry *q) -{ - struct iommu_table_entry *p; - - if (!q) - return NULL; - - for (p = start; p < finish; p++) - if (p->detect == q->depend) - return p; - - return NULL; -} - - -void __init sort_iommu_table(struct iommu_table_entry *start, - struct iommu_table_entry *finish) { - - struct iommu_table_entry *p, *q, tmp; - - for (p = start; p < finish; p++) { -again: - q = find_dependents_of(start, finish, p); - /* We are bit sneaky here. We use the memory address to figure - * out if the node we depend on is past our point, if so, swap. - */ - if (q > p) { - tmp = *p; - memmove(p, q, sizeof(*p)); - *q = tmp; - goto again; - } - } - -} - -#ifdef DEBUG -void __init check_iommu_entries(struct iommu_table_entry *start, - struct iommu_table_entry *finish) -{ - struct iommu_table_entry *p, *q, *x; - - /* Simple cyclic dependency checker. */ - for (p = start; p < finish; p++) { - q = find_dependents_of(start, finish, p); - x = find_dependents_of(start, finish, q); - if (p == x) { - printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %pS depends on %pS and vice-versa. BREAKING IT.\n", - p->detect, q->detect); - /* Heavy handed way..*/ - x->depend = NULL; - } - } - - for (p = start; p < finish; p++) { - q = find_dependents_of(p, finish, p); - if (q && q > p) { - printk(KERN_ERR "EXECUTION ORDER INVALID! %pS should be called before %pS!\n", - p->detect, q->detect); - } - } -} -#else -void __init check_iommu_entries(struct iommu_table_entry *start, - struct iommu_table_entry *finish) -{ -} -#endif diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c deleted file mode 100644 index 814ab46a0dad..000000000000 --- a/arch/x86/kernel/pci-swiotlb.c +++ /dev/null @@ -1,77 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -int swiotlb __read_mostly; - -/* - * pci_swiotlb_detect_override - set swiotlb to 1 if necessary - * - * This returns non-zero if we are forced to use swiotlb (by the boot - * option). - */ -int __init pci_swiotlb_detect_override(void) -{ - if (swiotlb_force == SWIOTLB_FORCE) - swiotlb = 1; - - return swiotlb; -} -IOMMU_INIT_FINISH(pci_swiotlb_detect_override, - pci_xen_swiotlb_detect, - pci_swiotlb_init, - pci_swiotlb_late_init); - -/* - * If 4GB or more detected (and iommu=off not set) or if SME is active - * then set swiotlb to 1 and return 1. - */ -int __init pci_swiotlb_detect_4gb(void) -{ - /* don't initialize swiotlb if iommu=off (no_iommu=1) */ - if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN) - swiotlb = 1; - - /* - * Set swiotlb to 1 so that bounce buffers are allocated and used for - * devices that can't support DMA to encrypted memory. - */ - if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) - swiotlb = 1; - - return swiotlb; -} -IOMMU_INIT(pci_swiotlb_detect_4gb, - pci_swiotlb_detect_override, - pci_swiotlb_init, - pci_swiotlb_late_init); - -void __init pci_swiotlb_init(void) -{ - if (swiotlb) - swiotlb_init(0); -} - -void __init pci_swiotlb_late_init(void) -{ - /* An IOMMU turned us off. */ - if (!swiotlb) - swiotlb_exit(); - else { - printk(KERN_INFO "PCI-DMA: " - "Using software bounce buffering for IO (SWIOTLB)\n"); - swiotlb_print_info(); - } -} diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index f9af561c3cd4..0c1154a1c403 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 7fda7f27e762..f5f6dc2e8007 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -315,18 +315,6 @@ SECTIONS *(.altinstr_replacement) } - /* - * struct iommu_table_entry entries are injected in this section. - * It is an array of IOMMUs which during run time gets sorted depending - * on its dependency order. After rootfs_initcall is complete - * this section can be safely removed. - */ - .iommu_table : AT(ADDR(.iommu_table) - LOAD_OFFSET) { - __iommu_table = .; - *(.iommu_table) - __iommu_table_end = .; - } - . = ALIGN(8); .apicdrivers : AT(ADDR(.apicdrivers) - LOAD_OFFSET) { __apicdrivers = .; diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 4953260e281c..3c5b52fbe4a7 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -47,6 +47,4 @@ obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o obj-$(CONFIG_XEN_PV_DOM0) += vga.o -obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o - obj-$(CONFIG_XEN_EFI) += efi.o diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c deleted file mode 100644 index 46df59aeaa06..000000000000 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ /dev/null @@ -1,96 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -/* Glue code to lib/swiotlb-xen.c */ - -#include -#include -#include - -#include -#include -#include - - -#include -#ifdef CONFIG_X86_64 -#include -#include -#endif -#include - -static int xen_swiotlb __read_mostly; - -/* - * pci_xen_swiotlb_detect - set xen_swiotlb to 1 if necessary - * - * This returns non-zero if we are forced to use xen_swiotlb (by the boot - * option). - */ -int __init pci_xen_swiotlb_detect(void) -{ - - if (!xen_pv_domain()) - return 0; - - /* If running as PV guest, either iommu=soft, or swiotlb=force will - * activate this IOMMU. If running as PV privileged, activate it - * irregardless. - */ - if (xen_initial_domain() || swiotlb || swiotlb_force == SWIOTLB_FORCE) - xen_swiotlb = 1; - - /* If we are running under Xen, we MUST disable the native SWIOTLB. - * Don't worry about swiotlb_force flag activating the native, as - * the 'swiotlb' flag is the only one turning it on. */ - swiotlb = 0; - -#ifdef CONFIG_X86_64 - /* pci_swiotlb_detect_4gb turns on native SWIOTLB if no_iommu == 0 - * (so no iommu=X command line over-writes). - * Considering that PV guests do not want the *native SWIOTLB* but - * only Xen SWIOTLB it is not useful to us so set no_iommu=1 here. - */ - if (max_pfn > MAX_DMA32_PFN) - no_iommu = 1; -#endif - return xen_swiotlb; -} - -static void __init pci_xen_swiotlb_init(void) -{ - if (xen_swiotlb) { - xen_swiotlb_init_early(); - dma_ops = &xen_swiotlb_dma_ops; - -#ifdef CONFIG_PCI - /* Make sure ACS will be enabled */ - pci_request_acs(); -#endif - } -} - -int pci_xen_swiotlb_init_late(void) -{ - int rc; - - if (xen_swiotlb) - return 0; - - rc = xen_swiotlb_init(); - if (rc) - return rc; - - dma_ops = &xen_swiotlb_dma_ops; -#ifdef CONFIG_PCI - /* Make sure ACS will be enabled */ - pci_request_acs(); -#endif - - return 0; -} -EXPORT_SYMBOL_GPL(pci_xen_swiotlb_init_late); - -IOMMU_INIT_FINISH(pci_xen_swiotlb_detect, - NULL, - pci_xen_swiotlb_init, - NULL); -- cgit From a3e230926708125205ffd06d3dc2175a8263ae7e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 29 Mar 2022 17:25:54 +0200 Subject: x86: centralize setting SWIOTLB_FORCE when guest memory encryption is enabled Move enabling SWIOTLB_FORCE for guest memory encryption into common code. Signed-off-by: Christoph Hellwig Reviewed-by: Konrad Rzeszutek Wilk Tested-by: Boris Ostrovsky --- arch/x86/kernel/cpu/mshyperv.c | 8 -------- arch/x86/kernel/pci-dma.c | 8 ++++++++ arch/x86/mm/mem_encrypt_amd.c | 3 --- 3 files changed, 8 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 4b67094215bb..5b8f2c357160 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -337,14 +337,6 @@ static void __init ms_hyperv_init_platform(void) swiotlb_unencrypted_base = ms_hyperv.shared_gpa_boundary; #endif } - -#ifdef CONFIG_SWIOTLB - /* - * Enable swiotlb force mode in Isolation VM to - * use swiotlb bounce buffer for dma transaction. - */ - swiotlb_force = SWIOTLB_FORCE; -#endif /* Isolation VMs are unenlightened SEV-based VMs, thus this check: */ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) { if (hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE) diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index df96926421be..04140e20ef1a 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -53,6 +53,14 @@ static void __init pci_swiotlb_detect(void) if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) x86_swiotlb_enable = true; + /* + * Guest with guest memory encryption currently perform all DMA through + * bounce buffers as the hypervisor can't access arbitrary VM memory + * that is not explicitly shared with it. + */ + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) + swiotlb_force = SWIOTLB_FORCE; + if (swiotlb_force == SWIOTLB_FORCE) x86_swiotlb_enable = true; } diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c index 6169053c2854..d732d727d3de 100644 --- a/arch/x86/mm/mem_encrypt_amd.c +++ b/arch/x86/mm/mem_encrypt_amd.c @@ -432,9 +432,6 @@ void __init sme_early_init(void) for (i = 0; i < ARRAY_SIZE(protection_map); i++) protection_map[i] = pgprot_encrypted(protection_map[i]); - if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) - swiotlb_force = SWIOTLB_FORCE; - x86_platform.guest.enc_status_change_prepare = amd_enc_status_change_prepare; x86_platform.guest.enc_status_change_finish = amd_enc_status_change_finish; x86_platform.guest.enc_tlb_flush_required = amd_enc_tlb_flush_required; -- cgit From c6af2aa9ffc9763826607bc2664ef3ea4475ed18 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 29 Mar 2022 17:27:33 +0200 Subject: swiotlb: make the swiotlb_init interface more useful Pass a boolean flag to indicate if swiotlb needs to be enabled based on the addressing needs, and replace the verbose argument with a set of flags, including one to force enable bounce buffering. Note that this patch removes the possibility to force xen-swiotlb use with the swiotlb=force parameter on the command line on x86 (arm and arm64 never supported that), but this interface will be restored shortly. Signed-off-by: Christoph Hellwig Reviewed-by: Konrad Rzeszutek Wilk Tested-by: Boris Ostrovsky --- arch/x86/kernel/pci-dma.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 04140e20ef1a..a705a199bf8a 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -39,6 +39,7 @@ int iommu_detected __read_mostly = 0; #ifdef CONFIG_SWIOTLB bool x86_swiotlb_enable; +static unsigned int x86_swiotlb_flags; static void __init pci_swiotlb_detect(void) { @@ -58,16 +59,16 @@ static void __init pci_swiotlb_detect(void) * bounce buffers as the hypervisor can't access arbitrary VM memory * that is not explicitly shared with it. */ - if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) - swiotlb_force = SWIOTLB_FORCE; - - if (swiotlb_force == SWIOTLB_FORCE) + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { x86_swiotlb_enable = true; + x86_swiotlb_flags |= SWIOTLB_FORCE; + } } #else static inline void __init pci_swiotlb_detect(void) { } +#define x86_swiotlb_flags 0 #endif /* CONFIG_SWIOTLB */ #ifdef CONFIG_SWIOTLB_XEN @@ -75,8 +76,7 @@ static bool xen_swiotlb; static void __init pci_xen_swiotlb_init(void) { - if (!xen_initial_domain() && !x86_swiotlb_enable && - swiotlb_force != SWIOTLB_FORCE) + if (!xen_initial_domain() && !x86_swiotlb_enable) return; x86_swiotlb_enable = true; xen_swiotlb = true; @@ -120,8 +120,7 @@ void __init pci_iommu_alloc(void) gart_iommu_hole_init(); amd_iommu_detect(); detect_intel_iommu(); - if (x86_swiotlb_enable) - swiotlb_init(0); + swiotlb_init(x86_swiotlb_enable, x86_swiotlb_flags); } /* -- cgit From 742519538e6b07250c8085bbff4bd358bc03bf16 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Feb 2022 11:12:59 +0100 Subject: swiotlb: pass a gfp_mask argument to swiotlb_init_late Let the caller chose a zone to allocate from. This will be used later on by the xen-swiotlb initialization on arm. Signed-off-by: Christoph Hellwig Reviewed-by: Anshuman Khandual Reviewed-by: Konrad Rzeszutek Wilk Tested-by: Boris Ostrovsky --- arch/x86/pci/sta2x11-fixup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c index e0c039a75b2d..c7e6faf59a86 100644 --- a/arch/x86/pci/sta2x11-fixup.c +++ b/arch/x86/pci/sta2x11-fixup.c @@ -57,7 +57,7 @@ static void sta2x11_new_instance(struct pci_dev *pdev) int size = STA2X11_SWIOTLB_SIZE; /* First instance: register your own swiotlb area */ dev_info(&pdev->dev, "Using SWIOTLB (size %i)\n", size); - if (swiotlb_init_late(size)) + if (swiotlb_init_late(size, GFP_DMA)) dev_emerg(&pdev->dev, "init swiotlb failed\n"); } list_add(&instance->list, &sta2x11_instance_list); -- cgit From 7374153d294eb51de5a81ac38ff1c4fef8927bec Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Mar 2022 08:02:57 +0100 Subject: swiotlb: provide swiotlb_init variants that remap the buffer To shared more code between swiotlb and xen-swiotlb, offer a swiotlb_init_remap interface and add a remap callback to swiotlb_init_late that will allow Xen to remap the buffer without duplicating much of the logic. Signed-off-by: Christoph Hellwig Reviewed-by: Konrad Rzeszutek Wilk Tested-by: Boris Ostrovsky --- arch/x86/pci/sta2x11-fixup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c index c7e6faf59a86..7368afc03998 100644 --- a/arch/x86/pci/sta2x11-fixup.c +++ b/arch/x86/pci/sta2x11-fixup.c @@ -57,7 +57,7 @@ static void sta2x11_new_instance(struct pci_dev *pdev) int size = STA2X11_SWIOTLB_SIZE; /* First instance: register your own swiotlb area */ dev_info(&pdev->dev, "Using SWIOTLB (size %i)\n", size); - if (swiotlb_init_late(size, GFP_DMA)) + if (swiotlb_init_late(size, GFP_DMA, NULL)) dev_emerg(&pdev->dev, "init swiotlb failed\n"); } list_add(&instance->list, &sta2x11_instance_list); -- cgit From 3f70356edf5611c28a68d8d5a9c2b442c9eb81e6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Mar 2022 07:58:45 +0100 Subject: swiotlb: merge swiotlb-xen initialization into swiotlb Reuse the generic swiotlb initialization for xen-swiotlb. For ARM/ARM64 this works trivially, while for x86 xen_swiotlb_fixup needs to be passed as the remap argument to swiotlb_init_remap/swiotlb_init_late. Note that the lower bound of the swiotlb size is changed to the smaller IO_TLB_MIN_SLABS based value with this patch, but that is fine as the 2MB value used in Xen before was just an optimization and is not the hard lower bound. Signed-off-by: Christoph Hellwig Reviewed-by: Stefano Stabellini Reviewed-by: Konrad Rzeszutek Wilk Tested-by: Boris Ostrovsky --- arch/x86/include/asm/xen/page.h | 5 ----- arch/x86/kernel/pci-dma.c | 20 ++++++++++---------- 2 files changed, 10 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index e989bc2269f5..1fc67df50014 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -357,9 +357,4 @@ static inline bool xen_arch_need_swiotlb(struct device *dev, return false; } -static inline unsigned long xen_get_swiotlb_free_pages(unsigned int order) -{ - return __get_free_pages(__GFP_NOWARN, order); -} - #endif /* _ASM_X86_XEN_PAGE_H */ diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index a705a199bf8a..30bbe4abb5d6 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -72,15 +72,13 @@ static inline void __init pci_swiotlb_detect(void) #endif /* CONFIG_SWIOTLB */ #ifdef CONFIG_SWIOTLB_XEN -static bool xen_swiotlb; - static void __init pci_xen_swiotlb_init(void) { if (!xen_initial_domain() && !x86_swiotlb_enable) return; x86_swiotlb_enable = true; - xen_swiotlb = true; - xen_swiotlb_init_early(); + x86_swiotlb_flags |= SWIOTLB_ANY; + swiotlb_init_remap(true, x86_swiotlb_flags, xen_swiotlb_fixup); dma_ops = &xen_swiotlb_dma_ops; if (IS_ENABLED(CONFIG_PCI)) pci_request_acs(); @@ -88,14 +86,16 @@ static void __init pci_xen_swiotlb_init(void) int pci_xen_swiotlb_init_late(void) { - int rc; - - if (xen_swiotlb) + if (dma_ops == &xen_swiotlb_dma_ops) return 0; - rc = xen_swiotlb_init(); - if (rc) - return rc; + /* we can work with the default swiotlb */ + if (!io_tlb_default_mem.nslabs) { + int rc = swiotlb_init_late(swiotlb_size_or_default(), + GFP_KERNEL, xen_swiotlb_fixup); + if (rc < 0) + return rc; + } /* XXX: this switches the dma ops under live devices! */ dma_ops = &xen_swiotlb_dma_ops; -- cgit From 3cb4503a330159dc5cf2f8382181ccbabbbaa5b2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 17 Feb 2022 15:10:54 +0100 Subject: x86: remove cruft from gets pulled in by all drivers using the DMA API. Remove x86 internal variables and unnecessary includes from it. Signed-off-by: Christoph Hellwig Reviewed-by: Konrad Rzeszutek Wilk Tested-by: Boris Ostrovsky --- arch/x86/include/asm/dma-mapping.h | 11 ----------- arch/x86/include/asm/iommu.h | 2 ++ 2 files changed, 2 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 256fd8115223..1c66708e3062 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -2,17 +2,6 @@ #ifndef _ASM_X86_DMA_MAPPING_H #define _ASM_X86_DMA_MAPPING_H -/* - * IOMMU interface. See Documentation/core-api/dma-api-howto.rst and - * Documentation/core-api/dma-api.rst for documentation. - */ - -#include -#include - -extern int iommu_merge; -extern int panic_on_overflow; - extern const struct dma_map_ops *dma_ops; static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h index dba89ed40d38..0bef44d30a27 100644 --- a/arch/x86/include/asm/iommu.h +++ b/arch/x86/include/asm/iommu.h @@ -8,6 +8,8 @@ extern int force_iommu, no_iommu; extern int iommu_detected; +extern int iommu_merge; +extern int panic_on_overflow; #ifdef CONFIG_SWIOTLB extern bool x86_swiotlb_enable; -- cgit From 51964015565d302fda63ce84ef151e1c9a5939cc Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 18 Apr 2022 11:16:13 -0500 Subject: x86/mm: Fix spacing within memory encryption features message The spacing is off in the memory encryption features message on AMD platforms that support memory encryption, e.g.: "Memory Encryption Features active:AMD SEV SEV-ES" There is no space before "AMD" and two spaces after it. Fix this so that the message is spaced properly: "Memory Encryption Features active: AMD SEV SEV-ES" Fixes: 968b493173ac ("x86/mm: Make DMA memory shared for TD guest") Signed-off-by: Tom Lendacky Signed-off-by: Dave Hansen Acked-by: Kirill A. Shutemov Link: https://lkml.kernel.org/r/02401f3024b18e90bc2508147e22e729436cb6d9.1650298573.git.thomas.lendacky@amd.com --- arch/x86/mm/mem_encrypt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 10ee40b5204b..1562f5ed8b10 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -49,7 +49,7 @@ static void print_mem_encrypt_feature_info(void) return; } - pr_cont("AMD "); + pr_cont(" AMD"); /* Secure Memory Encryption */ if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) { -- cgit From 72c3c8d6e5275b19fd2d32ec787e8135a421c7ec Mon Sep 17 00:00:00 2001 From: Matt Atwood Date: Mon, 18 Apr 2022 11:51:57 +0530 Subject: drm/i915/rpl-p: Add PCI IDs Adding initial PCI ids for RPL-P. RPL-P behaves identically to ADL-P from i915's point of view. Changes since V1 : - SUBPLATFORM ADL_N and RPL_P clash as both are ADLP based - Matthew R Bspec: 55376 Signed-off-by: Matt Atwood Signed-off-by: Madhumitha Tolakanahalli Pradeep Signed-off-by: Tejas Upadhyay [mattrope: Corrected comment formatting to match coding style] Signed-off-by: Matt Roper Link: https://patchwork.freedesktop.org/patch/msgid/20220418062157.2974665-1-tejaskumarx.surendrakumar.upadhyay@intel.com --- arch/x86/kernel/early-quirks.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 805596736e20..a6c1867fc7aa 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -558,6 +558,7 @@ static const struct pci_device_id intel_early_ids[] __initconst = { INTEL_ADLP_IDS(&gen11_early_ops), INTEL_ADLN_IDS(&gen11_early_ops), INTEL_RPLS_IDS(&gen11_early_ops), + INTEL_RPLP_IDS(&gen11_early_ops), }; struct resource intel_graphics_stolen_res __ro_after_init = DEFINE_RES_MEM(0, 0); -- cgit From 6044d159b5d826259a7397d42fa3ad0bfc4dbd13 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Wed, 20 Apr 2022 10:26:13 -0500 Subject: x86/boot: Put globals that are accessed early into the .data section The helpers in arch/x86/boot/compressed/efi.c might be used during early boot to access the EFI system/config tables, and in some cases these EFI helpers might attempt to print debug/error messages, before console_init() has been called. __putstr() checks some variables to avoid printing anything before the console has been initialized, but this isn't enough since those variables live in .bss, which may not have been cleared yet. This can lead to a triple-fault occurring, primarily when booting in legacy/CSM mode (where EFI helpers will attempt to print some debug messages). Fix this by declaring these globals in .data section instead so there is no dependency on .bss being cleared before accessing them. Fixes: c01fce9cef849 ("x86/compressed: Add SEV-SNP feature detection/setup") Reported-by: Borislav Petkov Suggested-by: Thomas Lendacky Signed-off-by: Michael Roth Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220420152613.145077-1-michael.roth@amd.com --- arch/x86/boot/compressed/early_serial_console.c | 3 ++- arch/x86/boot/compressed/misc.c | 5 ++++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/early_serial_console.c b/arch/x86/boot/compressed/early_serial_console.c index 261e81fb9582..70a8d1706d0f 100644 --- a/arch/x86/boot/compressed/early_serial_console.c +++ b/arch/x86/boot/compressed/early_serial_console.c @@ -1,5 +1,6 @@ #include "misc.h" -int early_serial_base; +/* This might be accessed before .bss is cleared, so use .data instead. */ +int early_serial_base __section(".data"); #include "../early_serial_console.c" diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 1cdcaf34ee36..ca6820f99b40 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -53,7 +53,10 @@ memptr free_mem_end_ptr; static char *vidmem; static int vidport; -static int lines, cols; + +/* These might be accessed before .bss is cleared, so use .data instead. */ +static int lines __section(".data"); +static int cols __section(".data"); #ifdef CONFIG_KERNEL_GZIP #include "../../../../lib/decompress_inflate.c" -- cgit From 2bf93ffbb97e0614cfc431d2ea33b7eae7481eb2 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Wed, 20 Apr 2022 09:14:13 -0500 Subject: virt: sevguest: Change driver name to reflect generic SEV support During patch review, it was decided the SNP guest driver name should not be SEV-SNP specific, but should be generic for use with anything SEV. However, this feedback was missed and the driver name, and many of the driver functions and structures, are SEV-SNP name specific. Rename the driver to "sev-guest" (to match the misc device that is created) and update some of the function and structure names, too. While in the file, adjust the one pr_err() message to be a dev_err() message so that the message, if issued, uses the driver name. Signed-off-by: Tom Lendacky Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/307710bb5515c9088a19fd0b930268c7300479b2.1650464054.git.thomas.lendacky@amd.com --- arch/x86/include/asm/sev.h | 2 +- arch/x86/kernel/sev.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 9c2d33f1cfee..6e3dda4f82b5 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -95,7 +95,7 @@ struct snp_req_data { unsigned int data_npages; }; -struct snp_guest_platform_data { +struct sev_guest_platform_data { u64 secrets_gpa; }; diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index f01f4550e2c6..2fa87a07ab30 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -2166,8 +2166,8 @@ e_restore_irq: } EXPORT_SYMBOL_GPL(snp_issue_guest_request); -static struct platform_device guest_req_device = { - .name = "snp-guest", +static struct platform_device sev_guest_device = { + .name = "sev-guest", .id = -1, }; @@ -2197,7 +2197,7 @@ static u64 get_secrets_page(void) static int __init snp_init_platform_device(void) { - struct snp_guest_platform_data data; + struct sev_guest_platform_data data; u64 gpa; if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) @@ -2208,10 +2208,10 @@ static int __init snp_init_platform_device(void) return -ENODEV; data.secrets_gpa = gpa; - if (platform_device_add_data(&guest_req_device, &data, sizeof(data))) + if (platform_device_add_data(&sev_guest_device, &data, sizeof(data))) return -ENODEV; - if (platform_device_register(&guest_req_device)) + if (platform_device_register(&sev_guest_device)) return -ENODEV; pr_info("SNP guest platform device initialized.\n"); -- cgit From 5af14c29f7a0e6d1fcee44c4ed4a2d12a49c4a43 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 20 Apr 2022 13:45:49 +0200 Subject: x86/tdx: Annotate a noreturn function objdump complains: vmlinux.o: warning: objtool: __tdx_hypercall()+0x74: unreachable instruction because __tdx_hypercall_failed() won't return but panic the guest. Annotate that that is ok and desired. Fixes: eb94f1b6a70a ("x86/tdx: Add __tdx_module_call() and __tdx_hypercall() helper functions") Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220420115025.5448-1-bp@alien8.de --- arch/x86/coco/tdx/tdcall.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/coco/tdx/tdcall.S b/arch/x86/coco/tdx/tdcall.S index 245888290bb6..eeb4511dc414 100644 --- a/arch/x86/coco/tdx/tdcall.S +++ b/arch/x86/coco/tdx/tdcall.S @@ -200,5 +200,6 @@ SYM_FUNC_START(__tdx_hypercall) .Lpanic: call __tdx_hypercall_failed /* __tdx_hypercall_failed never returns */ + REACHABLE jmp .Lpanic SYM_FUNC_END(__tdx_hypercall) -- cgit From 78ed93d72ded679e3caf0758357209887bda885f Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 4 Apr 2022 13:12:04 +0200 Subject: signal: Deliver SIGTRAP on perf event asynchronously if blocked With SIGTRAP on perf events, we have encountered termination of processes due to user space attempting to block delivery of SIGTRAP. Consider this case: ... sigset_t s; sigemptyset(&s); sigaddset(&s, SIGTRAP | ); sigprocmask(SIG_BLOCK, &s, ...); ... When the perf event triggers, while SIGTRAP is blocked, force_sig_perf() will force the signal, but revert back to the default handler, thus terminating the task. This makes sense for error conditions, but not so much for explicitly requested monitoring. However, the expectation is still that signals generated by perf events are synchronous, which will no longer be the case if the signal is blocked and delivered later. To give user space the ability to clearly distinguish synchronous from asynchronous signals, introduce siginfo_t::si_perf_flags and TRAP_PERF_FLAG_ASYNC (opted for flags in case more binary information is required in future). The resolution to the problem is then to (a) no longer force the signal (avoiding the terminations), but (b) tell user space via si_perf_flags if the signal was synchronous or not, so that such signals can be handled differently (e.g. let user space decide to ignore or consider the data imprecise). The alternative of making the kernel ignore SIGTRAP on perf events if the signal is blocked may work for some usecases, but likely causes issues in others that then have to revert back to interception of sigprocmask() (which we want to avoid). [ A concrete example: when using breakpoint perf events to track data-flow, in a region of code where signals are blocked, data-flow can no longer be tracked accurately. When a relevant asynchronous signal is received after unblocking the signal, the data-flow tracking logic needs to know its state is imprecise. ] Fixes: 97ba62b27867 ("perf: Add support for SIGTRAP on perf events") Reported-by: Dmitry Vyukov Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Acked-by: Geert Uytterhoeven Tested-by: Dmitry Vyukov Link: https://lore.kernel.org/r/20220404111204.935357-1-elver@google.com --- arch/x86/kernel/signal_compat.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index b52407c56000..879ef8c72f5c 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -149,8 +149,10 @@ static inline void signal_compat_build_tests(void) BUILD_BUG_ON(offsetof(siginfo_t, si_perf_data) != 0x18); BUILD_BUG_ON(offsetof(siginfo_t, si_perf_type) != 0x20); + BUILD_BUG_ON(offsetof(siginfo_t, si_perf_flags) != 0x24); BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_data) != 0x10); BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_type) != 0x14); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_flags) != 0x18); CHECK_CSI_OFFSET(_sigpoll); CHECK_CSI_SIZE (_sigpoll, 2*sizeof(int)); -- cgit From 03f16cd020eb8bb2eb837e2090086f296a9fa91d Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Apr 2022 09:50:36 -0700 Subject: objtool: Add CONFIG_OBJTOOL Now that stack validation is an optional feature of objtool, add CONFIG_OBJTOOL and replace most usages of CONFIG_STACK_VALIDATION with it. CONFIG_STACK_VALIDATION can now be considered to be frame-pointer specific. CONFIG_UNWINDER_ORC is already inherently valid for live patching, so no need to "validate" it. Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Miroslav Benes Link: https://lkml.kernel.org/r/939bf3d85604b2a126412bf11af6e3bd3b872bcb.1650300597.git.jpoimboe@redhat.com --- arch/x86/Kconfig | 18 +++++++++++------- arch/x86/Kconfig.debug | 2 +- arch/x86/include/asm/jump_label.h | 6 +++--- arch/x86/kernel/alternative.c | 6 +++--- 4 files changed, 18 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4bed3abf444d..43e26ee1611e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -188,7 +188,7 @@ config X86 select HAVE_CONTEXT_TRACKING if X86_64 select HAVE_CONTEXT_TRACKING_OFFSTACK if HAVE_CONTEXT_TRACKING select HAVE_C_RECORDMCOUNT - select HAVE_OBJTOOL_MCOUNT if STACK_VALIDATION + select HAVE_OBJTOOL_MCOUNT if HAVE_OBJTOOL select HAVE_BUILDTIME_MCOUNT_SORT select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS @@ -231,6 +231,7 @@ config X86 select HAVE_MOVE_PMD select HAVE_MOVE_PUD select HAVE_NMI + select HAVE_OBJTOOL if X86_64 select HAVE_OPTPROBES select HAVE_PCSPKR_PLATFORM select HAVE_PERF_EVENTS @@ -239,17 +240,17 @@ config X86 select HAVE_PCI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP - select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT + select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT select HAVE_POSIX_CPU_TIMERS_TASK_WORK select HAVE_REGS_AND_STACK_ACCESS_API - select HAVE_RELIABLE_STACKTRACE if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION + select HAVE_RELIABLE_STACKTRACE if UNWINDER_ORC || STACK_VALIDATION select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_SETUP_PER_CPU_AREA select HAVE_SOFTIRQ_ON_OWN_STACK select HAVE_STACKPROTECTOR if CC_HAS_SANE_STACKPROTECTOR - select HAVE_STACK_VALIDATION if X86_64 + select HAVE_STACK_VALIDATION if HAVE_OBJTOOL select HAVE_STATIC_CALL - select HAVE_STATIC_CALL_INLINE if HAVE_STACK_VALIDATION + select HAVE_STATIC_CALL_INLINE if HAVE_OBJTOOL select HAVE_PREEMPT_DYNAMIC_CALL select HAVE_RSEQ select HAVE_SYSCALL_TRACEPOINTS @@ -268,7 +269,6 @@ config X86 select RTC_MC146818_LIB select SPARSE_IRQ select SRCU - select STACK_VALIDATION if HAVE_STACK_VALIDATION && (HAVE_STATIC_CALL_INLINE || RETPOLINE) select SYSCTL_EXCEPTION_TRACE select THREAD_INFO_IN_TASK select TRACE_IRQFLAGS_SUPPORT @@ -459,6 +459,7 @@ config GOLDFISH config RETPOLINE bool "Avoid speculative indirect branches in kernel" + select OBJTOOL if HAVE_OBJTOOL default y help Compile kernel with the retpoline compiler options to guard against @@ -472,6 +473,7 @@ config CC_HAS_SLS config SLS bool "Mitigate Straight-Line-Speculation" depends on CC_HAS_SLS && X86_64 + select OBJTOOL if HAVE_OBJTOOL default n help Compile the kernel with straight-line-speculation options to guard @@ -1819,6 +1821,7 @@ config ARCH_RANDOM config X86_SMAP def_bool y prompt "Supervisor Mode Access Prevention" if EXPERT + select OBJTOOL if HAVE_OBJTOOL help Supervisor Mode Access Prevention (SMAP) is a security feature in newer Intel processors. There is a small @@ -1855,9 +1858,10 @@ config CC_HAS_IBT config X86_KERNEL_IBT prompt "Indirect Branch Tracking" bool - depends on X86_64 && CC_HAS_IBT && STACK_VALIDATION + depends on X86_64 && CC_HAS_IBT && HAVE_OBJTOOL # https://github.com/llvm/llvm-project/commit/9d7001eba9c4cb311e03cd8cdc231f9e579f2d0f depends on !LD_IS_LLD || LLD_VERSION >= 140000 + select OBJTOOL help Build the kernel with support for Indirect Branch Tracking, a hardware support course-grain forward-edge Control Flow Integrity diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index d3a6f74a94bd..d872a7522e55 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -237,7 +237,7 @@ choice config UNWINDER_ORC bool "ORC unwinder" depends on X86_64 - select STACK_VALIDATION + select OBJTOOL help This option enables the ORC (Oops Rewind Capability) unwinder for unwinding kernel stack traces. It uses a custom data format which is diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 0449b125d27f..3ce0e67c579c 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -20,7 +20,7 @@ _ASM_PTR "%c0 + %c1 - .\n\t" \ ".popsection \n\t" -#ifdef CONFIG_STACK_VALIDATION +#ifdef CONFIG_OBJTOOL static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { @@ -34,7 +34,7 @@ l_yes: return true; } -#else +#else /* !CONFIG_OBJTOOL */ static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch) { @@ -48,7 +48,7 @@ l_yes: return true; } -#endif /* STACK_VALIDATION */ +#endif /* CONFIG_OBJTOOL */ static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch) { diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index d374cb3cf024..3c66073e7645 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -338,7 +338,7 @@ next: } } -#if defined(CONFIG_RETPOLINE) && defined(CONFIG_STACK_VALIDATION) +#if defined(CONFIG_RETPOLINE) && defined(CONFIG_OBJTOOL) /* * CALL/JMP *%\reg @@ -507,11 +507,11 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) } } -#else /* !RETPOLINES || !CONFIG_STACK_VALIDATION */ +#else /* !CONFIG_RETPOLINE || !CONFIG_OBJTOOL */ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } -#endif /* CONFIG_RETPOLINE && CONFIG_STACK_VALIDATION */ +#endif /* CONFIG_RETPOLINE && CONFIG_OBJTOOL */ #ifdef CONFIG_X86_KERNEL_IBT -- cgit From 4ab7674f5951ac6a8ac4fa8828090edb64a4771f Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Apr 2022 09:50:39 -0700 Subject: objtool: Make jump label hack optional Objtool secretly does a jump label hack to overcome the limitations of the toolchain. Make the hack explicit (and optional for other arches) by turning it into a cmdline option and kernel config option. Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Miroslav Benes Link: https://lkml.kernel.org/r/3bdcbfdd27ecb01ddec13c04bdf756a583b13d24.1650300597.git.jpoimboe@redhat.com --- arch/x86/Kconfig | 1 + arch/x86/include/asm/jump_label.h | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 43e26ee1611e..26d012f1eeb8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -212,6 +212,7 @@ config X86 select HAVE_IOREMAP_PROT select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64 select HAVE_IRQ_TIME_ACCOUNTING + select HAVE_JUMP_LABEL_HACK if HAVE_OBJTOOL select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_GZIP select HAVE_KERNEL_LZ4 diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 3ce0e67c579c..071572e23d3a 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -20,7 +20,7 @@ _ASM_PTR "%c0 + %c1 - .\n\t" \ ".popsection \n\t" -#ifdef CONFIG_OBJTOOL +#ifdef CONFIG_HAVE_JUMP_LABEL_HACK static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { @@ -34,7 +34,7 @@ l_yes: return true; } -#else /* !CONFIG_OBJTOOL */ +#else /* !CONFIG_HAVE_JUMP_LABEL_HACK */ static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch) { @@ -48,7 +48,7 @@ l_yes: return true; } -#endif /* CONFIG_OBJTOOL */ +#endif /* CONFIG_HAVE_JUMP_LABEL_HACK */ static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch) { -- cgit From 22102f4559beaabcea614b29ee090c6a214f002f Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Apr 2022 09:50:40 -0700 Subject: objtool: Make noinstr hacks optional Objtool has some hacks in place to workaround toolchain limitations which otherwise would break no-instrumentation rules. Make the hacks explicit (and optional for other arches) by turning it into a cmdline option and kernel config option. Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Miroslav Benes Link: https://lkml.kernel.org/r/b326eeb9c33231b9dfbb925f194ed7ee40edcd7c.1650300597.git.jpoimboe@redhat.com --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 26d012f1eeb8..06e7cdd76691 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -231,6 +231,7 @@ config X86 select HAVE_MOD_ARCH_SPECIFIC select HAVE_MOVE_PMD select HAVE_MOVE_PUD + select HAVE_NOINSTR_HACK if HAVE_OBJTOOL select HAVE_NMI select HAVE_OBJTOOL if X86_64 select HAVE_OPTPROBES -- cgit From 489e355b42255c5536a0ea3083a66b54a5e235c3 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Apr 2022 09:50:42 -0700 Subject: objtool: Add HAVE_NOINSTR_VALIDATION Remove CONFIG_NOINSTR_VALIDATION's dependency on HAVE_OBJTOOL, since other arches might want to implement objtool without it. Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Miroslav Benes Link: https://lkml.kernel.org/r/488e94f69db4df154499bc098573d90e5db1c826.1650300597.git.jpoimboe@redhat.com --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 06e7cdd76691..1847d6e974a1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -233,6 +233,7 @@ config X86 select HAVE_MOVE_PUD select HAVE_NOINSTR_HACK if HAVE_OBJTOOL select HAVE_NMI + select HAVE_NOINSTR_VALIDATION if HAVE_OBJTOOL select HAVE_OBJTOOL if X86_64 select HAVE_OPTPROBES select HAVE_PCSPKR_PLATFORM -- cgit From 70c459d915e838b7f536b8e26e0b3a6141bd2645 Mon Sep 17 00:00:00 2001 From: Carlos Bilbao Date: Tue, 5 Apr 2022 13:32:13 -0500 Subject: x86/mce: Simplify AMD severity grading logic The MCE handler needs to understand the severity of the machine errors to act accordingly. Simplify the AMD grading logic following a logic that closely resembles the descriptions of the public PPR documents. This will help include more fine-grained grading of errors in the future. [ bp: Touchups. ] Signed-off-by: Carlos Bilbao Signed-off-by: Borislav Petkov Reviewed-by: Yazen Ghannam Link: https://lore.kernel.org/r/20220405183212.354606-2-carlos.bilbao@amd.com --- arch/x86/kernel/cpu/mce/severity.c | 101 +++++++++++++------------------------ 1 file changed, 36 insertions(+), 65 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index 1add86935349..d842148f9c39 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -301,85 +301,56 @@ static noinstr int error_context(struct mce *m, struct pt_regs *regs) } } -static __always_inline int mce_severity_amd_smca(struct mce *m, enum context err_ctx) +/* See AMD PPR(s) section Machine Check Error Handling. */ +static noinstr int mce_severity_amd(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp) { - u64 mcx_cfg; + int ret; /* - * We need to look at the following bits: - * - "succor" bit (data poisoning support), and - * - TCC bit (Task Context Corrupt) - * in MCi_STATUS to determine error severity. + * Default return value: Action required, the error must be handled + * immediately. */ - if (!mce_flags.succor) - return MCE_PANIC_SEVERITY; - - mcx_cfg = mce_rdmsrl(MSR_AMD64_SMCA_MCx_CONFIG(m->bank)); - - /* TCC (Task context corrupt). If set and if IN_KERNEL, panic. */ - if ((mcx_cfg & MCI_CONFIG_MCAX) && - (m->status & MCI_STATUS_TCC) && - (err_ctx == IN_KERNEL)) - return MCE_PANIC_SEVERITY; - - /* ...otherwise invoke hwpoison handler. */ - return MCE_AR_SEVERITY; -} - -/* - * See AMD Error Scope Hierarchy table in a newer BKDG. For example - * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features" - */ -static noinstr int mce_severity_amd(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp) -{ - enum context ctx = error_context(m, regs); + ret = MCE_AR_SEVERITY; /* Processor Context Corrupt, no need to fumble too much, die! */ - if (m->status & MCI_STATUS_PCC) - return MCE_PANIC_SEVERITY; - - if (m->status & MCI_STATUS_UC) { - - if (ctx == IN_KERNEL) - return MCE_PANIC_SEVERITY; + if (m->status & MCI_STATUS_PCC) { + ret = MCE_PANIC_SEVERITY; + goto out; + } - /* - * On older systems where overflow_recov flag is not present, we - * should simply panic if an error overflow occurs. If - * overflow_recov flag is present and set, then software can try - * to at least kill process to prolong system operation. - */ - if (mce_flags.overflow_recov) { - if (mce_flags.smca) - return mce_severity_amd_smca(m, ctx); - - /* kill current process */ - return MCE_AR_SEVERITY; - } else { - /* at least one error was not logged */ - if (m->status & MCI_STATUS_OVER) - return MCE_PANIC_SEVERITY; - } - - /* - * For any other case, return MCE_UC_SEVERITY so that we log the - * error and exit #MC handler. - */ - return MCE_UC_SEVERITY; + if (m->status & MCI_STATUS_DEFERRED) { + ret = MCE_DEFERRED_SEVERITY; + goto out; } /* - * deferred error: poll handler catches these and adds to mce_ring so - * memory-failure can take recovery actions. + * If the UC bit is not set, the system either corrected or deferred + * the error. No action will be required after logging the error. */ - if (m->status & MCI_STATUS_DEFERRED) - return MCE_DEFERRED_SEVERITY; + if (!(m->status & MCI_STATUS_UC)) { + ret = MCE_KEEP_SEVERITY; + goto out; + } /* - * corrected error: poll handler catches these and passes responsibility - * of decoding the error to EDAC + * On MCA overflow, without the MCA overflow recovery feature the + * system will not be able to recover, panic. */ - return MCE_KEEP_SEVERITY; + if ((m->status & MCI_STATUS_OVER) && !mce_flags.overflow_recov) { + ret = MCE_PANIC_SEVERITY; + goto out; + } + + if (!mce_flags.succor) { + ret = MCE_PANIC_SEVERITY; + goto out; + } + + if (error_context(m, regs) == IN_KERNEL) + ret = MCE_PANIC_SEVERITY; + +out: + return ret; } static noinstr int mce_severity_intel(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp) -- cgit From fa619f5156cf1ee3773edc6d756be262c9ef35de Mon Sep 17 00:00:00 2001 From: Carlos Bilbao Date: Tue, 5 Apr 2022 13:32:14 -0500 Subject: x86/mce: Add messages for panic errors in AMD's MCE grading When a machine error is graded as PANIC by the AMD grading logic, the MCE handler calls mce_panic(). The notification chain does not come into effect so the AMD EDAC driver does not decode the errors. In these cases, the messages displayed to the user are more cryptic and miss information that might be relevant, like the context in which the error took place. Add messages to the grading logic for machine errors so that it is clear what error it was. [ bp: Massage commit message. ] Signed-off-by: Carlos Bilbao Signed-off-by: Borislav Petkov Reviewed-by: Yazen Ghannam Link: https://lore.kernel.org/r/20220405183212.354606-3-carlos.bilbao@amd.com --- arch/x86/kernel/cpu/mce/severity.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index d842148f9c39..00483d1c27e4 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -304,6 +304,7 @@ static noinstr int error_context(struct mce *m, struct pt_regs *regs) /* See AMD PPR(s) section Machine Check Error Handling. */ static noinstr int mce_severity_amd(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp) { + char *panic_msg = NULL; int ret; /* @@ -314,6 +315,7 @@ static noinstr int mce_severity_amd(struct mce *m, struct pt_regs *regs, char ** /* Processor Context Corrupt, no need to fumble too much, die! */ if (m->status & MCI_STATUS_PCC) { + panic_msg = "Processor Context Corrupt"; ret = MCE_PANIC_SEVERITY; goto out; } @@ -337,19 +339,26 @@ static noinstr int mce_severity_amd(struct mce *m, struct pt_regs *regs, char ** * system will not be able to recover, panic. */ if ((m->status & MCI_STATUS_OVER) && !mce_flags.overflow_recov) { + panic_msg = "Overflowed uncorrected error without MCA Overflow Recovery"; ret = MCE_PANIC_SEVERITY; goto out; } if (!mce_flags.succor) { + panic_msg = "Uncorrected error without MCA Recovery"; ret = MCE_PANIC_SEVERITY; goto out; } - if (error_context(m, regs) == IN_KERNEL) + if (error_context(m, regs) == IN_KERNEL) { + panic_msg = "Uncorrected unrecoverable error in kernel context"; ret = MCE_PANIC_SEVERITY; + } out: + if (msg && panic_msg) + *msg = panic_msg; + return ret; } -- cgit From 8ad7e8f696951f192c6629a0cbda9ac94c773159 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Apr 2022 14:11:25 +0200 Subject: x86/fpu/xsave: Support XSAVEC in the kernel XSAVEC is the user space counterpart of XSAVES which cannot save supervisor state. In virtualization scenarios the hypervisor does not expose XSAVES but XSAVEC to the guest, though the kernel does not make use of it. That's unfortunate because XSAVEC uses the compacted format of saving the XSTATE. This is more efficient in terms of storage space vs. XSAVE[OPT] as it does not create holes for XSTATE components which are not supported or enabled by the kernel but are available in hardware. There is room for further optimizations when XSAVEC/S and XGETBV1 are supported. In order to support XSAVEC: - Define the XSAVEC ASM macro as it's not yet supported by the required minimal toolchain. - Create a software defined X86_FEATURE_XCOMPACTED to select the compacted XSTATE buffer format for both XSAVEC and XSAVES. - Make XSAVEC an option in the 'XSAVE' ASM alternatives Requested-by: Andrew Cooper Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20220404104820.598704095@linutronix.de --- arch/x86/include/asm/cpufeatures.h | 2 +- arch/x86/kernel/fpu/xstate.c | 58 +++++++++++++++++++++++++------------- arch/x86/kernel/fpu/xstate.h | 14 +++++---- 3 files changed, 48 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 73e643ae94b6..ff08da857847 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -201,7 +201,7 @@ #define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ -/* FREE! ( 7*32+10) */ +#define X86_FEATURE_XCOMPACTED ( 7*32+10) /* "" Use compacted XSTATE (XSAVES or XSAVEC) */ #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ #define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ #define X86_FEATURE_RETPOLINE_LFENCE ( 7*32+13) /* "" Use LFENCE for Spectre variant 2 */ diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 39e1c8626ab9..31c12f4d0770 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -142,7 +142,8 @@ static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature) * Non-compacted format and legacy features use the cached fixed * offsets. */ - if (!cpu_feature_enabled(X86_FEATURE_XSAVES) || xfeature <= XFEATURE_SSE) + if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) || + xfeature <= XFEATURE_SSE) return xstate_offsets[xfeature]; /* @@ -369,12 +370,12 @@ static void __init setup_init_fpu_buf(void) /* * All components are now in init state. Read the state back so * that init_fpstate contains all non-zero init state. This only - * works with XSAVE, but not with XSAVEOPT and XSAVES because + * works with XSAVE, but not with XSAVEOPT and XSAVEC/S because * those use the init optimization which skips writing data for * components in init state. * * XSAVE could be used, but that would require to reshuffle the - * data when XSAVES is available because XSAVES uses xstate + * data when XSAVEC/S is available because XSAVEC/S uses xstate * compaction. But doing so is a pointless exercise because most * components have an all zeros init state except for the legacy * ones (FP and SSE). Those can be saved with FXSAVE into the @@ -584,7 +585,8 @@ static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted) */ static bool __init paranoid_xstate_size_valid(unsigned int kernel_size) { - bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES); + bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); + bool xsaves = cpu_feature_enabled(X86_FEATURE_XSAVES); unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE; int i; @@ -595,7 +597,7 @@ static bool __init paranoid_xstate_size_valid(unsigned int kernel_size) * Supervisor state components can be managed only by * XSAVES. */ - if (!compacted && xfeature_is_supervisor(i)) { + if (!xsaves && xfeature_is_supervisor(i)) { XSTATE_WARN_ON(1); return false; } @@ -612,8 +614,11 @@ static bool __init paranoid_xstate_size_valid(unsigned int kernel_size) * the size of the *user* states. If we use it to size a buffer * that we use 'XSAVES' on, we could potentially overflow the * buffer because 'XSAVES' saves system states too. + * + * This also takes compaction into account. So this works for + * XSAVEC as well. */ -static unsigned int __init get_xsaves_size(void) +static unsigned int __init get_compacted_size(void) { unsigned int eax, ebx, ecx, edx; /* @@ -623,6 +628,10 @@ static unsigned int __init get_xsaves_size(void) * containing all the state components * corresponding to bits currently set in * XCR0 | IA32_XSS. + * + * When XSAVES is not available but XSAVEC is (virt), then there + * are no supervisor states, but XSAVEC still uses compacted + * format. */ cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); return ebx; @@ -632,13 +641,13 @@ static unsigned int __init get_xsaves_size(void) * Get the total size of the enabled xstates without the independent supervisor * features. */ -static unsigned int __init get_xsaves_size_no_independent(void) +static unsigned int __init get_xsave_compacted_size(void) { u64 mask = xfeatures_mask_independent(); unsigned int size; if (!mask) - return get_xsaves_size(); + return get_compacted_size(); /* Disable independent features. */ wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()); @@ -647,7 +656,7 @@ static unsigned int __init get_xsaves_size_no_independent(void) * Ask the hardware what size is required of the buffer. * This is the size required for the task->fpu buffer. */ - size = get_xsaves_size(); + size = get_compacted_size(); /* Re-enable independent features so XSAVES will work on them again. */ wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask); @@ -687,20 +696,21 @@ static int __init init_xstate_size(void) { /* Recompute the context size for enabled features: */ unsigned int user_size, kernel_size, kernel_default_size; - bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES); + bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); /* Uncompacted user space size */ user_size = get_xsave_size_user(); /* - * XSAVES kernel size includes supervisor states and - * uses compacted format when available. + * XSAVES kernel size includes supervisor states and uses compacted + * format. XSAVEC uses compacted format, but does not save + * supervisor states. * - * XSAVE does not support supervisor states so - * kernel and user size is identical. + * XSAVE[OPT] do not support supervisor states so kernel and user + * size is identical. */ if (compacted) - kernel_size = get_xsaves_size_no_independent(); + kernel_size = get_xsave_compacted_size(); else kernel_size = user_size; @@ -813,8 +823,11 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) if (!cpu_feature_enabled(X86_FEATURE_XFD)) fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC; - fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED | - XFEATURE_MASK_SUPERVISOR_SUPPORTED; + if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) + fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; + else + fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED | + XFEATURE_MASK_SUPERVISOR_SUPPORTED; fpu_user_cfg.max_features = fpu_kernel_cfg.max_features; fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; @@ -837,6 +850,11 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) */ init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC; + /* Set up compaction feature bit */ + if (cpu_feature_enabled(X86_FEATURE_XSAVEC) || + cpu_feature_enabled(X86_FEATURE_XSAVES)) + setup_force_cpu_cap(X86_FEATURE_XCOMPACTED); + /* Enable xstate instructions to be able to continue with initialization: */ fpu__init_cpu_xstate(); @@ -873,7 +891,7 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", fpu_kernel_cfg.max_features, fpu_kernel_cfg.max_size, - boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard"); + boot_cpu_has(X86_FEATURE_XCOMPACTED) ? "compacted" : "standard"); return; out_disable: @@ -917,7 +935,7 @@ static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr) if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) return NULL; - if (cpu_feature_enabled(X86_FEATURE_XSAVES)) { + if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) { if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr)))) return NULL; } @@ -1525,7 +1543,7 @@ static int __xstate_request_perm(u64 permitted, u64 requested, bool guest) * vendors into extending XFD for the pre AMX states, especially * AVX512. */ - bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES); + bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); struct fpu *fpu = ¤t->group_leader->thread.fpu; struct fpu_state_perm *perm; unsigned int ksize, usize; diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index d22ace092ca2..5ad47031383b 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -16,7 +16,7 @@ static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask) * XRSTORS requires these bits set in xcomp_bv, or it will * trigger #GP: */ - if (cpu_feature_enabled(X86_FEATURE_XSAVES)) + if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) xsave->header.xcomp_bv = mask | XCOMP_BV_COMPACTED_FORMAT; } @@ -79,6 +79,7 @@ static inline u64 xfeatures_mask_independent(void) /* These macros all use (%edi)/(%rdi) as the single memory argument. */ #define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27" #define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37" +#define XSAVEC ".byte " REX_PREFIX "0x0f,0xc7,0x27" #define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f" #define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f" #define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f" @@ -97,9 +98,11 @@ static inline u64 xfeatures_mask_independent(void) : "memory") /* - * If XSAVES is enabled, it replaces XSAVEOPT because it supports a compact - * format and supervisor states in addition to modified optimization in - * XSAVEOPT. + * If XSAVES is enabled, it replaces XSAVEC because it supports supervisor + * states in addition to XSAVEC. + * + * Otherwise if XSAVEC is enabled, it replaces XSAVEOPT because it supports + * compacted storage format in addition to XSAVEOPT. * * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT * supports modified optimization which is not supported by XSAVE. @@ -111,8 +114,9 @@ static inline u64 xfeatures_mask_independent(void) * address of the instruction where we might get an exception at. */ #define XSTATE_XSAVE(st, lmask, hmask, err) \ - asm volatile(ALTERNATIVE_2(XSAVE, \ + asm volatile(ALTERNATIVE_3(XSAVE, \ XSAVEOPT, X86_FEATURE_XSAVEOPT, \ + XSAVEC, X86_FEATURE_XSAVEC, \ XSAVES, X86_FEATURE_XSAVES) \ "\n" \ "xor %[err], %[err]\n" \ -- cgit From 306f7cc1e9061313c46d19e9bdffc819880794c1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 5 Apr 2022 15:12:56 +0800 Subject: uapi: always define F_GETLK64/F_SETLK64/F_SETLKW64 in fcntl.h The F_GETLK64/F_SETLK64/F_SETLKW64 fcntl opcodes are only implemented for the 32-bit syscall APIs, but are also needed for compat handling on 64-bit kernels. Consolidate them in unistd.h instead of definining the internal compat definitions in compat.h, which is rather error prone (e.g. parisc gets the values wrong currently). Note that before this change they were never visible to userspace due to the fact that CONFIG_64BIT is only set for kernel builds. Signed-off-by: Christoph Hellwig Signed-off-by: Guo Ren Reviewed-by: Arnd Bergmann Tested-by: Heiko Stuebner Link: https://lore.kernel.org/r/20220405071314.3225832-3-guoren@kernel.org Signed-off-by: Palmer Dabbelt --- arch/x86/include/asm/compat.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 7516e4199b3c..8d19a212f4f2 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -58,10 +58,6 @@ struct compat_flock { compat_pid_t l_pid; }; -#define F_GETLK64 12 /* using 'struct flock64' */ -#define F_SETLK64 13 -#define F_SETLKW64 14 - /* * IA32 uses 4 byte alignment for 64 bit quantities, * so we need to pack this structure. -- cgit From 3ce0f2373f7073f04715b1ffd7b1812d3183f79a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 5 Apr 2022 15:12:57 +0800 Subject: compat: consolidate the compat_flock{,64} definition Provide a single common definition for the compat_flock and compat_flock64 structures using the same tricks as for the native variants. Another extra define is added for the packing required on x86. Signed-off-by: Christoph Hellwig Signed-off-by: Guo Ren Reviewed-by: Arnd Bergmann Tested-by: Heiko Stuebner Acked-by: Helge Deller # parisc Link: https://lore.kernel.org/r/20220405071314.3225832-4-guoren@kernel.org Signed-off-by: Palmer Dabbelt --- arch/x86/include/asm/compat.h | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 8d19a212f4f2..de794d895866 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -50,25 +50,11 @@ struct compat_stat { u32 __unused5; }; -struct compat_flock { - short l_type; - short l_whence; - compat_off_t l_start; - compat_off_t l_len; - compat_pid_t l_pid; -}; - /* - * IA32 uses 4 byte alignment for 64 bit quantities, - * so we need to pack this structure. + * IA32 uses 4 byte alignment for 64 bit quantities, so we need to pack the + * compat flock64 structure. */ -struct compat_flock64 { - short l_type; - short l_whence; - compat_loff_t l_start; - compat_loff_t l_len; - compat_pid_t l_pid; -} __attribute__((packed)); +#define __ARCH_NEED_COMPAT_FLOCK64_PACKED struct compat_statfs { int f_type; -- cgit From 0cbed0ee1dbcd7ecd890e6af297c9d133f730021 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 5 Apr 2022 15:12:58 +0800 Subject: arch: Add SYSVIPC_COMPAT for all architectures The existing per-arch definitions are pretty much historic cruft. Move SYSVIPC_COMPAT into init/Kconfig. Signed-off-by: Guo Ren Signed-off-by: Guo Ren Acked-by: Arnd Bergmann Reviewed-by: Christoph Hellwig Tested-by: Heiko Stuebner Acked-by: Helge Deller # parisc Link: https://lore.kernel.org/r/20220405071314.3225832-5-guoren@kernel.org Signed-off-by: Palmer Dabbelt --- arch/x86/Kconfig | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b0142e01002e..65690b950f5f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2872,10 +2872,6 @@ config COMPAT if COMPAT config COMPAT_FOR_U64_ALIGNMENT def_bool y - -config SYSVIPC_COMPAT - def_bool y - depends on SYSVIPC endif endmenu -- cgit From f18ed30db299458f809aec55bf1800dbeebeb953 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 5 Apr 2022 15:12:59 +0800 Subject: fs: stat: compat: Add __ARCH_WANT_COMPAT_STAT RISC-V doesn't neeed compat_stat, so using __ARCH_WANT_COMPAT_STAT to exclude unnecessary SYSCALL functions. Signed-off-by: Guo Ren Signed-off-by: Guo Ren Reviewed-by: Arnd Bergmann Reviewed-by: Christoph Hellwig Tested-by: Heiko Stuebner Acked-by: Helge Deller # parisc Link: https://lore.kernel.org/r/20220405071314.3225832-6-guoren@kernel.org Signed-off-by: Palmer Dabbelt --- arch/x86/include/asm/unistd.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index 80e9d5206a71..761173ccc33c 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -22,6 +22,7 @@ # include # define __ARCH_WANT_SYS_TIME # define __ARCH_WANT_SYS_UTIME +# define __ARCH_WANT_COMPAT_STAT # define __ARCH_WANT_COMPAT_SYS_PREADV64 # define __ARCH_WANT_COMPAT_SYS_PWRITEV64 # define __ARCH_WANT_COMPAT_SYS_PREADV64V2 -- cgit From 84a0c977ab9821a4b57f69d9a6108f64188d1e71 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 5 Apr 2022 15:13:00 +0800 Subject: asm-generic: compat: Cleanup duplicate definitions There are 7 64bit architectures that support Linux COMPAT mode to run 32bit applications. A lot of definitions are duplicate: - COMPAT_USER_HZ - COMPAT_RLIM_INFINITY - COMPAT_OFF_T_MAX - __compat_uid_t, __compat_uid_t - compat_dev_t - compat_ipc_pid_t - struct compat_flock - struct compat_flock64 - struct compat_statfs - struct compat_ipc64_perm, compat_semid64_ds, compat_msqid64_ds, compat_shmid64_ds Cleanup duplicate definitions and merge them into asm-generic. Signed-off-by: Guo Ren Signed-off-by: Guo Ren Reviewed-by: Arnd Bergmann Reviewed-by: Christoph Hellwig Tested-by: Heiko Stuebner Acked-by: Helge Deller # parisc Link: https://lore.kernel.org/r/20220405071314.3225832-7-guoren@kernel.org Signed-off-by: Palmer Dabbelt --- arch/x86/include/asm/compat.h | 80 +++++++------------------------------------ 1 file changed, 12 insertions(+), 68 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index de794d895866..e74a107de0d0 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -15,17 +15,23 @@ #define compat_mode_t compat_mode_t typedef u16 compat_mode_t; +#define __compat_uid_t __compat_uid_t +typedef u16 __compat_uid_t; +typedef u16 __compat_gid_t; + +#define compat_dev_t compat_dev_t +typedef u16 compat_dev_t; + +#define compat_ipc_pid_t compat_ipc_pid_t +typedef u16 compat_ipc_pid_t; + +#define compat_statfs compat_statfs + #include -#define COMPAT_USER_HZ 100 #define COMPAT_UTS_MACHINE "i686\0\0" -typedef u16 __compat_uid_t; -typedef u16 __compat_gid_t; -typedef u16 compat_dev_t; typedef u16 compat_nlink_t; -typedef u16 compat_ipc_pid_t; -typedef __kernel_fsid_t compat_fsid_t; struct compat_stat { compat_dev_t st_dev; @@ -71,68 +77,6 @@ struct compat_statfs { int f_spare[4]; }; -#define COMPAT_RLIM_INFINITY 0xffffffff - -#define COMPAT_OFF_T_MAX 0x7fffffff - -struct compat_ipc64_perm { - compat_key_t key; - __compat_uid32_t uid; - __compat_gid32_t gid; - __compat_uid32_t cuid; - __compat_gid32_t cgid; - unsigned short mode; - unsigned short __pad1; - unsigned short seq; - unsigned short __pad2; - compat_ulong_t unused1; - compat_ulong_t unused2; -}; - -struct compat_semid64_ds { - struct compat_ipc64_perm sem_perm; - compat_ulong_t sem_otime; - compat_ulong_t sem_otime_high; - compat_ulong_t sem_ctime; - compat_ulong_t sem_ctime_high; - compat_ulong_t sem_nsems; - compat_ulong_t __unused3; - compat_ulong_t __unused4; -}; - -struct compat_msqid64_ds { - struct compat_ipc64_perm msg_perm; - compat_ulong_t msg_stime; - compat_ulong_t msg_stime_high; - compat_ulong_t msg_rtime; - compat_ulong_t msg_rtime_high; - compat_ulong_t msg_ctime; - compat_ulong_t msg_ctime_high; - compat_ulong_t msg_cbytes; - compat_ulong_t msg_qnum; - compat_ulong_t msg_qbytes; - compat_pid_t msg_lspid; - compat_pid_t msg_lrpid; - compat_ulong_t __unused4; - compat_ulong_t __unused5; -}; - -struct compat_shmid64_ds { - struct compat_ipc64_perm shm_perm; - compat_size_t shm_segsz; - compat_ulong_t shm_atime; - compat_ulong_t shm_atime_high; - compat_ulong_t shm_dtime; - compat_ulong_t shm_dtime_high; - compat_ulong_t shm_ctime; - compat_ulong_t shm_ctime_high; - compat_pid_t shm_cpid; - compat_pid_t shm_lpid; - compat_ulong_t shm_nattch; - compat_ulong_t __unused4; - compat_ulong_t __unused5; -}; - #ifdef CONFIG_X86_X32_ABI #define COMPAT_USE_64BIT_TIME \ (!!(task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT)) -- cgit From 75d359ec4141b013727022a663762931f69e6510 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Fri, 22 Apr 2022 08:56:23 -0500 Subject: x86/sev: Add missing __init annotations to SEV init routines Currently, get_secrets_page() is only reachable from the following call chain: __init snp_init_platform_device(): get_secrets_page() so mark it as __init as well. This is also needed since it calls early_memremap(), which is also an __init routine. Similarly, get_jump_table_addr() is only reachable from the following call chain: __init setup_real_mode(): sme_sev_setup_real_mode(): sev_es_setup_ap_jump_table(): get_jump_table_addr() so mark get_jump_table_addr() and everything up that call chain as __init as well. This is also needed since future patches will add a call to get_secrets_page(), which needs to be __init due to the reasons stated above. Suggested-by: Borislav Petkov Signed-off-by: Michael Roth Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220422135624.114172-2-michael.roth@amd.com --- arch/x86/kernel/sev.c | 6 +++--- arch/x86/realmode/init.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 2fa87a07ab30..b7fd1915560d 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -558,7 +558,7 @@ void noinstr __sev_es_nmi_complete(void) __sev_put_ghcb(&state); } -static u64 get_jump_table_addr(void) +static u64 __init get_jump_table_addr(void) { struct ghcb_state state; unsigned long flags; @@ -1077,7 +1077,7 @@ void snp_set_wakeup_secondary_cpu(void) apic->wakeup_secondary_cpu = wakeup_cpu_via_vmgexit; } -int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) +int __init sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { u16 startup_cs, startup_ip; phys_addr_t jump_table_pa; @@ -2171,7 +2171,7 @@ static struct platform_device sev_guest_device = { .id = -1, }; -static u64 get_secrets_page(void) +static u64 __init get_secrets_page(void) { u64 pa_data = boot_params.cc_blob_address; struct cc_blob_sev_info info; diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index c5e29db02a46..41d7669a97ad 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -67,7 +67,7 @@ void __init reserve_real_mode(void) memblock_reserve(0, SZ_1M); } -static void sme_sev_setup_real_mode(struct trampoline_header *th) +static void __init sme_sev_setup_real_mode(struct trampoline_header *th) { #ifdef CONFIG_AMD_MEM_ENCRYPT if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) -- cgit From c2106a231c2ba36ff9af50cdf2867b9a5f8150a6 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Fri, 22 Apr 2022 08:56:24 -0500 Subject: x86/sev: Get the AP jump table address from secrets page The GHCB specification section 2.7 states that when SEV-SNP is enabled, a guest should not rely on the hypervisor to provide the address of the AP jump table. Instead, if a guest BIOS wants to provide an AP jump table, it should record the address in the SNP secrets page so the guest operating system can obtain it directly from there. Fix this on the guest kernel side by having SNP guests use the AP jump table address published in the secrets page rather than issuing a GHCB request to get it. [ mroth: - Improve error handling when ioremap()/memremap() return NULL - Don't mix function calls with declarations - Add missing __init - Tweak commit message ] Fixes: 0afb6b660a6b ("x86/sev: Use SEV-SNP AP creation to start secondary CPUs") Signed-off-by: Brijesh Singh Signed-off-by: Michael Roth Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220422135624.114172-3-michael.roth@amd.com --- arch/x86/include/asm/sev.h | 35 +++++++++++++++++++++ arch/x86/kernel/sev.c | 76 +++++++++++++++++++++++++++++++--------------- 2 files changed, 87 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 6e3dda4f82b5..19514524f0f8 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -99,6 +99,41 @@ struct sev_guest_platform_data { u64 secrets_gpa; }; +/* + * The secrets page contains 96-bytes of reserved field that can be used by + * the guest OS. The guest OS uses the area to save the message sequence + * number for each VMPCK. + * + * See the GHCB spec section Secret page layout for the format for this area. + */ +struct secrets_os_area { + u32 msg_seqno_0; + u32 msg_seqno_1; + u32 msg_seqno_2; + u32 msg_seqno_3; + u64 ap_jump_table_pa; + u8 rsvd[40]; + u8 guest_usage[32]; +} __packed; + +#define VMPCK_KEY_LEN 32 + +/* See the SNP spec version 0.9 for secrets page format */ +struct snp_secrets_page_layout { + u32 version; + u32 imien : 1, + rsvd1 : 31; + u32 fms; + u32 rsvd2; + u8 gosvw[16]; + u8 vmpck0[VMPCK_KEY_LEN]; + u8 vmpck1[VMPCK_KEY_LEN]; + u8 vmpck2[VMPCK_KEY_LEN]; + u8 vmpck3[VMPCK_KEY_LEN]; + struct secrets_os_area os_area; + u8 rsvd3[3840]; +} __packed; + #ifdef CONFIG_AMD_MEM_ENCRYPT extern struct static_key_false sev_es_enable_key; extern void __sev_es_ist_enter(struct pt_regs *regs); diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index b7fd1915560d..166375084b1f 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -558,6 +558,55 @@ void noinstr __sev_es_nmi_complete(void) __sev_put_ghcb(&state); } +static u64 __init get_secrets_page(void) +{ + u64 pa_data = boot_params.cc_blob_address; + struct cc_blob_sev_info info; + void *map; + + /* + * The CC blob contains the address of the secrets page, check if the + * blob is present. + */ + if (!pa_data) + return 0; + + map = early_memremap(pa_data, sizeof(info)); + if (!map) { + pr_err("Unable to locate SNP secrets page: failed to map the Confidential Computing blob.\n"); + return 0; + } + memcpy(&info, map, sizeof(info)); + early_memunmap(map, sizeof(info)); + + /* smoke-test the secrets page passed */ + if (!info.secrets_phys || info.secrets_len != PAGE_SIZE) + return 0; + + return info.secrets_phys; +} + +static u64 __init get_snp_jump_table_addr(void) +{ + struct snp_secrets_page_layout *layout; + u64 pa, addr; + + pa = get_secrets_page(); + if (!pa) + return 0; + + layout = (__force void *)ioremap_encrypted(pa, PAGE_SIZE); + if (!layout) { + pr_err("Unable to locate AP jump table address: failed to map the SNP secrets page.\n"); + return 0; + } + + addr = layout->os_area.ap_jump_table_pa; + iounmap(layout); + + return addr; +} + static u64 __init get_jump_table_addr(void) { struct ghcb_state state; @@ -565,6 +614,9 @@ static u64 __init get_jump_table_addr(void) struct ghcb *ghcb; u64 ret = 0; + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return get_snp_jump_table_addr(); + local_irq_save(flags); ghcb = __sev_get_ghcb(&state); @@ -2171,30 +2223,6 @@ static struct platform_device sev_guest_device = { .id = -1, }; -static u64 __init get_secrets_page(void) -{ - u64 pa_data = boot_params.cc_blob_address; - struct cc_blob_sev_info info; - void *map; - - /* - * The CC blob contains the address of the secrets page, check if the - * blob is present. - */ - if (!pa_data) - return 0; - - map = early_memremap(pa_data, sizeof(info)); - memcpy(&info, map, sizeof(info)); - early_memunmap(map, sizeof(info)); - - /* smoke-test the secrets page passed */ - if (!info.secrets_phys || info.secrets_len != PAGE_SIZE) - return 0; - - return info.secrets_phys; -} - static int __init snp_init_platform_device(void) { struct sev_guest_platform_data data; -- cgit From b0b592cf08367719e1d1ef07c9f136e8c17f7ec3 Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Sat, 23 Apr 2022 20:24:10 +0200 Subject: x86/pm: Fix false positive kmemleak report in msr_build_context() Since e2a1256b17b1 ("x86/speculation: Restore speculation related MSRs during S3 resume") kmemleak reports this issue: unreferenced object 0xffff888009cedc00 (size 256): comm "swapper/0", pid 1, jiffies 4294693823 (age 73.764s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 48 00 00 00 00 00 00 00 ........H....... 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: msr_build_context (include/linux/slab.h:621) pm_check_save_msr (arch/x86/power/cpu.c:520) do_one_initcall (init/main.c:1298) kernel_init_freeable (init/main.c:1370) kernel_init (init/main.c:1504) ret_from_fork (arch/x86/entry/entry_64.S:304) Reproducer: - boot the VM with a debug kernel config (see https://github.com/multipath-tcp/mptcp_net-next/issues/268) - wait ~1 minute - start a kmemleak scan The root cause here is alignment within the packed struct saved_context (from suspend_64.h). Kmemleak only searches for pointers that are aligned (see how pointers are scanned in kmemleak.c), but pahole shows that the saved_msrs struct member and all members after it in the structure are unaligned: struct saved_context { struct pt_regs regs; /* 0 168 */ /* --- cacheline 2 boundary (128 bytes) was 40 bytes ago --- */ u16 ds; /* 168 2 */ ... u64 misc_enable; /* 232 8 */ bool misc_enable_saved; /* 240 1 */ /* Note below odd offset values for the remainder of this struct */ struct saved_msrs saved_msrs; /* 241 16 */ /* --- cacheline 4 boundary (256 bytes) was 1 bytes ago --- */ long unsigned int efer; /* 257 8 */ u16 gdt_pad; /* 265 2 */ struct desc_ptr gdt_desc; /* 267 10 */ u16 idt_pad; /* 277 2 */ struct desc_ptr idt; /* 279 10 */ u16 ldt; /* 289 2 */ u16 tss; /* 291 2 */ long unsigned int tr; /* 293 8 */ long unsigned int safety; /* 301 8 */ long unsigned int return_address; /* 309 8 */ /* size: 317, cachelines: 5, members: 25 */ /* last cacheline: 61 bytes */ } __attribute__((__packed__)); Move misc_enable_saved to the end of the struct declaration so that saved_msrs fits in before the cacheline 4 boundary. The comment above the saved_context declaration says to fix wakeup_64.S file and __save/__restore_processor_state() if the struct is modified: it looks like all the accesses in wakeup_64.S are done through offsets which are computed at build-time. Update that comment accordingly. At the end, the false positive kmemleak report is due to a limitation from kmemleak but it is always good to avoid unaligned members for optimisation purposes. Please note that it looks like this issue is not new, e.g. https://lore.kernel.org/all/9f1bb619-c4ee-21c4-a251-870bd4db04fa@lwfinger.net/ https://lore.kernel.org/all/94e48fcd-1dbd-ebd2-4c91-f39941735909@molgen.mpg.de/ [ bp: Massage + cleanup commit message. ] Fixes: 7a9c2dd08ead ("x86/pm: Introduce quirk framework to save/restore extra MSR registers around suspend/resume") Suggested-by: Mat Martineau Signed-off-by: Matthieu Baerts Signed-off-by: Borislav Petkov Reviewed-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20220426202138.498310-1-matthieu.baerts@tessares.net --- arch/x86/include/asm/suspend_32.h | 2 +- arch/x86/include/asm/suspend_64.h | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h index 7b132d0312eb..a800abb1a992 100644 --- a/arch/x86/include/asm/suspend_32.h +++ b/arch/x86/include/asm/suspend_32.h @@ -19,7 +19,6 @@ struct saved_context { u16 gs; unsigned long cr0, cr2, cr3, cr4; u64 misc_enable; - bool misc_enable_saved; struct saved_msrs saved_msrs; struct desc_ptr gdt_desc; struct desc_ptr idt; @@ -28,6 +27,7 @@ struct saved_context { unsigned long tr; unsigned long safety; unsigned long return_address; + bool misc_enable_saved; } __attribute__((packed)); /* routines for saving/restoring kernel state */ diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h index 35bb35d28733..54df06687d83 100644 --- a/arch/x86/include/asm/suspend_64.h +++ b/arch/x86/include/asm/suspend_64.h @@ -14,9 +14,13 @@ * Image of the saved processor state, used by the low level ACPI suspend to * RAM code and by the low level hibernation code. * - * If you modify it, fix arch/x86/kernel/acpi/wakeup_64.S and make sure that - * __save/__restore_processor_state(), defined in arch/x86/kernel/suspend_64.c, - * still work as required. + * If you modify it, check how it is used in arch/x86/kernel/acpi/wakeup_64.S + * and make sure that __save/__restore_processor_state(), defined in + * arch/x86/power/cpu.c, still work as required. + * + * Because the structure is packed, make sure to avoid unaligned members. For + * optimisation purposes but also because tools like kmemleak only search for + * pointers that are aligned. */ struct saved_context { struct pt_regs regs; @@ -36,7 +40,6 @@ struct saved_context { unsigned long cr0, cr2, cr3, cr4; u64 misc_enable; - bool misc_enable_saved; struct saved_msrs saved_msrs; unsigned long efer; u16 gdt_pad; /* Unused */ @@ -48,6 +51,7 @@ struct saved_context { unsigned long tr; unsigned long safety; unsigned long return_address; + bool misc_enable_saved; } __attribute__((packed)); #define loaddebug(thread,register) \ -- cgit From b041b525dab95352fbd666b14dc73ab898df465f Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Thu, 10 Mar 2022 12:48:53 -0800 Subject: x86/split_lock: Make life miserable for split lockers In https://lore.kernel.org/all/87y22uujkm.ffs@tglx/ Thomas said: Its's simply wishful thinking that stuff gets fixed because of a WARN_ONCE(). This has never worked. The only thing which works is to make stuff fail hard or slow it down in a way which makes it annoying enough to users to complain. He was talking about WBINVD. But it made me think about how we use the split lock detection feature in Linux. Existing code has three options for applications: 1) Don't enable split lock detection (allow arbitrary split locks) 2) Warn once when a process uses split lock, but let the process keep running with split lock detection disabled 3) Kill process that use split locks Option 2 falls into the "wishful thinking" territory that Thomas warns does nothing. But option 3 might not be viable in a situation with legacy applications that need to run. Hence make option 2 much stricter to "slow it down in a way which makes it annoying". Primary reason for this change is to provide better quality of service to the rest of the applications running on the system. Internal testing shows that even with many processes splitting locks, performance for the rest of the system is much more responsive. The new "warn" mode operates like this. When an application tries to execute a bus lock the #AC handler. 1) Delays (interruptibly) 10 ms before moving to next step. 2) Blocks (interruptibly) until it can get the semaphore If interrupted, just return. Assume the signal will either kill the task, or direct execution away from the instruction that is trying to get the bus lock. 3) Disables split lock detection for the current core 4) Schedules a work queue to re-enable split lock detect in 2 jiffies 5) Returns The work queue that re-enables split lock detection also releases the semaphore. There is a corner case where a CPU may be taken offline while split lock detection is disabled. A CPU hotplug handler handles this case. Old behaviour was to only print the split lock warning on the first occurrence of a split lock from a task. Preserve that by adding a flag to the task structure that suppresses subsequent split lock messages from that task. Signed-off-by: Tony Luck Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20220310204854.31752-2-tony.luck@intel.com --- arch/x86/kernel/cpu/intel.c | 63 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 53 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index f7a5370a9b3b..be2a0bdb9527 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -7,10 +7,13 @@ #include #include #include +#include #include #include #include +#include #include +#include #include #include @@ -999,6 +1002,8 @@ static const struct { static struct ratelimit_state bld_ratelimit; +static DEFINE_SEMAPHORE(buslock_sem); + static inline bool match_option(const char *arg, int arglen, const char *opt) { int len = strlen(opt), ratelimit; @@ -1109,18 +1114,52 @@ static void split_lock_init(void) split_lock_verify_msr(sld_state != sld_off); } +static void __split_lock_reenable(struct work_struct *work) +{ + sld_update_msr(true); + up(&buslock_sem); +} + +/* + * If a CPU goes offline with pending delayed work to re-enable split lock + * detection then the delayed work will be executed on some other CPU. That + * handles releasing the buslock_sem, but because it executes on a + * different CPU probably won't re-enable split lock detection. This is a + * problem on HT systems since the sibling CPU on the same core may then be + * left running with split lock detection disabled. + * + * Unconditionally re-enable detection here. + */ +static int splitlock_cpu_offline(unsigned int cpu) +{ + sld_update_msr(true); + + return 0; +} + +static DECLARE_DELAYED_WORK(split_lock_reenable, __split_lock_reenable); + static void split_lock_warn(unsigned long ip) { - pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n", - current->comm, current->pid, ip); + int cpu; - /* - * Disable the split lock detection for this task so it can make - * progress and set TIF_SLD so the detection is re-enabled via - * switch_to_sld() when the task is scheduled out. - */ + if (!current->reported_split_lock) + pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n", + current->comm, current->pid, ip); + current->reported_split_lock = 1; + + /* misery factor #1, sleep 10ms before trying to execute split lock */ + if (msleep_interruptible(10) > 0) + return; + /* Misery factor #2, only allow one buslocked disabled core at a time */ + if (down_interruptible(&buslock_sem) == -EINTR) + return; + cpu = get_cpu(); + schedule_delayed_work_on(cpu, &split_lock_reenable, 2); + + /* Disable split lock detection on this CPU to make progress */ sld_update_msr(false); - set_tsk_thread_flag(current, TIF_SLD); + put_cpu(); } bool handle_guest_split_lock(unsigned long ip) @@ -1274,10 +1313,14 @@ static void sld_state_show(void) pr_info("disabled\n"); break; case sld_warn: - if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) + if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) { pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n"); - else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) + if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "x86/splitlock", NULL, splitlock_cpu_offline) < 0) + pr_warn("No splitlock CPU offline handler\n"); + } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) { pr_info("#DB: warning on user-space bus_locks\n"); + } break; case sld_fatal: if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) { -- cgit From ef79970d7ccdc4e8855aa6079fc2f4797a6807fb Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Thu, 10 Mar 2022 12:48:54 -0800 Subject: x86/split-lock: Remove unused TIF_SLD bit Changes to the "warn" mode of split lock handling mean that TIF_SLD is never set. Remove the bit, and the functions that use it. Signed-off-by: Tony Luck Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20220310204854.31752-3-tony.luck@intel.com --- arch/x86/include/asm/cpu.h | 2 -- arch/x86/include/asm/thread_info.h | 4 +--- arch/x86/kernel/cpu/intel.c | 12 ------------ arch/x86/kernel/process.c | 3 --- 4 files changed, 1 insertion(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index 86e5e4e26fcb..d1c86b221bb9 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -43,14 +43,12 @@ unsigned int x86_model(unsigned int sig); unsigned int x86_stepping(unsigned int sig); #ifdef CONFIG_CPU_SUP_INTEL extern void __init sld_setup(struct cpuinfo_x86 *c); -extern void switch_to_sld(unsigned long tifn); extern bool handle_user_split_lock(struct pt_regs *regs, long error_code); extern bool handle_guest_split_lock(unsigned long ip); extern void handle_bus_lock(struct pt_regs *regs); u8 get_this_hybrid_cpu_type(void); #else static inline void __init sld_setup(struct cpuinfo_x86 *c) {} -static inline void switch_to_sld(unsigned long tifn) {} static inline bool handle_user_split_lock(struct pt_regs *regs, long error_code) { return false; diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index ebec69c35e95..f0cb881c1d69 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -92,7 +92,6 @@ struct thread_info { #define TIF_NOCPUID 15 /* CPUID is not accessible in userland */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_NOTIFY_SIGNAL 17 /* signal notifications exist */ -#define TIF_SLD 18 /* Restore split lock detection on context switch */ #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ #define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ @@ -116,7 +115,6 @@ struct thread_info { #define _TIF_NOCPUID (1 << TIF_NOCPUID) #define _TIF_NOTSC (1 << TIF_NOTSC) #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) -#define _TIF_SLD (1 << TIF_SLD) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) #define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE) @@ -128,7 +126,7 @@ struct thread_info { /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW_BASE \ (_TIF_NOCPUID | _TIF_NOTSC | _TIF_BLOCKSTEP | \ - _TIF_SSBD | _TIF_SPEC_FORCE_UPDATE | _TIF_SLD) + _TIF_SSBD | _TIF_SPEC_FORCE_UPDATE) /* * Avoid calls to __switch_to_xtra() on UP as STIBP is not evaluated. diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index be2a0bdb9527..672e253a58b9 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -1232,18 +1232,6 @@ void handle_bus_lock(struct pt_regs *regs) } } -/* - * This function is called only when switching between tasks with - * different split-lock detection modes. It sets the MSR for the - * mode of the new task. This is right most of the time, but since - * the MSR is shared by hyperthreads on a physical core there can - * be glitches when the two threads need different modes. - */ -void switch_to_sld(unsigned long tifn) -{ - sld_update_msr(!(tifn & _TIF_SLD)); -} - /* * Bits in the IA32_CORE_CAPABILITIES are not architectural, so they should * only be trusted if it is confirmed that a CPU model implements a diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index b370767f5b19..bcc76c187e11 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -686,9 +686,6 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) /* Enforce MSR update to ensure consistent state */ __speculation_ctrl_update(~tifn, tifn); } - - if ((tifp ^ tifn) & _TIF_SLD) - switch_to_sld(tifn); } /* -- cgit From 6d108c96bf23598cc3b4f91d60e9b7694abcd2a7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Apr 2022 21:19:50 +0200 Subject: x86/aperfmperf: Dont wake idle CPUs in arch_freq_get_on_cpu() aperfmperf_get_khz() already excludes idle CPUs from APERF/MPERF sampling and that's a reasonable decision. There is no point in sending up to two IPIs to an idle CPU just because someone reads a sysfs file. Signed-off-by: Thomas Gleixner Acked-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Acked-by: Paul E. McKenney Link: https://lore.kernel.org/r/20220415161206.419880163@linutronix.de --- arch/x86/kernel/cpu/aperfmperf.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index 9ca008f9e9b1..ea9160f7aaad 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -139,6 +139,9 @@ unsigned int arch_freq_get_on_cpu(int cpu) if (!housekeeping_cpu(cpu, HK_TYPE_MISC)) return 0; + if (rcu_is_idle_cpu(cpu)) + return 0; + if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true)) return per_cpu(samples.khz, cpu); -- cgit From 55cb0b70749361d7f82a979768c77ac301f07da9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Apr 2022 21:19:51 +0200 Subject: x86/smp: Move APERF/MPERF code where it belongs as this can share code with the preexisting APERF/MPERF code. No functional change. Signed-off-by: Thomas Gleixner Acked-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Acked-by: Paul E. McKenney Link: https://lore.kernel.org/r/20220415161206.478362457@linutronix.de --- arch/x86/kernel/cpu/aperfmperf.c | 366 ++++++++++++++++++++++++++++++++++++++- arch/x86/kernel/smpboot.c | 355 ------------------------------------- 2 files changed, 362 insertions(+), 359 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index ea9160f7aaad..35fff01e87b4 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -6,15 +6,19 @@ * Copyright (C) 2017 Intel Corp. * Author: Len Brown */ - +#include #include #include #include #include -#include -#include -#include #include +#include +#include +#include +#include + +#include +#include #include "cpu.h" @@ -152,3 +156,357 @@ unsigned int arch_freq_get_on_cpu(int cpu) return per_cpu(samples.khz, cpu); } + +#if defined(CONFIG_X86_64) && defined(CONFIG_SMP) +/* + * APERF/MPERF frequency ratio computation. + * + * The scheduler wants to do frequency invariant accounting and needs a <1 + * ratio to account for the 'current' frequency, corresponding to + * freq_curr / freq_max. + * + * Since the frequency freq_curr on x86 is controlled by micro-controller and + * our P-state setting is little more than a request/hint, we need to observe + * the effective frequency 'BusyMHz', i.e. the average frequency over a time + * interval after discarding idle time. This is given by: + * + * BusyMHz = delta_APERF / delta_MPERF * freq_base + * + * where freq_base is the max non-turbo P-state. + * + * The freq_max term has to be set to a somewhat arbitrary value, because we + * can't know which turbo states will be available at a given point in time: + * it all depends on the thermal headroom of the entire package. We set it to + * the turbo level with 4 cores active. + * + * Benchmarks show that's a good compromise between the 1C turbo ratio + * (freq_curr/freq_max would rarely reach 1) and something close to freq_base, + * which would ignore the entire turbo range (a conspicuous part, making + * freq_curr/freq_max always maxed out). + * + * An exception to the heuristic above is the Atom uarch, where we choose the + * highest turbo level for freq_max since Atom's are generally oriented towards + * power efficiency. + * + * Setting freq_max to anything less than the 1C turbo ratio makes the ratio + * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1. + */ + +DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key); + +static DEFINE_PER_CPU(u64, arch_prev_aperf); +static DEFINE_PER_CPU(u64, arch_prev_mperf); +static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE; +static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE; + +void arch_set_max_freq_ratio(bool turbo_disabled) +{ + arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE : + arch_turbo_freq_ratio; +} +EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio); + +static bool turbo_disabled(void) +{ + u64 misc_en; + int err; + + err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en); + if (err) + return false; + + return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE); +} + +static bool slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) +{ + int err; + + err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq); + if (err) + return false; + + err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq); + if (err) + return false; + + *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */ + *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */ + + return true; +} + +#define X86_MATCH(model) \ + X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \ + INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL) + +static const struct x86_cpu_id has_knl_turbo_ratio_limits[] = { + X86_MATCH(XEON_PHI_KNL), + X86_MATCH(XEON_PHI_KNM), + {} +}; + +static const struct x86_cpu_id has_skx_turbo_ratio_limits[] = { + X86_MATCH(SKYLAKE_X), + {} +}; + +static const struct x86_cpu_id has_glm_turbo_ratio_limits[] = { + X86_MATCH(ATOM_GOLDMONT), + X86_MATCH(ATOM_GOLDMONT_D), + X86_MATCH(ATOM_GOLDMONT_PLUS), + {} +}; + +static bool knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, + int num_delta_fratio) +{ + int fratio, delta_fratio, found; + int err, i; + u64 msr; + + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + if (err) + return false; + + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); + if (err) + return false; + + fratio = (msr >> 8) & 0xFF; + i = 16; + found = 0; + do { + if (found >= num_delta_fratio) { + *turbo_freq = fratio; + return true; + } + + delta_fratio = (msr >> (i + 5)) & 0x7; + + if (delta_fratio) { + found += 1; + fratio -= delta_fratio; + } + + i += 8; + } while (i < 64); + + return true; +} + +static bool skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size) +{ + u64 ratios, counts; + u32 group_size; + int err, i; + + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + if (err) + return false; + + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios); + if (err) + return false; + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts); + if (err) + return false; + + for (i = 0; i < 64; i += 8) { + group_size = (counts >> i) & 0xFF; + if (group_size >= size) { + *turbo_freq = (ratios >> i) & 0xFF; + return true; + } + } + + return false; +} + +static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) +{ + u64 msr; + int err; + + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + if (err) + return false; + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); + if (err) + return false; + + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ + *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */ + + /* The CPU may have less than 4 cores */ + if (!*turbo_freq) + *turbo_freq = msr & 0xFF; /* 1C turbo */ + + return true; +} + +static bool intel_set_max_freq_ratio(void) +{ + u64 base_freq, turbo_freq; + u64 turbo_ratio; + + if (slv_set_max_freq_ratio(&base_freq, &turbo_freq)) + goto out; + + if (x86_match_cpu(has_glm_turbo_ratio_limits) && + skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) + goto out; + + if (x86_match_cpu(has_knl_turbo_ratio_limits) && + knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) + goto out; + + if (x86_match_cpu(has_skx_turbo_ratio_limits) && + skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4)) + goto out; + + if (core_set_max_freq_ratio(&base_freq, &turbo_freq)) + goto out; + + return false; + +out: + /* + * Some hypervisors advertise X86_FEATURE_APERFMPERF + * but then fill all MSR's with zeroes. + * Some CPUs have turbo boost but don't declare any turbo ratio + * in MSR_TURBO_RATIO_LIMIT. + */ + if (!base_freq || !turbo_freq) { + pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n"); + return false; + } + + turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq); + if (!turbo_ratio) { + pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n"); + return false; + } + + arch_turbo_freq_ratio = turbo_ratio; + arch_set_max_freq_ratio(turbo_disabled()); + + return true; +} + +static void init_counter_refs(void) +{ + u64 aperf, mperf; + + rdmsrl(MSR_IA32_APERF, aperf); + rdmsrl(MSR_IA32_MPERF, mperf); + + this_cpu_write(arch_prev_aperf, aperf); + this_cpu_write(arch_prev_mperf, mperf); +} + +#ifdef CONFIG_PM_SLEEP +static struct syscore_ops freq_invariance_syscore_ops = { + .resume = init_counter_refs, +}; + +static void register_freq_invariance_syscore_ops(void) +{ + /* Bail out if registered already. */ + if (freq_invariance_syscore_ops.node.prev) + return; + + register_syscore_ops(&freq_invariance_syscore_ops); +} +#else +static inline void register_freq_invariance_syscore_ops(void) {} +#endif + +void init_freq_invariance(bool secondary, bool cppc_ready) +{ + bool ret = false; + + if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) + return; + + if (secondary) { + if (static_branch_likely(&arch_scale_freq_key)) { + init_counter_refs(); + } + return; + } + + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + ret = intel_set_max_freq_ratio(); + else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + if (!cppc_ready) { + return; + } + ret = amd_set_max_freq_ratio(&arch_turbo_freq_ratio); + } + + if (ret) { + init_counter_refs(); + static_branch_enable(&arch_scale_freq_key); + register_freq_invariance_syscore_ops(); + pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); + } else { + pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n"); + } +} + +static void disable_freq_invariance_workfn(struct work_struct *work) +{ + static_branch_disable(&arch_scale_freq_key); +} + +static DECLARE_WORK(disable_freq_invariance_work, + disable_freq_invariance_workfn); + +DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; + +void arch_scale_freq_tick(void) +{ + u64 freq_scale; + u64 aperf, mperf; + u64 acnt, mcnt; + + if (!arch_scale_freq_invariant()) + return; + + rdmsrl(MSR_IA32_APERF, aperf); + rdmsrl(MSR_IA32_MPERF, mperf); + + acnt = aperf - this_cpu_read(arch_prev_aperf); + mcnt = mperf - this_cpu_read(arch_prev_mperf); + + this_cpu_write(arch_prev_aperf, aperf); + this_cpu_write(arch_prev_mperf, mperf); + + if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt)) + goto error; + + if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt) + goto error; + + freq_scale = div64_u64(acnt, mcnt); + if (!freq_scale) + goto error; + + if (freq_scale > SCHED_CAPACITY_SCALE) + freq_scale = SCHED_CAPACITY_SCALE; + + this_cpu_write(arch_freq_scale, freq_scale); + return; + +error: + pr_warn("Scheduler frequency invariance went wobbly, disabling!\n"); + schedule_work(&disable_freq_invariance_work); +} +#endif /* CONFIG_X86_64 && CONFIG_SMP */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2ef14772dc04..a9fc16a9c408 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -56,7 +56,6 @@ #include #include #include -#include #include #include @@ -1847,357 +1846,3 @@ void native_play_dead(void) } #endif - -#ifdef CONFIG_X86_64 -/* - * APERF/MPERF frequency ratio computation. - * - * The scheduler wants to do frequency invariant accounting and needs a <1 - * ratio to account for the 'current' frequency, corresponding to - * freq_curr / freq_max. - * - * Since the frequency freq_curr on x86 is controlled by micro-controller and - * our P-state setting is little more than a request/hint, we need to observe - * the effective frequency 'BusyMHz', i.e. the average frequency over a time - * interval after discarding idle time. This is given by: - * - * BusyMHz = delta_APERF / delta_MPERF * freq_base - * - * where freq_base is the max non-turbo P-state. - * - * The freq_max term has to be set to a somewhat arbitrary value, because we - * can't know which turbo states will be available at a given point in time: - * it all depends on the thermal headroom of the entire package. We set it to - * the turbo level with 4 cores active. - * - * Benchmarks show that's a good compromise between the 1C turbo ratio - * (freq_curr/freq_max would rarely reach 1) and something close to freq_base, - * which would ignore the entire turbo range (a conspicuous part, making - * freq_curr/freq_max always maxed out). - * - * An exception to the heuristic above is the Atom uarch, where we choose the - * highest turbo level for freq_max since Atom's are generally oriented towards - * power efficiency. - * - * Setting freq_max to anything less than the 1C turbo ratio makes the ratio - * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1. - */ - -DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key); - -static DEFINE_PER_CPU(u64, arch_prev_aperf); -static DEFINE_PER_CPU(u64, arch_prev_mperf); -static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE; -static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE; - -void arch_set_max_freq_ratio(bool turbo_disabled) -{ - arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE : - arch_turbo_freq_ratio; -} -EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio); - -static bool turbo_disabled(void) -{ - u64 misc_en; - int err; - - err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en); - if (err) - return false; - - return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE); -} - -static bool slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) -{ - int err; - - err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq); - if (err) - return false; - - err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq); - if (err) - return false; - - *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */ - *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */ - - return true; -} - -#define X86_MATCH(model) \ - X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \ - INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL) - -static const struct x86_cpu_id has_knl_turbo_ratio_limits[] = { - X86_MATCH(XEON_PHI_KNL), - X86_MATCH(XEON_PHI_KNM), - {} -}; - -static const struct x86_cpu_id has_skx_turbo_ratio_limits[] = { - X86_MATCH(SKYLAKE_X), - {} -}; - -static const struct x86_cpu_id has_glm_turbo_ratio_limits[] = { - X86_MATCH(ATOM_GOLDMONT), - X86_MATCH(ATOM_GOLDMONT_D), - X86_MATCH(ATOM_GOLDMONT_PLUS), - {} -}; - -static bool knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, - int num_delta_fratio) -{ - int fratio, delta_fratio, found; - int err, i; - u64 msr; - - err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); - if (err) - return false; - - *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ - - err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); - if (err) - return false; - - fratio = (msr >> 8) & 0xFF; - i = 16; - found = 0; - do { - if (found >= num_delta_fratio) { - *turbo_freq = fratio; - return true; - } - - delta_fratio = (msr >> (i + 5)) & 0x7; - - if (delta_fratio) { - found += 1; - fratio -= delta_fratio; - } - - i += 8; - } while (i < 64); - - return true; -} - -static bool skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size) -{ - u64 ratios, counts; - u32 group_size; - int err, i; - - err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); - if (err) - return false; - - *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ - - err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios); - if (err) - return false; - - err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts); - if (err) - return false; - - for (i = 0; i < 64; i += 8) { - group_size = (counts >> i) & 0xFF; - if (group_size >= size) { - *turbo_freq = (ratios >> i) & 0xFF; - return true; - } - } - - return false; -} - -static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) -{ - u64 msr; - int err; - - err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); - if (err) - return false; - - err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); - if (err) - return false; - - *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ - *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */ - - /* The CPU may have less than 4 cores */ - if (!*turbo_freq) - *turbo_freq = msr & 0xFF; /* 1C turbo */ - - return true; -} - -static bool intel_set_max_freq_ratio(void) -{ - u64 base_freq, turbo_freq; - u64 turbo_ratio; - - if (slv_set_max_freq_ratio(&base_freq, &turbo_freq)) - goto out; - - if (x86_match_cpu(has_glm_turbo_ratio_limits) && - skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) - goto out; - - if (x86_match_cpu(has_knl_turbo_ratio_limits) && - knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) - goto out; - - if (x86_match_cpu(has_skx_turbo_ratio_limits) && - skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4)) - goto out; - - if (core_set_max_freq_ratio(&base_freq, &turbo_freq)) - goto out; - - return false; - -out: - /* - * Some hypervisors advertise X86_FEATURE_APERFMPERF - * but then fill all MSR's with zeroes. - * Some CPUs have turbo boost but don't declare any turbo ratio - * in MSR_TURBO_RATIO_LIMIT. - */ - if (!base_freq || !turbo_freq) { - pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n"); - return false; - } - - turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq); - if (!turbo_ratio) { - pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n"); - return false; - } - - arch_turbo_freq_ratio = turbo_ratio; - arch_set_max_freq_ratio(turbo_disabled()); - - return true; -} - -static void init_counter_refs(void) -{ - u64 aperf, mperf; - - rdmsrl(MSR_IA32_APERF, aperf); - rdmsrl(MSR_IA32_MPERF, mperf); - - this_cpu_write(arch_prev_aperf, aperf); - this_cpu_write(arch_prev_mperf, mperf); -} - -#ifdef CONFIG_PM_SLEEP -static struct syscore_ops freq_invariance_syscore_ops = { - .resume = init_counter_refs, -}; - -static void register_freq_invariance_syscore_ops(void) -{ - /* Bail out if registered already. */ - if (freq_invariance_syscore_ops.node.prev) - return; - - register_syscore_ops(&freq_invariance_syscore_ops); -} -#else -static inline void register_freq_invariance_syscore_ops(void) {} -#endif - -void init_freq_invariance(bool secondary, bool cppc_ready) -{ - bool ret = false; - - if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) - return; - - if (secondary) { - if (static_branch_likely(&arch_scale_freq_key)) { - init_counter_refs(); - } - return; - } - - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - ret = intel_set_max_freq_ratio(); - else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { - if (!cppc_ready) { - return; - } - ret = amd_set_max_freq_ratio(&arch_turbo_freq_ratio); - } - - if (ret) { - init_counter_refs(); - static_branch_enable(&arch_scale_freq_key); - register_freq_invariance_syscore_ops(); - pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); - } else { - pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n"); - } -} - -static void disable_freq_invariance_workfn(struct work_struct *work) -{ - static_branch_disable(&arch_scale_freq_key); -} - -static DECLARE_WORK(disable_freq_invariance_work, - disable_freq_invariance_workfn); - -DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; - -void arch_scale_freq_tick(void) -{ - u64 freq_scale; - u64 aperf, mperf; - u64 acnt, mcnt; - - if (!arch_scale_freq_invariant()) - return; - - rdmsrl(MSR_IA32_APERF, aperf); - rdmsrl(MSR_IA32_MPERF, mperf); - - acnt = aperf - this_cpu_read(arch_prev_aperf); - mcnt = mperf - this_cpu_read(arch_prev_mperf); - - this_cpu_write(arch_prev_aperf, aperf); - this_cpu_write(arch_prev_mperf, mperf); - - if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt)) - goto error; - - if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt) - goto error; - - freq_scale = div64_u64(acnt, mcnt); - if (!freq_scale) - goto error; - - if (freq_scale > SCHED_CAPACITY_SCALE) - freq_scale = SCHED_CAPACITY_SCALE; - - this_cpu_write(arch_freq_scale, freq_scale); - return; - -error: - pr_warn("Scheduler frequency invariance went wobbly, disabling!\n"); - schedule_work(&disable_freq_invariance_work); -} -#endif /* CONFIG_X86_64 */ -- cgit From 138a7f9c6beae8d652113b8e7a44994b4200bbcd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Apr 2022 21:19:53 +0200 Subject: x86/aperfmperf: Separate AP/BP frequency invariance init This code is convoluted and because it can be invoked post init via the ACPI/CPPC code, all of the initialization functionality is built in instead of being part of init text and init data. As a first step create separate calls for the boot and the application processors. Signed-off-by: Thomas Gleixner Reviewed-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Acked-by: Paul E. McKenney Link: https://lore.kernel.org/r/20220415161206.536733494@linutronix.de --- arch/x86/include/asm/topology.h | 12 +++++------- arch/x86/kernel/acpi/cppc.c | 3 ++- arch/x86/kernel/cpu/aperfmperf.c | 23 +++++++++++------------ arch/x86/kernel/smpboot.c | 4 ++-- 4 files changed, 20 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 9619385bf749..e2faedc6e793 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -216,14 +216,12 @@ extern void arch_scale_freq_tick(void); #define arch_scale_freq_tick arch_scale_freq_tick extern void arch_set_max_freq_ratio(bool turbo_disabled); -void init_freq_invariance(bool secondary, bool cppc_ready); +extern void bp_init_freq_invariance(bool cppc_ready); +extern void ap_init_freq_invariance(void); #else -static inline void arch_set_max_freq_ratio(bool turbo_disabled) -{ -} -static inline void init_freq_invariance(bool secondary, bool cppc_ready) -{ -} +static inline void arch_set_max_freq_ratio(bool turbo_disabled) { } +static inline void bp_init_freq_invariance(bool cppc_ready) { } +static inline void ap_init_freq_invariance(void) { } #endif #ifdef CONFIG_ACPI_CPPC_LIB diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c index df1644d9b3b6..06109d927a18 100644 --- a/arch/x86/kernel/acpi/cppc.c +++ b/arch/x86/kernel/acpi/cppc.c @@ -96,7 +96,8 @@ void init_freq_invariance_cppc(void) mutex_lock(&freq_invariance_lock); - init_freq_invariance(secondary, true); + if (!secondary) + bp_init_freq_invariance(true); secondary = true; mutex_unlock(&freq_invariance_lock); diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index 35fff01e87b4..87f34f23a974 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -428,31 +428,24 @@ static void register_freq_invariance_syscore_ops(void) static inline void register_freq_invariance_syscore_ops(void) {} #endif -void init_freq_invariance(bool secondary, bool cppc_ready) +void bp_init_freq_invariance(bool cppc_ready) { - bool ret = false; + bool ret; - if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) + if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) return; - if (secondary) { - if (static_branch_likely(&arch_scale_freq_key)) { - init_counter_refs(); - } - return; - } + init_counter_refs(); if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) ret = intel_set_max_freq_ratio(); else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { - if (!cppc_ready) { + if (!cppc_ready) return; - } ret = amd_set_max_freq_ratio(&arch_turbo_freq_ratio); } if (ret) { - init_counter_refs(); static_branch_enable(&arch_scale_freq_key); register_freq_invariance_syscore_ops(); pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); @@ -461,6 +454,12 @@ void init_freq_invariance(bool secondary, bool cppc_ready) } } +void ap_init_freq_invariance(void) +{ + if (cpu_feature_enabled(X86_FEATURE_APERFMPERF)) + init_counter_refs(); +} + static void disable_freq_invariance_workfn(struct work_struct *work) { static_branch_disable(&arch_scale_freq_key); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index a9fc16a9c408..023feb40f5c0 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -186,7 +186,7 @@ static void smp_callin(void) */ set_cpu_sibling_map(raw_smp_processor_id()); - init_freq_invariance(true, false); + ap_init_freq_invariance(); /* * Get our bogomips. @@ -1396,7 +1396,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) { smp_prepare_cpus_common(); - init_freq_invariance(false, false); + bp_init_freq_invariance(false); smp_sanity_check(); switch (apic_intr_mode) { -- cgit From 0dfaf3f6ecc0c7f4f876255aa82e8959d3721365 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Apr 2022 21:19:54 +0200 Subject: x86/aperfmperf: Untangle Intel and AMD frequency invariance init AMD boot CPU initialization happens late via ACPI/CPPC which prevents the Intel parts from being marked __init. Split out the common code and provide a dedicated interface for the AMD initialization and mark the Intel specific code and data __init. The remaining text size is almost cut in half: text: 2614 -> 1350 init.text: 0 -> 786 Signed-off-by: Thomas Gleixner Reviewed-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Acked-by: Paul E. McKenney Link: https://lore.kernel.org/r/20220415161206.592465719@linutronix.de --- arch/x86/include/asm/topology.h | 13 +++------ arch/x86/kernel/acpi/cppc.c | 30 +++++++++---------- arch/x86/kernel/cpu/aperfmperf.c | 62 +++++++++++++++++++++------------------- arch/x86/kernel/smpboot.c | 2 +- 4 files changed, 51 insertions(+), 56 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index e2faedc6e793..cc317077e73e 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -216,24 +216,19 @@ extern void arch_scale_freq_tick(void); #define arch_scale_freq_tick arch_scale_freq_tick extern void arch_set_max_freq_ratio(bool turbo_disabled); -extern void bp_init_freq_invariance(bool cppc_ready); +extern void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled); +extern void bp_init_freq_invariance(void); extern void ap_init_freq_invariance(void); #else static inline void arch_set_max_freq_ratio(bool turbo_disabled) { } -static inline void bp_init_freq_invariance(bool cppc_ready) { } +static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) { } +static inline void bp_init_freq_invariance(void) { } static inline void ap_init_freq_invariance(void) { } #endif #ifdef CONFIG_ACPI_CPPC_LIB void init_freq_invariance_cppc(void); #define arch_init_invariance_cppc init_freq_invariance_cppc - -bool amd_set_max_freq_ratio(u64 *ratio); -#else -static inline bool amd_set_max_freq_ratio(u64 *ratio) -{ - return false; -} #endif #endif /* _ASM_X86_TOPOLOGY_H */ diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c index 06109d927a18..8b8cbf22461a 100644 --- a/arch/x86/kernel/acpi/cppc.c +++ b/arch/x86/kernel/acpi/cppc.c @@ -50,20 +50,17 @@ int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) return err; } -bool amd_set_max_freq_ratio(u64 *ratio) +static void amd_set_max_freq_ratio(void) { struct cppc_perf_caps perf_caps; u64 highest_perf, nominal_perf; u64 perf_ratio; int rc; - if (!ratio) - return false; - rc = cppc_get_perf_caps(0, &perf_caps); if (rc) { pr_debug("Could not retrieve perf counters (%d)\n", rc); - return false; + return; } highest_perf = amd_get_highest_perf(); @@ -71,7 +68,7 @@ bool amd_set_max_freq_ratio(u64 *ratio) if (!highest_perf || !nominal_perf) { pr_debug("Could not retrieve highest or nominal performance\n"); - return false; + return; } perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf); @@ -79,26 +76,27 @@ bool amd_set_max_freq_ratio(u64 *ratio) perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1; if (!perf_ratio) { pr_debug("Non-zero highest/nominal perf values led to a 0 ratio\n"); - return false; + return; } - *ratio = perf_ratio; - arch_set_max_freq_ratio(false); - - return true; + freq_invariance_set_perf_ratio(perf_ratio, false); } static DEFINE_MUTEX(freq_invariance_lock); void init_freq_invariance_cppc(void) { - static bool secondary; + static bool init_done; - mutex_lock(&freq_invariance_lock); + if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) + return; - if (!secondary) - bp_init_freq_invariance(true); - secondary = true; + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return; + mutex_lock(&freq_invariance_lock); + if (!init_done) + amd_set_max_freq_ratio(); + init_done = true; mutex_unlock(&freq_invariance_lock); } diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index 87f34f23a974..b4f4ea529f2d 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -206,7 +206,7 @@ void arch_set_max_freq_ratio(bool turbo_disabled) } EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio); -static bool turbo_disabled(void) +static bool __init turbo_disabled(void) { u64 misc_en; int err; @@ -218,7 +218,7 @@ static bool turbo_disabled(void) return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE); } -static bool slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) +static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) { int err; @@ -240,26 +240,26 @@ static bool slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \ INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL) -static const struct x86_cpu_id has_knl_turbo_ratio_limits[] = { +static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = { X86_MATCH(XEON_PHI_KNL), X86_MATCH(XEON_PHI_KNM), {} }; -static const struct x86_cpu_id has_skx_turbo_ratio_limits[] = { +static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = { X86_MATCH(SKYLAKE_X), {} }; -static const struct x86_cpu_id has_glm_turbo_ratio_limits[] = { +static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = { X86_MATCH(ATOM_GOLDMONT), X86_MATCH(ATOM_GOLDMONT_D), X86_MATCH(ATOM_GOLDMONT_PLUS), {} }; -static bool knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, - int num_delta_fratio) +static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, + int num_delta_fratio) { int fratio, delta_fratio, found; int err, i; @@ -297,7 +297,7 @@ static bool knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, return true; } -static bool skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size) +static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size) { u64 ratios, counts; u32 group_size; @@ -328,7 +328,7 @@ static bool skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size) return false; } -static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) +static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) { u64 msr; int err; @@ -351,7 +351,7 @@ static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) return true; } -static bool intel_set_max_freq_ratio(void) +static bool __init intel_set_max_freq_ratio(void) { u64 base_freq, turbo_freq; u64 turbo_ratio; @@ -418,40 +418,42 @@ static struct syscore_ops freq_invariance_syscore_ops = { static void register_freq_invariance_syscore_ops(void) { - /* Bail out if registered already. */ - if (freq_invariance_syscore_ops.node.prev) - return; - register_syscore_ops(&freq_invariance_syscore_ops); } #else static inline void register_freq_invariance_syscore_ops(void) {} #endif -void bp_init_freq_invariance(bool cppc_ready) +static void freq_invariance_enable(void) +{ + if (static_branch_unlikely(&arch_scale_freq_key)) { + WARN_ON_ONCE(1); + return; + } + static_branch_enable(&arch_scale_freq_key); + register_freq_invariance_syscore_ops(); + pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); +} + +void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) { - bool ret; + arch_turbo_freq_ratio = ratio; + arch_set_max_freq_ratio(turbo_disabled); + freq_invariance_enable(); +} +void __init bp_init_freq_invariance(void) +{ if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) return; init_counter_refs(); - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - ret = intel_set_max_freq_ratio(); - else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { - if (!cppc_ready) - return; - ret = amd_set_max_freq_ratio(&arch_turbo_freq_ratio); - } + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return; - if (ret) { - static_branch_enable(&arch_scale_freq_key); - register_freq_invariance_syscore_ops(); - pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); - } else { - pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n"); - } + if (intel_set_max_freq_ratio()) + freq_invariance_enable(); } void ap_init_freq_invariance(void) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 023feb40f5c0..b1ba7ddfe930 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1396,7 +1396,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) { smp_prepare_cpus_common(); - bp_init_freq_invariance(false); + bp_init_freq_invariance(); smp_sanity_check(); switch (apic_intr_mode) { -- cgit From 24620d94a52adc0cafe65dc65bed1d586ca04a6e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Apr 2022 21:19:56 +0200 Subject: x86/aperfmperf: Put frequency invariance aperf/mperf data into a struct Preparation for sharing code with the CPU frequency portion of the aperf/mperf code. No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Acked-by: Paul E. McKenney Link: https://lore.kernel.org/r/20220415161206.648485667@linutronix.de --- arch/x86/kernel/cpu/aperfmperf.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index b4f4ea529f2d..6922c77d98d8 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -22,6 +22,13 @@ #include "cpu.h" +struct aperfmperf { + u64 aperf; + u64 mperf; +}; + +static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples); + struct aperfmperf_sample { unsigned int khz; atomic_t scfpending; @@ -194,8 +201,6 @@ unsigned int arch_freq_get_on_cpu(int cpu) DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key); -static DEFINE_PER_CPU(u64, arch_prev_aperf); -static DEFINE_PER_CPU(u64, arch_prev_mperf); static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE; static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE; @@ -407,8 +412,8 @@ static void init_counter_refs(void) rdmsrl(MSR_IA32_APERF, aperf); rdmsrl(MSR_IA32_MPERF, mperf); - this_cpu_write(arch_prev_aperf, aperf); - this_cpu_write(arch_prev_mperf, mperf); + this_cpu_write(cpu_samples.aperf, aperf); + this_cpu_write(cpu_samples.mperf, mperf); } #ifdef CONFIG_PM_SLEEP @@ -474,9 +479,8 @@ DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; void arch_scale_freq_tick(void) { - u64 freq_scale; - u64 aperf, mperf; - u64 acnt, mcnt; + struct aperfmperf *s = this_cpu_ptr(&cpu_samples); + u64 aperf, mperf, acnt, mcnt, freq_scale; if (!arch_scale_freq_invariant()) return; @@ -484,11 +488,11 @@ void arch_scale_freq_tick(void) rdmsrl(MSR_IA32_APERF, aperf); rdmsrl(MSR_IA32_MPERF, mperf); - acnt = aperf - this_cpu_read(arch_prev_aperf); - mcnt = mperf - this_cpu_read(arch_prev_mperf); + acnt = aperf - s->aperf; + mcnt = mperf - s->mperf; - this_cpu_write(arch_prev_aperf, aperf); - this_cpu_write(arch_prev_mperf, mperf); + s->aperf = aperf; + s->mperf = mperf; if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt)) goto error; -- cgit From 73a5fa7d51366a549a9f2e3ee875ae51aa0b5580 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Apr 2022 21:19:57 +0200 Subject: x86/aperfmperf: Restructure arch_scale_freq_tick() Preparation for sharing code with the CPU frequency portion of the aperf/mperf code. No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Acked-by: Paul E. McKenney Link: https://lore.kernel.org/r/20220415161206.706185092@linutronix.de --- arch/x86/kernel/cpu/aperfmperf.c | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index 6922c77d98d8..6220503af26a 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -477,22 +477,9 @@ static DECLARE_WORK(disable_freq_invariance_work, DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; -void arch_scale_freq_tick(void) +static void scale_freq_tick(u64 acnt, u64 mcnt) { - struct aperfmperf *s = this_cpu_ptr(&cpu_samples); - u64 aperf, mperf, acnt, mcnt, freq_scale; - - if (!arch_scale_freq_invariant()) - return; - - rdmsrl(MSR_IA32_APERF, aperf); - rdmsrl(MSR_IA32_MPERF, mperf); - - acnt = aperf - s->aperf; - mcnt = mperf - s->mperf; - - s->aperf = aperf; - s->mperf = mperf; + u64 freq_scale; if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt)) goto error; @@ -514,4 +501,23 @@ error: pr_warn("Scheduler frequency invariance went wobbly, disabling!\n"); schedule_work(&disable_freq_invariance_work); } + +void arch_scale_freq_tick(void) +{ + struct aperfmperf *s = this_cpu_ptr(&cpu_samples); + u64 acnt, mcnt, aperf, mperf; + + if (!arch_scale_freq_invariant()) + return; + + rdmsrl(MSR_IA32_APERF, aperf); + rdmsrl(MSR_IA32_MPERF, mperf); + acnt = aperf - s->aperf; + mcnt = mperf - s->mperf; + + s->aperf = aperf; + s->mperf = mperf; + + scale_freq_tick(acnt, mcnt); +} #endif /* CONFIG_X86_64 && CONFIG_SMP */ -- cgit From bb6e89df9028b2fab0ce6ac71cd9ef25b6ada32d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Apr 2022 21:19:59 +0200 Subject: x86/aperfmperf: Make parts of the frequency invariance code unconditional The frequency invariance support is currently limited to x86/64 and SMP, which is the vast majority of machines. arch_scale_freq_tick() is called every tick on all CPUs and reads the APERF and MPERF MSRs. The CPU frequency getters function do the same via dedicated IPIs. While it could be argued that on systems where frequency invariance support is disabled (32bit, !SMP) the per tick read of the APERF and MPERF MSRs can be avoided, it does not make sense to keep the extra code and the resulting runtime issues of mass IPIs around. As a first step split out the non frequency invariance specific initialization code and the read MSR portion of arch_scale_freq_tick(). The rest of the code is still conditional and guarded with a static key. Signed-off-by: Thomas Gleixner Reviewed-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Acked-by: Paul E. McKenney Link: https://lore.kernel.org/r/20220415161206.761988704@linutronix.de --- arch/x86/include/asm/cpu.h | 2 ++ arch/x86/include/asm/topology.h | 4 --- arch/x86/kernel/cpu/aperfmperf.c | 63 ++++++++++++++++++++++++---------------- arch/x86/kernel/smpboot.c | 3 +- 4 files changed, 41 insertions(+), 31 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index 86e5e4e26fcb..e89772dc17f1 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -36,6 +36,8 @@ extern int _debug_hotplug_cpu(int cpu, int action); #endif #endif +extern void ap_init_aperfmperf(void); + int mwait_usable(const struct cpuinfo_x86 *); unsigned int x86_family(unsigned int sig); diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index cc317077e73e..1b2553dd3c64 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -217,13 +217,9 @@ extern void arch_scale_freq_tick(void); extern void arch_set_max_freq_ratio(bool turbo_disabled); extern void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled); -extern void bp_init_freq_invariance(void); -extern void ap_init_freq_invariance(void); #else static inline void arch_set_max_freq_ratio(bool turbo_disabled) { } static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) { } -static inline void bp_init_freq_invariance(void) { } -static inline void ap_init_freq_invariance(void) { } #endif #ifdef CONFIG_ACPI_CPPC_LIB diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index 6220503af26a..df528a4f6de3 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -164,6 +165,17 @@ unsigned int arch_freq_get_on_cpu(int cpu) return per_cpu(samples.khz, cpu); } +static void init_counter_refs(void) +{ + u64 aperf, mperf; + + rdmsrl(MSR_IA32_APERF, aperf); + rdmsrl(MSR_IA32_MPERF, mperf); + + this_cpu_write(cpu_samples.aperf, aperf); + this_cpu_write(cpu_samples.mperf, mperf); +} + #if defined(CONFIG_X86_64) && defined(CONFIG_SMP) /* * APERF/MPERF frequency ratio computation. @@ -405,17 +417,6 @@ out: return true; } -static void init_counter_refs(void) -{ - u64 aperf, mperf; - - rdmsrl(MSR_IA32_APERF, aperf); - rdmsrl(MSR_IA32_MPERF, mperf); - - this_cpu_write(cpu_samples.aperf, aperf); - this_cpu_write(cpu_samples.mperf, mperf); -} - #ifdef CONFIG_PM_SLEEP static struct syscore_ops freq_invariance_syscore_ops = { .resume = init_counter_refs, @@ -447,13 +448,8 @@ void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) freq_invariance_enable(); } -void __init bp_init_freq_invariance(void) +static void __init bp_init_freq_invariance(void) { - if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) - return; - - init_counter_refs(); - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return; @@ -461,12 +457,6 @@ void __init bp_init_freq_invariance(void) freq_invariance_enable(); } -void ap_init_freq_invariance(void) -{ - if (cpu_feature_enabled(X86_FEATURE_APERFMPERF)) - init_counter_refs(); -} - static void disable_freq_invariance_workfn(struct work_struct *work) { static_branch_disable(&arch_scale_freq_key); @@ -481,6 +471,9 @@ static void scale_freq_tick(u64 acnt, u64 mcnt) { u64 freq_scale; + if (!arch_scale_freq_invariant()) + return; + if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt)) goto error; @@ -501,13 +494,17 @@ error: pr_warn("Scheduler frequency invariance went wobbly, disabling!\n"); schedule_work(&disable_freq_invariance_work); } +#else +static inline void bp_init_freq_invariance(void) { } +static inline void scale_freq_tick(u64 acnt, u64 mcnt) { } +#endif /* CONFIG_X86_64 && CONFIG_SMP */ void arch_scale_freq_tick(void) { struct aperfmperf *s = this_cpu_ptr(&cpu_samples); u64 acnt, mcnt, aperf, mperf; - if (!arch_scale_freq_invariant()) + if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) return; rdmsrl(MSR_IA32_APERF, aperf); @@ -520,4 +517,20 @@ void arch_scale_freq_tick(void) scale_freq_tick(acnt, mcnt); } -#endif /* CONFIG_X86_64 && CONFIG_SMP */ + +static int __init bp_init_aperfmperf(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) + return 0; + + init_counter_refs(); + bp_init_freq_invariance(); + return 0; +} +early_initcall(bp_init_aperfmperf); + +void ap_init_aperfmperf(void) +{ + if (cpu_feature_enabled(X86_FEATURE_APERFMPERF)) + init_counter_refs(); +} diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index b1ba7ddfe930..eb7de776a2a6 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -186,7 +186,7 @@ static void smp_callin(void) */ set_cpu_sibling_map(raw_smp_processor_id()); - ap_init_freq_invariance(); + ap_init_aperfmperf(); /* * Get our bogomips. @@ -1396,7 +1396,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) { smp_prepare_cpus_common(); - bp_init_freq_invariance(); smp_sanity_check(); switch (apic_intr_mode) { -- cgit From cd8c0e142daf9de9ce594e61b75509b0af7bfb26 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Apr 2022 21:20:01 +0200 Subject: x86/aperfmperf: Store aperf/mperf data for cpu frequency reads Now that the MSR readout is unconditional, store the results in the per CPU data structure along with a jiffies timestamp for the CPU frequency readout code. Signed-off-by: Thomas Gleixner Acked-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Acked-by: Paul E. McKenney Link: https://lore.kernel.org/r/20220415161206.817702355@linutronix.de --- arch/x86/kernel/cpu/aperfmperf.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index df528a4f6de3..963c0697a92b 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -24,11 +24,17 @@ #include "cpu.h" struct aperfmperf { + seqcount_t seq; + unsigned long last_update; + u64 acnt; + u64 mcnt; u64 aperf; u64 mperf; }; -static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = { + .seq = SEQCNT_ZERO(cpu_samples.seq) +}; struct aperfmperf_sample { unsigned int khz; @@ -515,6 +521,12 @@ void arch_scale_freq_tick(void) s->aperf = aperf; s->mperf = mperf; + raw_write_seqcount_begin(&s->seq); + s->last_update = jiffies; + s->acnt = acnt; + s->mcnt = mcnt; + raw_write_seqcount_end(&s->seq); + scale_freq_tick(acnt, mcnt); } -- cgit From 7d84c1ebf9ddafca27b481e6da7d24a023dacaa2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Apr 2022 21:20:02 +0200 Subject: x86/aperfmperf: Replace aperfmperf_get_khz() The frequency invariance infrastructure provides the APERF/MPERF samples already. Utilize them for the cpu frequency display in /proc/cpuinfo. The sample is considered valid for 20ms. So for idle or isolated NOHZ full CPUs the function returns 0, which is matching the previous behaviour. This gets rid of the mass IPIs and a delay of 20ms for stabilizing observed by Eric when reading /proc/cpuinfo. Reported-by: Eric Dumazet Signed-off-by: Thomas Gleixner Tested-by: Eric Dumazet Reviewed-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Acked-by: Paul E. McKenney Link: https://lore.kernel.org/r/20220415161206.875029458@linutronix.de --- arch/x86/kernel/cpu/aperfmperf.c | 77 ++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 43 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index 963c0697a92b..e9d2da7effc9 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -101,49 +101,6 @@ static bool aperfmperf_snapshot_cpu(int cpu, ktime_t now, bool wait) return time_delta <= APERFMPERF_STALE_THRESHOLD_MS; } -unsigned int aperfmperf_get_khz(int cpu) -{ - if (!cpu_khz) - return 0; - - if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) - return 0; - - if (!housekeeping_cpu(cpu, HK_TYPE_MISC)) - return 0; - - if (rcu_is_idle_cpu(cpu)) - return 0; /* Idle CPUs are completely uninteresting. */ - - aperfmperf_snapshot_cpu(cpu, ktime_get(), true); - return per_cpu(samples.khz, cpu); -} - -void arch_freq_prepare_all(void) -{ - ktime_t now = ktime_get(); - bool wait = false; - int cpu; - - if (!cpu_khz) - return; - - if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) - return; - - for_each_online_cpu(cpu) { - if (!housekeeping_cpu(cpu, HK_TYPE_MISC)) - continue; - if (rcu_is_idle_cpu(cpu)) - continue; /* Idle CPUs are completely uninteresting. */ - if (!aperfmperf_snapshot_cpu(cpu, now, false)) - wait = true; - } - - if (wait) - msleep(APERFMPERF_REFRESH_DELAY_MS); -} - unsigned int arch_freq_get_on_cpu(int cpu) { struct aperfmperf_sample *s = per_cpu_ptr(&samples, cpu); @@ -530,6 +487,40 @@ void arch_scale_freq_tick(void) scale_freq_tick(acnt, mcnt); } +/* + * Discard samples older than the define maximum sample age of 20ms. There + * is no point in sending IPIs in such a case. If the scheduler tick was + * not running then the CPU is either idle or isolated. + */ +#define MAX_SAMPLE_AGE ((unsigned long)HZ / 50) + +unsigned int aperfmperf_get_khz(int cpu) +{ + struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu); + unsigned long last; + unsigned int seq; + u64 acnt, mcnt; + + if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) + return 0; + + do { + seq = raw_read_seqcount_begin(&s->seq); + last = s->last_update; + acnt = s->acnt; + mcnt = s->mcnt; + } while (read_seqcount_retry(&s->seq, seq)); + + /* + * Bail on invalid count and when the last update was too long ago, + * which covers idle and NOHZ full CPUs. + */ + if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE) + return 0; + + return div64_u64((cpu_khz * acnt), mcnt); +} + static int __init bp_init_aperfmperf(void) { if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) -- cgit From f3eca381bd49d708073ba1a9af4fa6ea5d5810a6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Apr 2022 21:20:04 +0200 Subject: x86/aperfmperf: Replace arch_freq_get_on_cpu() Reading the current CPU frequency from /sys/..../scaling_cur_freq involves in the worst case two IPIs due to the ad hoc sampling. The frequency invariance infrastructure provides the APERF/MPERF samples already. Utilize them and consolidate this with the /proc/cpuinfo readout. The sample is considered valid for 20ms. So for idle or isolated NOHZ full CPUs the function returns 0, which is matching the previous behaviour. The resulting text size vs. the original APERF/MPERF plus the separate frequency invariance code: text: 2411 -> 723 init.text: 0 -> 767 Signed-off-by: Thomas Gleixner Tested-by: Eric Dumazet Reviewed-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Acked-by: Paul E. McKenney Link: https://lore.kernel.org/r/20220415161206.934040006@linutronix.de --- arch/x86/kernel/cpu/aperfmperf.c | 94 +--------------------------------------- arch/x86/kernel/cpu/proc.c | 2 +- 2 files changed, 2 insertions(+), 94 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index e9d2da7effc9..b15c884c4cbf 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -36,98 +36,6 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = { .seq = SEQCNT_ZERO(cpu_samples.seq) }; -struct aperfmperf_sample { - unsigned int khz; - atomic_t scfpending; - ktime_t time; - u64 aperf; - u64 mperf; -}; - -static DEFINE_PER_CPU(struct aperfmperf_sample, samples); - -#define APERFMPERF_CACHE_THRESHOLD_MS 10 -#define APERFMPERF_REFRESH_DELAY_MS 10 -#define APERFMPERF_STALE_THRESHOLD_MS 1000 - -/* - * aperfmperf_snapshot_khz() - * On the current CPU, snapshot APERF, MPERF, and jiffies - * unless we already did it within 10ms - * calculate kHz, save snapshot - */ -static void aperfmperf_snapshot_khz(void *dummy) -{ - u64 aperf, aperf_delta; - u64 mperf, mperf_delta; - struct aperfmperf_sample *s = this_cpu_ptr(&samples); - unsigned long flags; - - local_irq_save(flags); - rdmsrl(MSR_IA32_APERF, aperf); - rdmsrl(MSR_IA32_MPERF, mperf); - local_irq_restore(flags); - - aperf_delta = aperf - s->aperf; - mperf_delta = mperf - s->mperf; - - /* - * There is no architectural guarantee that MPERF - * increments faster than we can read it. - */ - if (mperf_delta == 0) - return; - - s->time = ktime_get(); - s->aperf = aperf; - s->mperf = mperf; - s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta); - atomic_set_release(&s->scfpending, 0); -} - -static bool aperfmperf_snapshot_cpu(int cpu, ktime_t now, bool wait) -{ - s64 time_delta = ktime_ms_delta(now, per_cpu(samples.time, cpu)); - struct aperfmperf_sample *s = per_cpu_ptr(&samples, cpu); - - /* Don't bother re-computing within the cache threshold time. */ - if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS) - return true; - - if (!atomic_xchg(&s->scfpending, 1) || wait) - smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, wait); - - /* Return false if the previous iteration was too long ago. */ - return time_delta <= APERFMPERF_STALE_THRESHOLD_MS; -} - -unsigned int arch_freq_get_on_cpu(int cpu) -{ - struct aperfmperf_sample *s = per_cpu_ptr(&samples, cpu); - - if (!cpu_khz) - return 0; - - if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) - return 0; - - if (!housekeeping_cpu(cpu, HK_TYPE_MISC)) - return 0; - - if (rcu_is_idle_cpu(cpu)) - return 0; - - if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true)) - return per_cpu(samples.khz, cpu); - - msleep(APERFMPERF_REFRESH_DELAY_MS); - atomic_set(&s->scfpending, 1); - smp_mb(); /* ->scfpending before smp_call_function_single(). */ - smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1); - - return per_cpu(samples.khz, cpu); -} - static void init_counter_refs(void) { u64 aperf, mperf; @@ -494,7 +402,7 @@ void arch_scale_freq_tick(void) */ #define MAX_SAMPLE_AGE ((unsigned long)HZ / 50) -unsigned int aperfmperf_get_khz(int cpu) +unsigned int arch_freq_get_on_cpu(int cpu) { struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu); unsigned long last; diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 4eec8889b0ff..0a0ee55830c7 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -84,7 +84,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "microcode\t: 0x%x\n", c->microcode); if (cpu_has(c, X86_FEATURE_TSC)) { - unsigned int freq = aperfmperf_get_khz(cpu); + unsigned int freq = arch_freq_get_on_cpu(cpu); if (!freq) freq = cpufreq_quick_get(cpu); -- cgit From fb4c77c21aba03677f283acda3cae748ef866abf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 25 Apr 2022 17:45:42 +0200 Subject: x86/aperfmperf: Integrate the fallback code from show_cpuinfo() Due to the avoidance of IPIs to idle CPUs arch_freq_get_on_cpu() can return 0 when the last sample was too long ago. show_cpuinfo() has a fallback to cpufreq_quick_get() and if that fails to return cpu_khz, but the readout code for the per CPU scaling frequency in sysfs does not. Move that fallback into arch_freq_get_on_cpu() so the behaviour is the same when reading /proc/cpuinfo and /sys/..../cur_scaling_freq. Suggested-by: "Rafael J. Wysocki" Signed-off-by: Thomas Gleixner Tested-by: Doug Smythies Link: https://lore.kernel.org/r/87pml5180p.ffs@tglx --- arch/x86/kernel/cpu/aperfmperf.c | 10 +++++++--- arch/x86/kernel/cpu/proc.c | 7 +------ 2 files changed, 8 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index b15c884c4cbf..1f60a2b27936 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -405,12 +405,12 @@ void arch_scale_freq_tick(void) unsigned int arch_freq_get_on_cpu(int cpu) { struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu); + unsigned int seq, freq; unsigned long last; - unsigned int seq; u64 acnt, mcnt; if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) - return 0; + goto fallback; do { seq = raw_read_seqcount_begin(&s->seq); @@ -424,9 +424,13 @@ unsigned int arch_freq_get_on_cpu(int cpu) * which covers idle and NOHZ full CPUs. */ if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE) - return 0; + goto fallback; return div64_u64((cpu_khz * acnt), mcnt); + +fallback: + freq = cpufreq_quick_get(cpu); + return freq ? freq : cpu_khz; } static int __init bp_init_aperfmperf(void) diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 0a0ee55830c7..099b6f0d96bd 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -86,12 +86,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) if (cpu_has(c, X86_FEATURE_TSC)) { unsigned int freq = arch_freq_get_on_cpu(cpu); - if (!freq) - freq = cpufreq_quick_get(cpu); - if (!freq) - freq = cpu_khz; - seq_printf(m, "cpu MHz\t\t: %u.%03u\n", - freq / 1000, (freq % 1000)); + seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000)); } /* Cache size */ -- cgit From 830fe3c30dffe0b9f9485772070c29fcd8c2473d Mon Sep 17 00:00:00 2001 From: Suma Hegde Date: Wed, 27 Apr 2022 20:52:48 +0530 Subject: amd_hsmp: Add HSMP protocol version 5 messages HSMP protocol version 5 is supported on AMD family 19h model 10h EPYC processors. This version brings new features such as -- DIMM statistics -- Bandwidth for IO and xGMI links -- Monitor socket and core frequency limits -- Configure power efficiency modes, DF pstate range etc Signed-off-by: Suma Hegde Reviewed-by: Carlos Bilbao Signed-off-by: Naveen Krishna Chatradhi Link: https://lore.kernel.org/r/20220427152248.25643-1-nchatrad@amd.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- arch/x86/include/uapi/asm/amd_hsmp.h | 114 +++++++++++++++++++++++++++++++++-- 1 file changed, 109 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/uapi/asm/amd_hsmp.h b/arch/x86/include/uapi/asm/amd_hsmp.h index 7ee7ba0d63a3..769b939444ae 100644 --- a/arch/x86/include/uapi/asm/amd_hsmp.h +++ b/arch/x86/include/uapi/asm/amd_hsmp.h @@ -31,9 +31,22 @@ enum hsmp_message_ids { HSMP_GET_CCLK_THROTTLE_LIMIT, /* 10h Get CCLK frequency limit in socket */ HSMP_GET_C0_PERCENT, /* 11h Get average C0 residency in socket */ HSMP_SET_NBIO_DPM_LEVEL, /* 12h Set max/min LCLK DPM Level for a given NBIO */ - /* 13h Reserved */ - HSMP_GET_DDR_BANDWIDTH = 0x14, /* 14h Get theoretical maximum and current DDR Bandwidth */ - HSMP_GET_TEMP_MONITOR, /* 15h Get per-DIMM temperature and refresh rates */ + HSMP_GET_NBIO_DPM_LEVEL, /* 13h Get LCLK DPM level min and max for a given NBIO */ + HSMP_GET_DDR_BANDWIDTH, /* 14h Get theoretical maximum and current DDR Bandwidth */ + HSMP_GET_TEMP_MONITOR, /* 15h Get socket temperature */ + HSMP_GET_DIMM_TEMP_RANGE, /* 16h Get per-DIMM temperature range and refresh rate */ + HSMP_GET_DIMM_POWER, /* 17h Get per-DIMM power consumption */ + HSMP_GET_DIMM_THERMAL, /* 18h Get per-DIMM thermal sensors */ + HSMP_GET_SOCKET_FREQ_LIMIT, /* 19h Get current active frequency per socket */ + HSMP_GET_CCLK_CORE_LIMIT, /* 1Ah Get CCLK frequency limit per core */ + HSMP_GET_RAILS_SVI, /* 1Bh Get SVI-based Telemetry for all rails */ + HSMP_GET_SOCKET_FMAX_FMIN, /* 1Ch Get Fmax and Fmin per socket */ + HSMP_GET_IOLINK_BANDWITH, /* 1Dh Get current bandwidth on IO Link */ + HSMP_GET_XGMI_BANDWITH, /* 1Eh Get current bandwidth on xGMI Link */ + HSMP_SET_GMI3_WIDTH, /* 1Fh Set max and min GMI3 Link width */ + HSMP_SET_PCI_RATE, /* 20h Control link rate on PCIe devices */ + HSMP_SET_POWER_MODE, /* 21h Select power efficiency profile policy */ + HSMP_SET_PSTATE_MAX_MIN, /* 22h Set the max and min DF P-State */ HSMP_MSG_ID_MAX, }; @@ -175,8 +188,12 @@ static const struct hsmp_msg_desc hsmp_msg_desc_table[] = { */ {1, 0, HSMP_SET}, - /* RESERVED message */ - {0, 0, HSMP_RSVD}, + /* + * HSMP_GET_NBIO_DPM_LEVEL, num_args = 1, response_sz = 1 + * input: args[0] = nbioid[23:16] + * output: args[0] = max dpm level[15:8] + min dpm level[7:0] + */ + {1, 1, HSMP_GET}, /* * HSMP_GET_DDR_BANDWIDTH, num_args = 0, response_sz = 1 @@ -191,6 +208,93 @@ static const struct hsmp_msg_desc hsmp_msg_desc_table[] = { * [7:5] fractional part */ {0, 1, HSMP_GET}, + + /* + * HSMP_GET_DIMM_TEMP_RANGE, num_args = 1, response_sz = 1 + * input: args[0] = DIMM address[7:0] + * output: args[0] = refresh rate[3] + temperature range[2:0] + */ + {1, 1, HSMP_GET}, + + /* + * HSMP_GET_DIMM_POWER, num_args = 1, response_sz = 1 + * input: args[0] = DIMM address[7:0] + * output: args[0] = DIMM power in mW[31:17] + update rate in ms[16:8] + + * DIMM address[7:0] + */ + {1, 1, HSMP_GET}, + + /* + * HSMP_GET_DIMM_THERMAL, num_args = 1, response_sz = 1 + * input: args[0] = DIMM address[7:0] + * output: args[0] = temperature in degree celcius[31:21] + update rate in ms[16:8] + + * DIMM address[7:0] + */ + {1, 1, HSMP_GET}, + + /* + * HSMP_GET_SOCKET_FREQ_LIMIT, num_args = 0, response_sz = 1 + * output: args[0] = frequency in MHz[31:16] + frequency source[15:0] + */ + {0, 1, HSMP_GET}, + + /* + * HSMP_GET_CCLK_CORE_LIMIT, num_args = 1, response_sz = 1 + * input: args[0] = apic id [31:0] + * output: args[0] = frequency in MHz[31:0] + */ + {1, 1, HSMP_GET}, + + /* + * HSMP_GET_RAILS_SVI, num_args = 0, response_sz = 1 + * output: args[0] = power in mW[31:0] + */ + {0, 1, HSMP_GET}, + + /* + * HSMP_GET_SOCKET_FMAX_FMIN, num_args = 0, response_sz = 1 + * output: args[0] = fmax in MHz[31:16] + fmin in MHz[15:0] + */ + {0, 1, HSMP_GET}, + + /* + * HSMP_GET_IOLINK_BANDWITH, num_args = 1, response_sz = 1 + * input: args[0] = link id[15:8] + bw type[2:0] + * output: args[0] = io bandwidth in Mbps[31:0] + */ + {1, 1, HSMP_GET}, + + /* + * HSMP_GET_XGMI_BANDWITH, num_args = 1, response_sz = 1 + * input: args[0] = link id[15:8] + bw type[2:0] + * output: args[0] = xgmi bandwidth in Mbps[31:0] + */ + {1, 1, HSMP_GET}, + + /* + * HSMP_SET_GMI3_WIDTH, num_args = 1, response_sz = 0 + * input: args[0] = min link width[15:8] + max link width[7:0] + */ + {1, 0, HSMP_SET}, + + /* + * HSMP_SET_PCI_RATE, num_args = 1, response_sz = 1 + * input: args[0] = link rate control value + * output: args[0] = previous link rate control value + */ + {1, 1, HSMP_SET}, + + /* + * HSMP_SET_POWER_MODE, num_args = 1, response_sz = 0 + * input: args[0] = power efficiency mode[2:0] + */ + {1, 0, HSMP_SET}, + + /* + * HSMP_SET_PSTATE_MAX_MIN, num_args = 1, response_sz = 0 + * input: args[0] = min df pstate[15:8] + max df pstate[7:0] + */ + {1, 0, HSMP_SET}, }; /* Reset to default packing */ -- cgit From 2e4ec02bbcc05b8905d65c763ebde6bc85508e90 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Apr 2022 23:16:03 -0700 Subject: mm: hugetlb_vmemmap: introduce ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP The feature of minimizing overhead of struct page associated with each HugeTLB page is implemented on x86_64, however, the infrastructure of this feature is already there, we could easily enable it for other architectures. Introduce ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP for other architectures to be easily enabled. Just select this config if they want to enable this feature. Link: https://lkml.kernel.org/r/20220331065640.5777-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Suggested-by: Andrew Morton Reviewed-by: Barry Song Tested-by: Barry Song Reviewed-by: Anshuman Khandual Cc: Bodeddula Balasubramaniam Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Rientjes Cc: Fam Zheng Cc: James Morse Cc: Mark Rutland Cc: Mike Kravetz Cc: Oscar Salvador Cc: Will Deacon Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b0142e01002e..ae4e1a79a1e2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -121,6 +121,7 @@ config X86 select ARCH_WANTS_NO_INSTR select ARCH_WANT_GENERAL_HUGETLB select ARCH_WANT_HUGE_PMD_SHARE + select ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP if X86_64 select ARCH_WANT_LD_ORPHAN_WARN select ARCH_WANTS_THP_SWAP if X86_64 select ARCH_HAS_PARANOID_L1D_FLUSH -- cgit From e10cd4b00904db127b178859d81f6b5d05d16c67 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 Apr 2022 23:16:13 -0700 Subject: x86/mm: enable ARCH_HAS_VM_GET_PAGE_PROT This defines and exports a platform specific custom vm_get_page_prot() via subscribing ARCH_HAS_VM_GET_PAGE_PROT. This also unsubscribes from config ARCH_HAS_FILTER_PGPROT, after dropping off arch_filter_pgprot() and arch_vm_get_page_prot(). Link: https://lkml.kernel.org/r/20220414062125.609297-6-anshuman.khandual@arm.com Signed-off-by: Christoph Hellwig Signed-off-by: Anshuman Khandual Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Catalin Marinas Cc: Christophe Leroy Cc: David S. Miller Cc: Khalid Aziz Cc: Michael Ellerman Cc: Paul Mackerras Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/x86/Kconfig | 2 +- arch/x86/include/asm/pgtable.h | 5 ----- arch/x86/include/uapi/asm/mman.h | 14 -------------- arch/x86/mm/Makefile | 2 +- arch/x86/mm/pgprot.c | 35 +++++++++++++++++++++++++++++++++++ 5 files changed, 37 insertions(+), 21 deletions(-) create mode 100644 arch/x86/mm/pgprot.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ae4e1a79a1e2..5b77b4301d36 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -76,7 +76,6 @@ config X86 select ARCH_HAS_EARLY_DEBUG if KGDB select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FAST_MULTIPLIER - select ARCH_HAS_FILTER_PGPROT select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_KCOV if X86_64 @@ -95,6 +94,7 @@ config X86 select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_UBSAN_SANITIZE_ALL + select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_HAS_DEBUG_WX select ARCH_HAS_ZONE_DMA_SET if EXPERT select ARCH_HAVE_NMI_SAFE_CMPXCHG diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 62ab07e24aef..3563f4645fa1 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -649,11 +649,6 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) #define canon_pgprot(p) __pgprot(massage_pgprot(p)) -static inline pgprot_t arch_filter_pgprot(pgprot_t prot) -{ - return canon_pgprot(prot); -} - static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, enum page_cache_mode pcm, enum page_cache_mode new_pcm) diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h index d4a8d0424bfb..775dbd3aff73 100644 --- a/arch/x86/include/uapi/asm/mman.h +++ b/arch/x86/include/uapi/asm/mman.h @@ -5,20 +5,6 @@ #define MAP_32BIT 0x40 /* only give out 32bit addresses */ #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS -/* - * Take the 4 protection key bits out of the vma->vm_flags - * value and turn them in to the bits that we can put in - * to a pte. - * - * Only override these if Protection Keys are available - * (which is only on 64-bit). - */ -#define arch_vm_get_page_prot(vm_flags) __pgprot( \ - ((vm_flags) & VM_PKEY_BIT0 ? _PAGE_PKEY_BIT0 : 0) | \ - ((vm_flags) & VM_PKEY_BIT1 ? _PAGE_PKEY_BIT1 : 0) | \ - ((vm_flags) & VM_PKEY_BIT2 ? _PAGE_PKEY_BIT2 : 0) | \ - ((vm_flags) & VM_PKEY_BIT3 ? _PAGE_PKEY_BIT3 : 0)) - #define arch_calc_vm_prot_bits(prot, key) ( \ ((key) & 0x1 ? VM_PKEY_BIT0 : 0) | \ ((key) & 0x2 ? VM_PKEY_BIT1 : 0) | \ diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index fe3d3061fc11..fb6b41a48ae5 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -20,7 +20,7 @@ CFLAGS_REMOVE_mem_encrypt_identity.o = -pg endif obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o mmap.o \ - pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o maccess.o + pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o maccess.o pgprot.o obj-y += pat/ diff --git a/arch/x86/mm/pgprot.c b/arch/x86/mm/pgprot.c new file mode 100644 index 000000000000..763742782286 --- /dev/null +++ b/arch/x86/mm/pgprot.c @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include + +pgprot_t vm_get_page_prot(unsigned long vm_flags) +{ + unsigned long val = pgprot_val(protection_map[vm_flags & + (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]); + +#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS + /* + * Take the 4 protection key bits out of the vma->vm_flags value and + * turn them in to the bits that we can put in to a pte. + * + * Only override these if Protection Keys are available (which is only + * on 64-bit). + */ + if (vm_flags & VM_PKEY_BIT0) + val |= _PAGE_PKEY_BIT0; + if (vm_flags & VM_PKEY_BIT1) + val |= _PAGE_PKEY_BIT1; + if (vm_flags & VM_PKEY_BIT2) + val |= _PAGE_PKEY_BIT2; + if (vm_flags & VM_PKEY_BIT3) + val |= _PAGE_PKEY_BIT3; +#endif + + val = __sme_set(val); + if (val & _PAGE_PRESENT) + val &= __supported_pte_mask; + return __pgprot(val); +} +EXPORT_SYMBOL(vm_get_page_prot); -- cgit From 47010c040dec8af6347ec6259104fc13f7e7e30a Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Apr 2022 23:16:15 -0700 Subject: mm: hugetlb_vmemmap: cleanup CONFIG_HUGETLB_PAGE_FREE_VMEMMAP* The word of "free" is not expressive enough to express the feature of optimizing vmemmap pages associated with each HugeTLB, rename this keywork to "optimize". In this patch , cheanup configs to make code more expressive. Link: https://lkml.kernel.org/r/20220404074652.68024-4-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Mike Kravetz Cc: David Hildenbrand Signed-off-by: Andrew Morton --- arch/x86/Kconfig | 2 +- arch/x86/mm/init_64.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5b77b4301d36..da07c8087e5a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -121,7 +121,7 @@ config X86 select ARCH_WANTS_NO_INSTR select ARCH_WANT_GENERAL_HUGETLB select ARCH_WANT_HUGE_PMD_SHARE - select ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP if X86_64 + select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP if X86_64 select ARCH_WANT_LD_ORPHAN_WARN select ARCH_WANTS_THP_SWAP if X86_64 select ARCH_HAS_PARANOID_L1D_FLUSH diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 96d34ebb20a9..af34dd1510d6 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1269,7 +1269,7 @@ static struct kcore_list kcore_vsyscall; static void __init register_page_bootmem_info(void) { -#if defined(CONFIG_NUMA) || defined(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP) +#if defined(CONFIG_NUMA) || defined(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) int i; for_each_online_node(i) -- cgit From e999995c84c3abb6dcae83f8c35942a8d4ee0451 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Thu, 21 Apr 2022 00:00:05 +0800 Subject: ftrace: cleanup ftrace_graph_caller enable and disable The ftrace_[enable,disable]_ftrace_graph_caller() are used to do special hooks for graph tracer, which are not needed on some ARCHs that use graph_ops:func function to install return_hooker. So introduce the weak version in ftrace core code to cleanup in x86. Signed-off-by: Chengming Zhou Acked-by: Steven Rostedt (Google) Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20220420160006.17880-1-zhouchengming@bytedance.com Signed-off-by: Catalin Marinas --- arch/x86/kernel/ftrace.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 1e31c7d21597..b09d73c2ba89 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -579,9 +579,7 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops) #ifdef CONFIG_FUNCTION_GRAPH_TRACER -#ifdef CONFIG_DYNAMIC_FTRACE - -#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS +#if defined(CONFIG_DYNAMIC_FTRACE) && !defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS) extern void ftrace_graph_call(void); static const char *ftrace_jmp_replace(unsigned long ip, unsigned long addr) { @@ -610,18 +608,7 @@ int ftrace_disable_ftrace_graph_caller(void) return ftrace_mod_jmp(ip, &ftrace_stub); } -#else /* !CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */ -int ftrace_enable_ftrace_graph_caller(void) -{ - return 0; -} - -int ftrace_disable_ftrace_graph_caller(void) -{ - return 0; -} -#endif /* CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */ -#endif /* !CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_DYNAMIC_FTRACE && !CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */ /* * Hook the return address and push it in the stack of return addrs -- cgit From 5d8de293c224896a4da99763fce4f9794308caf4 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 29 Apr 2022 14:37:59 -0700 Subject: vmcore: convert copy_oldmem_page() to take an iov_iter Patch series "Convert vmcore to use an iov_iter", v5. For some reason several people have been sending bad patches to fix compiler warnings in vmcore recently. Here's how it should be done. Compile-tested only on x86. As noted in the first patch, s390 should take this conversion a bit further, but I'm not inclined to do that work myself. This patch (of 3): Instead of passing in a 'buf' and 'userbuf' argument, pass in an iov_iter. s390 needs more work to pass the iov_iter down further, or refactor, but I'd be more comfortable if someone who can test on s390 did that work. It's more convenient to convert the whole of read_from_oldmem() to take an iov_iter at the same time, so rename it to read_from_oldmem_iter() and add a temporary read_from_oldmem() wrapper that creates an iov_iter. Link: https://lkml.kernel.org/r/20220408090636.560886-1-bhe@redhat.com Link: https://lkml.kernel.org/r/20220408090636.560886-2-bhe@redhat.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Cc: Heiko Carstens Signed-off-by: Andrew Morton --- arch/x86/kernel/crash_dump_32.c | 29 ++++------------------------- arch/x86/kernel/crash_dump_64.c | 41 +++++++++++------------------------------ 2 files changed, 15 insertions(+), 55 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index 5fcac46aaf6b..5f4ae5476e19 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -10,8 +10,7 @@ #include #include #include - -#include +#include static inline bool is_crashed_pfn_valid(unsigned long pfn) { @@ -29,21 +28,8 @@ static inline bool is_crashed_pfn_valid(unsigned long pfn) #endif } -/** - * copy_oldmem_page - copy one page from "oldmem" - * @pfn: page frame number to be copied - * @buf: target memory address for the copy; this can be in kernel address - * space or user address space (see @userbuf) - * @csize: number of bytes to copy - * @offset: offset in bytes into the page (based on pfn) to begin the copy - * @userbuf: if set, @buf is in user address space, use copy_to_user(), - * otherwise @buf is in kernel address space, use memcpy(). - * - * Copy a page from "oldmem". For this page, there might be no pte mapped - * in the current kernel. - */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, - unsigned long offset, int userbuf) +ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, size_t csize, + unsigned long offset) { void *vaddr; @@ -54,14 +40,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, return -EFAULT; vaddr = kmap_local_pfn(pfn); - - if (!userbuf) { - memcpy(buf, vaddr + offset, csize); - } else { - if (copy_to_user(buf, vaddr + offset, csize)) - csize = -EFAULT; - } - + csize = copy_to_iter(vaddr + offset, csize, iter); kunmap_local(vaddr); return csize; diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index 97529552dd24..94fe4aff9694 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c @@ -8,12 +8,12 @@ #include #include -#include +#include #include #include -static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, - unsigned long offset, int userbuf, +static ssize_t __copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, + size_t csize, unsigned long offset, bool encrypted) { void *vaddr; @@ -29,46 +29,27 @@ static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, if (!vaddr) return -ENOMEM; - if (userbuf) { - if (copy_to_user((void __user *)buf, vaddr + offset, csize)) { - iounmap((void __iomem *)vaddr); - return -EFAULT; - } - } else - memcpy(buf, vaddr + offset, csize); + csize = copy_to_iter(vaddr + offset, csize, iter); iounmap((void __iomem *)vaddr); return csize; } -/** - * copy_oldmem_page - copy one page of memory - * @pfn: page frame number to be copied - * @buf: target memory address for the copy; this can be in kernel address - * space or user address space (see @userbuf) - * @csize: number of bytes to copy - * @offset: offset in bytes into the page (based on pfn) to begin the copy - * @userbuf: if set, @buf is in user address space, use copy_to_user(), - * otherwise @buf is in kernel address space, use memcpy(). - * - * Copy a page from the old kernel's memory. For this page, there is no pte - * mapped in the current kernel. We stitch up a pte, similar to kmap_atomic. - */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, - unsigned long offset, int userbuf) +ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, size_t csize, + unsigned long offset) { - return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, false); + return __copy_oldmem_page(iter, pfn, csize, offset, false); } -/** +/* * copy_oldmem_page_encrypted - same as copy_oldmem_page() above but ioremap the * memory with the encryption mask set to accommodate kdump on SME-enabled * machines. */ -ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize, - unsigned long offset, int userbuf) +ssize_t copy_oldmem_page_encrypted(struct iov_iter *iter, unsigned long pfn, + size_t csize, unsigned long offset) { - return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, true); + return __copy_oldmem_page(iter, pfn, csize, offset, true); } ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos) -- cgit From e0690479917cbce740eef51fa3de92c69647a5ad Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 29 Apr 2022 14:37:59 -0700 Subject: vmcore: convert read_from_oldmem() to take an iov_iter Remove the read_from_oldmem() wrapper introduced earlier and convert all the remaining callers to pass an iov_iter. Link: https://lkml.kernel.org/r/20220408090636.560886-4-bhe@redhat.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Cc: Tiezhu Yang Cc: Amit Daniel Kachhap Cc: Al Viro Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- arch/x86/kernel/crash_dump_64.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index 94fe4aff9694..e75bc2f217ff 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c @@ -54,6 +54,11 @@ ssize_t copy_oldmem_page_encrypted(struct iov_iter *iter, unsigned long pfn, ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos) { - return read_from_oldmem(buf, count, ppos, 0, + struct kvec kvec = { .iov_base = buf, .iov_len = count }; + struct iov_iter iter; + + iov_iter_kvec(&iter, READ, &kvec, 1, count); + + return read_from_oldmem(&iter, count, ppos, cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)); } -- cgit From 6308499b5e99c0c903fde2c605e41d9a86c4be6c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 29 Apr 2022 14:37:59 -0700 Subject: net: unexport csum_and_copy_{from,to}_user csum_and_copy_from_user and csum_and_copy_to_user are exported by a few architectures, but not actually used in modular code. Drop the exports. Link: https://lkml.kernel.org/r/20220421070440.1282704-1-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Jakub Kicinski Acked-by: Geert Uytterhoeven Acked-by: Arnd Bergmann Acked-by: Michael Ellerman (powerpc) Cc: David Miller Signed-off-by: Andrew Morton --- arch/x86/lib/csum-wrappers_64.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c index 189344924a2b..145f9a0bde29 100644 --- a/arch/x86/lib/csum-wrappers_64.c +++ b/arch/x86/lib/csum-wrappers_64.c @@ -32,7 +32,6 @@ csum_and_copy_from_user(const void __user *src, void *dst, int len) user_access_end(); return sum; } -EXPORT_SYMBOL(csum_and_copy_from_user); /** * csum_and_copy_to_user - Copy and checksum to user space. @@ -57,7 +56,6 @@ csum_and_copy_to_user(const void *src, void __user *dst, int len) user_access_end(); return sum; } -EXPORT_SYMBOL(csum_and_copy_to_user); /** * csum_partial_copy_nocheck - Copy and checksum. -- cgit From 7a116a2dd32d96869f0f93bac00b900859ba0434 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Sat, 16 Apr 2022 02:45:32 +0000 Subject: x86/apic: Do apic driver probe for "nosmp" use case For the "nosmp" use case, the APIC initialization code selects "APIC_SYMMETRIC_IO_NO_ROUTING" as the default interrupt mode and avoids probing APIC drivers. This works well for the default APIC modes, but for the x2APIC case the probe function is required to allocate the cluster_hotplug mask. So in the APIC_SYMMETRIC_IO_NO_ROUTING case when the x2APIC is initialized it dereferences a NULL pointer and the kernel crashes. This was observed on a TDX platform where x2APIC is enabled and "nosmp" command line option is allowed. To fix this issue, probe APIC drivers via default_setup_apic_routing() for the APIC_SYMMETRIC_IO_NO_ROUTING interrupt mode too. Suggested-by: Kirill A. Shutemov Suggested-by: Rafael J. Wysocki Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Thomas Gleixner Reviewed-by: Tony Luck Link: https://lore.kernel.org/r/a64f864e1114bcd63593286aaf61142cfce384ea.1650076869.git.sathyanarayanan.kuppuswamy@intel.com --- arch/x86/kernel/apic/apic.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 13819bfa8dde..25e92d7a7ae5 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1428,22 +1428,21 @@ void __init apic_intr_mode_init(void) return; case APIC_VIRTUAL_WIRE: pr_info("APIC: Switch to virtual wire mode setup\n"); - default_setup_apic_routing(); break; case APIC_VIRTUAL_WIRE_NO_CONFIG: pr_info("APIC: Switch to virtual wire mode setup with no configuration\n"); upmode = true; - default_setup_apic_routing(); break; case APIC_SYMMETRIC_IO: pr_info("APIC: Switch to symmetric I/O mode setup\n"); - default_setup_apic_routing(); break; case APIC_SYMMETRIC_IO_NO_ROUTING: pr_info("APIC: Switch to symmetric I/O mode setup in no SMP routine\n"); break; } + default_setup_apic_routing(); + if (x86_platform.apic_post_init) x86_platform.apic_post_init(); -- cgit From 1ff2fb982c52ed6c3478adc944441d6ea065d8fb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 2 May 2022 09:01:55 +0200 Subject: x86/aperfperf: Make it correct on 32bit and UP kernels The utilization of arch_scale_freq_tick() for CPU frequency readouts is incomplete as it failed to move the function prototype and the define out of the CONFIG_SMP && CONFIG_X86_64 #ifdef. Make them unconditionally available. Fixes: bb6e89df9028 ("x86/aperfmperf: Make parts of the frequency invariance code unconditional") Reported-by: kernel test robot Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/lkml/202205010106.06xRBR2C-lkp@intel.com --- arch/x86/include/asm/topology.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 1b2553dd3c64..458c891a8273 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -212,9 +212,6 @@ static inline long arch_scale_freq_capacity(int cpu) } #define arch_scale_freq_capacity arch_scale_freq_capacity -extern void arch_scale_freq_tick(void); -#define arch_scale_freq_tick arch_scale_freq_tick - extern void arch_set_max_freq_ratio(bool turbo_disabled); extern void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled); #else @@ -222,6 +219,9 @@ static inline void arch_set_max_freq_ratio(bool turbo_disabled) { } static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) { } #endif +extern void arch_scale_freq_tick(void); +#define arch_scale_freq_tick arch_scale_freq_tick + #ifdef CONFIG_ACPI_CPPC_LIB void init_freq_invariance_cppc(void); #define arch_init_invariance_cppc init_freq_invariance_cppc -- cgit From b91c0922bf1ed15b67a6faa404bc64e3ed532ec2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 2 May 2022 09:20:42 +0200 Subject: x86/fpu: Cleanup variable shadowing Addresses: warning: Local variable 'mask' shadows outer variable Remove extra variable declaration and switch the bit mask assignment to use BIT_ULL() while at it. Fixes: 522e92743b35 ("x86/fpu: Deduplicate copy_uabi_from_user/kernel_to_xstate()") Reported-by: kernel test robot Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/lkml/202204262032.jFYKit5j-lkp@intel.com --- arch/x86/kernel/fpu/xstate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 31c12f4d0770..81fcd04247de 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1233,7 +1233,7 @@ static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf, } for (i = 0; i < XFEATURE_MAX; i++) { - u64 mask = ((u64)1 << i); + mask = BIT_ULL(i); if (hdr.xfeatures & mask) { void *dst = __raw_xsave_addr(xsave, i); -- cgit From fe06a0c09b47ea58ba2b01c8941af16bd45c02df Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sun, 23 Jan 2022 10:38:37 -0800 Subject: KVM: x86: replace bitmap_weight with bitmap_empty where appropriate In some places kvm/hyperv.c code calls bitmap_weight() to check if any bit of a given bitmap is set. It's better to use bitmap_empty() in that case because bitmap_empty() stops traversing the bitmap as soon as it finds first set bit, while bitmap_weight() counts all bits unconditionally. Signed-off-by: Yury Norov Reviewed-by: Vitaly Kuznetsov Reviewed-by: Sean Christopherson Reviewed-by: Andy Shevchenko --- arch/x86/kvm/hyperv.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 123b677111c5..c964923a7684 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -90,7 +90,7 @@ static void synic_update_vector(struct kvm_vcpu_hv_synic *synic, { struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); - int auto_eoi_old, auto_eoi_new; + bool auto_eoi_old, auto_eoi_new; if (vector < HV_SYNIC_FIRST_VALID_VECTOR) return; @@ -100,16 +100,16 @@ static void synic_update_vector(struct kvm_vcpu_hv_synic *synic, else __clear_bit(vector, synic->vec_bitmap); - auto_eoi_old = bitmap_weight(synic->auto_eoi_bitmap, 256); + auto_eoi_old = !bitmap_empty(synic->auto_eoi_bitmap, 256); if (synic_has_vector_auto_eoi(synic, vector)) __set_bit(vector, synic->auto_eoi_bitmap); else __clear_bit(vector, synic->auto_eoi_bitmap); - auto_eoi_new = bitmap_weight(synic->auto_eoi_bitmap, 256); + auto_eoi_new = !bitmap_empty(synic->auto_eoi_bitmap, 256); - if (!!auto_eoi_old == !!auto_eoi_new) + if (auto_eoi_old == auto_eoi_new) return; if (!enable_apicv) -- cgit From ab65f49253ff706723ecbf87af74e9383b5e4582 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 2 May 2022 17:33:40 +0200 Subject: x86/sev: Fix address space sparse warning Fix: arch/x86/kernel/sev.c:605:16: warning: incorrect type in assignment (different address spaces) arch/x86/kernel/sev.c:605:16: expected struct snp_secrets_page_layout *layout arch/x86/kernel/sev.c:605:16: got void [noderef] __iomem *[assigned] mem Reported-by: kernel test robot Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/202205022233.XgNDR7WR-lkp@intel.com --- arch/x86/kernel/sev.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 166375084b1f..c05f0124c410 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -589,20 +589,23 @@ static u64 __init get_secrets_page(void) static u64 __init get_snp_jump_table_addr(void) { struct snp_secrets_page_layout *layout; + void __iomem *mem; u64 pa, addr; pa = get_secrets_page(); if (!pa) return 0; - layout = (__force void *)ioremap_encrypted(pa, PAGE_SIZE); - if (!layout) { + mem = ioremap_encrypted(pa, PAGE_SIZE); + if (!mem) { pr_err("Unable to locate AP jump table address: failed to map the SNP secrets page.\n"); return 0; } + layout = (__force struct snp_secrets_page_layout *)mem; + addr = layout->os_area.ap_jump_table_pa; - iounmap(layout); + iounmap(mem); return addr; } -- cgit From 0aca53c6b522f8d6e2681ca875acbbe105f5fdcf Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 21 Apr 2022 22:10:48 +0800 Subject: x86/traps: Use pt_regs directly in fixup_bad_iret() Always stash the address error_entry() is going to return to, in %r12 and get rid of the void *error_entry_ret; slot in struct bad_iret_stack which was supposed to account for it and pt_regs pushed on the stack. After this, both fixup_bad_iret() and sync_regs() can work on a struct pt_regs pointer directly. [ bp: Rewrite commit message, touch ups. ] Signed-off-by: Lai Jiangshan Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220503032107.680190-2-jiangshanlai@gmail.com --- arch/x86/entry/entry_64.S | 5 ++++- arch/x86/include/asm/traps.h | 2 +- arch/x86/kernel/traps.c | 19 +++++++------------ 3 files changed, 12 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 73d958522b6a..ecbfca3cc18c 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1061,9 +1061,12 @@ SYM_CODE_START_LOCAL(error_entry) * Pretend that the exception came from user mode: set up pt_regs * as if we faulted immediately after IRET. */ - mov %rsp, %rdi + popq %r12 /* save return addr in %12 */ + movq %rsp, %rdi /* arg0 = pt_regs pointer */ call fixup_bad_iret mov %rax, %rsp + ENCODE_FRAME_POINTER + pushq %r12 jmp .Lerror_entry_from_usermode_after_swapgs SYM_CODE_END(error_entry) diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 35317c5c551d..47ecfff2c83d 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -13,7 +13,7 @@ #ifdef CONFIG_X86_64 asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs); asmlinkage __visible notrace -struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s); +struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs); void __init trap_init(void); asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *eregs); #endif diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 1563fb995005..4167215333fd 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -892,14 +892,10 @@ sync: } #endif -struct bad_iret_stack { - void *error_entry_ret; - struct pt_regs regs; -}; - -asmlinkage __visible noinstr -struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) +asmlinkage __visible noinstr struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs) { + struct pt_regs tmp, *new_stack; + /* * This is called from entry_64.S early in handling a fault * caused by a bad iret to user mode. To handle the fault @@ -908,19 +904,18 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) * just below the IRET frame) and we want to pretend that the * exception came from the IRET target. */ - struct bad_iret_stack tmp, *new_stack = - (struct bad_iret_stack *)__this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; + new_stack = (struct pt_regs *)__this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; /* Copy the IRET target to the temporary storage. */ - __memcpy(&tmp.regs.ip, (void *)s->regs.sp, 5*8); + __memcpy(&tmp.ip, (void *)bad_regs->sp, 5*8); /* Copy the remainder of the stack from the current stack. */ - __memcpy(&tmp, s, offsetof(struct bad_iret_stack, regs.ip)); + __memcpy(&tmp, bad_regs, offsetof(struct pt_regs, ip)); /* Update the entry stack */ __memcpy(new_stack, &tmp, sizeof(tmp)); - BUG_ON(!user_mode(&new_stack->regs)); + BUG_ON(!user_mode(new_stack)); return new_stack; } #endif -- cgit From 520a7e80c96d655fbe4650d9cc985bd9d0443389 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 21 Apr 2022 22:10:49 +0800 Subject: x86/entry: Switch the stack after error_entry() returns error_entry() calls fixup_bad_iret() before sync_regs() if it is a fault from a bad IRET, to copy pt_regs to the kernel stack. It switches to the kernel stack directly after sync_regs(). But error_entry() itself is also a function call, so it has to stash the address it is going to return to, in %r12 which is unnecessarily complicated. Move the stack switching after error_entry() and get rid of the need to handle the return address. [ bp: Massage commit message. ] Signed-off-by: Lai Jiangshan Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220503032107.680190-3-jiangshanlai@gmail.com --- arch/x86/entry/entry_64.S | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index ecbfca3cc18c..ca3e99e08a44 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -326,6 +326,8 @@ SYM_CODE_END(ret_from_fork) .macro idtentry_body cfunc has_error_code:req call error_entry + movq %rax, %rsp /* switch to the task stack if from userspace */ + ENCODE_FRAME_POINTER UNWIND_HINT_REGS movq %rsp, %rdi /* pt_regs pointer into 1st argument*/ @@ -1002,14 +1004,10 @@ SYM_CODE_START_LOCAL(error_entry) /* We have user CR3. Change to kernel CR3. */ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax + leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */ .Lerror_entry_from_usermode_after_swapgs: /* Put us onto the real thread stack. */ - popq %r12 /* save return addr in %12 */ - movq %rsp, %rdi /* arg0 = pt_regs pointer */ call sync_regs - movq %rax, %rsp /* switch stack */ - ENCODE_FRAME_POINTER - pushq %r12 RET /* @@ -1041,6 +1039,7 @@ SYM_CODE_START_LOCAL(error_entry) */ .Lerror_entry_done_lfence: FENCE_SWAPGS_KERNEL_ENTRY + leaq 8(%rsp), %rax /* return pt_regs pointer */ RET .Lbstep_iret: @@ -1061,12 +1060,9 @@ SYM_CODE_START_LOCAL(error_entry) * Pretend that the exception came from user mode: set up pt_regs * as if we faulted immediately after IRET. */ - popq %r12 /* save return addr in %12 */ - movq %rsp, %rdi /* arg0 = pt_regs pointer */ + leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */ call fixup_bad_iret - mov %rax, %rsp - ENCODE_FRAME_POINTER - pushq %r12 + mov %rax, %rdi jmp .Lerror_entry_from_usermode_after_swapgs SYM_CODE_END(error_entry) -- cgit From ee774dac0da1543376a69fd90840af6aa86879b3 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 21 Apr 2022 22:10:50 +0800 Subject: x86/entry: Move PUSH_AND_CLEAR_REGS out of error_entry() The macro idtentry() (through idtentry_body()) calls error_entry() unconditionally even on XENPV. But XENPV needs to only push and clear regs. PUSH_AND_CLEAR_REGS in error_entry() makes the stack not return to its original place when the function returns, which means it is not possible to convert it to a C function. Carve out PUSH_AND_CLEAR_REGS out of error_entry() and into a separate function and call it before error_entry() in order to avoid calling error_entry() on XENPV. It will also allow for error_entry() to be converted to C code that can use inlined sync_regs() and save a function call. [ bp: Massage commit message. ] Signed-off-by: Lai Jiangshan Signed-off-by: Borislav Petkov Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20220503032107.680190-4-jiangshanlai@gmail.com --- arch/x86/entry/entry_64.S | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index ca3e99e08a44..b1cef3b0a7ab 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -318,6 +318,14 @@ SYM_CODE_END(ret_from_fork) #endif .endm +/* Save all registers in pt_regs */ +SYM_CODE_START_LOCAL(push_and_clear_regs) + UNWIND_HINT_FUNC + PUSH_AND_CLEAR_REGS save_ret=1 + ENCODE_FRAME_POINTER 8 + RET +SYM_CODE_END(push_and_clear_regs) + /** * idtentry_body - Macro to emit code calling the C function * @cfunc: C function to be called @@ -325,6 +333,9 @@ SYM_CODE_END(ret_from_fork) */ .macro idtentry_body cfunc has_error_code:req + call push_and_clear_regs + UNWIND_HINT_REGS + call error_entry movq %rax, %rsp /* switch to the task stack if from userspace */ ENCODE_FRAME_POINTER @@ -985,13 +996,11 @@ SYM_CODE_START_LOCAL(paranoid_exit) SYM_CODE_END(paranoid_exit) /* - * Save all registers in pt_regs, and switch GS if needed. + * Switch GS and CR3 if needed. */ SYM_CODE_START_LOCAL(error_entry) UNWIND_HINT_FUNC cld - PUSH_AND_CLEAR_REGS save_ret=1 - ENCODE_FRAME_POINTER 8 testb $3, CS+8(%rsp) jz .Lerror_kernelspace -- cgit From c64cc2802a784ecfd25d39945e57e7a147854a5b Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 21 Apr 2022 22:10:51 +0800 Subject: x86/entry: Move CLD to the start of the idtentry macro Move it after CLAC. Suggested-by: Peter Zijlstra Signed-off-by: Lai Jiangshan Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220503032107.680190-5-jiangshanlai@gmail.com --- arch/x86/entry/entry_64.S | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index b1cef3b0a7ab..ab6ab6d3dab5 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -371,6 +371,7 @@ SYM_CODE_START(\asmsym) UNWIND_HINT_IRET_REGS offset=\has_error_code*8 ENDBR ASM_CLAC + cld .if \has_error_code == 0 pushq $-1 /* ORIG_RAX: no syscall to restart */ @@ -439,6 +440,7 @@ SYM_CODE_START(\asmsym) UNWIND_HINT_IRET_REGS ENDBR ASM_CLAC + cld pushq $-1 /* ORIG_RAX: no syscall to restart */ @@ -495,6 +497,7 @@ SYM_CODE_START(\asmsym) UNWIND_HINT_IRET_REGS ENDBR ASM_CLAC + cld /* * If the entry is from userspace, switch stacks and treat it as @@ -557,6 +560,7 @@ SYM_CODE_START(\asmsym) UNWIND_HINT_IRET_REGS offset=8 ENDBR ASM_CLAC + cld /* paranoid_entry returns GS information for paranoid_exit in EBX. */ call paranoid_entry @@ -882,7 +886,6 @@ SYM_CODE_END(xen_failsafe_callback) */ SYM_CODE_START_LOCAL(paranoid_entry) UNWIND_HINT_FUNC - cld PUSH_AND_CLEAR_REGS save_ret=1 ENCODE_FRAME_POINTER 8 @@ -1000,7 +1003,6 @@ SYM_CODE_END(paranoid_exit) */ SYM_CODE_START_LOCAL(error_entry) UNWIND_HINT_FUNC - cld testb $3, CS+8(%rsp) jz .Lerror_kernelspace @@ -1134,6 +1136,7 @@ SYM_CODE_START(asm_exc_nmi) */ ASM_CLAC + cld /* Use %rdx as our temp variable throughout */ pushq %rdx @@ -1153,7 +1156,6 @@ SYM_CODE_START(asm_exc_nmi) */ swapgs - cld FENCE_SWAPGS_USER_ENTRY SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx movq %rsp, %rdx -- cgit From 64cbd0acb58203fb769ed2f4eab526d43e243847 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 3 May 2022 11:21:06 +0800 Subject: x86/entry: Don't call error_entry() for XENPV XENPV guests enter already on the task stack and they can't fault for native_iret() nor native_load_gs_index() since they use their own pvop for IRET and load_gs_index(). A CR3 switch is not needed either. So there is no reason to call error_entry() in XENPV. [ bp: Massage commit message. ] Signed-off-by: Lai Jiangshan Signed-off-by: Borislav Petkov Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20220503032107.680190-6-jiangshanlai@gmail.com --- arch/x86/entry/entry_64.S | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index ab6ab6d3dab5..062aa9d95961 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -336,8 +336,17 @@ SYM_CODE_END(push_and_clear_regs) call push_and_clear_regs UNWIND_HINT_REGS - call error_entry - movq %rax, %rsp /* switch to the task stack if from userspace */ + /* + * Call error_entry() and switch to the task stack if from userspace. + * + * When in XENPV, it is already in the task stack, and it can't fault + * for native_iret() nor native_load_gs_index() since XENPV uses its + * own pvops for IRET and load_gs_index(). And it doesn't need to + * switch the CR3. So it can skip invoking error_entry(). + */ + ALTERNATIVE "call error_entry; movq %rax, %rsp", \ + "", X86_FEATURE_XENPV + ENCODE_FRAME_POINTER UNWIND_HINT_REGS -- cgit From c89191ce67efa4e5353db6a67f7287c28e673740 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 3 May 2022 11:21:07 +0800 Subject: x86/entry: Convert SWAPGS to swapgs and remove the definition of SWAPGS XENPV doesn't use swapgs_restore_regs_and_return_to_usermode(), error_entry() and the code between entry_SYSENTER_compat() and entry_SYSENTER_compat_after_hwframe. Change the PV-compatible SWAPGS to the ASM instruction swapgs in these places. Also remove the definition of SWAPGS since no more users. Signed-off-by: Lai Jiangshan Signed-off-by: Borislav Petkov Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20220503032107.680190-7-jiangshanlai@gmail.com --- arch/x86/entry/entry_64.S | 6 +++--- arch/x86/entry/entry_64_compat.S | 2 +- arch/x86/include/asm/irqflags.h | 8 -------- 3 files changed, 4 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 062aa9d95961..312186612f4e 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1019,7 +1019,7 @@ SYM_CODE_START_LOCAL(error_entry) * We entered from user mode or we're pretending to have entered * from user mode due to an IRET fault. */ - SWAPGS + swapgs FENCE_SWAPGS_USER_ENTRY /* We have user CR3. Change to kernel CR3. */ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax @@ -1051,7 +1051,7 @@ SYM_CODE_START_LOCAL(error_entry) * gsbase and proceed. We'll fix up the exception and land in * .Lgs_change's error handler with kernel gsbase. */ - SWAPGS + swapgs /* * Issue an LFENCE to prevent GS speculation, regardless of whether it is a @@ -1072,7 +1072,7 @@ SYM_CODE_START_LOCAL(error_entry) * We came from an IRET to user mode, so we have user * gsbase and CR3. Switch to kernel gsbase and CR3: */ - SWAPGS + swapgs FENCE_SWAPGS_USER_ENTRY SWITCH_TO_KERNEL_CR3 scratch_reg=%rax diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 4fdb007cddbd..c5aeb0819707 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -50,7 +50,7 @@ SYM_CODE_START(entry_SYSENTER_compat) UNWIND_HINT_EMPTY ENDBR /* Interrupts are off on entry. */ - SWAPGS + swapgs pushq %rax SWITCH_TO_KERNEL_CR3 scratch_reg=%rax diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 111104d1c2cd..7793e52d6237 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -137,14 +137,6 @@ static __always_inline void arch_local_irq_restore(unsigned long flags) if (!arch_irqs_disabled_flags(flags)) arch_local_irq_enable(); } -#else -#ifdef CONFIG_X86_64 -#ifdef CONFIG_XEN_PV -#define SWAPGS ALTERNATIVE "swapgs", "", X86_FEATURE_XENPV -#else -#define SWAPGS swapgs -#endif -#endif #endif /* !__ASSEMBLY__ */ #endif -- cgit From 3ba75c1316390b2bc39c19cb8f0f85922ab3f9ed Mon Sep 17 00:00:00 2001 From: Baskov Evgeniy Date: Thu, 3 Mar 2022 17:21:19 +0300 Subject: efi: libstub: declare DXE services table UEFI DXE services are not yet used in kernel code but are required to manipulate page table memory protection flags. Add required declarations to use DXE services functions. Signed-off-by: Baskov Evgeniy Link: https://lore.kernel.org/r/20220303142120.1975-2-baskov@ispras.ru [ardb: ignore absent DXE table but warn if the signature check fails] Signed-off-by: Ard Biesheuvel --- arch/x86/include/asm/efi.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 98938a68251c..bed74a0f2932 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -357,6 +357,11 @@ static inline u32 efi64_convert_status(efi_status_t status) runtime), \ func, __VA_ARGS__)) +#define efi_dxe_call(func, ...) \ + (efi_is_native() \ + ? efi_dxe_table->func(__VA_ARGS__) \ + : __efi64_thunk_map(efi_dxe_table, func, __VA_ARGS__)) + #else /* CONFIG_EFI_MIXED */ static inline bool efi_is_64bit(void) -- cgit From 24b72bb12e84c75e297a5a81f24b921d7a011575 Mon Sep 17 00:00:00 2001 From: Peter Jones Date: Tue, 29 Mar 2022 14:47:43 -0400 Subject: efi: x86: Set the NX-compatibility flag in the PE header Following Baskov Evgeniy's "Handle UEFI NX-restricted page tables" patches, it's safe to set this compatibility flag to let loaders know they don't need to make special accommodations for kernel to load if pre-boot NX is enabled. Signed-off-by: Peter Jones Link: https://lore.kernel.org/all/20220329184743.798513-1-pjones@redhat.com/ Signed-off-by: Ard Biesheuvel --- arch/x86/boot/header.S | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 6dbd7e9f74c9..0352e4589efa 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -163,7 +163,11 @@ extra_header_fields: .long 0x200 # SizeOfHeaders .long 0 # CheckSum .word IMAGE_SUBSYSTEM_EFI_APPLICATION # Subsystem (EFI application) +#ifdef CONFIG_DXE_MEM_ATTRIBUTES + .word IMAGE_DLL_CHARACTERISTICS_NX_COMPAT # DllCharacteristics +#else .word 0 # DllCharacteristics +#endif #ifdef CONFIG_X86_32 .long 0 # SizeOfStackReserve .long 0 # SizeOfStackCommit -- cgit From d6d0c7f681fda1d07e005c8f653e578b77a0eb40 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 21 Apr 2022 11:16:53 +0530 Subject: x86/cpufeatures: Add PerfMonV2 feature bit CPUID leaf 0x80000022 i.e. ExtPerfMonAndDbg advertises some new performance monitoring features for AMD processors. Bit 0 of EAX indicates support for Performance Monitoring Version 2 (PerfMonV2) features. If found to be set during PMU initialization, the EBX bits of the same CPUID function can be used to determine the number of available PMCs for different PMU types. Additionally, Core PMCs can be managed using new global control and status registers. For better utilization of feature words, PerfMonV2 is added as a scattered feature bit. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/c70e497e22f18e7f05b025bb64ca21cc12b17792.1650515382.git.sandipan.das@amd.com --- arch/x86/include/asm/cpufeatures.h | 2 +- arch/x86/kernel/cpu/scattered.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 0d62afd525e3..b50e0872ad1e 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -211,7 +211,7 @@ #define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */ #define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ #define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */ -/* FREE! ( 7*32+20) */ +#define X86_FEATURE_PERFMON_V2 ( 7*32+20) /* AMD Performance Monitoring Version 2 */ #define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */ #define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */ #define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */ diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 4143b1e4c5c6..dbaa8326d6f2 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -43,6 +43,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, + { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 }, { 0, 0, 0, 0, 0 } }; -- cgit From 089be16d5992dd0bc6df15ef12042fd1023ded9a Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 21 Apr 2022 11:16:54 +0530 Subject: x86/msr: Add PerfCntrGlobal* registers Add MSR definitions that will be used to enable the new AMD Performance Monitoring Version 2 (PerfMonV2) features. These include: * Performance Counter Global Control (PerfCntrGlobalCtl) * Performance Counter Global Status (PerfCntrGlobalStatus) * Performance Counter Global Status Clear (PerfCntrGlobalStatusClr) The new Performance Counter Global Control and Status MSRs provide an interface for enabling or disabling multiple counters at the same time and for testing overflow without probing the individual registers for each PMC. The availability of these registers is indicated through the PerfMonV2 feature bit of CPUID leaf 0x80000022 EAX. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/cdc0d8f75bd519848731b5c64d924f5a0619a573.1650515382.git.sandipan.das@amd.com --- arch/x86/include/asm/msr-index.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 8179ea351bd8..58a44dceef9a 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -524,6 +524,11 @@ #define AMD_CPPC_DES_PERF(x) (((x) & 0xff) << 16) #define AMD_CPPC_ENERGY_PERF_PREF(x) (((x) & 0xff) << 24) +/* AMD Performance Counter Global Status and Control MSRs */ +#define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS 0xc0000300 +#define MSR_AMD64_PERF_CNTR_GLOBAL_CTL 0xc0000301 +#define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR 0xc0000302 + /* Fam 17h MSRs */ #define MSR_F17H_IRPERF 0xc00000e9 -- cgit From 21d59e3e2c403c83ba196a5857d517054124168e Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 21 Apr 2022 11:16:55 +0530 Subject: perf/x86/amd/core: Detect PerfMonV2 support AMD Performance Monitoring Version 2 (PerfMonV2) introduces some new Core PMU features such as detection of the number of available PMCs and managing PMCs using global registers namely, PerfCntrGlobalCtl and PerfCntrGlobalStatus. Clearing PerfCntrGlobalCtl and PerfCntrGlobalStatus ensures that all PMCs are inactive and have no pending overflows when CPUs are onlined or offlined. The PMU version (x86_pmu.version) now indicates PerfMonV2 support and will be used to bypass the new features on unsupported processors. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/dc8672ecbddff394e088ca8abf94b089b8ecc2e7.1650515382.git.sandipan.das@amd.com --- arch/x86/events/amd/core.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 8e1e818f8195..b70dfa028ba5 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -19,6 +19,9 @@ static unsigned long perf_nmi_window; #define AMD_MERGE_EVENT ((0xFULL << 32) | 0xFFULL) #define AMD_MERGE_EVENT_ENABLE (AMD_MERGE_EVENT | ARCH_PERFMON_EVENTSEL_ENABLE) +/* PMC Enable and Overflow bits for PerfCntrGlobal* registers */ +static u64 amd_pmu_global_cntr_mask __read_mostly; + static __initconst const u64 amd_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -578,6 +581,18 @@ static struct amd_nb *amd_alloc_nb(int cpu) return nb; } +static void amd_pmu_cpu_reset(int cpu) +{ + if (x86_pmu.version < 2) + return; + + /* Clear enable bits i.e. PerfCntrGlobalCtl.PerfCntrEn */ + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 0); + + /* Clear overflow bits i.e. PerfCntrGLobalStatus.PerfCntrOvfl */ + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, amd_pmu_global_cntr_mask); +} + static int amd_pmu_cpu_prepare(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); @@ -625,6 +640,7 @@ static void amd_pmu_cpu_starting(int cpu) cpuc->amd_nb->refcnt++; amd_brs_reset(); + amd_pmu_cpu_reset(cpu); } static void amd_pmu_cpu_dead(int cpu) @@ -644,6 +660,8 @@ static void amd_pmu_cpu_dead(int cpu) cpuhw->amd_nb = NULL; } + + amd_pmu_cpu_reset(cpu); } /* @@ -1185,6 +1203,15 @@ static int __init amd_core_pmu_init(void) x86_pmu.eventsel = MSR_F15H_PERF_CTL; x86_pmu.perfctr = MSR_F15H_PERF_CTR; x86_pmu.num_counters = AMD64_NUM_COUNTERS_CORE; + + /* Check for Performance Monitoring v2 support */ + if (boot_cpu_has(X86_FEATURE_PERFMON_V2)) { + /* Update PMU version for later usage */ + x86_pmu.version = 2; + + amd_pmu_global_cntr_mask = (1ULL << x86_pmu.num_counters) - 1; + } + /* * AMD Core perfctr has separate MSRs for the NB events, see * the amd/uncore.c driver. -- cgit From 56e026a7ca3f92b8e44359e1f705febd1833f701 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 21 Apr 2022 11:16:56 +0530 Subject: perf/x86/amd/core: Detect available counters If AMD Performance Monitoring Version 2 (PerfMonV2) is supported, use CPUID leaf 0x80000022 EBX to detect the number of Core PMCs. This offers more flexibility if the counts change in later processor families. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/68a6d9688df189267db26530378870edd34f7b06.1650515382.git.sandipan.das@amd.com --- arch/x86/events/amd/core.c | 6 ++++++ arch/x86/include/asm/perf_event.h | 17 +++++++++++++++++ 2 files changed, 23 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index b70dfa028ba5..52fd7941a724 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -1186,6 +1186,7 @@ static const struct attribute_group *amd_attr_update[] = { static int __init amd_core_pmu_init(void) { + union cpuid_0x80000022_ebx ebx; u64 even_ctr_mask = 0ULL; int i; @@ -1206,9 +1207,14 @@ static int __init amd_core_pmu_init(void) /* Check for Performance Monitoring v2 support */ if (boot_cpu_has(X86_FEATURE_PERFMON_V2)) { + ebx.full = cpuid_ebx(EXT_PERFMON_DEBUG_FEATURES); + /* Update PMU version for later usage */ x86_pmu.version = 2; + /* Find the number of available Core PMCs */ + x86_pmu.num_counters = ebx.split.num_core_pmc; + amd_pmu_global_cntr_mask = (1ULL << x86_pmu.num_counters) - 1; } diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 8199fc5a37ea..c6cc07f46556 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -186,6 +186,18 @@ union cpuid28_ecx { unsigned int full; }; +/* + * AMD "Extended Performance Monitoring and Debug" CPUID + * detection/enumeration details: + */ +union cpuid_0x80000022_ebx { + struct { + /* Number of Core Performance Counters */ + unsigned int num_core_pmc:4; + } split; + unsigned int full; +}; + struct x86_pmu_capability { int version; int num_counters_gp; @@ -367,6 +379,11 @@ struct pebs_xmm { u64 xmm[16*2]; /* two entries for each register */ }; +/* + * AMD Extended Performance Monitoring and Debug cpuid feature detection + */ +#define EXT_PERFMON_DEBUG_FEATURES 0x80000022 + /* * IBS cpuid feature detection */ -- cgit From 9622e67e3980c01872490de0925e5c6c23247c94 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 21 Apr 2022 11:16:57 +0530 Subject: perf/x86/amd/core: Add PerfMonV2 counter control If AMD Performance Monitoring Version 2 (PerfMonV2) is supported, use a new scheme to manage the Core PMCs using the new global control and status registers. This will be bypassed on unsupported hardware (x86_pmu.version < 2). Currently, all PMCs have dedicated control (PERF_CTL) and counter (PERF_CTR) registers. For a given PMC, the enable (En) bit of its PERF_CTL register is used to start or stop counting. The Performance Counter Global Control (PerfCntrGlobalCtl) register has enable (PerfCntrEn) bits for each PMC. For a PMC to start counting, both PERF_CTL and PerfCntrGlobalCtl enable bits must be set. If either of those are cleared, the PMC stops counting. In x86_pmu_{en,dis}able_all(), the PERF_CTL registers of all active PMCs are written to in a loop. Ideally, PMCs counting the same event that were started and stopped at the same time should record the same counts. Due to delays in between writes to the PERF_CTL registers across loop iterations, the PMCs cannot be enabled or disabled at the same instant and hence, record slightly different counts. This is fixed by enabling or disabling all active PMCs at the same time with a single write to the PerfCntrGlobalCtl register. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/dfe8e934074aaabc6ba748dfaccd0a77c974bb82.1650515382.git.sandipan.das@amd.com --- arch/x86/events/amd/core.c | 50 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 45 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 52fd7941a724..a339c3e0be33 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -664,6 +664,11 @@ static void amd_pmu_cpu_dead(int cpu) amd_pmu_cpu_reset(cpu); } +static inline void amd_pmu_set_global_ctl(u64 ctl) +{ + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, ctl); +} + /* * When a PMC counter overflows, an NMI is used to process the event and * reset the counter. NMI latency can result in the counter being updated @@ -693,15 +698,11 @@ static void amd_pmu_wait_on_overflow(int idx) } } -static void amd_pmu_disable_all(void) +static void amd_pmu_check_overflow(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; - amd_brs_disable_all(); - - x86_pmu_disable_all(); - /* * This shouldn't be called from NMI context, but add a safeguard here * to return, since if we're in NMI context we can't wait for an NMI @@ -748,6 +749,26 @@ static void amd_pmu_enable_all(int added) } } +static void amd_pmu_v2_enable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + /* + * Testing cpu_hw_events.enabled should be skipped in this case unlike + * in x86_pmu_enable_event(). + * + * Since cpu_hw_events.enabled is set only after returning from + * x86_pmu_start(), the PMCs must be programmed and kept ready. + * Counting starts only after x86_pmu_enable_all() is called. + */ + __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); +} + +static void amd_pmu_v2_enable_all(int added) +{ + amd_pmu_set_global_ctl(amd_pmu_global_cntr_mask); +} + static void amd_pmu_disable_event(struct perf_event *event) { x86_pmu_disable_event(event); @@ -765,6 +786,20 @@ static void amd_pmu_disable_event(struct perf_event *event) amd_pmu_wait_on_overflow(event->hw.idx); } +static void amd_pmu_disable_all(void) +{ + amd_brs_disable_all(); + x86_pmu_disable_all(); + amd_pmu_check_overflow(); +} + +static void amd_pmu_v2_disable_all(void) +{ + /* Disable all PMCs */ + amd_pmu_set_global_ctl(0); + amd_pmu_check_overflow(); +} + static void amd_pmu_add_event(struct perf_event *event) { if (needs_branch_stack(event)) @@ -1216,6 +1251,11 @@ static int __init amd_core_pmu_init(void) x86_pmu.num_counters = ebx.split.num_core_pmc; amd_pmu_global_cntr_mask = (1ULL << x86_pmu.num_counters) - 1; + + /* Update PMC handling functions */ + x86_pmu.enable_all = amd_pmu_v2_enable_all; + x86_pmu.disable_all = amd_pmu_v2_disable_all; + x86_pmu.enable = amd_pmu_v2_enable_event; } /* -- cgit From 7685665c390dc68c2d9a74e8445f41494cc8f6cf Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 21 Apr 2022 11:16:58 +0530 Subject: perf/x86/amd/core: Add PerfMonV2 overflow handling If AMD Performance Monitoring Version 2 (PerfMonV2) is supported, use a new scheme to process Core PMC overflows in the NMI handler using the new global control and status registers. This will be bypassed on unsupported hardware (x86_pmu.version < 2). In x86_pmu_handle_irq(), overflows are detected by testing the contents of the PERF_CTR register for each active PMC in a loop. The new scheme instead inspects the overflow bits of the global status register. The Performance Counter Global Status (PerfCntrGlobalStatus) register has overflow (PerfCntrOvfl) bits for each PMC. This is, however, a read-only MSR. To acknowledge that overflows have been processed, the NMI handler must clear the bits by writing to the PerfCntrGlobalStatusClr register. In x86_pmu_handle_irq(), PMCs counting the same event that are started and stopped at the same time record slightly different counts due to delays in between reads from the PERF_CTR registers. This is fixed by stopping and starting the PMCs at the same before and with a single write to the Performance Counter Global Control (PerfCntrGlobalCtl) upon entering and before exiting the NMI handler. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/f20b7e4da0b0a83bdbe05857f354146623bc63ab.1650515382.git.sandipan.das@amd.com --- arch/x86/events/amd/core.c | 144 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 133 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index a339c3e0be33..262e39a85031 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include "../perf_event.h" @@ -669,6 +670,45 @@ static inline void amd_pmu_set_global_ctl(u64 ctl) wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, ctl); } +static inline u64 amd_pmu_get_global_status(void) +{ + u64 status; + + /* PerfCntrGlobalStatus is read-only */ + rdmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, status); + + return status & amd_pmu_global_cntr_mask; +} + +static inline void amd_pmu_ack_global_status(u64 status) +{ + /* + * PerfCntrGlobalStatus is read-only but an overflow acknowledgment + * mechanism exists; writing 1 to a bit in PerfCntrGlobalStatusClr + * clears the same bit in PerfCntrGlobalStatus + */ + + /* Only allow modifications to PerfCntrGlobalStatus.PerfCntrOvfl */ + status &= amd_pmu_global_cntr_mask; + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, status); +} + +static bool amd_pmu_test_overflow_topbit(int idx) +{ + u64 counter; + + rdmsrl(x86_pmu_event_addr(idx), counter); + + return !(counter & BIT_ULL(x86_pmu.cntval_bits - 1)); +} + +static bool amd_pmu_test_overflow_status(int idx) +{ + return amd_pmu_get_global_status() & BIT_ULL(idx); +} + +DEFINE_STATIC_CALL(amd_pmu_test_overflow, amd_pmu_test_overflow_topbit); + /* * When a PMC counter overflows, an NMI is used to process the event and * reset the counter. NMI latency can result in the counter being updated @@ -681,7 +721,6 @@ static inline void amd_pmu_set_global_ctl(u64 ctl) static void amd_pmu_wait_on_overflow(int idx) { unsigned int i; - u64 counter; /* * Wait for the counter to be reset if it has overflowed. This loop @@ -689,8 +728,7 @@ static void amd_pmu_wait_on_overflow(int idx) * forever... */ for (i = 0; i < OVERFLOW_WAIT_COUNT; i++) { - rdmsrl(x86_pmu_event_addr(idx), counter); - if (counter & (1ULL << (x86_pmu.cntval_bits - 1))) + if (!static_call(amd_pmu_test_overflow)(idx)) break; /* Might be in IRQ context, so can't sleep */ @@ -830,6 +868,24 @@ static void amd_pmu_del_event(struct perf_event *event) * handled a counter. When an un-handled NMI is received, it will be claimed * only if arriving within that window. */ +static inline int amd_pmu_adjust_nmi_window(int handled) +{ + /* + * If a counter was handled, record a timestamp such that un-handled + * NMIs will be claimed if arriving within that window. + */ + if (handled) { + this_cpu_write(perf_nmi_tstamp, jiffies + perf_nmi_window); + + return handled; + } + + if (time_after(jiffies, this_cpu_read(perf_nmi_tstamp))) + return NMI_DONE; + + return NMI_HANDLED; +} + static int amd_pmu_handle_irq(struct pt_regs *regs) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -857,20 +913,84 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) if (pmu_enabled) amd_pmu_enable_all(0); + return amd_pmu_adjust_nmi_window(handled); +} + +static int amd_pmu_v2_handle_irq(struct pt_regs *regs) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_sample_data data; + struct hw_perf_event *hwc; + struct perf_event *event; + int handled = 0, idx; + u64 status, mask; + bool pmu_enabled; + /* - * If a counter was handled, record a timestamp such that un-handled - * NMIs will be claimed if arriving within that window. + * Save the PMU state as it needs to be restored when leaving the + * handler */ - if (handled) { - this_cpu_write(perf_nmi_tstamp, jiffies + perf_nmi_window); + pmu_enabled = cpuc->enabled; + cpuc->enabled = 0; - return handled; + /* Stop counting */ + amd_pmu_v2_disable_all(); + + status = amd_pmu_get_global_status(); + + /* Check if any overflows are pending */ + if (!status) + goto done; + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + if (!test_bit(idx, cpuc->active_mask)) + continue; + + event = cpuc->events[idx]; + hwc = &event->hw; + x86_perf_event_update(event); + mask = BIT_ULL(idx); + + if (!(status & mask)) + continue; + + /* Event overflow */ + handled++; + perf_sample_data_init(&data, 0, hwc->last_period); + + if (!x86_perf_event_set_period(event)) + continue; + + if (perf_event_overflow(event, &data, regs)) + x86_pmu_stop(event, 0); + + status &= ~mask; } - if (time_after(jiffies, this_cpu_read(perf_nmi_tstamp))) - return NMI_DONE; + /* + * It should never be the case that some overflows are not handled as + * the corresponding PMCs are expected to be inactive according to the + * active_mask + */ + WARN_ON(status > 0); - return NMI_HANDLED; + /* Clear overflow bits */ + amd_pmu_ack_global_status(~status); + + /* + * Unmasking the LVTPC is not required as the Mask (M) bit of the LVT + * PMI entry is not set by the local APIC when a PMC overflow occurs + */ + inc_irq_stat(apic_perf_irqs); + +done: + cpuc->enabled = pmu_enabled; + + /* Resume counting only if PMU is active */ + if (pmu_enabled) + amd_pmu_v2_enable_all(0); + + return amd_pmu_adjust_nmi_window(handled); } static struct event_constraint * @@ -1256,6 +1376,8 @@ static int __init amd_core_pmu_init(void) x86_pmu.enable_all = amd_pmu_v2_enable_all; x86_pmu.disable_all = amd_pmu_v2_disable_all; x86_pmu.enable = amd_pmu_v2_enable_event; + x86_pmu.handle_irq = amd_pmu_v2_handle_irq; + static_call_update(amd_pmu_test_overflow, amd_pmu_test_overflow_status); } /* -- cgit From 0180a1e823d7c41d9a1c19f38e6069b38fe60c87 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 27 Apr 2022 16:10:59 -0700 Subject: x86/split_lock: Enable the split lock feature on Raptor Lake Raptor Lake supports the split lock detection feature. Add it to the split_lock_cpu_ids[] array. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Acked-by: Thomas Gleixner Link: https://lore.kernel.org/r/20220427231059.293086-1-tony.luck@intel.com --- arch/x86/kernel/cpu/intel.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 672e253a58b9..e6c37f38c5ea 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -1257,6 +1257,7 @@ static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, 1), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, 1), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, 1), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, 1), {} }; -- cgit From 12441ccdf5e2f5a01a46e344976cbbd3d46845c9 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 13 Mar 2022 18:27:25 -0700 Subject: x86: Fix return value of __setup handlers __setup() handlers should return 1 to obsolete_checksetup() in init/main.c to indicate that the boot option has been handled. A return of 0 causes the boot option/value to be listed as an Unknown kernel parameter and added to init's (limited) argument (no '=') or environment (with '=') strings. So return 1 from these x86 __setup handlers. Examples: Unknown kernel command line parameters "apicpmtimer BOOT_IMAGE=/boot/bzImage-517rc8 vdso=1 ring3mwait=disable", will be passed to user space. Run /sbin/init as init process with arguments: /sbin/init apicpmtimer with environment: HOME=/ TERM=linux BOOT_IMAGE=/boot/bzImage-517rc8 vdso=1 ring3mwait=disable Fixes: 2aae950b21e4 ("x86_64: Add vDSO for x86-64 with gettimeofday/clock_gettime/getcpu") Fixes: 77b52b4c5c66 ("x86: add "debugpat" boot option") Fixes: e16fd002afe2 ("x86/cpufeature: Enable RING3MWAIT for Knights Landing") Fixes: b8ce33590687 ("x86_64: convert to clock events") Reported-by: Igor Zhbanov Signed-off-by: Randy Dunlap Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/64644a2f-4a20-bab3-1e15-3b2cdd0defe3@omprussia.ru Link: https://lore.kernel.org/r/20220314012725.26661-1-rdunlap@infradead.org --- arch/x86/entry/vdso/vma.c | 2 +- arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/cpu/intel.c | 2 +- arch/x86/mm/pat/memtype.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 235a5794296a..1000d457c332 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -438,7 +438,7 @@ bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs) static __init int vdso_setup(char *s) { vdso64_enabled = simple_strtoul(s, NULL, 0); - return 0; + return 1; } __setup("vdso=", vdso_setup); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b70344bf6600..ed7d9cf71f68 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -170,7 +170,7 @@ static __init int setup_apicpmtimer(char *s) { apic_calibrate_pmtmr = 1; notsc_setup(NULL); - return 0; + return 1; } __setup("apicpmtimer", setup_apicpmtimer); #endif diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 8321c43554a1..350c247def37 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -91,7 +91,7 @@ static bool ring3mwait_disabled __read_mostly; static int __init ring3mwait_disable(char *__unused) { ring3mwait_disabled = true; - return 0; + return 1; } __setup("ring3mwait=disable", ring3mwait_disable); diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index 4ba2a3ee4bce..d5ef64ddd35e 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -101,7 +101,7 @@ int pat_debug_enable; static int __init pat_debug_setup(char *str) { pat_debug_enable = 1; - return 0; + return 1; } __setup("debugpat", pat_debug_setup); -- cgit From 1ef64b1e89e6d4018da46e08ffc32779a31160c7 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 14 Mar 2022 17:10:45 -0700 Subject: x86/mm: Cleanup the control_va_addr_alignment() __setup handler Clean up control_va_addr_alignment(): a. Make '=' required instead of optional (as documented). b. Print a warning if an invalid option value is used. c. Return 1 from the __setup handler when an invalid option value is used. This prevents the kernel from polluting init's (limited) environment space with the entire string. Fixes: dfb09f9b7ab0 ("x86, amd: Avoid cache aliasing penalties on AMD family 15h") Reported-by: Igor Zhbanov Signed-off-by: Randy Dunlap Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/64644a2f-4a20-bab3-1e15-3b2cdd0defe3@omprussia.ru Link: https://lore.kernel.org/r/20220315001045.7680-1-rdunlap@infradead.org --- arch/x86/kernel/sys_x86_64.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 660b78827638..8cc653ffdccd 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -68,9 +68,6 @@ static int __init control_va_addr_alignment(char *str) if (*str == 0) return 1; - if (*str == '=') - str++; - if (!strcmp(str, "32")) va_align.flags = ALIGN_VA_32; else if (!strcmp(str, "64")) @@ -80,11 +77,11 @@ static int __init control_va_addr_alignment(char *str) else if (!strcmp(str, "on")) va_align.flags = ALIGN_VA_32 | ALIGN_VA_64; else - return 0; + pr_warn("invalid option value: 'align_va_addr=%s'\n", str); return 1; } -__setup("align_va_addr", control_va_addr_alignment); +__setup("align_va_addr=", control_va_addr_alignment); SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, -- cgit From a1e2c031ec3949b8c039b739c0b5bf9c30007b00 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 6 May 2022 14:14:32 +0200 Subject: x86/mm: Simplify RESERVE_BRK() RESERVE_BRK() reserves data in the .brk_reservation section. The data is initialized to zero, like BSS, so the macro specifies 'nobits' to prevent the data from taking up space in the vmlinux binary. The only way to get the compiler to do that (without putting the variable in .bss proper) is to use inline asm. The macro also has a hack which encloses the inline asm in a discarded function, which allows the size to be passed (global inline asm doesn't allow inputs). Remove the need for the discarded function hack by just stringifying the size rather than supplying it as an input to the inline asm. Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lore.kernel.org/r/20220506121631.133110232@infradead.org --- arch/x86/include/asm/setup.h | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 896e48d45828..bec5ff4d6d98 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -109,27 +109,19 @@ extern unsigned long _brk_end; void *extend_brk(size_t size, size_t align); /* - * Reserve space in the brk section. The name must be unique within - * the file, and somewhat descriptive. The size is in bytes. Must be - * used at file scope. + * Reserve space in the brk section. The name must be unique within the file, + * and somewhat descriptive. The size is in bytes. * - * (This uses a temp function to wrap the asm so we can pass it the - * size parameter; otherwise we wouldn't be able to. We can't use a - * "section" attribute on a normal variable because it always ends up - * being @progbits, which ends up allocating space in the vmlinux - * executable.) + * The allocation is done using inline asm (rather than using a section + * attribute on a normal variable) in order to allow the use of @nobits, so + * that it doesn't take up any space in the vmlinux file. */ -#define RESERVE_BRK(name,sz) \ - static void __section(".discard.text") __noendbr __used notrace \ - __brk_reservation_fn_##name##__(void) { \ - asm volatile ( \ - ".pushsection .brk_reservation,\"aw\",@nobits;" \ - ".brk." #name ":" \ - " 1:.skip %c0;" \ - " .size .brk." #name ", . - 1b;" \ - " .popsection" \ - : : "i" (sz)); \ - } +#define RESERVE_BRK(name, size) \ + asm(".pushsection .brk_reservation,\"aw\",@nobits\n\t" \ + ".brk." #name ":\n\t" \ + ".skip " __stringify(size) "\n\t" \ + ".size .brk." #name ", " __stringify(size) "\n\t" \ + ".popsection\n\t") extern void probe_roms(void); #ifdef __i386__ -- cgit From d205222eb6a8e5e70c21200beb81c6e19ec211d6 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 6 May 2022 14:14:33 +0200 Subject: x86/entry: Simplify entry_INT80_compat() Instead of playing silly games with rdi, use rax for simpler and more consistent code. Signed-off-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lore.kernel.org/r/20220506121631.221072885@infradead.org --- arch/x86/entry/entry_64_compat.S | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index c5aeb0819707..d743eaa19d9b 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -362,26 +362,25 @@ SYM_CODE_START(entry_INT80_compat) /* switch to thread stack expects orig_ax and rdi to be pushed */ pushq %rax /* pt_regs->orig_ax */ - pushq %rdi /* pt_regs->di */ /* Need to switch before accessing the thread stack. */ - SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax /* In the Xen PV case we already run on the thread stack. */ ALTERNATIVE "", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV - movq %rsp, %rdi + movq %rsp, %rax movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - pushq 6*8(%rdi) /* regs->ss */ - pushq 5*8(%rdi) /* regs->rsp */ - pushq 4*8(%rdi) /* regs->eflags */ - pushq 3*8(%rdi) /* regs->cs */ - pushq 2*8(%rdi) /* regs->ip */ - pushq 1*8(%rdi) /* regs->orig_ax */ - pushq (%rdi) /* pt_regs->di */ + pushq 5*8(%rax) /* regs->ss */ + pushq 4*8(%rax) /* regs->rsp */ + pushq 3*8(%rax) /* regs->eflags */ + pushq 2*8(%rax) /* regs->cs */ + pushq 1*8(%rax) /* regs->ip */ + pushq 0*8(%rax) /* regs->orig_ax */ .Lint80_keep_stack: + pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ xorl %esi, %esi /* nospec si */ pushq %rdx /* pt_regs->dx */ -- cgit From 8c42819b61b8340cff0643e65b5ce6a4144ab155 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 6 May 2022 14:14:34 +0200 Subject: x86/entry: Use PUSH_AND_CLEAR_REGS for compat Since the upper regs don't exist for ia32 code, preserving them doesn't hurt and it simplifies the code. This doesn't add any attack surface that would not already be available through INT80. Notably: - 32bit SYSENTER: didn't clear si, dx, cx. - 32bit SYSCALL, INT80: *do* clear si since the C functions don't take a second argument. - 64bit: didn't clear si since the C functions take a second argument; except the error_entry path might have only one argument, so clearing si was missing here. 32b SYSENTER should be clearing all those 3 registers, nothing uses them and selftests pass. Unconditionally clear rsi since it simplifies code. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lore.kernel.org/r/20220506121631.293889636@infradead.org --- arch/x86/entry/calling.h | 1 + arch/x86/entry/entry_64_compat.S | 87 ++-------------------------------------- 2 files changed, 4 insertions(+), 84 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index a4c061fb7c6e..debbe94aa3db 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -99,6 +99,7 @@ For 32-bit we have the following conventions - kernel is built with * well before they could be put to use in a speculative execution * gadget. */ + xorl %esi, %esi /* nospec si */ xorl %edx, %edx /* nospec dx */ xorl %ecx, %ecx /* nospec cx */ xorl %r8d, %r8d /* nospec r8 */ diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index d743eaa19d9b..ed2be3615b50 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -83,32 +83,7 @@ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL) movl %eax, %eax pushq %rax /* pt_regs->orig_ax */ - pushq %rdi /* pt_regs->di */ - pushq %rsi /* pt_regs->si */ - pushq %rdx /* pt_regs->dx */ - pushq %rcx /* pt_regs->cx */ - pushq $-ENOSYS /* pt_regs->ax */ - pushq $0 /* pt_regs->r8 = 0 */ - xorl %r8d, %r8d /* nospec r8 */ - pushq $0 /* pt_regs->r9 = 0 */ - xorl %r9d, %r9d /* nospec r9 */ - pushq $0 /* pt_regs->r10 = 0 */ - xorl %r10d, %r10d /* nospec r10 */ - pushq $0 /* pt_regs->r11 = 0 */ - xorl %r11d, %r11d /* nospec r11 */ - pushq %rbx /* pt_regs->rbx */ - xorl %ebx, %ebx /* nospec rbx */ - pushq %rbp /* pt_regs->rbp (will be overwritten) */ - xorl %ebp, %ebp /* nospec rbp */ - pushq $0 /* pt_regs->r12 = 0 */ - xorl %r12d, %r12d /* nospec r12 */ - pushq $0 /* pt_regs->r13 = 0 */ - xorl %r13d, %r13d /* nospec r13 */ - pushq $0 /* pt_regs->r14 = 0 */ - xorl %r14d, %r14d /* nospec r14 */ - pushq $0 /* pt_regs->r15 = 0 */ - xorl %r15d, %r15d /* nospec r15 */ - + PUSH_AND_CLEAR_REGS rax=$-ENOSYS UNWIND_HINT_REGS cld @@ -225,35 +200,7 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_safe_stack, SYM_L_GLOBAL) SYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL) movl %eax, %eax /* discard orig_ax high bits */ pushq %rax /* pt_regs->orig_ax */ - pushq %rdi /* pt_regs->di */ - pushq %rsi /* pt_regs->si */ - xorl %esi, %esi /* nospec si */ - pushq %rdx /* pt_regs->dx */ - xorl %edx, %edx /* nospec dx */ - pushq %rbp /* pt_regs->cx (stashed in bp) */ - xorl %ecx, %ecx /* nospec cx */ - pushq $-ENOSYS /* pt_regs->ax */ - pushq $0 /* pt_regs->r8 = 0 */ - xorl %r8d, %r8d /* nospec r8 */ - pushq $0 /* pt_regs->r9 = 0 */ - xorl %r9d, %r9d /* nospec r9 */ - pushq $0 /* pt_regs->r10 = 0 */ - xorl %r10d, %r10d /* nospec r10 */ - pushq $0 /* pt_regs->r11 = 0 */ - xorl %r11d, %r11d /* nospec r11 */ - pushq %rbx /* pt_regs->rbx */ - xorl %ebx, %ebx /* nospec rbx */ - pushq %rbp /* pt_regs->rbp (will be overwritten) */ - xorl %ebp, %ebp /* nospec rbp */ - pushq $0 /* pt_regs->r12 = 0 */ - xorl %r12d, %r12d /* nospec r12 */ - pushq $0 /* pt_regs->r13 = 0 */ - xorl %r13d, %r13d /* nospec r13 */ - pushq $0 /* pt_regs->r14 = 0 */ - xorl %r14d, %r14d /* nospec r14 */ - pushq $0 /* pt_regs->r15 = 0 */ - xorl %r15d, %r15d /* nospec r15 */ - + PUSH_AND_CLEAR_REGS rax=$-ENOSYS UNWIND_HINT_REGS movq %rsp, %rdi @@ -380,35 +327,7 @@ SYM_CODE_START(entry_INT80_compat) pushq 0*8(%rax) /* regs->orig_ax */ .Lint80_keep_stack: - pushq %rdi /* pt_regs->di */ - pushq %rsi /* pt_regs->si */ - xorl %esi, %esi /* nospec si */ - pushq %rdx /* pt_regs->dx */ - xorl %edx, %edx /* nospec dx */ - pushq %rcx /* pt_regs->cx */ - xorl %ecx, %ecx /* nospec cx */ - pushq $-ENOSYS /* pt_regs->ax */ - pushq %r8 /* pt_regs->r8 */ - xorl %r8d, %r8d /* nospec r8 */ - pushq %r9 /* pt_regs->r9 */ - xorl %r9d, %r9d /* nospec r9 */ - pushq %r10 /* pt_regs->r10*/ - xorl %r10d, %r10d /* nospec r10 */ - pushq %r11 /* pt_regs->r11 */ - xorl %r11d, %r11d /* nospec r11 */ - pushq %rbx /* pt_regs->rbx */ - xorl %ebx, %ebx /* nospec rbx */ - pushq %rbp /* pt_regs->rbp */ - xorl %ebp, %ebp /* nospec rbp */ - pushq %r12 /* pt_regs->r12 */ - xorl %r12d, %r12d /* nospec r12 */ - pushq %r13 /* pt_regs->r13 */ - xorl %r13d, %r13d /* nospec r13 */ - pushq %r14 /* pt_regs->r14 */ - xorl %r14d, %r14d /* nospec r14 */ - pushq %r15 /* pt_regs->r15 */ - xorl %r15d, %r15d /* nospec r15 */ - + PUSH_AND_CLEAR_REGS rax=$-ENOSYS UNWIND_HINT_REGS cld -- cgit From 1b331eeea7b8676fc5dbdf80d0a07e41be226177 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 6 May 2022 14:14:35 +0200 Subject: x86/entry: Remove skip_r11rcx Yes, r11 and rcx have been restored previously, but since they're being popped anyway (into rsi) might as well pop them into their own regs -- setting them to the value they already are. Less magical code. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220506121631.365070674@infradead.org --- arch/x86/entry/calling.h | 10 +--------- arch/x86/entry/entry_64.S | 3 +-- 2 files changed, 2 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index debbe94aa3db..a97cc78ecb92 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -120,27 +120,19 @@ For 32-bit we have the following conventions - kernel is built with CLEAR_REGS .endm -.macro POP_REGS pop_rdi=1 skip_r11rcx=0 +.macro POP_REGS pop_rdi=1 popq %r15 popq %r14 popq %r13 popq %r12 popq %rbp popq %rbx - .if \skip_r11rcx - popq %rsi - .else popq %r11 - .endif popq %r10 popq %r9 popq %r8 popq %rax - .if \skip_r11rcx - popq %rsi - .else popq %rcx - .endif popq %rdx popq %rsi .if \pop_rdi diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 312186612f4e..3a1e3f215617 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -191,8 +191,7 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) * perf profiles. Nothing jumps here. */ syscall_return_via_sysret: - /* rcx and r11 are already restored (see code above) */ - POP_REGS pop_rdi=0 skip_r11rcx=1 + POP_REGS pop_rdi=0 /* * Now all regs are restored except RSP and RDI. -- cgit From c5febea0956fd3874e8fb59c6f84d68f128d68f8 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 8 Apr 2022 18:07:50 -0500 Subject: fork: Pass struct kernel_clone_args into copy_thread With io_uring we have started supporting tasks that are for most purposes user space tasks that exclusively run code in kernel mode. The kernel task that exec's init and tasks that exec user mode helpers are also user mode tasks that just run kernel code until they call kernel execve. Pass kernel_clone_args into copy_thread so these oddball tasks can be supported more cleanly and easily. v2: Fix spelling of kenrel_clone_args on h8300 Link: https://lkml.kernel.org/r/20220506141512.516114-2-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- arch/x86/kernel/process.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index b370767f5b19..0fce52b10dc4 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -130,9 +130,12 @@ static int set_new_tls(struct task_struct *p, unsigned long tls) return do_set_thread_area_64(p, ARCH_SET_FS, tls); } -int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, - struct task_struct *p, unsigned long tls) +int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long sp = args->stack; + unsigned long arg = args->stack_size; + unsigned long tls = args->tls; struct inactive_task_frame *frame; struct fork_frame *fork_frame; struct pt_regs *childregs; -- cgit From 5bd2e97c868a8a44470950ed01846cab6328e540 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 12 Apr 2022 10:18:48 -0500 Subject: fork: Generalize PF_IO_WORKER handling Add fn and fn_arg members into struct kernel_clone_args and test for them in copy_thread (instead of testing for PF_KTHREAD | PF_IO_WORKER). This allows any task that wants to be a user space task that only runs in kernel mode to use this functionality. The code on x86 is an exception and still retains a PF_KTHREAD test because x86 unlikely everything else handles kthreads slightly differently than user space tasks that start with a function. The functions that created tasks that start with a function have been updated to set ".fn" and ".fn_arg" instead of ".stack" and ".stack_size". These functions are fork_idle(), create_io_thread(), kernel_thread(), and user_mode_thread(). Link: https://lkml.kernel.org/r/20220506141512.516114-4-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- arch/x86/include/asm/fpu/sched.h | 2 +- arch/x86/include/asm/switch_to.h | 8 ++++---- arch/x86/kernel/fpu/core.c | 4 ++-- arch/x86/kernel/process.c | 13 ++++++------- 4 files changed, 13 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/sched.h b/arch/x86/include/asm/fpu/sched.h index 99a8820e8cc4..b2486b2cbc6e 100644 --- a/arch/x86/include/asm/fpu/sched.h +++ b/arch/x86/include/asm/fpu/sched.h @@ -11,7 +11,7 @@ extern void save_fpregs_to_fpstate(struct fpu *fpu); extern void fpu__drop(struct fpu *fpu); -extern int fpu_clone(struct task_struct *dst, unsigned long clone_flags); +extern int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal); extern void fpu_flush_thread(void); /* diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index b5f0d2ff47e4..c08eb0fdd11f 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -78,13 +78,13 @@ static inline void update_task_stack(struct task_struct *task) } static inline void kthread_frame_init(struct inactive_task_frame *frame, - unsigned long fun, unsigned long arg) + int (*fun)(void *), void *arg) { - frame->bx = fun; + frame->bx = (unsigned long)fun; #ifdef CONFIG_X86_32 - frame->di = arg; + frame->di = (unsigned long)arg; #else - frame->r12 = arg; + frame->r12 = (unsigned long)arg; #endif } diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index c049561f373a..fbade5a3975b 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -556,7 +556,7 @@ static inline void fpu_inherit_perms(struct fpu *dst_fpu) } /* Clone current's FPU state on fork */ -int fpu_clone(struct task_struct *dst, unsigned long clone_flags) +int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal) { struct fpu *src_fpu = ¤t->thread.fpu; struct fpu *dst_fpu = &dst->thread.fpu; @@ -579,7 +579,7 @@ int fpu_clone(struct task_struct *dst, unsigned long clone_flags) * No FPU state inheritance for kernel threads and IO * worker threads. */ - if (dst->flags & (PF_KTHREAD | PF_IO_WORKER)) { + if (minimal) { /* Clear out the minimal state */ memcpy(&dst_fpu->fpstate->regs, &init_fpstate.regs, init_fpstate_copy_size()); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 0fce52b10dc4..d20eaad52a85 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -134,7 +134,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long sp = args->stack; - unsigned long arg = args->stack_size; unsigned long tls = args->tls; struct inactive_task_frame *frame; struct fork_frame *fork_frame; @@ -172,13 +171,13 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) frame->flags = X86_EFLAGS_FIXED; #endif - fpu_clone(p, clone_flags); + fpu_clone(p, clone_flags, args->fn); /* Kernel thread ? */ if (unlikely(p->flags & PF_KTHREAD)) { p->thread.pkru = pkru_get_init_value(); memset(childregs, 0, sizeof(struct pt_regs)); - kthread_frame_init(frame, sp, arg); + kthread_frame_init(frame, args->fn, args->fn_arg); return 0; } @@ -198,10 +197,10 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) task_user_gs(p) = get_user_gs(current_pt_regs()); #endif - if (unlikely(p->flags & PF_IO_WORKER)) { + if (unlikely(args->fn)) { /* - * An IO thread is a user space thread, but it doesn't - * return to ret_after_fork(). + * A user space thread, but it doesn't return to + * ret_after_fork(). * * In order to indicate that to tools like gdb, * we reset the stack and instruction pointers. @@ -211,7 +210,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) */ childregs->sp = 0; childregs->ip = 0; - kthread_frame_init(frame, sp, arg); + kthread_frame_init(frame, args->fn, args->fn_arg); return 0; } -- cgit From 595b893e2087de306d0781795fb8ec47873596a6 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 3 May 2022 13:55:00 -0700 Subject: randstruct: Reorganize Kconfigs and attribute macros In preparation for Clang supporting randstruct, reorganize the Kconfigs, move the attribute macros, and generalize the feature to be named CONFIG_RANDSTRUCT for on/off, CONFIG_RANDSTRUCT_FULL for the full randomization mode, and CONFIG_RANDSTRUCT_PERFORMANCE for the cache-line sized mode. Cc: linux-hardening@vger.kernel.org Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220503205503.3054173-4-keescook@chromium.org --- arch/x86/mm/pti.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 5d5c7bb50ce9..ffe3b3a087fe 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -540,7 +540,7 @@ static inline bool pti_kernel_image_global_ok(void) * cases where RANDSTRUCT is in use to help keep the layout a * secret. */ - if (IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT)) + if (IS_ENABLED(CONFIG_RANDSTRUCT)) return false; return true; -- cgit From 613f4b3ed7902d1dbbc6ade6401e452a63dfbc21 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 3 May 2022 13:55:01 -0700 Subject: randstruct: Split randstruct Makefile and CFLAGS To enable the new Clang randstruct implementation[1], move randstruct into its own Makefile and split the CFLAGS from GCC_PLUGINS_CFLAGS into RANDSTRUCT_CFLAGS. [1] https://reviews.llvm.org/D121556 Cc: linux-hardening@vger.kernel.org Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220503205503.3054173-5-keescook@chromium.org --- arch/x86/entry/vdso/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 693f8b9031fb..c2a8b76ae0bc 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -91,7 +91,7 @@ ifneq ($(RETPOLINE_VDSO_CFLAGS),) endif endif -$(vobjs): KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL) +$(vobjs): KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO) $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL) # # vDSO code runs in userspace and -pg doesn't help with profiling anyway. @@ -148,6 +148,7 @@ KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS)) KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 := $(filter-out $(RANDSTRUCT_CFLAGS),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out $(CC_FLAGS_LTO),$(KBUILD_CFLAGS_32)) -- cgit From 6d97af487dee3176cb1342d4ab16637e495440ad Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Wed, 4 May 2022 08:23:50 +0200 Subject: entry: Rename arch_check_user_regs() to arch_enter_from_user_mode() arch_check_user_regs() is used at the moment to verify that struct pt_regs contains valid values when entering the kernel from userspace. s390 needs a place in the generic entry code to modify a cpu data structure when switching from userspace to kernel mode. As arch_check_user_regs() is exactly this, rename it to arch_enter_from_user_mode(). When entering the kernel from userspace, arch_check_user_regs() is used to verify that struct pt_regs contains valid values. Note that the NMI codepath doesn't call this function. s390 needs a place in the generic entry code to modify a cpu data structure when switching from userspace to kernel mode. As arch_check_user_regs() is exactly this, rename it to arch_enter_from_user_mode(). Signed-off-by: Sven Schnelle Reviewed-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Link: https://lore.kernel.org/r/20220504062351.2954280-2-tmricht@linux.ibm.com Signed-off-by: Heiko Carstens --- arch/x86/include/asm/entry-common.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h index 43184640b579..674ed46d3ced 100644 --- a/arch/x86/include/asm/entry-common.h +++ b/arch/x86/include/asm/entry-common.h @@ -10,7 +10,7 @@ #include /* Check that the stack and regs on entry from user mode are sane. */ -static __always_inline void arch_check_user_regs(struct pt_regs *regs) +static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs) { if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) { /* @@ -42,7 +42,7 @@ static __always_inline void arch_check_user_regs(struct pt_regs *regs) WARN_ON_ONCE(regs != task_pt_regs(current)); } } -#define arch_check_user_regs arch_check_user_regs +#define arch_enter_from_user_mode arch_enter_from_user_mode static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, unsigned long ti_work) -- cgit From 3e20889cfbee1e9716544a14c5d23be598412ddf Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:46 -0700 Subject: x86/pgtable: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's use bit 3 to remember PG_anon_exclusive in swap ptes. [david@redhat.com: fix 32-bit swap layout] Link: https://lkml.kernel.org/r/d875c292-46b3-f281-65ae-71d0b0c6f592@redhat.com Link: https://lkml.kernel.org/r/20220329164329.208407-4-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Dave Hansen Cc: Don Dutile Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Paul Mackerras Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/x86/include/asm/pgtable.h | 17 +++++++++++++++++ arch/x86/include/asm/pgtable_64.h | 4 +++- arch/x86/include/asm/pgtable_64_types.h | 5 +++++ 3 files changed, 25 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 3563f4645fa1..1e09519af143 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1286,6 +1286,23 @@ static inline void update_mmu_cache_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud) { } +#ifdef _PAGE_SWP_EXCLUSIVE +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_SWP_EXCLUSIVE); +} + +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_flags(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_SWP_EXCLUSIVE); +} +#endif /* _PAGE_SWP_EXCLUSIVE */ #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY static inline pte_t pte_swp_mksoft_dirty(pte_t pte) diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 56d0399a0cd1..e479491da8d5 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -186,7 +186,7 @@ static inline void native_pgd_clear(pgd_t *pgd) * * | ... | 11| 10| 9|8|7|6|5| 4| 3|2| 1|0| <- bit number * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names - * | TYPE (59-63) | ~OFFSET (9-58) |0|0|X|X| X| X|F|SD|0| <- swp entry + * | TYPE (59-63) | ~OFFSET (9-58) |0|0|X|X| X| E|F|SD|0| <- swp entry * * G (8) is aliased and used as a PROT_NONE indicator for * !present ptes. We need to start storing swap entries above @@ -203,6 +203,8 @@ static inline void native_pgd_clear(pgd_t *pgd) * F (2) in swp entry is used to record when a pagetable is * writeprotected by userfaultfd WP support. * + * E (3) in swp entry is used to rememeber PG_anon_exclusive. + * * Bit 7 in swp entry should be 0 because pmd_present checks not only P, * but also L and G. * diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 91ac10654570..70e360a2e5fb 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -163,4 +163,9 @@ extern unsigned int ptrs_per_p4d; #define PGD_KERNEL_START ((PAGE_SIZE / 2) / sizeof(pgd_t)) +/* + * We borrow bit 3 to remember PG_anon_exclusive. + */ +#define _PAGE_SWP_EXCLUSIVE _PAGE_PWT + #endif /* _ASM_X86_PGTABLE_64_DEFS_H */ -- cgit From 3d47083b9ff46863e8374ad3bb5edb5e464c75f8 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Fri, 29 Apr 2022 10:44:41 +0530 Subject: perf/amd/ibs: Use interrupt regs ip for stack unwinding IbsOpRip is recorded when IBS interrupt is triggered. But there is a skid from the time IBS interrupt gets triggered to the time the interrupt is presented to the core. Meanwhile processor would have moved ahead and thus IbsOpRip will be inconsistent with rsp and rbp recorded as part of the interrupt regs. This causes issues while unwinding stack using the ORC unwinder as it needs consistent rip, rsp and rbp. Fix this by using rip from interrupt regs instead of IbsOpRip for stack unwinding. Fixes: ee9f8fce99640 ("x86/unwind: Add the ORC unwinder") Reported-by: Dmitry Monakhov Suggested-by: Peter Zijlstra Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220429051441.14251-1-ravi.bangoria@amd.com --- arch/x86/events/amd/ibs.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 9739019d4b67..11e8b493e015 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -304,6 +304,16 @@ static int perf_ibs_init(struct perf_event *event) hwc->config_base = perf_ibs->msr; hwc->config = config; + /* + * rip recorded by IbsOpRip will not be consistent with rsp and rbp + * recorded as part of interrupt regs. Thus we need to use rip from + * interrupt regs while unwinding call stack. Setting _EARLY flag + * makes sure we unwind call-stack before perf sample rip is set to + * IbsOpRip. + */ + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY; + return 0; } @@ -687,6 +697,14 @@ fail: data.raw = &raw; } + /* + * rip recorded by IbsOpRip will not be consistent with rsp and rbp + * recorded as part of interrupt regs. Thus we need to use rip from + * interrupt regs while unwinding call stack. + */ + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + data.callchain = perf_callchain(event, iregs); + throttle = perf_event_overflow(event, &data, ®s); out: if (throttle) { -- cgit From f7e0beaf39d3868dc700d4954b26cf8443c5d423 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Tue, 10 May 2022 13:59:19 -0700 Subject: bpf, x86: Generate trampolines from bpf_tramp_links Replace struct bpf_tramp_progs with struct bpf_tramp_links to collect struct bpf_tramp_link(s) for a trampoline. struct bpf_tramp_link extends bpf_link to act as a linked list node. arch_prepare_bpf_trampoline() accepts a struct bpf_tramp_links to collects all bpf_tramp_link(s) that a trampoline should call. Change BPF trampoline and bpf_struct_ops to pass bpf_tramp_links instead of bpf_tramp_progs. Signed-off-by: Kui-Feng Lee Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220510205923.3206889-2-kuifeng@fb.com --- arch/x86/net/bpf_jit_comp.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 16b6efacf7c6..38eb43159230 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1762,10 +1762,12 @@ static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_args, } static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, - struct bpf_prog *p, int stack_size, bool save_ret) + struct bpf_tramp_link *l, int stack_size, + bool save_ret) { u8 *prog = *pprog; u8 *jmp_insn; + struct bpf_prog *p = l->link.prog; /* arg1: mov rdi, progs[i] */ emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); @@ -1850,14 +1852,14 @@ static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond) } static int invoke_bpf(const struct btf_func_model *m, u8 **pprog, - struct bpf_tramp_progs *tp, int stack_size, + struct bpf_tramp_links *tl, int stack_size, bool save_ret) { int i; u8 *prog = *pprog; - for (i = 0; i < tp->nr_progs; i++) { - if (invoke_bpf_prog(m, &prog, tp->progs[i], stack_size, + for (i = 0; i < tl->nr_links; i++) { + if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size, save_ret)) return -EINVAL; } @@ -1866,7 +1868,7 @@ static int invoke_bpf(const struct btf_func_model *m, u8 **pprog, } static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, - struct bpf_tramp_progs *tp, int stack_size, + struct bpf_tramp_links *tl, int stack_size, u8 **branches) { u8 *prog = *pprog; @@ -1877,8 +1879,8 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, */ emit_mov_imm32(&prog, false, BPF_REG_0, 0); emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); - for (i = 0; i < tp->nr_progs; i++) { - if (invoke_bpf_prog(m, &prog, tp->progs[i], stack_size, true)) + for (i = 0; i < tl->nr_links; i++) { + if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size, true)) return -EINVAL; /* mod_ret prog stored return value into [rbp - 8]. Emit: @@ -1980,14 +1982,14 @@ static bool is_valid_bpf_tramp_flags(unsigned int flags) */ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, - struct bpf_tramp_progs *tprogs, + struct bpf_tramp_links *tlinks, void *orig_call) { int ret, i, nr_args = m->nr_args; int regs_off, ip_off, args_off, stack_size = nr_args * 8; - struct bpf_tramp_progs *fentry = &tprogs[BPF_TRAMP_FENTRY]; - struct bpf_tramp_progs *fexit = &tprogs[BPF_TRAMP_FEXIT]; - struct bpf_tramp_progs *fmod_ret = &tprogs[BPF_TRAMP_MODIFY_RETURN]; + struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY]; + struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT]; + struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN]; u8 **branches = NULL; u8 *prog; bool save_ret; @@ -2078,13 +2080,13 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i } } - if (fentry->nr_progs) + if (fentry->nr_links) if (invoke_bpf(m, &prog, fentry, regs_off, flags & BPF_TRAMP_F_RET_FENTRY_RET)) return -EINVAL; - if (fmod_ret->nr_progs) { - branches = kcalloc(fmod_ret->nr_progs, sizeof(u8 *), + if (fmod_ret->nr_links) { + branches = kcalloc(fmod_ret->nr_links, sizeof(u8 *), GFP_KERNEL); if (!branches) return -ENOMEM; @@ -2111,7 +2113,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i prog += X86_PATCH_SIZE; } - if (fmod_ret->nr_progs) { + if (fmod_ret->nr_links) { /* From Intel 64 and IA-32 Architectures Optimization * Reference Manual, 3.4.1.4 Code Alignment, Assembly/Compiler * Coding Rule 11: All branch targets should be 16-byte @@ -2121,12 +2123,12 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i /* Update the branches saved in invoke_bpf_mod_ret with the * aligned address of do_fexit. */ - for (i = 0; i < fmod_ret->nr_progs; i++) + for (i = 0; i < fmod_ret->nr_links; i++) emit_cond_near_jump(&branches[i], prog, branches[i], X86_JNE); } - if (fexit->nr_progs) + if (fexit->nr_links) if (invoke_bpf(m, &prog, fexit, regs_off, false)) { ret = -EINVAL; goto cleanup; -- cgit From e384c7b7b46d0a5f4bf3c554f963e6e9622d0ab1 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Tue, 10 May 2022 13:59:20 -0700 Subject: bpf, x86: Create bpf_tramp_run_ctx on the caller thread's stack BPF trampolines will create a bpf_tramp_run_ctx, a bpf_run_ctx, on stacks and set/reset the current bpf_run_ctx before/after calling a bpf_prog. Signed-off-by: Kui-Feng Lee Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220510205923.3206889-3-kuifeng@fb.com --- arch/x86/net/bpf_jit_comp.c | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 38eb43159230..1fbc5cf1c7a7 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1763,14 +1763,30 @@ static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_args, static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, struct bpf_tramp_link *l, int stack_size, - bool save_ret) + int run_ctx_off, bool save_ret) { u8 *prog = *pprog; u8 *jmp_insn; + int ctx_cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie); struct bpf_prog *p = l->link.prog; + /* mov rdi, 0 */ + emit_mov_imm64(&prog, BPF_REG_1, 0, 0); + + /* Prepare struct bpf_tramp_run_ctx. + * + * bpf_tramp_run_ctx is already preserved by + * arch_prepare_bpf_trampoline(). + * + * mov QWORD PTR [rbp - run_ctx_off + ctx_cookie_off], rdi + */ + emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_1, -run_ctx_off + ctx_cookie_off); + /* arg1: mov rdi, progs[i] */ emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); + /* arg2: lea rsi, [rbp - ctx_cookie_off] */ + EMIT4(0x48, 0x8D, 0x75, -run_ctx_off); + if (emit_call(&prog, p->aux->sleepable ? __bpf_prog_enter_sleepable : __bpf_prog_enter, prog)) @@ -1816,6 +1832,8 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); /* arg2: mov rsi, rbx <- start time in nsec */ emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); + /* arg3: lea rdx, [rbp - run_ctx_off] */ + EMIT4(0x48, 0x8D, 0x55, -run_ctx_off); if (emit_call(&prog, p->aux->sleepable ? __bpf_prog_exit_sleepable : __bpf_prog_exit, prog)) @@ -1853,14 +1871,14 @@ static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond) static int invoke_bpf(const struct btf_func_model *m, u8 **pprog, struct bpf_tramp_links *tl, int stack_size, - bool save_ret) + int run_ctx_off, bool save_ret) { int i; u8 *prog = *pprog; for (i = 0; i < tl->nr_links; i++) { if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size, - save_ret)) + run_ctx_off, save_ret)) return -EINVAL; } *pprog = prog; @@ -1869,7 +1887,7 @@ static int invoke_bpf(const struct btf_func_model *m, u8 **pprog, static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, struct bpf_tramp_links *tl, int stack_size, - u8 **branches) + int run_ctx_off, u8 **branches) { u8 *prog = *pprog; int i; @@ -1880,7 +1898,7 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, emit_mov_imm32(&prog, false, BPF_REG_0, 0); emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); for (i = 0; i < tl->nr_links; i++) { - if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size, true)) + if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size, run_ctx_off, true)) return -EINVAL; /* mod_ret prog stored return value into [rbp - 8]. Emit: @@ -1986,7 +2004,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i void *orig_call) { int ret, i, nr_args = m->nr_args; - int regs_off, ip_off, args_off, stack_size = nr_args * 8; + int regs_off, ip_off, args_off, stack_size = nr_args * 8, run_ctx_off; struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY]; struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT]; struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN]; @@ -2016,6 +2034,8 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i * RBP - args_off [ args count ] always * * RBP - ip_off [ traced function ] BPF_TRAMP_F_IP_ARG flag + * + * RBP - run_ctx_off [ bpf_tramp_run_ctx ] */ /* room for return value of orig_call or fentry prog */ @@ -2034,6 +2054,9 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i ip_off = stack_size; + stack_size += (sizeof(struct bpf_tramp_run_ctx) + 7) & ~0x7; + run_ctx_off = stack_size; + if (flags & BPF_TRAMP_F_SKIP_FRAME) { /* skip patched call instruction and point orig_call to actual * body of the kernel function. @@ -2081,7 +2104,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i } if (fentry->nr_links) - if (invoke_bpf(m, &prog, fentry, regs_off, + if (invoke_bpf(m, &prog, fentry, regs_off, run_ctx_off, flags & BPF_TRAMP_F_RET_FENTRY_RET)) return -EINVAL; @@ -2092,7 +2115,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i return -ENOMEM; if (invoke_bpf_mod_ret(m, &prog, fmod_ret, regs_off, - branches)) { + run_ctx_off, branches)) { ret = -EINVAL; goto cleanup; } @@ -2129,7 +2152,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i } if (fexit->nr_links) - if (invoke_bpf(m, &prog, fexit, regs_off, false)) { + if (invoke_bpf(m, &prog, fexit, regs_off, run_ctx_off, false)) { ret = -EINVAL; goto cleanup; } -- cgit From 2fcc82411e74e5e6aba336561cf56fb899bfae4e Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Tue, 10 May 2022 13:59:21 -0700 Subject: bpf, x86: Attach a cookie to fentry/fexit/fmod_ret/lsm. Pass a cookie along with BPF_LINK_CREATE requests. Add a bpf_cookie field to struct bpf_tracing_link to attach a cookie. The cookie of a bpf_tracing_link is available by calling bpf_get_attach_cookie when running the BPF program of the attached link. The value of a cookie will be set at bpf_tramp_run_ctx by the trampoline of the link. Signed-off-by: Kui-Feng Lee Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220510205923.3206889-4-kuifeng@fb.com --- arch/x86/net/bpf_jit_comp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 1fbc5cf1c7a7..a2b6d197c226 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1769,9 +1769,10 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, u8 *jmp_insn; int ctx_cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie); struct bpf_prog *p = l->link.prog; + u64 cookie = l->cookie; - /* mov rdi, 0 */ - emit_mov_imm64(&prog, BPF_REG_1, 0, 0); + /* mov rdi, cookie */ + emit_mov_imm64(&prog, BPF_REG_1, (long) cookie >> 32, (u32) (long) cookie); /* Prepare struct bpf_tramp_run_ctx. * -- cgit From f774f5bb87d132b48bc4a99598c45f35121ac054 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 3 May 2022 11:47:16 +0900 Subject: kbuild: factor out the common installation code into scripts/install.sh Many architectures have similar install.sh scripts. The first half is really generic; it verifies that the kernel image and System.map exist, then executes ~/bin/${INSTALLKERNEL} or /sbin/${INSTALLKERNEL} if available. The second half is kind of arch-specific; it copies the kernel image and System.map to the destination, but the code is slightly different. Factor out the generic part into scripts/install.sh. Signed-off-by: Masahiro Yamada Reviewed-by: Nicolas Schier --- arch/x86/Makefile | 3 +-- arch/x86/boot/install.sh | 22 ---------------------- 2 files changed, 1 insertion(+), 24 deletions(-) mode change 100644 => 100755 arch/x86/boot/install.sh (limited to 'arch/x86') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 63d50f65b828..5e1f21aae12b 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -271,8 +271,7 @@ $(BOOT_TARGETS): vmlinux PHONY += install install: - $(CONFIG_SHELL) $(srctree)/$(boot)/install.sh $(KERNELRELEASE) \ - $(KBUILD_IMAGE) System.map "$(INSTALL_PATH)" + $(call cmd,install) PHONY += vdso_install vdso_install: diff --git a/arch/x86/boot/install.sh b/arch/x86/boot/install.sh old mode 100644 new mode 100755 index d13ec1c38640..0849f4b42745 --- a/arch/x86/boot/install.sh +++ b/arch/x86/boot/install.sh @@ -15,28 +15,6 @@ # $2 - kernel image file # $3 - kernel map file # $4 - default install path (blank if root directory) -# - -verify () { - if [ ! -f "$1" ]; then - echo "" 1>&2 - echo " *** Missing file: $1" 1>&2 - echo ' *** You need to run "make" before "make install".' 1>&2 - echo "" 1>&2 - exit 1 - fi -} - -# Make sure the files actually exist -verify "$2" -verify "$3" - -# User may have a custom install script - -if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi -if [ -x /sbin/${INSTALLKERNEL} ]; then exec /sbin/${INSTALLKERNEL} "$@"; fi - -# Default install - same as make zlilo if [ -f $4/vmlinuz ]; then mv $4/vmlinuz $4/vmlinuz.old -- cgit From c2a960f7c5741cc4f03b4e587afeb89ad53c32c5 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 4 May 2022 12:44:09 -0700 Subject: perf/x86: Add new Alder Lake and Raptor Lake support From PMU's perspective, there is no difference for the new Alder Lake N and Raptor Lake P. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220504194413.1003071-1-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index fc7f458eb3de..955ae91c56dc 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -6216,7 +6216,9 @@ __init int intel_pmu_init(void) case INTEL_FAM6_ALDERLAKE: case INTEL_FAM6_ALDERLAKE_L: + case INTEL_FAM6_ALDERLAKE_N: case INTEL_FAM6_RAPTORLAKE: + case INTEL_FAM6_RAPTORLAKE_P: /* * Alder Lake has 2 types of CPU, core and atom. * -- cgit From d773a73366bd54d0c75c533269fe2f0765ce42ee Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 4 May 2022 12:44:10 -0700 Subject: perf/x86/msr: Add new Alder Lake and Raptor Lake support The new Alder Lake N and Raptor Lake P also support PPERF and SMI_COUNT MSRs. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220504194413.1003071-2-kan.liang@linux.intel.com --- arch/x86/events/msr.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index 6d759f88315c..ac542f98c070 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -103,7 +103,9 @@ static bool test_intel(int idx, void *data) case INTEL_FAM6_ROCKETLAKE: case INTEL_FAM6_ALDERLAKE: case INTEL_FAM6_ALDERLAKE_L: + case INTEL_FAM6_ALDERLAKE_N: case INTEL_FAM6_RAPTORLAKE: + case INTEL_FAM6_RAPTORLAKE_P: if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF) return true; break; -- cgit From cd971104ac7e41ff66082b9b584d319bb0688a1a Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 4 May 2022 12:44:11 -0700 Subject: perf/x86/cstate: Add new Alder Lake and Raptor Lake support From the perspective of Intel cstate residency counters, there is nothing changed for the new Alder Lake N and Raptor Lake P. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220504194413.1003071-3-kan.liang@linux.intel.com --- arch/x86/events/intel/cstate.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 48e5db21142c..8ec23f47fee9 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -682,7 +682,9 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &icl_cstates), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &adl_cstates), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &adl_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &adl_cstates), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_cstates), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); -- cgit From e5ae168e8394dc3c6dce580690c87ff2cf16cdbb Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 4 May 2022 12:44:12 -0700 Subject: perf/x86/uncore: Clean up uncore_pci_ids[] The initialization code to assign PCI IDs for different platforms is similar. Add the new macros to reduce the redundant code. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220504194413.1003071-4-kan.liang@linux.intel.com --- arch/x86/events/intel/uncore_snb.c | 402 ++++++++----------------------------- 1 file changed, 86 insertions(+), 316 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 4262351f52b6..b30890b91137 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -84,6 +84,13 @@ #define PCI_DEVICE_ID_INTEL_RPL_3_IMC 0xA706 #define PCI_DEVICE_ID_INTEL_RPL_4_IMC 0xA709 + +#define IMC_UNCORE_DEV(a) \ +{ \ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_##a##_IMC), \ + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), \ +} + /* SNB event control */ #define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff #define SNB_UNC_CTL_UMASK_MASK 0x0000ff00 @@ -849,242 +856,80 @@ static struct intel_uncore_type *snb_pci_uncores[] = { }; static const struct pci_device_id snb_uncore_pci_ids[] = { - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, + IMC_UNCORE_DEV(SNB), { /* end: all zeroes */ }, }; static const struct pci_device_id ivb_uncore_pci_ids[] = { - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_E3_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, + IMC_UNCORE_DEV(IVB), + IMC_UNCORE_DEV(IVB_E3), { /* end: all zeroes */ }, }; static const struct pci_device_id hsw_uncore_pci_ids[] = { - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_U_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, + IMC_UNCORE_DEV(HSW), + IMC_UNCORE_DEV(HSW_U), { /* end: all zeroes */ }, }; static const struct pci_device_id bdw_uncore_pci_ids[] = { - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BDW_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, + IMC_UNCORE_DEV(BDW), { /* end: all zeroes */ }, }; static const struct pci_device_id skl_uncore_pci_ids[] = { - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_Y_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_U_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_HD_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_HQ_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_SD_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_SQ_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_E3_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_Y_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_U_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_UQ_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_SD_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_SQ_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_HQ_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_WQ_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_2U_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4U_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4H_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6H_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_2S_D_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4S_D_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6S_D_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_8S_D_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4S_W_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6S_W_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_8S_W_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4S_S_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6S_S_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_8S_S_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_AML_YD_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_AML_YQ_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_WHL_UQ_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_WHL_4_UQ_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_WHL_UD_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H1_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H2_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H3_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U1_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U2_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U3_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S1_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S2_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S3_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S4_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S5_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, + IMC_UNCORE_DEV(SKL_Y), + IMC_UNCORE_DEV(SKL_U), + IMC_UNCORE_DEV(SKL_HD), + IMC_UNCORE_DEV(SKL_HQ), + IMC_UNCORE_DEV(SKL_SD), + IMC_UNCORE_DEV(SKL_SQ), + IMC_UNCORE_DEV(SKL_E3), + IMC_UNCORE_DEV(KBL_Y), + IMC_UNCORE_DEV(KBL_U), + IMC_UNCORE_DEV(KBL_UQ), + IMC_UNCORE_DEV(KBL_SD), + IMC_UNCORE_DEV(KBL_SQ), + IMC_UNCORE_DEV(KBL_HQ), + IMC_UNCORE_DEV(KBL_WQ), + IMC_UNCORE_DEV(CFL_2U), + IMC_UNCORE_DEV(CFL_4U), + IMC_UNCORE_DEV(CFL_4H), + IMC_UNCORE_DEV(CFL_6H), + IMC_UNCORE_DEV(CFL_2S_D), + IMC_UNCORE_DEV(CFL_4S_D), + IMC_UNCORE_DEV(CFL_6S_D), + IMC_UNCORE_DEV(CFL_8S_D), + IMC_UNCORE_DEV(CFL_4S_W), + IMC_UNCORE_DEV(CFL_6S_W), + IMC_UNCORE_DEV(CFL_8S_W), + IMC_UNCORE_DEV(CFL_4S_S), + IMC_UNCORE_DEV(CFL_6S_S), + IMC_UNCORE_DEV(CFL_8S_S), + IMC_UNCORE_DEV(AML_YD), + IMC_UNCORE_DEV(AML_YQ), + IMC_UNCORE_DEV(WHL_UQ), + IMC_UNCORE_DEV(WHL_4_UQ), + IMC_UNCORE_DEV(WHL_UD), + IMC_UNCORE_DEV(CML_H1), + IMC_UNCORE_DEV(CML_H2), + IMC_UNCORE_DEV(CML_H3), + IMC_UNCORE_DEV(CML_U1), + IMC_UNCORE_DEV(CML_U2), + IMC_UNCORE_DEV(CML_U3), + IMC_UNCORE_DEV(CML_S1), + IMC_UNCORE_DEV(CML_S2), + IMC_UNCORE_DEV(CML_S3), + IMC_UNCORE_DEV(CML_S4), + IMC_UNCORE_DEV(CML_S5), { /* end: all zeroes */ }, }; static const struct pci_device_id icl_uncore_pci_ids[] = { - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICL_U_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICL_U2_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_RKL_1_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_RKL_2_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, + IMC_UNCORE_DEV(ICL_U), + IMC_UNCORE_DEV(ICL_U2), + IMC_UNCORE_DEV(RKL_1), + IMC_UNCORE_DEV(RKL_2), { /* end: all zeroes */ }, }; @@ -1326,106 +1171,31 @@ void nhm_uncore_cpu_init(void) /* Tiger Lake MMIO uncore support */ static const struct pci_device_id tgl_uncore_pci_ids[] = { - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_TGL_U1_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_TGL_U2_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_TGL_U3_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_TGL_U4_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_TGL_H_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_1_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_2_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_3_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_4_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_5_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_6_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_7_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_8_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_9_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_10_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_11_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_12_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_13_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_14_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_15_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_16_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_RPL_1_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_RPL_2_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_RPL_3_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_RPL_4_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, + IMC_UNCORE_DEV(TGL_U1), + IMC_UNCORE_DEV(TGL_U2), + IMC_UNCORE_DEV(TGL_U3), + IMC_UNCORE_DEV(TGL_U4), + IMC_UNCORE_DEV(TGL_H), + IMC_UNCORE_DEV(ADL_1), + IMC_UNCORE_DEV(ADL_2), + IMC_UNCORE_DEV(ADL_3), + IMC_UNCORE_DEV(ADL_4), + IMC_UNCORE_DEV(ADL_5), + IMC_UNCORE_DEV(ADL_6), + IMC_UNCORE_DEV(ADL_7), + IMC_UNCORE_DEV(ADL_8), + IMC_UNCORE_DEV(ADL_9), + IMC_UNCORE_DEV(ADL_10), + IMC_UNCORE_DEV(ADL_11), + IMC_UNCORE_DEV(ADL_12), + IMC_UNCORE_DEV(ADL_13), + IMC_UNCORE_DEV(ADL_14), + IMC_UNCORE_DEV(ADL_15), + IMC_UNCORE_DEV(ADL_16), + IMC_UNCORE_DEV(RPL_1), + IMC_UNCORE_DEV(RPL_2), + IMC_UNCORE_DEV(RPL_3), + IMC_UNCORE_DEV(RPL_4), { /* end: all zeroes */ } }; -- cgit From f758bc5a91233bb5b5b6994a8e72ba4eba0e9ab2 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 4 May 2022 12:44:13 -0700 Subject: perf/x86/uncore: Add new Alder Lake and Raptor Lake support From the perspective of the uncore PMU, there is nothing changed for the new Alder Lake N and Raptor Lake P. Add new PCIIDs of IMC. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220504194413.1003071-5-kan.liang@linux.intel.com --- arch/x86/events/intel/uncore.c | 2 ++ arch/x86/events/intel/uncore_snb.c | 52 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 7695dcae280e..db6c31bca809 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1828,7 +1828,9 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &rkl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &adl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &adl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &adl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &spr_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init), {}, diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index b30890b91137..ce440011cc4e 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -79,10 +79,36 @@ #define PCI_DEVICE_ID_INTEL_ADL_14_IMC 0x4650 #define PCI_DEVICE_ID_INTEL_ADL_15_IMC 0x4668 #define PCI_DEVICE_ID_INTEL_ADL_16_IMC 0x4670 +#define PCI_DEVICE_ID_INTEL_ADL_17_IMC 0x4614 +#define PCI_DEVICE_ID_INTEL_ADL_18_IMC 0x4617 +#define PCI_DEVICE_ID_INTEL_ADL_19_IMC 0x4618 +#define PCI_DEVICE_ID_INTEL_ADL_20_IMC 0x461B +#define PCI_DEVICE_ID_INTEL_ADL_21_IMC 0x461C #define PCI_DEVICE_ID_INTEL_RPL_1_IMC 0xA700 #define PCI_DEVICE_ID_INTEL_RPL_2_IMC 0xA702 #define PCI_DEVICE_ID_INTEL_RPL_3_IMC 0xA706 #define PCI_DEVICE_ID_INTEL_RPL_4_IMC 0xA709 +#define PCI_DEVICE_ID_INTEL_RPL_5_IMC 0xA701 +#define PCI_DEVICE_ID_INTEL_RPL_6_IMC 0xA703 +#define PCI_DEVICE_ID_INTEL_RPL_7_IMC 0xA704 +#define PCI_DEVICE_ID_INTEL_RPL_8_IMC 0xA705 +#define PCI_DEVICE_ID_INTEL_RPL_9_IMC 0xA706 +#define PCI_DEVICE_ID_INTEL_RPL_10_IMC 0xA707 +#define PCI_DEVICE_ID_INTEL_RPL_11_IMC 0xA708 +#define PCI_DEVICE_ID_INTEL_RPL_12_IMC 0xA709 +#define PCI_DEVICE_ID_INTEL_RPL_13_IMC 0xA70a +#define PCI_DEVICE_ID_INTEL_RPL_14_IMC 0xA70b +#define PCI_DEVICE_ID_INTEL_RPL_15_IMC 0xA715 +#define PCI_DEVICE_ID_INTEL_RPL_16_IMC 0xA716 +#define PCI_DEVICE_ID_INTEL_RPL_17_IMC 0xA717 +#define PCI_DEVICE_ID_INTEL_RPL_18_IMC 0xA718 +#define PCI_DEVICE_ID_INTEL_RPL_19_IMC 0xA719 +#define PCI_DEVICE_ID_INTEL_RPL_20_IMC 0xA71A +#define PCI_DEVICE_ID_INTEL_RPL_21_IMC 0xA71B +#define PCI_DEVICE_ID_INTEL_RPL_22_IMC 0xA71C +#define PCI_DEVICE_ID_INTEL_RPL_23_IMC 0xA728 +#define PCI_DEVICE_ID_INTEL_RPL_24_IMC 0xA729 +#define PCI_DEVICE_ID_INTEL_RPL_25_IMC 0xA72A #define IMC_UNCORE_DEV(a) \ @@ -1192,10 +1218,36 @@ static const struct pci_device_id tgl_uncore_pci_ids[] = { IMC_UNCORE_DEV(ADL_14), IMC_UNCORE_DEV(ADL_15), IMC_UNCORE_DEV(ADL_16), + IMC_UNCORE_DEV(ADL_17), + IMC_UNCORE_DEV(ADL_18), + IMC_UNCORE_DEV(ADL_19), + IMC_UNCORE_DEV(ADL_20), + IMC_UNCORE_DEV(ADL_21), IMC_UNCORE_DEV(RPL_1), IMC_UNCORE_DEV(RPL_2), IMC_UNCORE_DEV(RPL_3), IMC_UNCORE_DEV(RPL_4), + IMC_UNCORE_DEV(RPL_5), + IMC_UNCORE_DEV(RPL_6), + IMC_UNCORE_DEV(RPL_7), + IMC_UNCORE_DEV(RPL_8), + IMC_UNCORE_DEV(RPL_9), + IMC_UNCORE_DEV(RPL_10), + IMC_UNCORE_DEV(RPL_11), + IMC_UNCORE_DEV(RPL_12), + IMC_UNCORE_DEV(RPL_13), + IMC_UNCORE_DEV(RPL_14), + IMC_UNCORE_DEV(RPL_15), + IMC_UNCORE_DEV(RPL_16), + IMC_UNCORE_DEV(RPL_17), + IMC_UNCORE_DEV(RPL_18), + IMC_UNCORE_DEV(RPL_19), + IMC_UNCORE_DEV(RPL_20), + IMC_UNCORE_DEV(RPL_21), + IMC_UNCORE_DEV(RPL_22), + IMC_UNCORE_DEV(RPL_23), + IMC_UNCORE_DEV(RPL_24), + IMC_UNCORE_DEV(RPL_25), { /* end: all zeroes */ } }; -- cgit From 39b2ca75eec8a33e2ffdb8aa0c4840ec3e3b472c Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Mon, 9 May 2022 10:19:07 +0530 Subject: perf/amd/ibs: Cascade pmu init functions' return value IBS pmu initialization code ignores return value provided by callee functions. Fix it. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220509044914.1473-2-ravi.bangoria@amd.com --- arch/x86/events/amd/ibs.c | 37 +++++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 11e8b493e015..2704ec1e42a3 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -777,9 +777,10 @@ static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name) return ret; } -static __init void perf_event_ibs_init(void) +static __init int perf_event_ibs_init(void) { struct attribute **attr = ibs_op_format_attrs; + int ret; /* * Some chips fail to reset the fetch count when it is written; instead @@ -791,7 +792,9 @@ static __init void perf_event_ibs_init(void) if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model < 0x10) perf_ibs_fetch.fetch_ignore_if_zero_rip = 1; - perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); + ret = perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); + if (ret) + return ret; if (ibs_caps & IBS_CAPS_OPCNT) { perf_ibs_op.config_mask |= IBS_OP_CNT_CTL; @@ -804,15 +807,35 @@ static __init void perf_event_ibs_init(void) perf_ibs_op.cnt_mask |= IBS_OP_MAX_CNT_EXT_MASK; } - perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); + ret = perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); + if (ret) + goto err_op; + + ret = register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs"); + if (ret) + goto err_nmi; - register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs"); pr_info("perf: AMD IBS detected (0x%08x)\n", ibs_caps); + return 0; + +err_nmi: + perf_pmu_unregister(&perf_ibs_op.pmu); + free_percpu(perf_ibs_op.pcpu); + perf_ibs_op.pcpu = NULL; +err_op: + perf_pmu_unregister(&perf_ibs_fetch.pmu); + free_percpu(perf_ibs_fetch.pcpu); + perf_ibs_fetch.pcpu = NULL; + + return ret; } #else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */ -static __init void perf_event_ibs_init(void) { } +static __init int perf_event_ibs_init(void) +{ + return 0; +} #endif @@ -1082,9 +1105,7 @@ static __init int amd_ibs_init(void) x86_pmu_amd_ibs_starting_cpu, x86_pmu_amd_ibs_dying_cpu); - perf_event_ibs_init(); - - return 0; + return perf_event_ibs_init(); } /* Since we need the pci subsystem to init ibs we can't do this earlier: */ -- cgit From 2a7a7e658682bfd7501dc6b4c9d365aa6c79788a Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Mon, 9 May 2022 10:19:08 +0530 Subject: perf/amd/ibs: Use ->is_visible callback for dynamic attributes Currently, some attributes are added at build time whereas others at boot time depending on IBS pmu capabilities. Instead, we can just add all attribute groups at build time but hide individual group at boot time using more appropriate ->is_visible() callback. Also, struct perf_ibs has bunch of fields for pmu attributes which just pass on the pointer, does not do anything else. Remove them. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220509044914.1473-3-ravi.bangoria@amd.com --- arch/x86/events/amd/ibs.c | 78 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 54 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 2704ec1e42a3..ece4f6a7d24b 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -94,10 +94,6 @@ struct perf_ibs { unsigned int fetch_ignore_if_zero_rip : 1; struct cpu_perf_ibs __percpu *pcpu; - struct attribute **format_attrs; - struct attribute_group format_group; - const struct attribute_group *attr_groups[2]; - u64 (*get_count)(u64 config); }; @@ -528,16 +524,61 @@ static void perf_ibs_del(struct perf_event *event, int flags) static void perf_ibs_read(struct perf_event *event) { } +/* + * We need to initialize with empty group if all attributes in the + * group are dynamic. + */ +static struct attribute *attrs_empty[] = { + NULL, +}; + +static struct attribute_group empty_format_group = { + .name = "format", + .attrs = attrs_empty, +}; + +static const struct attribute_group *empty_attr_groups[] = { + &empty_format_group, + NULL, +}; + PMU_FORMAT_ATTR(rand_en, "config:57"); PMU_FORMAT_ATTR(cnt_ctl, "config:19"); -static struct attribute *ibs_fetch_format_attrs[] = { +static struct attribute *rand_en_attrs[] = { &format_attr_rand_en.attr, NULL, }; -static struct attribute *ibs_op_format_attrs[] = { - NULL, /* &format_attr_cnt_ctl.attr if IBS_CAPS_OPCNT */ +static struct attribute_group group_rand_en = { + .name = "format", + .attrs = rand_en_attrs, +}; + +static const struct attribute_group *fetch_attr_groups[] = { + &group_rand_en, + NULL, +}; + +static umode_t +cnt_ctl_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return ibs_caps & IBS_CAPS_OPCNT ? attr->mode : 0; +} + +static struct attribute *cnt_ctl_attrs[] = { + &format_attr_cnt_ctl.attr, + NULL, +}; + +static struct attribute_group group_cnt_ctl = { + .name = "format", + .attrs = cnt_ctl_attrs, + .is_visible = cnt_ctl_is_visible, +}; + +static const struct attribute_group *op_attr_update[] = { + &group_cnt_ctl, NULL, }; @@ -561,7 +602,6 @@ static struct perf_ibs perf_ibs_fetch = { .max_period = IBS_FETCH_MAX_CNT << 4, .offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK }, .offset_max = MSR_AMD64_IBSFETCH_REG_COUNT, - .format_attrs = ibs_fetch_format_attrs, .get_count = get_ibs_fetch_count, }; @@ -587,7 +627,6 @@ static struct perf_ibs perf_ibs_op = { .max_period = IBS_OP_MAX_CNT << 4, .offset_mask = { MSR_AMD64_IBSOP_REG_MASK }, .offset_max = MSR_AMD64_IBSOP_REG_COUNT, - .format_attrs = ibs_op_format_attrs, .get_count = get_ibs_op_count, }; @@ -757,17 +796,6 @@ static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name) perf_ibs->pcpu = pcpu; - /* register attributes */ - if (perf_ibs->format_attrs[0]) { - memset(&perf_ibs->format_group, 0, sizeof(perf_ibs->format_group)); - perf_ibs->format_group.name = "format"; - perf_ibs->format_group.attrs = perf_ibs->format_attrs; - - memset(&perf_ibs->attr_groups, 0, sizeof(perf_ibs->attr_groups)); - perf_ibs->attr_groups[0] = &perf_ibs->format_group; - perf_ibs->pmu.attr_groups = perf_ibs->attr_groups; - } - ret = perf_pmu_register(&perf_ibs->pmu, name, -1); if (ret) { perf_ibs->pcpu = NULL; @@ -779,7 +807,6 @@ static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name) static __init int perf_event_ibs_init(void) { - struct attribute **attr = ibs_op_format_attrs; int ret; /* @@ -792,14 +819,14 @@ static __init int perf_event_ibs_init(void) if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model < 0x10) perf_ibs_fetch.fetch_ignore_if_zero_rip = 1; + perf_ibs_fetch.pmu.attr_groups = fetch_attr_groups; + ret = perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); if (ret) return ret; - if (ibs_caps & IBS_CAPS_OPCNT) { + if (ibs_caps & IBS_CAPS_OPCNT) perf_ibs_op.config_mask |= IBS_OP_CNT_CTL; - *attr++ = &format_attr_cnt_ctl.attr; - } if (ibs_caps & IBS_CAPS_OPCNTEXT) { perf_ibs_op.max_period |= IBS_OP_MAX_CNT_EXT_MASK; @@ -807,6 +834,9 @@ static __init int perf_event_ibs_init(void) perf_ibs_op.cnt_mask |= IBS_OP_MAX_CNT_EXT_MASK; } + perf_ibs_op.pmu.attr_groups = empty_attr_groups; + perf_ibs_op.pmu.attr_update = op_attr_update; + ret = perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); if (ret) goto err_op; -- cgit From ba5d35b442c65f32d38ef61f732218274c6dcf4c Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Mon, 9 May 2022 10:19:09 +0530 Subject: perf/amd/ibs: Add support for L3 miss filtering IBS L3 miss filtering works by tagging an instruction on IBS counter overflow and generating an NMI if the tagged instruction causes an L3 miss. Samples without an L3 miss are discarded and counter is reset with random value (between 1-15 for fetch pmu and 1-127 for op pmu). This helps in reducing sampling overhead when user is interested only in such samples. One of the use case of such filtered samples is to feed data to page-migration daemon in tiered memory systems. Add support for L3 miss filtering in IBS driver via new pmu attribute "l3missonly". Example usage: # perf record -a -e ibs_op/l3missonly=1/ --raw-samples sleep 5 Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220509044914.1473-4-ravi.bangoria@amd.com --- arch/x86/events/amd/ibs.c | 67 +++++++++++++++++++++++++++++++++++---- arch/x86/include/asm/perf_event.h | 3 ++ 2 files changed, 63 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index ece4f6a7d24b..2dc8b7ec030a 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -544,22 +544,46 @@ static const struct attribute_group *empty_attr_groups[] = { PMU_FORMAT_ATTR(rand_en, "config:57"); PMU_FORMAT_ATTR(cnt_ctl, "config:19"); +PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59"); +PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16"); + +static umode_t +zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return ibs_caps & IBS_CAPS_ZEN4 ? attr->mode : 0; +} static struct attribute *rand_en_attrs[] = { &format_attr_rand_en.attr, NULL, }; +static struct attribute *fetch_l3missonly_attrs[] = { + &fetch_l3missonly.attr.attr, + NULL, +}; + static struct attribute_group group_rand_en = { .name = "format", .attrs = rand_en_attrs, }; +static struct attribute_group group_fetch_l3missonly = { + .name = "format", + .attrs = fetch_l3missonly_attrs, + .is_visible = zen4_ibs_extensions_is_visible, +}; + static const struct attribute_group *fetch_attr_groups[] = { &group_rand_en, NULL, }; +static const struct attribute_group *fetch_attr_update[] = { + &group_fetch_l3missonly, + NULL, +}; + static umode_t cnt_ctl_is_visible(struct kobject *kobj, struct attribute *attr, int i) { @@ -571,14 +595,26 @@ static struct attribute *cnt_ctl_attrs[] = { NULL, }; +static struct attribute *op_l3missonly_attrs[] = { + &op_l3missonly.attr.attr, + NULL, +}; + static struct attribute_group group_cnt_ctl = { .name = "format", .attrs = cnt_ctl_attrs, .is_visible = cnt_ctl_is_visible, }; +static struct attribute_group group_op_l3missonly = { + .name = "format", + .attrs = op_l3missonly_attrs, + .is_visible = zen4_ibs_extensions_is_visible, +}; + static const struct attribute_group *op_attr_update[] = { &group_cnt_ctl, + &group_op_l3missonly, NULL, }; @@ -805,10 +841,8 @@ static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name) return ret; } -static __init int perf_event_ibs_init(void) +static __init int perf_ibs_fetch_init(void) { - int ret; - /* * Some chips fail to reset the fetch count when it is written; instead * they need a 0-1 transition of IbsFetchEn. @@ -819,12 +853,17 @@ static __init int perf_event_ibs_init(void) if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model < 0x10) perf_ibs_fetch.fetch_ignore_if_zero_rip = 1; + if (ibs_caps & IBS_CAPS_ZEN4) + perf_ibs_fetch.config_mask |= IBS_FETCH_L3MISSONLY; + perf_ibs_fetch.pmu.attr_groups = fetch_attr_groups; + perf_ibs_fetch.pmu.attr_update = fetch_attr_update; - ret = perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); - if (ret) - return ret; + return perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); +} +static __init int perf_ibs_op_init(void) +{ if (ibs_caps & IBS_CAPS_OPCNT) perf_ibs_op.config_mask |= IBS_OP_CNT_CTL; @@ -834,10 +873,24 @@ static __init int perf_event_ibs_init(void) perf_ibs_op.cnt_mask |= IBS_OP_MAX_CNT_EXT_MASK; } + if (ibs_caps & IBS_CAPS_ZEN4) + perf_ibs_op.config_mask |= IBS_OP_L3MISSONLY; + perf_ibs_op.pmu.attr_groups = empty_attr_groups; perf_ibs_op.pmu.attr_update = op_attr_update; - ret = perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); + return perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); +} + +static __init int perf_event_ibs_init(void) +{ + int ret; + + ret = perf_ibs_fetch_init(); + if (ret) + return ret; + + ret = perf_ibs_op_init(); if (ret) goto err_op; diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 7aa1d420c779..409725e86f42 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -410,6 +410,7 @@ struct pebs_xmm { #define IBS_CAPS_OPBRNFUSE (1U<<8) #define IBS_CAPS_FETCHCTLEXTD (1U<<9) #define IBS_CAPS_OPDATA4 (1U<<10) +#define IBS_CAPS_ZEN4 (1U<<11) #define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \ | IBS_CAPS_FETCHSAM \ @@ -423,6 +424,7 @@ struct pebs_xmm { #define IBSCTL_LVT_OFFSET_MASK 0x0F /* IBS fetch bits/masks */ +#define IBS_FETCH_L3MISSONLY (1ULL<<59) #define IBS_FETCH_RAND_EN (1ULL<<57) #define IBS_FETCH_VAL (1ULL<<49) #define IBS_FETCH_ENABLE (1ULL<<48) @@ -439,6 +441,7 @@ struct pebs_xmm { #define IBS_OP_CNT_CTL (1ULL<<19) #define IBS_OP_VAL (1ULL<<18) #define IBS_OP_ENABLE (1ULL<<17) +#define IBS_OP_L3MISSONLY (1ULL<<16) #define IBS_OP_MAX_CNT 0x0000FFFFULL #define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */ #define IBS_OP_MAX_CNT_EXT_MASK (0x7FULL<<20) /* separate upper 7 bits */ -- cgit From 838de1d843fc9b6161e0e1c6308a8c027d08606d Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Mon, 9 May 2022 10:19:10 +0530 Subject: perf/amd/ibs: Advertise zen4_ibs_extensions as pmu capability attribute PMU driver can advertise certain feature via capability attribute('caps' sysfs directory) which can be consumed by userspace tools like perf. Add zen4_ibs_extensions capability attribute for IBS pmus. This attribute will be enabled when CPUID_Fn8000001B_EAX[11] is set. With patch on Zen4: $ ls /sys/bus/event_source/devices/ibs_op/caps zen4_ibs_extensions Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220509044914.1473-5-ravi.bangoria@amd.com --- arch/x86/events/amd/ibs.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 2dc8b7ec030a..c251bc44c088 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -537,8 +537,14 @@ static struct attribute_group empty_format_group = { .attrs = attrs_empty, }; +static struct attribute_group empty_caps_group = { + .name = "caps", + .attrs = attrs_empty, +}; + static const struct attribute_group *empty_attr_groups[] = { &empty_format_group, + &empty_caps_group, NULL, }; @@ -546,6 +552,7 @@ PMU_FORMAT_ATTR(rand_en, "config:57"); PMU_FORMAT_ATTR(cnt_ctl, "config:19"); PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59"); PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16"); +PMU_EVENT_ATTR_STRING(zen4_ibs_extensions, zen4_ibs_extensions, "1"); static umode_t zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int i) @@ -563,6 +570,11 @@ static struct attribute *fetch_l3missonly_attrs[] = { NULL, }; +static struct attribute *zen4_ibs_extensions_attrs[] = { + &zen4_ibs_extensions.attr.attr, + NULL, +}; + static struct attribute_group group_rand_en = { .name = "format", .attrs = rand_en_attrs, @@ -574,13 +586,21 @@ static struct attribute_group group_fetch_l3missonly = { .is_visible = zen4_ibs_extensions_is_visible, }; +static struct attribute_group group_zen4_ibs_extensions = { + .name = "caps", + .attrs = zen4_ibs_extensions_attrs, + .is_visible = zen4_ibs_extensions_is_visible, +}; + static const struct attribute_group *fetch_attr_groups[] = { &group_rand_en, + &empty_caps_group, NULL, }; static const struct attribute_group *fetch_attr_update[] = { &group_fetch_l3missonly, + &group_zen4_ibs_extensions, NULL, }; @@ -615,6 +635,7 @@ static struct attribute_group group_op_l3missonly = { static const struct attribute_group *op_attr_update[] = { &group_cnt_ctl, &group_op_l3missonly, + &group_zen4_ibs_extensions, NULL, }; -- cgit From 9cb23f598c641c1dcbe18defd219cdc439bc94a8 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Mon, 9 May 2022 10:19:14 +0530 Subject: perf/ibs: Fix comment s/IBS Op Data 2/IBS Op Data 1/ for MSR 0xc0011035. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220509044914.1473-9-ravi.bangoria@amd.com --- arch/x86/include/asm/amd-ibs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/amd-ibs.h b/arch/x86/include/asm/amd-ibs.h index 46e1df45efc0..aabdbb5ab920 100644 --- a/arch/x86/include/asm/amd-ibs.h +++ b/arch/x86/include/asm/amd-ibs.h @@ -49,7 +49,7 @@ union ibs_op_ctl { }; }; -/* MSR 0xc0011035: IBS Op Data 2 */ +/* MSR 0xc0011035: IBS Op Data 1 */ union ibs_op_data { __u64 val; struct { -- cgit From f1f8288d19d03af9d03db219c23d07a6e8ecd51b Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Mon, 9 May 2022 08:44:23 -0700 Subject: x86/hyperv: Disable hardlockup detector by default in Hyper-V guests In newer versions of Hyper-V, the x86/x64 PMU can be virtualized into guest VMs by explicitly enabling it. Linux kernels are typically built to automatically enable the hardlockup detector if the PMU is found. To prevent the possibility of false positives due to the vagaries of VM scheduling, disable the PMU-based hardlockup detector by default in a VM on Hyper-V. The hardlockup detector can still be enabled by overriding the default with the nmi_watchdog=1 option on the kernel boot line or via sysctl at runtime. This change mimics the approach taken with KVM guests in commit 692297d8f968 ("watchdog: introduce the hardlockup_detector_disable() function"). Linux on ARM64 does not provide a PMU-based hardlockup detector, so there's no corresponding disable in the Hyper-V init code on ARM64. Signed-off-by: Michael Kelley Link: https://lore.kernel.org/r/1652111063-6535-1-git-send-email-mikelley@microsoft.com Signed-off-by: Wei Liu --- arch/x86/kernel/cpu/mshyperv.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 4b67094215bb..16dad62c8155 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -465,6 +465,8 @@ static void __init ms_hyperv_init_platform(void) */ if (!(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT)) mark_tsc_unstable("running on Hyper-V"); + + hardlockup_detector_disable(); } static bool __init ms_hyperv_x2apic_available(void) -- cgit From 24773e6c7a27bfc724a8ed5523dc31bbfc9543d5 Mon Sep 17 00:00:00 2001 From: Li kunyu Date: Wed, 11 May 2022 13:16:05 +0800 Subject: x86: ACPI: Make mp_config_acpi_gsi() a void function Because the return value of mp_config_acpi_gsi() is not use, change it into a void function. Signed-off-by: Li kunyu [ rjw: Subject and changelog rewrite ] Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/acpi/boot.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 0d01e7f5078c..7e32e33d52fa 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -375,7 +375,7 @@ static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, isa_irq_to_gsi[bus_irq] = gsi; } -static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, +static void mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, int polarity) { #ifdef CONFIG_X86_MPPARSE @@ -387,9 +387,9 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, u8 pin; if (!acpi_ioapic) - return 0; + return; if (!dev || !dev_is_pci(dev)) - return 0; + return; pdev = to_pci_dev(dev); number = pdev->bus->number; @@ -408,7 +408,6 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, mp_save_irq(&mp_irq); #endif - return 0; } static int __init mp_register_ioapic_irq(u8 bus_irq, u8 polarity, -- cgit From 566fb90e050dfa2132340bbdab9533b727dda6f1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 22 Apr 2022 06:37:57 +0200 Subject: swiotlb-xen: fix DMA_ATTR_NO_KERNEL_MAPPING on arm swiotlb-xen uses very different ways to allocate coherent memory on x86 vs arm. On the former it allocates memory from the page allocator, while on the later it reuses the dma-direct allocator the handles the complexities of non-coherent DMA on arm platforms. Unfortunately the complexities of trying to deal with the two cases in the swiotlb-xen.c code lead to a bug in the handling of DMA_ATTR_NO_KERNEL_MAPPING on arm. With the DMA_ATTR_NO_KERNEL_MAPPING flag the coherent memory allocator does not actually allocate coherent memory, but just a DMA handle for some memory that is DMA addressable by the device, but which does not have to have a kernel mapping. Thus dereferencing the return value will lead to kernel crashed and memory corruption. Fix this by using the dma-direct allocator directly for arm, which works perfectly fine because on arm swiotlb-xen is only used when the domain is 1:1 mapped, and then simplifying the remaining code to only cater for the x86 case with DMA coherent device. Reported-by: Rahul Singh Signed-off-by: Christoph Hellwig Reviewed-by: Rahul Singh Reviewed-by: Stefano Stabellini Tested-by: Rahul Singh --- arch/x86/include/asm/xen/page-coherent.h | 24 ------------------------ arch/x86/include/asm/xen/swiotlb-xen.h | 6 ++++++ arch/x86/xen/mmu_pv.c | 1 + 3 files changed, 7 insertions(+), 24 deletions(-) delete mode 100644 arch/x86/include/asm/xen/page-coherent.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/page-coherent.h b/arch/x86/include/asm/xen/page-coherent.h deleted file mode 100644 index 63cd41b2e17a..000000000000 --- a/arch/x86/include/asm/xen/page-coherent.h +++ /dev/null @@ -1,24 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_XEN_PAGE_COHERENT_H -#define _ASM_X86_XEN_PAGE_COHERENT_H - -#include -#include - -static inline void *xen_alloc_coherent_pages(struct device *hwdev, size_t size, - dma_addr_t *dma_handle, gfp_t flags, - unsigned long attrs) -{ - void *vstart = (void*)__get_free_pages(flags, get_order(size)); - *dma_handle = virt_to_phys(vstart); - return vstart; -} - -static inline void xen_free_coherent_pages(struct device *hwdev, size_t size, - void *cpu_addr, dma_addr_t dma_handle, - unsigned long attrs) -{ - free_pages((unsigned long) cpu_addr, get_order(size)); -} - -#endif /* _ASM_X86_XEN_PAGE_COHERENT_H */ diff --git a/arch/x86/include/asm/xen/swiotlb-xen.h b/arch/x86/include/asm/xen/swiotlb-xen.h index e5a90b42e4dd..77a2d19cc990 100644 --- a/arch/x86/include/asm/xen/swiotlb-xen.h +++ b/arch/x86/include/asm/xen/swiotlb-xen.h @@ -8,4 +8,10 @@ extern int pci_xen_swiotlb_init_late(void); static inline int pci_xen_swiotlb_init_late(void) { return -ENXIO; } #endif +int xen_swiotlb_fixup(void *buf, unsigned long nslabs); +int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, + unsigned int address_bits, + dma_addr_t *dma_handle); +void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order); + #endif /* _ASM_X86_SWIOTLB_XEN_H */ diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 00354866921b..ee29fb558f2e 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -80,6 +80,7 @@ #include #include #include +#include #include "multicalls.h" #include "mmu.h" -- cgit From bf00745e7791fe2ba7941aeead8528075a158bbe Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 11 May 2022 10:38:53 -0700 Subject: x86/vsyscall: Remove CONFIG_LEGACY_VSYSCALL_EMULATE CONFIG_LEGACY_VSYSCALL_EMULATE is, as far as I know, only needed for the combined use of exotic and outdated debugging mechanisms with outdated binaries. At this point, no one should be using it. Eventually, dynamic switching of vsyscalls will be implemented, but this is much more complicated to support in EMULATE mode than XONLY mode. So let's force all the distros off of EMULATE mode. If anyone actually needs it, they can set vsyscall=emulate, and the kernel can then get away with refusing to support newer security models if that option is set. [ bp: Remove "we"s. ] Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Reviewed-by: Kees Cook Acked-by: Florian Weimer Link: https://lore.kernel.org/r/898932fe61db6a9d61bc2458fa2f6049f1ca9f5c.1652290558.git.luto@kernel.org --- arch/x86/Kconfig | 18 +++--------------- arch/x86/entry/vsyscall/vsyscall_64.c | 2 +- 2 files changed, 4 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4bed3abf444d..68c669680c16 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2326,7 +2326,9 @@ choice it can be used to assist security vulnerability exploitation. This setting can be changed at boot time via the kernel command - line parameter vsyscall=[emulate|xonly|none]. + line parameter vsyscall=[emulate|xonly|none]. Emulate mode + is deprecated and can only be enabled using the kernel command + line. On a system with recent enough glibc (2.14 or newer) and no static binaries, you can say None without a performance penalty @@ -2334,20 +2336,6 @@ choice If unsure, select "Emulate execution only". - config LEGACY_VSYSCALL_EMULATE - bool "Full emulation" - help - The kernel traps and emulates calls into the fixed vsyscall - address mapping. This makes the mapping non-executable, but - it still contains readable known contents, which could be - used in certain rare security vulnerability exploits. This - configuration is recommended when using legacy userspace - that still uses vsyscalls along with legacy binary - instrumentation tools that require code to be readable. - - An example of this type of legacy userspace is running - Pin on an old binary that still uses vsyscalls. - config LEGACY_VSYSCALL_XONLY bool "Emulate execution only" help diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index fd2ee9408e91..4af81df133ee 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -48,7 +48,7 @@ static enum { EMULATE, XONLY, NONE } vsyscall_mode __ro_after_init = #elif defined(CONFIG_LEGACY_VSYSCALL_XONLY) XONLY; #else - EMULATE; + #error VSYSCALL config is broken #endif static int __init vsyscall_setup(char *str) -- cgit From 6a2d90ba027adba528509ffa27097cffd3879257 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 29 Apr 2022 09:23:55 -0500 Subject: ptrace: Reimplement PTRACE_KILL by always sending SIGKILL The current implementation of PTRACE_KILL is buggy and has been for many years as it assumes it's target has stopped in ptrace_stop. At a quick skim it looks like this assumption has existed since ptrace support was added in linux v1.0. While PTRACE_KILL has been deprecated we can not remove it as a quick search with google code search reveals many existing programs calling it. When the ptracee is not stopped at ptrace_stop some fields would be set that are ignored except in ptrace_stop. Making the userspace visible behavior of PTRACE_KILL a noop in those case. As the usual rules are not obeyed it is not clear what the consequences are of calling PTRACE_KILL on a running process. Presumably userspace does not do this as it achieves nothing. Replace the implementation of PTRACE_KILL with a simple send_sig_info(SIGKILL) followed by a return 0. This changes the observable user space behavior only in that PTRACE_KILL on a process not stopped in ptrace_stop will also kill it. As that has always been the intent of the code this seems like a reasonable change. Cc: stable@vger.kernel.org Reported-by: Al Viro Suggested-by: Al Viro Tested-by: Kees Cook Reviewed-by: Oleg Nesterov Link: https://lkml.kernel.org/r/20220505182645.497868-7-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- arch/x86/kernel/step.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 0f3c307b37b3..8e2b2552b5ee 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -180,8 +180,7 @@ void set_task_blockstep(struct task_struct *task, bool on) * * NOTE: this means that set/clear TIF_BLOCKSTEP is only safe if * task is current or it can't be running, otherwise we can race - * with __switch_to_xtra(). We rely on ptrace_freeze_traced() but - * PTRACE_KILL is not safe. + * with __switch_to_xtra(). We rely on ptrace_freeze_traced(). */ local_irq_disable(); debugctl = get_debugctlmsr(); -- cgit From d3287fb0d3c8afdfd4870a6cd4a852abc9008b3b Mon Sep 17 00:00:00 2001 From: Jithu Joseph Date: Fri, 6 May 2022 15:53:59 -0700 Subject: x86/microcode/intel: Expose collect_cpu_info_early() for IFS IFS is a CPU feature that allows a binary blob, similar to microcode, to be loaded and consumed to perform low level validation of CPU circuitry. In fact, it carries the same Processor Signature (family/model/stepping) details that are contained in Intel microcode blobs. In support of an IFS driver to trigger loading, validation, and running of these tests blobs, make the functionality of cpu_signatures_match() and collect_cpu_info_early() available outside of the microcode driver. Add an "intel_" prefix and drop the "_early" suffix from collect_cpu_info_early() and EXPORT_SYMBOL_GPL() it. Add declaration to x86 Make cpu_signatures_match() an inline function in x86 , and also give it an "intel_" prefix. No functional change intended. Reviewed-by: Dan Williams Signed-off-by: Jithu Joseph Co-developed-by: Tony Luck Signed-off-by: Tony Luck Reviewed-by: Thomas Gleixner Acked-by: Borislav Petkov Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20220506225410.1652287-2-tony.luck@intel.com Signed-off-by: Hans de Goede --- arch/x86/include/asm/cpu.h | 18 +++++++++++ arch/x86/kernel/cpu/intel.c | 32 +++++++++++++++++++ arch/x86/kernel/cpu/microcode/intel.c | 59 +++++------------------------------ 3 files changed, 57 insertions(+), 52 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index 86e5e4e26fcb..990167357c34 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -76,4 +76,22 @@ static inline void init_ia32_feat_ctl(struct cpuinfo_x86 *c) {} extern __noendbr void cet_disable(void); +struct ucode_cpu_info; + +int intel_cpu_collect_info(struct ucode_cpu_info *uci); + +static inline bool intel_cpu_signatures_match(unsigned int s1, unsigned int p1, + unsigned int s2, unsigned int p2) +{ + if (s1 != s2) + return false; + + /* Processor flags are either both 0 ... */ + if (!p1 && !p2) + return true; + + /* ... or they intersect. */ + return p1 & p2; +} + #endif /* _ASM_X86_CPU_H */ diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 8321c43554a1..29d152219cb6 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -181,6 +181,38 @@ static bool bad_spectre_microcode(struct cpuinfo_x86 *c) return false; } +int intel_cpu_collect_info(struct ucode_cpu_info *uci) +{ + unsigned int val[2]; + unsigned int family, model; + struct cpu_signature csig = { 0 }; + unsigned int eax, ebx, ecx, edx; + + memset(uci, 0, sizeof(*uci)); + + eax = 0x00000001; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + csig.sig = eax; + + family = x86_family(eax); + model = x86_model(eax); + + if (model >= 5 || family > 6) { + /* get processor flags from MSR 0x17 */ + native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); + csig.pf = 1 << ((val[1] >> 18) & 7); + } + + csig.rev = intel_get_microcode_revision(); + + uci->cpu_sig = csig; + uci->valid = 1; + + return 0; +} +EXPORT_SYMBOL_GPL(intel_cpu_collect_info); + static void early_init_intel(struct cpuinfo_x86 *c) { u64 misc_enable; diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index d28a9f8f3fec..025c8f0cd948 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -45,20 +45,6 @@ static struct microcode_intel *intel_ucode_patch; /* last level cache size per core */ static int llc_size_per_core; -static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1, - unsigned int s2, unsigned int p2) -{ - if (s1 != s2) - return false; - - /* Processor flags are either both 0 ... */ - if (!p1 && !p2) - return true; - - /* ... or they intersect. */ - return p1 & p2; -} - /* * Returns 1 if update has been found, 0 otherwise. */ @@ -69,7 +55,7 @@ static int find_matching_signature(void *mc, unsigned int csig, int cpf) struct extended_signature *ext_sig; int i; - if (cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf)) + if (intel_cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf)) return 1; /* Look for ext. headers: */ @@ -80,7 +66,7 @@ static int find_matching_signature(void *mc, unsigned int csig, int cpf) ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE; for (i = 0; i < ext_hdr->count; i++) { - if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf)) + if (intel_cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf)) return 1; ext_sig++; } @@ -342,37 +328,6 @@ next: return patch; } -static int collect_cpu_info_early(struct ucode_cpu_info *uci) -{ - unsigned int val[2]; - unsigned int family, model; - struct cpu_signature csig = { 0 }; - unsigned int eax, ebx, ecx, edx; - - memset(uci, 0, sizeof(*uci)); - - eax = 0x00000001; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - csig.sig = eax; - - family = x86_family(eax); - model = x86_model(eax); - - if ((model >= 5) || (family > 6)) { - /* get processor flags from MSR 0x17 */ - native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); - csig.pf = 1 << ((val[1] >> 18) & 7); - } - - csig.rev = intel_get_microcode_revision(); - - uci->cpu_sig = csig; - uci->valid = 1; - - return 0; -} - static void show_saved_mc(void) { #ifdef DEBUG @@ -386,7 +341,7 @@ static void show_saved_mc(void) return; } - collect_cpu_info_early(&uci); + intel_cpu_collect_info(&uci); sig = uci.cpu_sig.sig; pf = uci.cpu_sig.pf; @@ -502,7 +457,7 @@ void show_ucode_info_early(void) struct ucode_cpu_info uci; if (delay_ucode_info) { - collect_cpu_info_early(&uci); + intel_cpu_collect_info(&uci); print_ucode_info(&uci, current_mc_date); delay_ucode_info = 0; } @@ -604,7 +559,7 @@ int __init save_microcode_in_initrd_intel(void) if (!(cp.data && cp.size)) return 0; - collect_cpu_info_early(&uci); + intel_cpu_collect_info(&uci); scan_microcode(cp.data, cp.size, &uci, true); @@ -637,7 +592,7 @@ static struct microcode_intel *__load_ucode_intel(struct ucode_cpu_info *uci) if (!(cp.data && cp.size)) return NULL; - collect_cpu_info_early(uci); + intel_cpu_collect_info(uci); return scan_microcode(cp.data, cp.size, uci, false); } @@ -712,7 +667,7 @@ void reload_ucode_intel(void) struct microcode_intel *p; struct ucode_cpu_info uci; - collect_cpu_info_early(&uci); + intel_cpu_collect_info(&uci); p = find_patch(&uci); if (!p) -- cgit From db1af12929c99d15fc04cc5c4447b87ab51eab0a Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 6 May 2022 15:54:00 -0700 Subject: x86/msr-index: Define INTEGRITY_CAPABILITIES MSR The INTEGRITY_CAPABILITIES MSR is enumerated by bit 2 of the CORE_CAPABILITIES MSR. Add defines for the CORE_CAPS enumeration as well as for the integrity MSR. Reviewed-by: Dan Williams Signed-off-by: Tony Luck Reviewed-by: Greg Kroah-Hartman Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/20220506225410.1652287-3-tony.luck@intel.com Signed-off-by: Hans de Goede --- arch/x86/include/asm/msr-index.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 0eb90d21049e..6fc5de0c61a6 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -76,6 +76,8 @@ /* Abbreviated from Intel SDM name IA32_CORE_CAPABILITIES */ #define MSR_IA32_CORE_CAPS 0x000000cf +#define MSR_IA32_CORE_CAPS_INTEGRITY_CAPS_BIT 2 +#define MSR_IA32_CORE_CAPS_INTEGRITY_CAPS BIT(MSR_IA32_CORE_CAPS_INTEGRITY_CAPS_BIT) #define MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT_BIT 5 #define MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT BIT(MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT_BIT) @@ -154,6 +156,11 @@ #define MSR_IA32_POWER_CTL 0x000001fc #define MSR_IA32_POWER_CTL_BIT_EE 19 +/* Abbreviated from Intel SDM name IA32_INTEGRITY_CAPABILITIES */ +#define MSR_INTEGRITY_CAPS 0x000002d9 +#define MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT 4 +#define MSR_INTEGRITY_CAPS_PERIODIC_BIST BIT(MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT) + #define MSR_LBR_NHM_FROM 0x00000680 #define MSR_LBR_NHM_TO 0x000006c0 #define MSR_LBR_CORE_FROM 0x00000040 -- cgit From b28cb0cd2c5e80a8c0feb408a0e4b0dbb6d132c5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 May 2022 14:51:22 +0000 Subject: KVM: x86/mmu: Update number of zapped pages even if page list is stable When zapping obsolete pages, update the running count of zapped pages regardless of whether or not the list has become unstable due to zapping a shadow page with its own child shadow pages. If the VM is backed by mostly 4kb pages, KVM can zap an absurd number of SPTEs without bumping the batch count and thus without yielding. In the worst case scenario, this can cause a soft lokcup. watchdog: BUG: soft lockup - CPU#12 stuck for 22s! [dirty_log_perf_:13020] RIP: 0010:workingset_activation+0x19/0x130 mark_page_accessed+0x266/0x2e0 kvm_set_pfn_accessed+0x31/0x40 mmu_spte_clear_track_bits+0x136/0x1c0 drop_spte+0x1a/0xc0 mmu_page_zap_pte+0xef/0x120 __kvm_mmu_prepare_zap_page+0x205/0x5e0 kvm_mmu_zap_all_fast+0xd7/0x190 kvm_mmu_invalidate_zap_pages_in_memslot+0xe/0x10 kvm_page_track_flush_slot+0x5c/0x80 kvm_arch_flush_shadow_memslot+0xe/0x10 kvm_set_memslot+0x1a8/0x5d0 __kvm_set_memory_region+0x337/0x590 kvm_vm_ioctl+0xb08/0x1040 Fixes: fbb158cb88b6 ("KVM: x86/mmu: Revert "Revert "KVM: MMU: zap pages in batch""") Reported-by: David Matlack Reviewed-by: Ben Gardon Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20220511145122.3133334-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 311e4e1d7870..56ebc4fb7f91 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5665,6 +5665,7 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; int nr_zapped, batch = 0; + bool unstable; restart: list_for_each_entry_safe_reverse(sp, node, @@ -5696,11 +5697,12 @@ restart: goto restart; } - if (__kvm_mmu_prepare_zap_page(kvm, sp, - &kvm->arch.zapped_obsolete_pages, &nr_zapped)) { - batch += nr_zapped; + unstable = __kvm_mmu_prepare_zap_page(kvm, sp, + &kvm->arch.zapped_obsolete_pages, &nr_zapped); + batch += nr_zapped; + + if (unstable) goto restart; - } } /* -- cgit From bc469ddf67154a4840267132e87ce0d8b72d4952 Mon Sep 17 00:00:00 2001 From: Zucheng Zheng Date: Thu, 21 Apr 2022 19:10:31 +0800 Subject: perf/x86/amd: Remove unused variable 'hwc' 'hwc' is never used in amd_pmu_enable_all(), so remove it. Signed-off-by: Zucheng Zheng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220421111031.174698-1-zhengzucheng@huawei.com --- arch/x86/events/amd/core.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 262e39a85031..d81eac2284ea 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -771,14 +771,11 @@ static void amd_pmu_enable_event(struct perf_event *event) static void amd_pmu_enable_all(int added) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct hw_perf_event *hwc; int idx; amd_brs_enable_all(); for (idx = 0; idx < x86_pmu.num_counters; idx++) { - hwc = &cpuc->events[idx]->hw; - /* only activate events which are marked as active */ if (!test_bit(idx, cpuc->active_mask)) continue; -- cgit From f5c0b4f30416c670408a77be94703d04d22b57df Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 12 May 2022 14:04:08 +0200 Subject: x86/prctl: Remove pointless task argument The functions invoked via do_arch_prctl_common() can only operate on the current task and none of these function uses the task argument. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/87lev7vtxj.ffs@tglx --- arch/x86/include/asm/fpu/api.h | 3 +-- arch/x86/include/asm/proto.h | 3 +-- arch/x86/kernel/fpu/xstate.c | 5 +---- arch/x86/kernel/process.c | 9 ++++----- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 4 ++-- 6 files changed, 10 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index c83b3020350a..6b0f31fb53f7 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -162,7 +162,6 @@ static inline bool fpstate_is_confidential(struct fpu_guest *gfpu) } /* prctl */ -struct task_struct; -extern long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2); +extern long fpu_xstate_prctl(int option, unsigned long arg2); #endif /* _ASM_X86_FPU_API_H */ diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index feed36d44d04..80be803b96d2 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -39,7 +39,6 @@ void x86_report_nx(void); extern int reboot_force; -long do_arch_prctl_common(struct task_struct *task, int option, - unsigned long arg2); +long do_arch_prctl_common(int option, unsigned long arg2); #endif /* _ASM_X86_PROTO_H */ diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 39e1c8626ab9..1b016a186c11 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1687,16 +1687,13 @@ EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm); * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18). */ -long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2) +long fpu_xstate_prctl(int option, unsigned long arg2) { u64 __user *uptr = (u64 __user *)arg2; u64 permitted, supported; unsigned long idx = arg2; bool guest = false; - if (tsk != current) - return -EPERM; - switch (option) { case ARCH_GET_XCOMP_SUPP: supported = fpu_user_cfg.max_features | fpu_user_cfg.legacy_features; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index b3d2d414cfad..e86d09a6aac7 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -334,7 +334,7 @@ static int get_cpuid_mode(void) return !test_thread_flag(TIF_NOCPUID); } -static int set_cpuid_mode(struct task_struct *task, unsigned long cpuid_enabled) +static int set_cpuid_mode(unsigned long cpuid_enabled) { if (!boot_cpu_has(X86_FEATURE_CPUID_FAULT)) return -ENODEV; @@ -985,20 +985,19 @@ unsigned long __get_wchan(struct task_struct *p) return addr; } -long do_arch_prctl_common(struct task_struct *task, int option, - unsigned long arg2) +long do_arch_prctl_common(int option, unsigned long arg2) { switch (option) { case ARCH_GET_CPUID: return get_cpuid_mode(); case ARCH_SET_CPUID: - return set_cpuid_mode(task, arg2); + return set_cpuid_mode(arg2); case ARCH_GET_XCOMP_SUPP: case ARCH_GET_XCOMP_PERM: case ARCH_REQ_XCOMP_PERM: case ARCH_GET_XCOMP_GUEST_PERM: case ARCH_REQ_XCOMP_GUEST_PERM: - return fpu_xstate_prctl(task, option, arg2); + return fpu_xstate_prctl(option, arg2); } return -EINVAL; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 26edb1cd07a4..0faa5e28dd64 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -222,5 +222,5 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) { - return do_arch_prctl_common(current, option, arg2); + return do_arch_prctl_common(option, arg2); } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index e459253649be..1962008fe743 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -844,7 +844,7 @@ SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) ret = do_arch_prctl_64(current, option, arg2); if (ret == -EINVAL) - ret = do_arch_prctl_common(current, option, arg2); + ret = do_arch_prctl_common(option, arg2); return ret; } @@ -852,7 +852,7 @@ SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) #ifdef CONFIG_IA32_EMULATION COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) { - return do_arch_prctl_common(current, option, arg2); + return do_arch_prctl_common(option, arg2); } #endif -- cgit From c9fe66560bf2dc7d109754414e309888cb8c9ba9 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Mon, 9 May 2022 18:20:50 -0700 Subject: mm/mprotect: do not flush when not required architecturally Currently, using mprotect() to unprotect a memory region or uffd to unprotect a memory region causes a TLB flush. However, in such cases the PTE is often not modified (i.e., remain RO) and therefore not TLB flush is needed. Add an arch-specific pte_needs_flush() which tells whether a TLB flush is needed based on the old PTE and the new one. Implement an x86 pte_needs_flush(). Always flush the TLB when it is architecturally needed even when skipping a TLB flush might only result in a spurious page-faults by skipping the flush. Even with such conservative manner, we can in the future further refine the checks to test whether a PTE is present by only considering the architectural _PAGE_PRESENT flag instead of {pte|pmd}_preesnt(). For not be careful and use the latter. Link: https://lkml.kernel.org/r/20220401180821.1986781-3-namit@vmware.com Signed-off-by: Nadav Amit Cc: Andrea Arcangeli Cc: Andy Lutomirski Cc: Dave Hansen Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Cc: Yu Zhao Cc: Nick Piggin Cc: Andrew Cooper Cc: Peter Xu Signed-off-by: Andrew Morton --- arch/x86/include/asm/pgtable_types.h | 2 + arch/x86/include/asm/tlbflush.h | 97 ++++++++++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 40497a9020c6..8668bc661026 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -110,9 +110,11 @@ #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) #define _PAGE_DEVMAP (_AT(u64, 1) << _PAGE_BIT_DEVMAP) +#define _PAGE_SOFTW4 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW4) #else #define _PAGE_NX (_AT(pteval_t, 0)) #define _PAGE_DEVMAP (_AT(pteval_t, 0)) +#define _PAGE_SOFTW4 (_AT(pteval_t, 0)) #endif #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 98fa0a114074..4af5579c7ef7 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -259,6 +259,103 @@ static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); +static inline bool pte_flags_need_flush(unsigned long oldflags, + unsigned long newflags, + bool ignore_access) +{ + /* + * Flags that require a flush when cleared but not when they are set. + * Only include flags that would not trigger spurious page-faults. + * Non-present entries are not cached. Hardware would set the + * dirty/access bit if needed without a fault. + */ + const pteval_t flush_on_clear = _PAGE_DIRTY | _PAGE_PRESENT | + _PAGE_ACCESSED; + const pteval_t software_flags = _PAGE_SOFTW1 | _PAGE_SOFTW2 | + _PAGE_SOFTW3 | _PAGE_SOFTW4; + const pteval_t flush_on_change = _PAGE_RW | _PAGE_USER | _PAGE_PWT | + _PAGE_PCD | _PAGE_PSE | _PAGE_GLOBAL | _PAGE_PAT | + _PAGE_PAT_LARGE | _PAGE_PKEY_BIT0 | _PAGE_PKEY_BIT1 | + _PAGE_PKEY_BIT2 | _PAGE_PKEY_BIT3 | _PAGE_NX; + unsigned long diff = oldflags ^ newflags; + + BUILD_BUG_ON(flush_on_clear & software_flags); + BUILD_BUG_ON(flush_on_clear & flush_on_change); + BUILD_BUG_ON(flush_on_change & software_flags); + + /* Ignore software flags */ + diff &= ~software_flags; + + if (ignore_access) + diff &= ~_PAGE_ACCESSED; + + /* + * Did any of the 'flush_on_clear' flags was clleared set from between + * 'oldflags' and 'newflags'? + */ + if (diff & oldflags & flush_on_clear) + return true; + + /* Flush on modified flags. */ + if (diff & flush_on_change) + return true; + + /* Ensure there are no flags that were left behind */ + if (IS_ENABLED(CONFIG_DEBUG_VM) && + (diff & ~(flush_on_clear | software_flags | flush_on_change))) { + VM_WARN_ON_ONCE(1); + return true; + } + + return false; +} + +/* + * pte_needs_flush() checks whether permissions were demoted and require a + * flush. It should only be used for userspace PTEs. + */ +static inline bool pte_needs_flush(pte_t oldpte, pte_t newpte) +{ + /* !PRESENT -> * ; no need for flush */ + if (!(pte_flags(oldpte) & _PAGE_PRESENT)) + return false; + + /* PFN changed ; needs flush */ + if (pte_pfn(oldpte) != pte_pfn(newpte)) + return true; + + /* + * check PTE flags; ignore access-bit; see comment in + * ptep_clear_flush_young(). + */ + return pte_flags_need_flush(pte_flags(oldpte), pte_flags(newpte), + true); +} +#define pte_needs_flush pte_needs_flush + +/* + * huge_pmd_needs_flush() checks whether permissions were demoted and require a + * flush. It should only be used for userspace huge PMDs. + */ +static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd) +{ + /* !PRESENT -> * ; no need for flush */ + if (!(pmd_flags(oldpmd) & _PAGE_PRESENT)) + return false; + + /* PFN changed ; needs flush */ + if (pmd_pfn(oldpmd) != pmd_pfn(newpmd)) + return true; + + /* + * check PMD flags; do not ignore access-bit; see + * pmdp_clear_flush_young(). + */ + return pte_flags_need_flush(pmd_flags(oldpmd), pmd_flags(newpmd), + false); +} +#define huge_pmd_needs_flush huge_pmd_needs_flush + #endif /* !MODULE */ static inline void __native_tlb_flush_global(unsigned long cr4) -- cgit From 4f83145721f362c2f4d312edc4755269a2069488 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Mon, 9 May 2022 18:20:50 -0700 Subject: mm: avoid unnecessary flush on change_huge_pmd() Calls to change_protection_range() on THP can trigger, at least on x86, two TLB flushes for one page: one immediately, when pmdp_invalidate() is called by change_huge_pmd(), and then another one later (that can be batched) when change_protection_range() finishes. The first TLB flush is only necessary to prevent the dirty bit (and with a lesser importance the access bit) from changing while the PTE is modified. However, this is not necessary as the x86 CPUs set the dirty-bit atomically with an additional check that the PTE is (still) present. One caveat is Intel's Knights Landing that has a bug and does not do so. Leverage this behavior to eliminate the unnecessary TLB flush in change_huge_pmd(). Introduce a new arch specific pmdp_invalidate_ad() that only invalidates the access and dirty bit from further changes. Link: https://lkml.kernel.org/r/20220401180821.1986781-4-namit@vmware.com Signed-off-by: Nadav Amit Cc: Andrea Arcangeli Cc: Andrew Cooper Cc: Andy Lutomirski Cc: Dave Hansen Cc: Peter Xu Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Cc: Yu Zhao Cc: Nick Piggin Signed-off-by: Andrew Morton --- arch/x86/include/asm/pgtable.h | 5 +++++ arch/x86/mm/pgtable.c | 10 ++++++++++ 2 files changed, 15 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 1e09519af143..0821f87d495f 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1168,6 +1168,11 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, } } #endif + +#define __HAVE_ARCH_PMDP_INVALIDATE_AD +extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); + /* * Page table pages are page-aligned. The lower half of the top * level is used for userspace and the top half for the kernel. diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 3481b35cb4ec..f16059e9a85e 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -608,6 +608,16 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma, return young; } + +pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + /* + * No flush is necessary. Once an invalid PTE is established, the PTE's + * access and dirty bits cannot be updated. + */ + return pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp)); +} #endif /** -- cgit From e5a554014618308f046af99ab9c950165ed6cb11 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 12 May 2022 20:23:06 -0700 Subject: mm: page_table_check: move pxx_user_accessible_page into x86 The pxx_user_accessible_page() checks the PTE bit, it's architecture-specific code, move them into x86's pgtable.h. These helpers are being moved out to make the page table check framework platform independent. Link: https://lkml.kernel.org/r/20220507110114.4128854-3-tongtiangen@huawei.com Signed-off-by: Kefeng Wang Signed-off-by: Tong Tiangen Acked-by: Pasha Tatashin Reviewed-by: Anshuman Khandual Cc: Borislav Petkov Cc: Catalin Marinas Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/x86/include/asm/pgtable.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 0821f87d495f..46fa65d818bd 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1447,6 +1447,23 @@ static inline bool arch_faults_on_old_pte(void) return false; } +#ifdef CONFIG_PAGE_TABLE_CHECK +static inline bool pte_user_accessible_page(pte_t pte) +{ + return (pte_val(pte) & _PAGE_PRESENT) && (pte_val(pte) & _PAGE_USER); +} + +static inline bool pmd_user_accessible_page(pmd_t pmd) +{ + return pmd_leaf(pmd) && (pmd_val(pmd) & _PAGE_PRESENT) && (pmd_val(pmd) & _PAGE_USER); +} + +static inline bool pud_user_accessible_page(pud_t pud) +{ + return pud_leaf(pud) && (pud_val(pud) & _PAGE_PRESENT) && (pud_val(pud) & _PAGE_USER); +} +#endif + #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_H */ -- cgit From de8c8e52836d0082188508548d0b939f49f7f0e6 Mon Sep 17 00:00:00 2001 From: Tong Tiangen Date: Thu, 12 May 2022 20:23:06 -0700 Subject: mm: page_table_check: add hooks to public helpers Move ptep_clear() to the include/linux/pgtable.h and add page table check relate hooks to some helpers, it's prepare for support page table check feature on new architecture. Optimize the implementation of ptep_clear(), page table hooks added page table check stubs, the interface control should be at stubs, there is no rationale for doing a IS_ENABLED() check here. For architectures that do not enable CONFIG_PAGE_TABLE_CHECK, they will call a fallback page table check stubs[1] when getting their page table helpers[2] in include/linux/pgtable.h. [1] page table check stubs defined in include/linux/page_table_check.h [2] ptep_clear() ptep_get_and_clear() pmdp_huge_get_and_clear() pudp_huge_get_and_clear() Link: https://lkml.kernel.org/r/20220507110114.4128854-4-tongtiangen@huawei.com Signed-off-by: Tong Tiangen Acked-by: Pasha Tatashin Cc: Anshuman Khandual Cc: Borislav Petkov Cc: Catalin Marinas Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Kefeng Wang Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/x86/include/asm/pgtable.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 46fa65d818bd..44e2d6f1dbaa 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1072,16 +1072,6 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, return pte; } -#define __HAVE_ARCH_PTEP_CLEAR -static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - if (IS_ENABLED(CONFIG_PAGE_TABLE_CHECK)) - ptep_get_and_clear(mm, addr, ptep); - else - pte_clear(mm, addr, ptep); -} - #define __HAVE_ARCH_PTEP_SET_WRPROTECT static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) -- cgit From c8db8c2628afc7088a43de3f7cfbcc2ef1f182f7 Mon Sep 17 00:00:00 2001 From: Li kunyu Date: Thu, 12 May 2022 20:23:07 -0700 Subject: mm: functions may simplify the use of return values p4d_clear_huge may be optimized for void return type and function usage. vunmap_p4d_range function saves a few steps here. Link: https://lkml.kernel.org/r/20220507150630.90399-1-kunyu@nfschina.com Signed-off-by: Li kunyu Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton --- arch/x86/mm/pgtable.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index f16059e9a85e..a932d7712d85 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -686,9 +686,8 @@ int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) * * No 512GB pages yet -- always return 0 */ -int p4d_clear_huge(p4d_t *p4d) +void p4d_clear_huge(p4d_t *p4d) { - return 0; } #endif -- cgit From 3bd4abc07a267e6a8b33d7f8717136e18f921c53 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Fri, 8 Apr 2022 18:03:13 +0200 Subject: x86/tsc: Use fallback for random_get_entropy() instead of zero In the event that random_get_entropy() can't access a cycle counter or similar, falling back to returning 0 is suboptimal. Instead, fallback to calling random_get_entropy_fallback(), which isn't extremely high precision or guaranteed to be entropic, but is certainly better than returning zero all the time. If CONFIG_X86_TSC=n, then it's possible for the kernel to run on systems without RDTSC, such as 486 and certain 586, so the fallback code is only required for that case. As well, fix up both the new function and the get_cycles() function from which it was derived to use cpu_feature_enabled() rather than boot_cpu_has(), and use !IS_ENABLED() instead of #ifndef. Signed-off-by: Jason A. Donenfeld Reviewed-by: Thomas Gleixner Cc: Thomas Gleixner Cc: Arnd Bergmann Cc: Borislav Petkov Cc: x86@kernel.org --- arch/x86/include/asm/timex.h | 9 +++++++++ arch/x86/include/asm/tsc.h | 7 +++---- 2 files changed, 12 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/timex.h b/arch/x86/include/asm/timex.h index a4a8b1b16c0c..956e4145311b 100644 --- a/arch/x86/include/asm/timex.h +++ b/arch/x86/include/asm/timex.h @@ -5,6 +5,15 @@ #include #include +static inline unsigned long random_get_entropy(void) +{ + if (!IS_ENABLED(CONFIG_X86_TSC) && + !cpu_feature_enabled(X86_FEATURE_TSC)) + return random_get_entropy_fallback(); + return rdtsc(); +} +#define random_get_entropy random_get_entropy + /* Assume we use the PIT time source for the clock tick */ #define CLOCK_TICK_RATE PIT_TICK_RATE diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 01a300a9700b..fbdc3d951494 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -20,13 +20,12 @@ extern void disable_TSC(void); static inline cycles_t get_cycles(void) { -#ifndef CONFIG_X86_TSC - if (!boot_cpu_has(X86_FEATURE_TSC)) + if (!IS_ENABLED(CONFIG_X86_TSC) && + !cpu_feature_enabled(X86_FEATURE_TSC)) return 0; -#endif - return rdtsc(); } +#define get_cycles get_cycles extern struct system_counterval_t convert_art_to_tsc(u64 art); extern struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns); -- cgit From b3fdf9398a16f01dc013967a4ab25e99c3f4fc12 Mon Sep 17 00:00:00 2001 From: Jane Chu Date: Mon, 16 May 2022 11:21:46 -0700 Subject: x86/mce: relocate set{clear}_mce_nospec() functions Relocate the twin mce functions to arch/x86/mm/pat/set_memory.c file where they belong. While at it, fixup a function name in a comment. Reviewed-by: Christoph Hellwig Reviewed-by: Dan Williams Signed-off-by: Jane Chu Acked-by: Borislav Petkov Cc: Stephen Rothwell [sfr: gate {set,clear}_mce_nospec() by CONFIG_X86_64] Link: https://lore.kernel.org/r/165272527328.90175.8336008202048685278.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- arch/x86/include/asm/set_memory.h | 52 --------------------------------------- arch/x86/mm/pat/set_memory.c | 50 +++++++++++++++++++++++++++++++++++-- 2 files changed, 48 insertions(+), 54 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index 78ca53512486..b45c4d27fd46 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -86,56 +86,4 @@ bool kernel_page_present(struct page *page); extern int kernel_set_to_readonly; -#ifdef CONFIG_X86_64 -/* - * Prevent speculative access to the page by either unmapping - * it (if we do not require access to any part of the page) or - * marking it uncacheable (if we want to try to retrieve data - * from non-poisoned lines in the page). - */ -static inline int set_mce_nospec(unsigned long pfn, bool unmap) -{ - unsigned long decoy_addr; - int rc; - - /* SGX pages are not in the 1:1 map */ - if (arch_is_platform_page(pfn << PAGE_SHIFT)) - return 0; - /* - * We would like to just call: - * set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1); - * but doing that would radically increase the odds of a - * speculative access to the poison page because we'd have - * the virtual address of the kernel 1:1 mapping sitting - * around in registers. - * Instead we get tricky. We create a non-canonical address - * that looks just like the one we want, but has bit 63 flipped. - * This relies on set_memory_XX() properly sanitizing any __pa() - * results with __PHYSICAL_MASK or PTE_PFN_MASK. - */ - decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63)); - - if (unmap) - rc = set_memory_np(decoy_addr, 1); - else - rc = set_memory_uc(decoy_addr, 1); - if (rc) - pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); - return rc; -} -#define set_mce_nospec set_mce_nospec - -/* Restore full speculative operation to the pfn. */ -static inline int clear_mce_nospec(unsigned long pfn) -{ - return set_memory_wb((unsigned long) pfn_to_kaddr(pfn), 1); -} -#define clear_mce_nospec clear_mce_nospec -#else -/* - * Few people would run a 32-bit kernel on a machine that supports - * recoverable errors because they have too much memory to boot 32-bit. - */ -#endif - #endif /* _ASM_X86_SET_MEMORY_H */ diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index abf5ed76e4b7..0caf4b0edcbc 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -29,7 +30,6 @@ #include #include #include -#include #include #include @@ -1816,7 +1816,7 @@ static inline int cpa_clear_pages_array(struct page **pages, int numpages, } /* - * _set_memory_prot is an internal helper for callers that have been passed + * __set_memory_prot is an internal helper for callers that have been passed * a pgprot_t value from upper layers and a reservation has already been taken. * If you want to set the pgprot to a specific page protocol, use the * set_memory_xx() functions. @@ -1925,6 +1925,52 @@ int set_memory_wb(unsigned long addr, int numpages) } EXPORT_SYMBOL(set_memory_wb); +/* + * Prevent speculative access to the page by either unmapping + * it (if we do not require access to any part of the page) or + * marking it uncacheable (if we want to try to retrieve data + * from non-poisoned lines in the page). + */ +#ifdef CONFIG_X86_64 +int set_mce_nospec(unsigned long pfn, bool unmap) +{ + unsigned long decoy_addr; + int rc; + + /* SGX pages are not in the 1:1 map */ + if (arch_is_platform_page(pfn << PAGE_SHIFT)) + return 0; + /* + * We would like to just call: + * set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1); + * but doing that would radically increase the odds of a + * speculative access to the poison page because we'd have + * the virtual address of the kernel 1:1 mapping sitting + * around in registers. + * Instead we get tricky. We create a non-canonical address + * that looks just like the one we want, but has bit 63 flipped. + * This relies on set_memory_XX() properly sanitizing any __pa() + * results with __PHYSICAL_MASK or PTE_PFN_MASK. + */ + decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63)); + + if (unmap) + rc = set_memory_np(decoy_addr, 1); + else + rc = set_memory_uc(decoy_addr, 1); + if (rc) + pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); + return rc; +} + +/* Restore full speculative operation to the pfn. */ +int clear_mce_nospec(unsigned long pfn) +{ + return set_memory_wb((unsigned long) pfn_to_kaddr(pfn), 1); +} +EXPORT_SYMBOL_GPL(clear_mce_nospec); +#endif /* CONFIG_X86_64 */ + int set_memory_x(unsigned long addr, int numpages) { if (!(__supported_pte_mask & _PAGE_NX)) -- cgit From 5898b43af954b83c4a4ee4ab85c4dbafa395822a Mon Sep 17 00:00:00 2001 From: Jane Chu Date: Mon, 16 May 2022 11:38:10 -0700 Subject: mce: fix set_mce_nospec to always unmap the whole page The set_memory_uc() approach doesn't work well in all cases. As Dan pointed out when "The VMM unmapped the bad page from guest physical space and passed the machine check to the guest." "The guest gets virtual #MC on an access to that page. When the guest tries to do set_memory_uc() and instructs cpa_flush() to do clean caches that results in taking another fault / exception perhaps because the VMM unmapped the page from the guest." Since the driver has special knowledge to handle NP or UC, mark the poisoned page with NP and let driver handle it when it comes down to repair. Please refer to discussions here for more details. https://lore.kernel.org/all/CAPcyv4hrXPb1tASBZUg-GgdVs0OOFKXMXLiHmktg_kFi7YBMyQ@mail.gmail.com/ Now since poisoned page is marked as not-present, in order to avoid writing to a not-present page and trigger kernel Oops, also fix pmem_do_write(). Fixes: 284ce4011ba6 ("x86/memory_failure: Introduce {set, clear}_mce_nospec()") Reviewed-by: Christoph Hellwig Reviewed-by: Dan Williams Signed-off-by: Jane Chu Acked-by: Tony Luck Link: https://lore.kernel.org/r/165272615484.103830.2563950688772226611.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- arch/x86/kernel/cpu/mce/core.c | 6 +++--- arch/x86/mm/pat/set_memory.c | 23 +++++++++++------------ 2 files changed, 14 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 981496e6bc0e..fa67bb9d1afe 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -579,7 +579,7 @@ static int uc_decode_notifier(struct notifier_block *nb, unsigned long val, pfn = mce->addr >> PAGE_SHIFT; if (!memory_failure(pfn, 0)) { - set_mce_nospec(pfn, whole_page(mce)); + set_mce_nospec(pfn); mce->kflags |= MCE_HANDLED_UC; } @@ -1316,7 +1316,7 @@ static void kill_me_maybe(struct callback_head *cb) ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags); if (!ret) { - set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); + set_mce_nospec(p->mce_addr >> PAGE_SHIFT); sync_core(); return; } @@ -1342,7 +1342,7 @@ static void kill_me_never(struct callback_head *cb) p->mce_count = 0; pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr); if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0)) - set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); + set_mce_nospec(p->mce_addr >> PAGE_SHIFT); } static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *)) diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 0caf4b0edcbc..44f0d4260bd8 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -1925,14 +1925,9 @@ int set_memory_wb(unsigned long addr, int numpages) } EXPORT_SYMBOL(set_memory_wb); -/* - * Prevent speculative access to the page by either unmapping - * it (if we do not require access to any part of the page) or - * marking it uncacheable (if we want to try to retrieve data - * from non-poisoned lines in the page). - */ +/* Prevent speculative access to a page by marking it not-present */ #ifdef CONFIG_X86_64 -int set_mce_nospec(unsigned long pfn, bool unmap) +int set_mce_nospec(unsigned long pfn) { unsigned long decoy_addr; int rc; @@ -1954,19 +1949,23 @@ int set_mce_nospec(unsigned long pfn, bool unmap) */ decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63)); - if (unmap) - rc = set_memory_np(decoy_addr, 1); - else - rc = set_memory_uc(decoy_addr, 1); + rc = set_memory_np(decoy_addr, 1); if (rc) pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); return rc; } +static int set_memory_present(unsigned long *addr, int numpages) +{ + return change_page_attr_set(addr, numpages, __pgprot(_PAGE_PRESENT), 0); +} + /* Restore full speculative operation to the pfn. */ int clear_mce_nospec(unsigned long pfn) { - return set_memory_wb((unsigned long) pfn_to_kaddr(pfn), 1); + unsigned long addr = (unsigned long) pfn_to_kaddr(pfn); + + return set_memory_present(&addr, 1); } EXPORT_SYMBOL_GPL(clear_mce_nospec); #endif /* CONFIG_X86_64 */ -- cgit From 6bd429643cc265e94a9d19839c771bcc5d008fa8 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Thu, 12 May 2022 14:50:57 -0700 Subject: x86/sgx: Disconnect backing page references from dirty status SGX uses shmem backing storage to store encrypted enclave pages and their crypto metadata when enclave pages are moved out of enclave memory. Two shmem backing storage pages are associated with each enclave page - one backing page to contain the encrypted enclave page data and one backing page (shared by a few enclave pages) to contain the crypto metadata used by the processor to verify the enclave page when it is loaded back into the enclave. sgx_encl_put_backing() is used to release references to the backing storage and, optionally, mark both backing store pages as dirty. Managing references and dirty status together in this way results in both backing store pages marked as dirty, even if only one of the backing store pages are changed. Additionally, waiting until the page reference is dropped to set the page dirty risks a race with the page fault handler that may load outdated data into the enclave when a page is faulted right after it is reclaimed. Consider what happens if the reclaimer writes a page to the backing store and the page is immediately faulted back, before the reclaimer is able to set the dirty bit of the page: sgx_reclaim_pages() { sgx_vma_fault() { ... sgx_encl_get_backing(); ... ... sgx_reclaimer_write() { mutex_lock(&encl->lock); /* Write data to backing store */ mutex_unlock(&encl->lock); } mutex_lock(&encl->lock); __sgx_encl_eldu() { ... /* * Enclave backing store * page not released * nor marked dirty - * contents may not be * up to date. */ sgx_encl_get_backing(); ... /* * Enclave data restored * from backing store * and PCMD pages that * are not up to date. * ENCLS[ELDU] faults * because of MAC or PCMD * checking failure. */ sgx_encl_put_backing(); } ... /* set page dirty */ sgx_encl_put_backing(); ... mutex_unlock(&encl->lock); } } Remove the option to sgx_encl_put_backing() to set the backing pages as dirty and set the needed pages as dirty right after receiving important data while enclave mutex is held. This ensures that the page fault handler can get up to date data from a page and prepares the code for a following change where only one of the backing pages need to be marked as dirty. Cc: stable@vger.kernel.org Fixes: 1728ab54b4be ("x86/sgx: Add a page reclaimer") Suggested-by: Dave Hansen Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Tested-by: Haitao Huang Link: https://lore.kernel.org/linux-sgx/8922e48f-6646-c7cc-6393-7c78dcf23d23@intel.com/ Link: https://lkml.kernel.org/r/fa9f98986923f43e72ef4c6702a50b2a0b3c42e3.1652389823.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/sgx/encl.c | 10 ++-------- arch/x86/kernel/cpu/sgx/encl.h | 2 +- arch/x86/kernel/cpu/sgx/main.c | 6 ++++-- 3 files changed, 7 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 7c63a1911fae..398695a20605 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -94,7 +94,7 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, kunmap_atomic(pcmd_page); kunmap_atomic((void *)(unsigned long)pginfo.contents); - sgx_encl_put_backing(&b, false); + sgx_encl_put_backing(&b); sgx_encl_truncate_backing_page(encl, page_index); @@ -645,15 +645,9 @@ int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, /** * sgx_encl_put_backing() - Unpin the backing storage * @backing: data for accessing backing storage for the page - * @do_write: mark pages dirty */ -void sgx_encl_put_backing(struct sgx_backing *backing, bool do_write) +void sgx_encl_put_backing(struct sgx_backing *backing) { - if (do_write) { - set_page_dirty(backing->pcmd); - set_page_dirty(backing->contents); - } - put_page(backing->pcmd); put_page(backing->contents); } diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h index fec43ca65065..d44e7372151f 100644 --- a/arch/x86/kernel/cpu/sgx/encl.h +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -107,7 +107,7 @@ void sgx_encl_release(struct kref *ref); int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm); int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, struct sgx_backing *backing); -void sgx_encl_put_backing(struct sgx_backing *backing, bool do_write); +void sgx_encl_put_backing(struct sgx_backing *backing); int sgx_encl_test_and_clear_young(struct mm_struct *mm, struct sgx_encl_page *page); diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 8e4bc6453d26..e71df40a4f38 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -191,6 +191,8 @@ static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot, backing->pcmd_offset; ret = __ewb(&pginfo, sgx_get_epc_virt_addr(epc_page), va_slot); + set_page_dirty(backing->pcmd); + set_page_dirty(backing->contents); kunmap_atomic((void *)(unsigned long)(pginfo.metadata - backing->pcmd_offset)); @@ -320,7 +322,7 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page, sgx_encl_free_epc_page(encl->secs.epc_page); encl->secs.epc_page = NULL; - sgx_encl_put_backing(&secs_backing, true); + sgx_encl_put_backing(&secs_backing); } out: @@ -411,7 +413,7 @@ skip: encl_page = epc_page->owner; sgx_reclaimer_write(epc_page, &backing[i]); - sgx_encl_put_backing(&backing[i], true); + sgx_encl_put_backing(&backing[i]); kref_put(&encl_page->encl->refcount, sgx_encl_release); epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; -- cgit From 2154e1c11b7080aa19f47160bd26b6f39bbd7824 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Thu, 12 May 2022 14:50:58 -0700 Subject: x86/sgx: Mark PCMD page as dirty when modifying contents Recent commit 08999b2489b4 ("x86/sgx: Free backing memory after faulting the enclave page") expanded __sgx_encl_eldu() to clear an enclave page's PCMD (Paging Crypto MetaData) from the PCMD page in the backing store after the enclave page is restored to the enclave. Since the PCMD page in the backing store is modified the page should be marked as dirty to ensure the modified data is retained. Cc: stable@vger.kernel.org Fixes: 08999b2489b4 ("x86/sgx: Free backing memory after faulting the enclave page") Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Tested-by: Haitao Huang Link: https://lkml.kernel.org/r/00cd2ac480db01058d112e347b32599c1a806bc4.1652389823.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/sgx/encl.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 398695a20605..5104a428b72c 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -84,6 +84,7 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, } memset(pcmd_page + b.pcmd_offset, 0, sizeof(struct sgx_pcmd)); + set_page_dirty(b.pcmd); /* * The area for the PCMD in the page was zeroed above. Check if the -- cgit From 0e4e729a830c1e7f31d3b3fbf8feb355a402b117 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Thu, 12 May 2022 14:50:59 -0700 Subject: x86/sgx: Obtain backing storage page with enclave mutex held Haitao reported encountering a WARN triggered by the ENCLS[ELDU] instruction faulting with a #GP. The WARN is encountered when the reclaimer evicts a range of pages from the enclave when the same pages are faulted back right away. The SGX backing storage is accessed on two paths: when there are insufficient free pages in the EPC the reclaimer works to move enclave pages to the backing storage and as enclaves access pages that have been moved to the backing storage they are retrieved from there as part of page fault handling. An oversubscribed SGX system will often run the reclaimer and page fault handler concurrently and needs to ensure that the backing store is accessed safely between the reclaimer and the page fault handler. This is not the case because the reclaimer accesses the backing store without the enclave mutex while the page fault handler accesses the backing store with the enclave mutex. Consider the scenario where a page is faulted while a page sharing a PCMD page with the faulted page is being reclaimed. The consequence is a race between the reclaimer and page fault handler, the reclaimer attempting to access a PCMD at the same time it is truncated by the page fault handler. This could result in lost PCMD data. Data may still be lost if the reclaimer wins the race, this is addressed in the following patch. The reclaimer accesses pages from the backing storage without holding the enclave mutex and runs the risk of concurrently accessing the backing storage with the page fault handler that does access the backing storage with the enclave mutex held. In the scenario below a PCMD page is truncated from the backing store after all its pages have been loaded in to the enclave at the same time the PCMD page is loaded from the backing store when one of its pages are reclaimed: sgx_reclaim_pages() { sgx_vma_fault() { ... mutex_lock(&encl->lock); ... __sgx_encl_eldu() { ... if (pcmd_page_empty) { /* * EPC page being reclaimed /* * shares a PCMD page with an * PCMD page truncated * enclave page that is being * while requested from * faulted in. * reclaimer. */ */ sgx_encl_get_backing() <----------> sgx_encl_truncate_backing_page() } mutex_unlock(&encl->lock); } } In this scenario there is a race between the reclaimer and the page fault handler when the reclaimer attempts to get access to the same PCMD page that is being truncated. This could result in the reclaimer writing to the PCMD page that is then truncated, causing the PCMD data to be lost, or in a new PCMD page being allocated. The lost PCMD data may still occur after protecting the backing store access with the mutex - this is fixed in the next patch. By ensuring the backing store is accessed with the mutex held the enclave page state can be made accurate with the SGX_ENCL_PAGE_BEING_RECLAIMED flag accurately reflecting that a page is in the process of being reclaimed. Consistently protect the reclaimer's backing store access with the enclave's mutex to ensure that it can safely run concurrently with the page fault handler. Cc: stable@vger.kernel.org Fixes: 1728ab54b4be ("x86/sgx: Add a page reclaimer") Reported-by: Haitao Huang Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Tested-by: Jarkko Sakkinen Tested-by: Haitao Huang Link: https://lkml.kernel.org/r/fa2e04c561a8555bfe1f4e7adc37d60efc77387b.1652389823.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/sgx/main.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index e71df40a4f38..ab4ec54bbdd9 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -310,6 +310,7 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page, sgx_encl_ewb(epc_page, backing); encl_page->epc_page = NULL; encl->secs_child_cnt--; + sgx_encl_put_backing(backing); if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) { ret = sgx_encl_get_backing(encl, PFN_DOWN(encl->size), @@ -381,11 +382,14 @@ static void sgx_reclaim_pages(void) goto skip; page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base); + + mutex_lock(&encl_page->encl->lock); ret = sgx_encl_get_backing(encl_page->encl, page_index, &backing[i]); - if (ret) + if (ret) { + mutex_unlock(&encl_page->encl->lock); goto skip; + } - mutex_lock(&encl_page->encl->lock); encl_page->desc |= SGX_ENCL_PAGE_BEING_RECLAIMED; mutex_unlock(&encl_page->encl->lock); continue; @@ -413,7 +417,6 @@ skip: encl_page = epc_page->owner; sgx_reclaimer_write(epc_page, &backing[i]); - sgx_encl_put_backing(&backing[i]); kref_put(&encl_page->encl->refcount, sgx_encl_release); epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; -- cgit From af117837ceb9a78e995804ade4726ad2c2c8981f Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Thu, 12 May 2022 14:51:00 -0700 Subject: x86/sgx: Fix race between reclaimer and page fault handler Haitao reported encountering a WARN triggered by the ENCLS[ELDU] instruction faulting with a #GP. The WARN is encountered when the reclaimer evicts a range of pages from the enclave when the same pages are faulted back right away. Consider two enclave pages (ENCLAVE_A and ENCLAVE_B) sharing a PCMD page (PCMD_AB). ENCLAVE_A is in the enclave memory and ENCLAVE_B is in the backing store. PCMD_AB contains just one entry, that of ENCLAVE_B. Scenario proceeds where ENCLAVE_A is being evicted from the enclave while ENCLAVE_B is faulted in. sgx_reclaim_pages() { ... /* * Reclaim ENCLAVE_A */ mutex_lock(&encl->lock); /* * Get a reference to ENCLAVE_A's * shmem page where enclave page * encrypted data will be stored * as well as a reference to the * enclave page's PCMD data page, * PCMD_AB. * Release mutex before writing * any data to the shmem pages. */ sgx_encl_get_backing(...); encl_page->desc |= SGX_ENCL_PAGE_BEING_RECLAIMED; mutex_unlock(&encl->lock); /* * Fault ENCLAVE_B */ sgx_vma_fault() { mutex_lock(&encl->lock); /* * Get reference to * ENCLAVE_B's shmem page * as well as PCMD_AB. */ sgx_encl_get_backing(...) /* * Load page back into * enclave via ELDU. */ /* * Release reference to * ENCLAVE_B' shmem page and * PCMD_AB. */ sgx_encl_put_backing(...); /* * PCMD_AB is found empty so * it and ENCLAVE_B's shmem page * are truncated. */ /* Truncate ENCLAVE_B backing page */ sgx_encl_truncate_backing_page(); /* Truncate PCMD_AB */ sgx_encl_truncate_backing_page(); mutex_unlock(&encl->lock); ... } mutex_lock(&encl->lock); encl_page->desc &= ~SGX_ENCL_PAGE_BEING_RECLAIMED; /* * Write encrypted contents of * ENCLAVE_A to ENCLAVE_A shmem * page and its PCMD data to * PCMD_AB. */ sgx_encl_put_backing(...) /* * Reference to PCMD_AB is * dropped and it is truncated. * ENCLAVE_A's PCMD data is lost. */ mutex_unlock(&encl->lock); } What happens next depends on whether it is ENCLAVE_A being faulted in or ENCLAVE_B being evicted - but both end up with ENCLS[ELDU] faulting with a #GP. If ENCLAVE_A is faulted then at the time sgx_encl_get_backing() is called a new PCMD page is allocated and providing the empty PCMD data for ENCLAVE_A would cause ENCLS[ELDU] to #GP If ENCLAVE_B is evicted first then a new PCMD_AB would be allocated by the reclaimer but later when ENCLAVE_A is faulted the ENCLS[ELDU] instruction would #GP during its checks of the PCMD value and the WARN would be encountered. Noting that the reclaimer sets SGX_ENCL_PAGE_BEING_RECLAIMED at the time it obtains a reference to the backing store pages of an enclave page it is in the process of reclaiming, fix the race by only truncating the PCMD page after ensuring that no page sharing the PCMD page is in the process of being reclaimed. Cc: stable@vger.kernel.org Fixes: 08999b2489b4 ("x86/sgx: Free backing memory after faulting the enclave page") Reported-by: Haitao Huang Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Tested-by: Haitao Huang Link: https://lkml.kernel.org/r/ed20a5db516aa813873268e125680041ae11dfcf.1652389823.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/sgx/encl.c | 94 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 93 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 5104a428b72c..243f3bd78145 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -12,6 +12,92 @@ #include "encls.h" #include "sgx.h" +#define PCMDS_PER_PAGE (PAGE_SIZE / sizeof(struct sgx_pcmd)) +/* + * 32 PCMD entries share a PCMD page. PCMD_FIRST_MASK is used to + * determine the page index associated with the first PCMD entry + * within a PCMD page. + */ +#define PCMD_FIRST_MASK GENMASK(4, 0) + +/** + * reclaimer_writing_to_pcmd() - Query if any enclave page associated with + * a PCMD page is in process of being reclaimed. + * @encl: Enclave to which PCMD page belongs + * @start_addr: Address of enclave page using first entry within the PCMD page + * + * When an enclave page is reclaimed some Paging Crypto MetaData (PCMD) is + * stored. The PCMD data of a reclaimed enclave page contains enough + * information for the processor to verify the page at the time + * it is loaded back into the Enclave Page Cache (EPC). + * + * The backing storage to which enclave pages are reclaimed is laid out as + * follows: + * Encrypted enclave pages:SECS page:PCMD pages + * + * Each PCMD page contains the PCMD metadata of + * PAGE_SIZE/sizeof(struct sgx_pcmd) enclave pages. + * + * A PCMD page can only be truncated if it is (a) empty, and (b) not in the + * process of getting data (and thus soon being non-empty). (b) is tested with + * a check if an enclave page sharing the PCMD page is in the process of being + * reclaimed. + * + * The reclaimer sets the SGX_ENCL_PAGE_BEING_RECLAIMED flag when it + * intends to reclaim that enclave page - it means that the PCMD page + * associated with that enclave page is about to get some data and thus + * even if the PCMD page is empty, it should not be truncated. + * + * Context: Enclave mutex (&sgx_encl->lock) must be held. + * Return: 1 if the reclaimer is about to write to the PCMD page + * 0 if the reclaimer has no intention to write to the PCMD page + */ +static int reclaimer_writing_to_pcmd(struct sgx_encl *encl, + unsigned long start_addr) +{ + int reclaimed = 0; + int i; + + /* + * PCMD_FIRST_MASK is based on number of PCMD entries within + * PCMD page being 32. + */ + BUILD_BUG_ON(PCMDS_PER_PAGE != 32); + + for (i = 0; i < PCMDS_PER_PAGE; i++) { + struct sgx_encl_page *entry; + unsigned long addr; + + addr = start_addr + i * PAGE_SIZE; + + /* + * Stop when reaching the SECS page - it does not + * have a page_array entry and its reclaim is + * started and completed with enclave mutex held so + * it does not use the SGX_ENCL_PAGE_BEING_RECLAIMED + * flag. + */ + if (addr == encl->base + encl->size) + break; + + entry = xa_load(&encl->page_array, PFN_DOWN(addr)); + if (!entry) + continue; + + /* + * VA page slot ID uses same bit as the flag so it is important + * to ensure that the page is not already in backing store. + */ + if (entry->epc_page && + (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)) { + reclaimed = 1; + break; + } + } + + return reclaimed; +} + /* * Calculate byte offset of a PCMD struct associated with an enclave page. PCMD's * follow right after the EPC data in the backing storage. In addition to the @@ -47,6 +133,7 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK; struct sgx_encl *encl = encl_page->encl; pgoff_t page_index, page_pcmd_off; + unsigned long pcmd_first_page; struct sgx_pageinfo pginfo; struct sgx_backing b; bool pcmd_page_empty; @@ -58,6 +145,11 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, else page_index = PFN_DOWN(encl->size); + /* + * Address of enclave page using the first entry within the PCMD page. + */ + pcmd_first_page = PFN_PHYS(page_index & ~PCMD_FIRST_MASK) + encl->base; + page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); ret = sgx_encl_get_backing(encl, page_index, &b); @@ -99,7 +191,7 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, sgx_encl_truncate_backing_page(encl, page_index); - if (pcmd_page_empty) + if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off)); return ret; -- cgit From e3a3bbe3e99de73043a1d32d36cf4d211dc58c7e Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Thu, 12 May 2022 14:51:01 -0700 Subject: x86/sgx: Ensure no data in PCMD page after truncate A PCMD (Paging Crypto MetaData) page contains the PCMD structures of enclave pages that have been encrypted and moved to the shmem backing store. When all enclave pages sharing a PCMD page are loaded in the enclave, there is no need for the PCMD page and it can be truncated from the backing store. A few issues appeared around the truncation of PCMD pages. The known issues have been addressed but the PCMD handling code could be made more robust by loudly complaining if any new issue appears in this area. Add a check that will complain with a warning if the PCMD page is not actually empty after it has been truncated. There should never be data in the PCMD page at this point since it is was just checked to be empty and truncated with enclave mutex held and is updated with the enclave mutex held. Suggested-by: Dave Hansen Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Tested-by: Haitao Huang Link: https://lkml.kernel.org/r/6495120fed43fafc1496d09dd23df922b9a32709.1652389823.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/sgx/encl.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 243f3bd78145..3c24e6124d95 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -187,12 +187,20 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, kunmap_atomic(pcmd_page); kunmap_atomic((void *)(unsigned long)pginfo.contents); + get_page(b.pcmd); sgx_encl_put_backing(&b); sgx_encl_truncate_backing_page(encl, page_index); - if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) + if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) { sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off)); + pcmd_page = kmap_atomic(b.pcmd); + if (memchr_inv(pcmd_page, 0, PAGE_SIZE)) + pr_warn("PCMD page not empty after truncate.\n"); + kunmap_atomic(pcmd_page); + } + + put_page(b.pcmd); return ret; } -- cgit From a7fed5c0431dbfa707037848830f980e0f93cfb3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 May 2022 13:39:34 +0200 Subject: x86/nmi: Make register_nmi_handler() more robust register_nmi_handler() has no sanity check whether a handler has been registered already. Such an unintended double-add leads to list corruption and hard to diagnose problems during the next NMI handling. Init the list head in the static NMI action struct and check it for being empty in register_nmi_handler(). [ bp: Fixups. ] Reported-by: Sean Christopherson Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/lkml/20220511234332.3654455-1-seanjc@google.com --- arch/x86/include/asm/nmi.h | 1 + arch/x86/kernel/nmi.c | 12 ++++++++---- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 1cb9c17a4cb4..5c5f1e56c404 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -47,6 +47,7 @@ struct nmiaction { #define register_nmi_handler(t, fn, fg, n, init...) \ ({ \ static struct nmiaction init fn##_na = { \ + .list = LIST_HEAD_INIT(fn##_na.list), \ .handler = (fn), \ .name = (n), \ .flags = (fg), \ diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index e73f7df362f5..cec0bfa3bc04 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -157,7 +157,7 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action) struct nmi_desc *desc = nmi_to_desc(type); unsigned long flags; - if (!action->handler) + if (WARN_ON_ONCE(!action->handler || !list_empty(&action->list))) return -EINVAL; raw_spin_lock_irqsave(&desc->lock, flags); @@ -177,7 +177,7 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action) list_add_rcu(&action->list, &desc->head); else list_add_tail_rcu(&action->list, &desc->head); - + raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } @@ -186,7 +186,7 @@ EXPORT_SYMBOL(__register_nmi_handler); void unregister_nmi_handler(unsigned int type, const char *name) { struct nmi_desc *desc = nmi_to_desc(type); - struct nmiaction *n; + struct nmiaction *n, *found = NULL; unsigned long flags; raw_spin_lock_irqsave(&desc->lock, flags); @@ -200,12 +200,16 @@ void unregister_nmi_handler(unsigned int type, const char *name) WARN(in_nmi(), "Trying to free NMI (%s) from NMI context!\n", n->name); list_del_rcu(&n->list); + found = n; break; } } raw_spin_unlock_irqrestore(&desc->lock, flags); - synchronize_rcu(); + if (found) { + synchronize_rcu(); + INIT_LIST_HEAD(&found->list); + } } EXPORT_SYMBOL_GPL(unregister_nmi_handler); -- cgit From 0621210ab7693e6d50585ca689d95d57df617455 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 16 May 2022 19:42:15 +0100 Subject: x86/sev: Remove duplicated assignment to variable info Variable info is being assigned the same value twice, remove the redundant assignment. Also assign variable v in the declaration. Cleans up clang scan warning: warning: Value stored to 'info' during its initialization is never read [deadcode.DeadStores] No code changed: # arch/x86/kernel/sev.o: text data bss dec hex filename 19878 4487 4112 28477 6f3d sev.o.before 19878 4487 4112 28477 6f3d sev.o.after md5: bfbaa515af818615fd01fea91e7eba1b sev.o.before.asm bfbaa515af818615fd01fea91e7eba1b sev.o.after.asm [ bp: Running the before/after check on sev.c because sev-shared.c gets included into it. ] Fixes: 597cfe48212a ("x86/boot/compressed/64: Setup a GHCB-based VC Exception handler") Signed-off-by: Colin Ian King Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220516184215.51841-1-colin.i.king@gmail.com --- arch/x86/kernel/sev-shared.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index 2b4270d5559e..b478edf43bec 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -201,10 +201,7 @@ static enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt if (ret == 1) { u64 info = ghcb->save.sw_exit_info_2; - unsigned long v; - - info = ghcb->save.sw_exit_info_2; - v = info & SVM_EVTINJ_VEC_MASK; + unsigned long v = info & SVM_EVTINJ_VEC_MASK; /* Check if exception information from hypervisor is sane. */ if ((info & SVM_EVTINJ_VALID) && -- cgit From 3c27b0c6ea48bc61492a138c410e262735d660ab Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 10 May 2022 21:22:04 +0200 Subject: perf/x86/amd: Fix AMD BRS period adjustment There's two problems with the current amd_brs_adjust_period() code: - it isn't in fact AMD specific and wil always adjust the period; - it adjusts the period, while it should only adjust the event count, resulting in repoting a short period. Fix this by using x86_pmu.limit_period, this makes it specific to the AMD BRS case and ensures only the event count is adjusted while the reported period is unmodified. Fixes: ba2fe7500845 ("perf/x86/amd: Add AMD branch sampling period adjustment") Signed-off-by: Peter Zijlstra (Intel) --- arch/x86/events/amd/core.c | 13 +++++++++++++ arch/x86/events/core.c | 7 ------- arch/x86/events/perf_event.h | 18 ------------------ 3 files changed, 13 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index d81eac2284ea..3eee59c64daa 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -1255,6 +1255,18 @@ static void amd_pmu_sched_task(struct perf_event_context *ctx, amd_pmu_brs_sched_task(ctx, sched_in); } +static u64 amd_pmu_limit_period(struct perf_event *event, u64 left) +{ + /* + * Decrease period by the depth of the BRS feature to get the last N + * taken branches and approximate the desired period + */ + if (has_branch_stack(event) && left > x86_pmu.lbr_nr) + left -= x86_pmu.lbr_nr; + + return left; +} + static __initconst const struct x86_pmu amd_pmu = { .name = "AMD", .handle_irq = amd_pmu_handle_irq, @@ -1415,6 +1427,7 @@ static int __init amd_core_pmu_init(void) if (boot_cpu_data.x86 >= 0x19 && !amd_brs_init()) { x86_pmu.get_event_constraints = amd_get_event_constraints_f19h; x86_pmu.sched_task = amd_pmu_sched_task; + x86_pmu.limit_period = amd_pmu_limit_period; /* * put_event_constraints callback same as Fam17h, set above */ diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index b08052b05db6..30788894124f 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1374,13 +1374,6 @@ int x86_perf_event_set_period(struct perf_event *event) x86_pmu.set_topdown_event_period) return x86_pmu.set_topdown_event_period(event); - /* - * decrease period by the depth of the BRS feature to get - * the last N taken branches and approximate the desired period - */ - if (has_branch_stack(event)) - period = amd_brs_adjust_period(period); - /* * If we are way outside a reasonable range then just skip forward: */ diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 3b0324584da3..21a5482bcf84 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1254,14 +1254,6 @@ static inline void amd_pmu_brs_del(struct perf_event *event) } void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in); - -static inline s64 amd_brs_adjust_period(s64 period) -{ - if (period > x86_pmu.lbr_nr) - return period - x86_pmu.lbr_nr; - - return period; -} #else static inline int amd_brs_init(void) { @@ -1290,11 +1282,6 @@ static inline void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool s { } -static inline s64 amd_brs_adjust_period(s64 period) -{ - return period; -} - static inline void amd_brs_enable_all(void) { } @@ -1324,11 +1311,6 @@ static inline void amd_brs_enable_all(void) static inline void amd_brs_disable_all(void) { } - -static inline s64 amd_brs_adjust_period(s64 period) -{ - return period; -} #endif /* CONFIG_CPU_SUP_AMD */ static inline int is_pebs_pt(struct perf_event *event) -- cgit From 841b51e4a3590866d17fa2663c64688c25b891b1 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 16 May 2022 17:48:38 +0200 Subject: perf/x86/amd: Run AMD BRS code only on supported hw This fires on a Fam16h machine here: unchecked MSR access error: WRMSR to 0xc000010f (tried to write 0x0000000000000018) \ at rIP: 0xffffffff81007db1 (amd_brs_reset+0x11/0x50) Call Trace: amd_pmu_cpu_starting ? x86_pmu_dead_cpu x86_pmu_starting_cpu cpuhp_invoke_callback ? x86_pmu_starting_cpu ? x86_pmu_dead_cpu cpuhp_issue_call ? x86_pmu_starting_cpu __cpuhp_setup_state_cpuslocked ? x86_pmu_dead_cpu ? x86_pmu_starting_cpu __cpuhp_setup_state ? map_vsyscall init_hw_perf_events ? map_vsyscall do_one_initcall ? _raw_spin_unlock_irqrestore ? try_to_wake_up kernel_init_freeable ? rest_init kernel_init ret_from_fork because that CPU hotplug callback gets executed on any AMD CPU - not only on the BRS-enabled ones. Check the BRS feature bit properly. Signed-off-by: Borislav Petkov Signed-off-by: Peter Zijlstra (Intel) Acked-By: Stephane Eranian Link: https://lkml.kernel.org/r/20220516154838.7044-1-bp@alien8.de --- arch/x86/events/amd/brs.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c index 895c82165d85..bee8765a1e9b 100644 --- a/arch/x86/events/amd/brs.c +++ b/arch/x86/events/amd/brs.c @@ -57,7 +57,7 @@ static inline u64 get_debug_extn_cfg(void) static bool __init amd_brs_detect(void) { - if (!boot_cpu_has(X86_FEATURE_BRS)) + if (!cpu_feature_enabled(X86_FEATURE_BRS)) return false; switch (boot_cpu_data.x86) { @@ -112,6 +112,9 @@ static inline int amd_brs_get_tos(union amd_debug_extn_cfg *cfg) */ void amd_brs_reset(void) { + if (!cpu_feature_enabled(X86_FEATURE_BRS)) + return; + /* * Reset config */ -- cgit From c2df0a6af177b6c06a859806a876f92b072dc624 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Sun, 15 May 2022 20:42:04 +0200 Subject: locking/atomic/x86: Introduce arch_try_cmpxchg64 Introduce arch_try_cmpxchg64 for 64-bit and 32-bit targets to improve code using cmpxchg64. On 64-bit targets, the generated assembly improves from: ab: 89 c8 mov %ecx,%eax ad: 48 89 4c 24 60 mov %rcx,0x60(%rsp) b2: 83 e0 fd and $0xfffffffd,%eax b5: 89 54 24 64 mov %edx,0x64(%rsp) b9: 88 44 24 60 mov %al,0x60(%rsp) bd: 48 89 c8 mov %rcx,%rax c0: c6 44 24 62 f2 movb $0xf2,0x62(%rsp) c5: 48 8b 74 24 60 mov 0x60(%rsp),%rsi ca: f0 49 0f b1 34 24 lock cmpxchg %rsi,(%r12) d0: 48 39 c1 cmp %rax,%rcx d3: 75 cf jne a4 to: b3: 89 c2 mov %eax,%edx b5: 48 89 44 24 60 mov %rax,0x60(%rsp) ba: 83 e2 fd and $0xfffffffd,%edx bd: 89 4c 24 64 mov %ecx,0x64(%rsp) c1: 88 54 24 60 mov %dl,0x60(%rsp) c5: c6 44 24 62 f2 movb $0xf2,0x62(%rsp) ca: 48 8b 54 24 60 mov 0x60(%rsp),%rdx cf: f0 48 0f b1 13 lock cmpxchg %rdx,(%rbx) d4: 75 d5 jne ab where a move and a compare after cmpxchg is saved. The improvements for 32-bit targets are even more noticeable, because dual-word compare after cmpxchg8b gets eliminated. Signed-off-by: Uros Bizjak Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220515184205.103089-3-ubizjak@gmail.com --- arch/x86/include/asm/cmpxchg_32.h | 21 +++++++++++++++++++++ arch/x86/include/asm/cmpxchg_64.h | 6 ++++++ 2 files changed, 27 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index 0a7fe0321613..215f5a65790f 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -42,6 +42,9 @@ static inline void set_64bit(volatile u64 *ptr, u64 value) #define arch_cmpxchg64_local(ptr, o, n) \ ((__typeof__(*(ptr)))__cmpxchg64_local((ptr), (unsigned long long)(o), \ (unsigned long long)(n))) +#define arch_try_cmpxchg64(ptr, po, n) \ + __try_cmpxchg64((ptr), (unsigned long long *)(po), \ + (unsigned long long)(n)) #endif static inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new) @@ -70,6 +73,24 @@ static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new) return prev; } +static inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *pold, u64 new) +{ + bool success; + u64 old = *pold; + asm volatile(LOCK_PREFIX "cmpxchg8b %[ptr]" + CC_SET(z) + : CC_OUT(z) (success), + [ptr] "+m" (*ptr), + "+A" (old) + : "b" ((u32)new), + "c" ((u32)(new >> 32)) + : "memory"); + + if (unlikely(!success)) + *pold = old; + return success; +} + #ifndef CONFIG_X86_CMPXCHG64 /* * Building a kernel capable running on 80386 and 80486. It may be necessary diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index 072e5459fe2f..250187ac8248 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -19,6 +19,12 @@ static inline void set_64bit(volatile u64 *ptr, u64 val) arch_cmpxchg_local((ptr), (o), (n)); \ }) +#define arch_try_cmpxchg64(ptr, po, n) \ +({ \ + BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ + arch_try_cmpxchg((ptr), (po), (n)); \ +}) + #define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX16) #endif /* _ASM_X86_CMPXCHG_64_H */ -- cgit From c42b145181aafd59ed31ccd879493389e3ea5a08 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 16 Mar 2022 12:16:12 +0800 Subject: x86/sev: Annotate stack change in the #VC handler In idtentry_vc(), vc_switch_off_ist() determines a safe stack to switch to, off of the IST stack. Annotate the new stack switch with ENCODE_FRAME_POINTER in case UNWINDER_FRAME_POINTER is used. A stack walk before looks like this: CPU: 0 PID: 0 Comm: swapper Not tainted 5.18.0-rc7+ #2 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 Call Trace: dump_stack_lvl dump_stack kernel_exc_vmm_communication asm_exc_vmm_communication ? native_read_msr ? __x2apic_disable.part.0 ? x2apic_setup ? cpu_init ? trap_init ? start_kernel ? x86_64_start_reservations ? x86_64_start_kernel ? secondary_startup_64_no_verify and with the fix, the stack dump is exact: CPU: 0 PID: 0 Comm: swapper Not tainted 5.18.0-rc7+ #3 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 Call Trace: dump_stack_lvl dump_stack kernel_exc_vmm_communication asm_exc_vmm_communication RIP: 0010:native_read_msr Code: ... < snipped regs > ? __x2apic_disable.part.0 x2apic_setup cpu_init trap_init start_kernel x86_64_start_reservations x86_64_start_kernel secondary_startup_64_no_verify [ bp: Test in a SEV-ES guest and rewrite the commit message to explain what exactly this does. ] Fixes: a13644f3a53d ("x86/entry/64: Add entry code for #VC handler") Signed-off-by: Lai Jiangshan Signed-off-by: Borislav Petkov Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20220316041612.71357-1-jiangshanlai@gmail.com --- arch/x86/entry/entry_64.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 4faac48ebec5..f7bd8001bf07 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -505,6 +505,7 @@ SYM_CODE_START(\asmsym) call vc_switch_off_ist movq %rax, %rsp /* Switch to new stack */ + ENCODE_FRAME_POINTER UNWIND_HINT_REGS /* Update pt_regs */ -- cgit From 47f33de4aafb2f5e43d480d590a939d0f1d566a9 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 12 Apr 2022 20:49:08 +0800 Subject: x86/sev: Mark the code returning to user space as syscall gap When returning to user space, %rsp is user-controlled value. If it is a SNP-guest and the hypervisor decides to mess with the code-page for this path while a CPU is executing it, a potential #VC could hit in the syscall return path and mislead the #VC handler. So make ip_within_syscall_gap() return true in this case. Signed-off-by: Lai Jiangshan Signed-off-by: Borislav Petkov Acked-by: Joerg Roedel Link: https://lore.kernel.org/r/20220412124909.10467-1-jiangshanlai@gmail.com --- arch/x86/entry/entry_64.S | 2 ++ arch/x86/entry/entry_64_compat.S | 2 ++ arch/x86/include/asm/proto.h | 4 ++++ arch/x86/include/asm/ptrace.h | 4 ++++ 4 files changed, 12 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index f7bd8001bf07..2fd8a5cad853 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -215,8 +215,10 @@ syscall_return_via_sysret: popq %rdi popq %rsp +SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL) swapgs sysretq +SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL) SYM_CODE_END(entry_SYSCALL_64) /* diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 4fdb007cddbd..3c0e14960e2b 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -297,6 +297,7 @@ sysret32_from_system_call: * code. We zero R8-R10 to avoid info leaks. */ movq RSP-ORIG_RAX(%rsp), %rsp +SYM_INNER_LABEL(entry_SYSRETL_compat_unsafe_stack, SYM_L_GLOBAL) /* * The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored @@ -314,6 +315,7 @@ sysret32_from_system_call: xorl %r10d, %r10d swapgs sysretl +SYM_INNER_LABEL(entry_SYSRETL_compat_end, SYM_L_GLOBAL) SYM_CODE_END(entry_SYSCALL_compat) /* diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index feed36d44d04..f042cfc9938f 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -13,6 +13,8 @@ void syscall_init(void); #ifdef CONFIG_X86_64 void entry_SYSCALL_64(void); void entry_SYSCALL_64_safe_stack(void); +void entry_SYSRETQ_unsafe_stack(void); +void entry_SYSRETQ_end(void); long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2); #endif @@ -28,6 +30,8 @@ void entry_SYSENTER_compat(void); void __end_entry_SYSENTER_compat(void); void entry_SYSCALL_compat(void); void entry_SYSCALL_compat_safe_stack(void); +void entry_SYSRETL_compat_unsafe_stack(void); +void entry_SYSRETL_compat_end(void); void entry_INT80_compat(void); #ifdef CONFIG_XEN_PV void xen_entry_INT80_compat(void); diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 4357e0f2cd5f..f4db78b09c8f 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -186,9 +186,13 @@ static __always_inline bool ip_within_syscall_gap(struct pt_regs *regs) bool ret = (regs->ip >= (unsigned long)entry_SYSCALL_64 && regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack); + ret = ret || (regs->ip >= (unsigned long)entry_SYSRETQ_unsafe_stack && + regs->ip < (unsigned long)entry_SYSRETQ_end); #ifdef CONFIG_IA32_EMULATION ret = ret || (regs->ip >= (unsigned long)entry_SYSCALL_compat && regs->ip < (unsigned long)entry_SYSCALL_compat_safe_stack); + ret = ret || (regs->ip >= (unsigned long)entry_SYSRETL_compat_unsafe_stack && + regs->ip < (unsigned long)entry_SYSRETL_compat_end); #endif return ret; -- cgit From aeb84412037b89e06f45e382f044da6f200e12f8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 27 Feb 2022 11:59:18 -0800 Subject: x86/boot: Wrap literal addresses in absolute_pointer() GCC 11 (incorrectly[1]) assumes that literal values cast to (void *) should be treated like a NULL pointer with an offset, and raises diagnostics when doing bounds checking under -Warray-bounds. GCC 12 got "smarter" about finding these: In function 'rdfs8', inlined from 'vga_recalc_vertical' at /srv/code/arch/x86/boot/video-mode.c:124:29, inlined from 'set_mode' at /srv/code/arch/x86/boot/video-mode.c:163:3: /srv/code/arch/x86/boot/boot.h:114:9: warning: array subscript 0 is outside array bounds of 'u8[0]' {aka 'unsigned char[]'} [-Warray-bounds] 114 | asm volatile("movb %%fs:%1,%0" : "=q" (v) : "m" (*(u8 *)addr)); | ^~~ This has been solved in other places[2] already by using the recently added absolute_pointer() macro. Do the same here. [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99578 [2] https://lore.kernel.org/all/20210912160149.2227137-1-linux@roeck-us.net/ Signed-off-by: Kees Cook Signed-off-by: Borislav Petkov Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220227195918.705219-1-keescook@chromium.org --- arch/x86/boot/boot.h | 36 ++++++++++++++++++++++++------------ arch/x86/boot/main.c | 2 +- 2 files changed, 25 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 34c9dbb6a47d..686a9d75a0e4 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -110,66 +110,78 @@ typedef unsigned int addr_t; static inline u8 rdfs8(addr_t addr) { + u8 *ptr = (u8 *)absolute_pointer(addr); u8 v; - asm volatile("movb %%fs:%1,%0" : "=q" (v) : "m" (*(u8 *)addr)); + asm volatile("movb %%fs:%1,%0" : "=q" (v) : "m" (*ptr)); return v; } static inline u16 rdfs16(addr_t addr) { + u16 *ptr = (u16 *)absolute_pointer(addr); u16 v; - asm volatile("movw %%fs:%1,%0" : "=r" (v) : "m" (*(u16 *)addr)); + asm volatile("movw %%fs:%1,%0" : "=r" (v) : "m" (*ptr)); return v; } static inline u32 rdfs32(addr_t addr) { + u32 *ptr = (u32 *)absolute_pointer(addr); u32 v; - asm volatile("movl %%fs:%1,%0" : "=r" (v) : "m" (*(u32 *)addr)); + asm volatile("movl %%fs:%1,%0" : "=r" (v) : "m" (*ptr)); return v; } static inline void wrfs8(u8 v, addr_t addr) { - asm volatile("movb %1,%%fs:%0" : "+m" (*(u8 *)addr) : "qi" (v)); + u8 *ptr = (u8 *)absolute_pointer(addr); + asm volatile("movb %1,%%fs:%0" : "+m" (*ptr) : "qi" (v)); } static inline void wrfs16(u16 v, addr_t addr) { - asm volatile("movw %1,%%fs:%0" : "+m" (*(u16 *)addr) : "ri" (v)); + u16 *ptr = (u16 *)absolute_pointer(addr); + asm volatile("movw %1,%%fs:%0" : "+m" (*ptr) : "ri" (v)); } static inline void wrfs32(u32 v, addr_t addr) { - asm volatile("movl %1,%%fs:%0" : "+m" (*(u32 *)addr) : "ri" (v)); + u32 *ptr = (u32 *)absolute_pointer(addr); + asm volatile("movl %1,%%fs:%0" : "+m" (*ptr) : "ri" (v)); } static inline u8 rdgs8(addr_t addr) { + u8 *ptr = (u8 *)absolute_pointer(addr); u8 v; - asm volatile("movb %%gs:%1,%0" : "=q" (v) : "m" (*(u8 *)addr)); + asm volatile("movb %%gs:%1,%0" : "=q" (v) : "m" (*ptr)); return v; } static inline u16 rdgs16(addr_t addr) { + u16 *ptr = (u16 *)absolute_pointer(addr); u16 v; - asm volatile("movw %%gs:%1,%0" : "=r" (v) : "m" (*(u16 *)addr)); + asm volatile("movw %%gs:%1,%0" : "=r" (v) : "m" (*ptr)); return v; } static inline u32 rdgs32(addr_t addr) { + u32 *ptr = (u32 *)absolute_pointer(addr); u32 v; - asm volatile("movl %%gs:%1,%0" : "=r" (v) : "m" (*(u32 *)addr)); + asm volatile("movl %%gs:%1,%0" : "=r" (v) : "m" (*ptr)); return v; } static inline void wrgs8(u8 v, addr_t addr) { - asm volatile("movb %1,%%gs:%0" : "+m" (*(u8 *)addr) : "qi" (v)); + u8 *ptr = (u8 *)absolute_pointer(addr); + asm volatile("movb %1,%%gs:%0" : "+m" (*ptr) : "qi" (v)); } static inline void wrgs16(u16 v, addr_t addr) { - asm volatile("movw %1,%%gs:%0" : "+m" (*(u16 *)addr) : "ri" (v)); + u16 *ptr = (u16 *)absolute_pointer(addr); + asm volatile("movw %1,%%gs:%0" : "+m" (*ptr) : "ri" (v)); } static inline void wrgs32(u32 v, addr_t addr) { - asm volatile("movl %1,%%gs:%0" : "+m" (*(u32 *)addr) : "ri" (v)); + u32 *ptr = (u32 *)absolute_pointer(addr); + asm volatile("movl %1,%%gs:%0" : "+m" (*ptr) : "ri" (v)); } /* Note: these only return true/false, not a signed return value! */ diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c index e3add857c2c9..c421af5a3cdc 100644 --- a/arch/x86/boot/main.c +++ b/arch/x86/boot/main.c @@ -33,7 +33,7 @@ static void copy_boot_params(void) u16 cl_offset; }; const struct old_cmdline * const oldcmd = - (const struct old_cmdline *)OLD_CL_ADDRESS; + absolute_pointer(OLD_CL_ADDRESS); BUILD_BUG_ON(sizeof(boot_params) != 4096); memcpy(&boot_params.hdr, &hdr, sizeof(hdr)); -- cgit From 1591a65f55bca5f7e14f9fbca4bd082dc8f4680f Mon Sep 17 00:00:00 2001 From: Maximilian Heyne Date: Tue, 17 May 2022 16:24:25 +0000 Subject: x86: xen: remove STACK_FRAME_NON_STANDARD from xen_cpuid Since commit 4d65adfcd119 ("x86: xen: insn: Decode Xen and KVM emulate-prefix signature"), objtool is able to correctly parse the prefixed instruction in xen_cpuid and emit correct orc unwind information. Hence, marking the function as STACKFRAME_NON_STANDARD is no longer needed. This commit is basically a revert of commit 983bb6d254c7 ("x86/xen: Mark xen_cpuid() stack frame as non-standard"). Signed-off-by: Maximilian Heyne Reviewed-by: Juergen Gross CC: Josh Poimboeuf Link: https://lore.kernel.org/r/20220517162425.100567-1-mheyne@amazon.de Signed-off-by: Juergen Gross --- arch/x86/xen/enlighten_pv.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 5038edb79ad5..ca85d1409917 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include @@ -165,7 +164,6 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, *bx &= maskebx; } -STACK_FRAME_NON_STANDARD(xen_cpuid); /* XEN_EMULATE_PREFIX */ static bool __init xen_check_mwait(void) { -- cgit From f089ab674cea43561fee8e3de8d15d864e486286 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Tue, 10 May 2022 02:32:22 +0300 Subject: xen/x86: Use do_kernel_power_off() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Kernel now supports chained power-off handlers. Use do_kernel_power_off() that invokes chained power-off handlers. It also invokes legacy pm_power_off() for now, which will be removed once all drivers will be converted to the new sys-off API. Acked-by: Juergen Gross Reviewed-by: MichaÅ‚ MirosÅ‚aw Signed-off-by: Dmitry Osipenko Signed-off-by: Rafael J. Wysocki --- arch/x86/xen/enlighten_pv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 5038edb79ad5..af1f6e886225 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -1071,8 +1072,7 @@ static void xen_machine_halt(void) static void xen_machine_power_off(void) { - if (pm_power_off) - pm_power_off(); + do_kernel_power_off(); xen_reboot(SHUTDOWN_poweroff); } -- cgit From d35773499329300d837818417f6d1b5a7987317d Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Tue, 10 May 2022 02:32:26 +0300 Subject: x86: Use do_kernel_power_off() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Kernel now supports chained power-off handlers. Use do_kernel_power_off() that invokes chained power-off handlers. It also invokes legacy pm_power_off() for now, which will be removed once all drivers will be converted to the new sys-off API. Reviewed-by: MichaÅ‚ MirosÅ‚aw Signed-off-by: Dmitry Osipenko Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/reboot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index fa700b46588e..c3636ea4aa71 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -739,10 +739,10 @@ static void native_machine_halt(void) static void native_machine_power_off(void) { - if (pm_power_off) { + if (kernel_can_power_off()) { if (!reboot_force) machine_shutdown(); - pm_power_off(); + do_kernel_power_off(); } /* A fallback in case there is no PM info available */ tboot_shutdown(TB_SHUTDOWN_HALT); -- cgit From fa6dae5d82081e8d9f8e6a2baf7149442a6c1ba5 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 19 May 2022 17:21:48 +0200 Subject: x86/PCI: Add kernel cmdline options to use/ignore E820 reserved regions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some firmware supplies PCI host bridge _CRS that includes address space unusable by PCI devices, e.g., space occupied by host bridge registers or used by hidden PCI devices. To avoid this unusable space, Linux currently excludes E820 reserved regions from _CRS windows; see 4dc2287c1805 ("x86: avoid E820 regions when allocating address space"). However, this use of E820 reserved regions to clip things out of _CRS is not supported by ACPI, UEFI, or PCI Firmware specs, and some systems have E820 reserved regions that cover the entire memory window from _CRS. 4dc2287c1805 clips the entire window, leaving no space for hot-added or uninitialized PCI devices. For example, from a Lenovo IdeaPad 3 15IIL 81WE: BIOS-e820: [mem 0x4bc50000-0xcfffffff] reserved pci_bus 0000:00: root bus resource [mem 0x65400000-0xbfffffff window] pci 0000:00:15.0: BAR 0: [mem 0x00000000-0x00000fff 64bit] pci 0000:00:15.0: BAR 0: no space for [mem size 0x00001000 64bit] Future patches will add quirks to enable/disable E820 clipping automatically. Add a "pci=no_e820" kernel command line option to disable clipping with E820 reserved regions. Also add a matching "pci=use_e820" option to enable clipping with E820 reserved regions if that has been disabled by default by further patches in this patch-set. Both options taint the kernel because they are intended for debugging and workaround purposes until a quirk can set them automatically. [bhelgaas: commit log, add printk] Link: https://bugzilla.redhat.com/show_bug.cgi?id=1868899 Lenovo IdeaPad 3 Link: https://lore.kernel.org/r/20220519152150.6135-2-hdegoede@redhat.com Signed-off-by: Hans de Goede Signed-off-by: Bjorn Helgaas Acked-by: Rafael J. Wysocki Cc: Benoit Grégoire Cc: Hui Wang --- arch/x86/include/asm/pci_x86.h | 2 ++ arch/x86/pci/acpi.c | 18 ++++++++++++++++-- arch/x86/pci/common.c | 8 ++++++++ 3 files changed, 26 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index a0627dfae541..ce3fd3311772 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -42,6 +42,8 @@ do { \ #define PCI_ROOT_NO_CRS 0x100000 #define PCI_NOASSIGN_BARS 0x200000 #define PCI_BIG_ROOT_WINDOW 0x400000 +#define PCI_USE_E820 0x800000 +#define PCI_NO_E820 0x1000000 extern unsigned int pci_probe; extern unsigned long pirq_table_addr; diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 562c81a51ea0..c61c815efedb 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -20,6 +20,7 @@ struct pci_root_info { #endif }; +static bool pci_use_e820 = true; static bool pci_use_crs = true; static bool pci_ignore_seg; @@ -161,6 +162,17 @@ void __init pci_acpi_crs_quirks(void) "if necessary, use \"pci=%s\" and report a bug\n", pci_use_crs ? "Using" : "Ignoring", pci_use_crs ? "nocrs" : "use_crs"); + + /* "pci=use_e820"/"pci=no_e820" on the kernel cmdline takes precedence */ + if (pci_probe & PCI_NO_E820) + pci_use_e820 = false; + else if (pci_probe & PCI_USE_E820) + pci_use_e820 = true; + + printk(KERN_INFO "PCI: %s E820 reservations for host bridge windows\n", + pci_use_e820 ? "Using" : "Ignoring"); + if (pci_probe & (PCI_NO_E820 | PCI_USE_E820)) + printk(KERN_INFO "PCI: Please notify linux-pci@vger.kernel.org so future kernels can this automatically\n"); } #ifdef CONFIG_PCI_MMCONFIG @@ -301,8 +313,10 @@ static int pci_acpi_root_prepare_resources(struct acpi_pci_root_info *ci) status = acpi_pci_probe_root_resources(ci); - resource_list_for_each_entry(entry, &ci->resources) - remove_e820_regions(&device->dev, entry->res); + if (pci_use_e820) { + resource_list_for_each_entry(entry, &ci->resources) + remove_e820_regions(&device->dev, entry->res); + } if (pci_use_crs) { resource_list_for_each_entry_safe(entry, tmp, &ci->resources) diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 9e1e6b8d8876..ddb798603201 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -595,6 +595,14 @@ char *__init pcibios_setup(char *str) } else if (!strcmp(str, "nocrs")) { pci_probe |= PCI_ROOT_NO_CRS; return NULL; + } else if (!strcmp(str, "use_e820")) { + pci_probe |= PCI_USE_E820; + add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); + return NULL; + } else if (!strcmp(str, "no_e820")) { + pci_probe |= PCI_NO_E820; + add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); + return NULL; #ifdef CONFIG_PHYS_ADDR_T_64BIT } else if (!strcmp(str, "big_root_window")) { pci_probe |= PCI_BIG_ROOT_WINDOW; -- cgit From 036c07c0c3b8a57d5c96e1f2aab62da0056f8f21 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 19 May 2022 11:46:58 -0700 Subject: x86/entry: Fix register corruption in compat syscall A panic was reported in the init process on AMD: Run /sbin/init as init process init[1]: segfault at f7fd5ca0 ip 00000000f7f5bbc7 sp 00000000ffa06aa0 error 7 in libc.so[f7f51000+4e000] Code: 8a 44 24 10 88 41 ff 8b 44 24 10 83 c4 2c 5b 5e 5f 5d c3 53 83 ec 08 8b 5c 24 10 81 fb 00 f0 ff ff 76 0c e8 ba dc ff ff f7 db <89> 18 83 cb ff 83 c4 08 89 d8 5b c3 e8 81 60 ff ff 05 28 84 07 00 Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b CPU: 1 PID: 1 Comm: init Tainted: G W 5.18.0-rc7-next-20220519 #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x57/0x7d panic+0x10f/0x28d do_exit.cold+0x18/0x48 do_group_exit+0x2e/0xb0 get_signal+0xb6d/0xb80 arch_do_signal_or_restart+0x31/0x760 ? show_opcodes.cold+0x1c/0x21 ? force_sig_fault+0x49/0x70 exit_to_user_mode_prepare+0x131/0x1a0 irqentry_exit_to_user_mode+0x5/0x30 asm_exc_page_fault+0x27/0x30 RIP: 0023:0xf7f5bbc7 Code: 8a 44 24 10 88 41 ff 8b 44 24 10 83 c4 2c 5b 5e 5f 5d c3 53 83 ec 08 8b 5c 24 10 81 fb 00 f0 ff ff 76 0c e8 ba dc ff ff f7 db <89> 18 83 cb ff 83 c4 08 89 d8 5b c3 e8 81 60 ff ff 05 28 84 07 00 RSP: 002b:00000000ffa06aa0 EFLAGS: 00000217 RAX: 00000000f7fd5ca0 RBX: 000000000000000c RCX: 0000000000001000 RDX: 0000000000000001 RSI: 00000000f7fd5b60 RDI: 00000000f7fd5b60 RBP: 00000000f7fd1c1c R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000206 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 The task's CX register got corrupted by commit 8c42819b61b8 ("x86/entry: Use PUSH_AND_CLEAR_REGS for compat"), which overlooked the fact that compat SYSCALL apparently stores the user's CX value in BP. Before that commit, CX was saved from its stashed value in BP: pushq %rbp /* pt_regs->cx (stashed in bp) */ But then it got changed to: pushq %rcx /* pt_regs->cx */ So the wrong value got saved and later restored back to the user. Fix it by pushing the correct value again (BP) for regs->cx. Fixes: 8c42819b61b8 ("x86/entry: Use PUSH_AND_CLEAR_REGS for compat") Reported-by: Guenter Roeck Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Tested-by: Guenter Roeck Link: https://lkml.kernel.org/r/b5a26592c9dd60bbacdf97974a7433fd802a5593.1652985970.git.jpoimboe@kernel.org --- arch/x86/entry/calling.h | 8 ++++---- arch/x86/entry/entry_64_compat.S | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index a97cc78ecb92..29b36e9e4e74 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -63,7 +63,7 @@ For 32-bit we have the following conventions - kernel is built with * for assembly code: */ -.macro PUSH_REGS rdx=%rdx rax=%rax save_ret=0 +.macro PUSH_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 .if \save_ret pushq %rsi /* pt_regs->si */ movq 8(%rsp), %rsi /* temporarily store the return address in %rsi */ @@ -73,7 +73,7 @@ For 32-bit we have the following conventions - kernel is built with pushq %rsi /* pt_regs->si */ .endif pushq \rdx /* pt_regs->dx */ - pushq %rcx /* pt_regs->cx */ + pushq \rcx /* pt_regs->cx */ pushq \rax /* pt_regs->ax */ pushq %r8 /* pt_regs->r8 */ pushq %r9 /* pt_regs->r9 */ @@ -115,8 +115,8 @@ For 32-bit we have the following conventions - kernel is built with .endm -.macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax save_ret=0 - PUSH_REGS rdx=\rdx, rax=\rax, save_ret=\save_ret +.macro PUSH_AND_CLEAR_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 + PUSH_REGS rdx=\rdx, rcx=\rcx, rax=\rax, save_ret=\save_ret CLEAR_REGS .endm diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index ed2be3615b50..f76e674d22c4 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -200,7 +200,7 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_safe_stack, SYM_L_GLOBAL) SYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL) movl %eax, %eax /* discard orig_ax high bits */ pushq %rax /* pt_regs->orig_ax */ - PUSH_AND_CLEAR_REGS rax=$-ENOSYS + PUSH_AND_CLEAR_REGS rcx=%rbp rax=$-ENOSYS UNWIND_HINT_REGS movq %rsp, %rdi -- cgit From 69505e3d9a39a988aaed9b58aa6b3482238f6516 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 12 May 2022 06:56:23 -0700 Subject: bug: Use normal relative pointers in 'struct bug_entry' With CONFIG_GENERIC_BUG_RELATIVE_POINTERS, the addr/file relative pointers are calculated weirdly: based on the beginning of the bug_entry struct address, rather than their respective pointer addresses. Make the relative pointers less surprising to both humans and tools by calculating them the normal way. Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Mark Rutland Acked-by: Peter Zijlstra (Intel) Acked-by: Sven Schnelle # s390 Acked-by: Michael Ellerman (powerpc) Acked-by: Catalin Marinas Tested-by: Mark Rutland [arm64] Link: https://lkml.kernel.org/r/f0e05be797a16f4fc2401eeb88c8450dcbe61df6.1652362951.git.jpoimboe@kernel.org --- arch/x86/include/asm/bug.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index 4d20a293c6fd..76fbe24cccf9 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -18,7 +18,7 @@ #ifdef CONFIG_X86_32 # define __BUG_REL(val) ".long " __stringify(val) #else -# define __BUG_REL(val) ".long " __stringify(val) " - 2b" +# define __BUG_REL(val) ".long " __stringify(val) " - ." #endif #ifdef CONFIG_DEBUG_BUGVERBOSE -- cgit From bae19fdd7e9e759580ac4693d2df3bc23ab415d7 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Wed, 18 May 2022 14:13:27 +0530 Subject: perf/x86/amd/core: Fix reloading events for SVM Commit 1018faa6cf23 ("perf/x86/kvm: Fix Host-Only/Guest-Only counting with SVM disabled") addresses an issue in which the Host-Only bit in the counter control registers needs to be masked off when SVM is not enabled. The events need to be reloaded whenever SVM is enabled or disabled for a CPU and this requires the PERF_CTL registers to be reprogrammed using {enable,disable}_all(). However, PerfMonV2 variants of these functions do not reprogram the PERF_CTL registers. Hence, the legacy enable_all() function should also be called. Fixes: 9622e67e3980 ("perf/x86/amd/core: Add PerfMonV2 counter control") Reported-by: Like Xu Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220518084327.464005-1-sandipan.das@amd.com --- arch/x86/events/amd/core.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 3eee59c64daa..9ac3718410ce 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -1472,6 +1472,24 @@ __init int amd_pmu_init(void) return 0; } +static inline void amd_pmu_reload_virt(void) +{ + if (x86_pmu.version >= 2) { + /* + * Clear global enable bits, reprogram the PERF_CTL + * registers with updated perf_ctr_virt_mask and then + * set global enable bits once again + */ + amd_pmu_v2_disable_all(); + amd_pmu_enable_all(0); + amd_pmu_v2_enable_all(0); + return; + } + + amd_pmu_disable_all(); + amd_pmu_enable_all(0); +} + void amd_pmu_enable_virt(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1479,8 +1497,7 @@ void amd_pmu_enable_virt(void) cpuc->perf_ctr_virt_mask = 0; /* Reload all events */ - amd_pmu_disable_all(); - x86_pmu_enable_all(0); + amd_pmu_reload_virt(); } EXPORT_SYMBOL_GPL(amd_pmu_enable_virt); @@ -1497,7 +1514,6 @@ void amd_pmu_disable_virt(void) cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY; /* Reload all events */ - amd_pmu_disable_all(); - x86_pmu_enable_all(0); + amd_pmu_reload_virt(); } EXPORT_SYMBOL_GPL(amd_pmu_disable_virt); -- cgit From d341838d776abadb3ac48abdd2f1f40df5a4fc10 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 19 May 2022 17:21:49 +0200 Subject: x86/PCI: Disable E820 reserved region clipping via quirks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To avoid unusable space that some firmware includes in PCI host bridge _CRS, Linux currently excludes E820 reserved regions from _CRS windows; see 4dc2287c1805 ("x86: avoid E820 regions when allocating address space"). However, some systems supply E820 reserved regions that cover the entire memory window from _CRS, so clipping them out leaves no space for hot-added or uninitialized PCI devices. For example, from a Lenovo IdeaPad 3 15IIL 81WE: BIOS-e820: [mem 0x4bc50000-0xcfffffff] reserved pci_bus 0000:00: root bus resource [mem 0x65400000-0xbfffffff window] pci 0000:00:15.0: BAR 0: [mem 0x00000000-0x00000fff 64bit] pci 0000:00:15.0: BAR 0: no space for [mem size 0x00001000 64bit] Add quirks to disable the E820 clipping for machines known to do this. A single DMI_PRODUCT_VERSION "IIL" quirk matches all the below: Lenovo IdeaPad 3 14IIL05 Lenovo IdeaPad 3 15IIL05 Lenovo IdeaPad 3 17IIL05 Lenovo IdeaPad 5 14IIL05 Lenovo IdeaPad 5 15IIL05 Lenovo IdeaPad Slim 7 14IIL05 Lenovo IdeaPad Slim 7 15IIL05 Lenovo IdeaPad S145-15IIL Lenovo IdeaPad S340-14IIL Lenovo IdeaPad S340-15IIL Lenovo IdeaPad C340-15IIL Lenovo BS145-15IIL Lenovo V14-IIL Lenovo V15-IIL Lenovo V17-IIL Lenovo Yoga C940-14IIL Lenovo Yoga S740-14IIL Lenovo Yoga Slim 7 14IIL05 Lenovo Yoga Slim 7 15IIL05 in addition to the following that don't actually need it because they have no E820 reserved regions that overlap _CRS windows: Lenovo IdeaPad Flex 5 14IIL05 Lenovo IdeaPad Flex 5 15IIL05 Lenovo ThinkBook 14-IIL Lenovo ThinkBook 15-IIL Lenovo Yoga S940-14IIL Other quirks match these: Acer Spin 5 (SP513-54N) Clevo X170KM-G Barebone Link: https://bugzilla.kernel.org/show_bug.cgi?id=206459 Lenovo Yoga C940-14IIL Link: https://bugzilla.kernel.org/show_bug.cgi?id=214259 Clevo X170KM Barebone Link: https://bugzilla.redhat.com/show_bug.cgi?id=1868899 Lenovo IdeaPad 3 15IIL05 Link: https://bugzilla.redhat.com/show_bug.cgi?id=1871793 Lenovo IdeaPad 5 14IIL05 Link: https://bugs.launchpad.net/bugs/1878279 Lenovo IdeaPad 5 14IIL05 Link: https://bugs.launchpad.net/bugs/1880172 Lenovo IdeaPad 3 14IIL05 Link: https://bugs.launchpad.net/bugs/1884232 Acer Spin SP513-54N Link: https://bugs.launchpad.net/bugs/1921649 Lenovo IdeaPad S145 Link: https://bugs.launchpad.net/bugs/1931715 Lenovo IdeaPad S145 Link: https://bugs.launchpad.net/bugs/1932069 Lenovo BS145-15IIL Link: https://lore.kernel.org/r/20220519152150.6135-3-hdegoede@redhat.com Signed-off-by: Hans de Goede Signed-off-by: Bjorn Helgaas Acked-by: Rafael J. Wysocki Cc: Benoit Grégoire Cc: Hui Wang --- arch/x86/pci/acpi.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index c61c815efedb..cd8f47c0d97e 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -43,6 +43,14 @@ static int __init set_ignore_seg(const struct dmi_system_id *id) return 0; } +static int __init set_no_e820(const struct dmi_system_id *id) +{ + printk(KERN_INFO "PCI: %s detected: not clipping E820 regions from _CRS\n", + id->ident); + pci_use_e820 = false; + return 0; +} + static const struct dmi_system_id pci_crs_quirks[] __initconst = { /* http://bugzilla.kernel.org/show_bug.cgi?id=14183 */ { @@ -137,6 +145,51 @@ static const struct dmi_system_id pci_crs_quirks[] __initconst = { DMI_MATCH(DMI_PRODUCT_NAME, "HP xw9300 Workstation"), }, }, + + /* + * Many Lenovo models with "IIL" in their DMI_PRODUCT_VERSION have + * an E820 reserved region that covers the entire 32-bit host + * bridge memory window from _CRS. Using the E820 region to clip + * _CRS means no space is available for hot-added or uninitialized + * PCI devices. This typically breaks I2C controllers for touchpads + * and hot-added Thunderbolt devices. See the commit log for + * models known to require this quirk and related bug reports. + */ + { + .callback = set_no_e820, + .ident = "Lenovo *IIL* product version", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_VERSION, "IIL"), + }, + }, + + /* + * The Acer Spin 5 (SP513-54N) has the same E820 reservation covering + * the entire _CRS 32-bit window issue as the Lenovo *IIL* models. + * See https://bugs.launchpad.net/bugs/1884232 + */ + { + .callback = set_no_e820, + .ident = "Acer Spin 5 (SP513-54N)", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Acer"), + DMI_MATCH(DMI_PRODUCT_NAME, "Spin SP513-54N"), + }, + }, + + /* + * Clevo X170KM-G barebones have the same E820 reservation covering + * the entire _CRS 32-bit window issue as the Lenovo *IIL* models. + * See https://bugzilla.kernel.org/show_bug.cgi?id=214259 + */ + { + .callback = set_no_e820, + .ident = "Clevo X170KM-G Barebone", + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "X170KM-G"), + }, + }, {} }; -- cgit From 0ae084d5a6744b1318407d8e20fb88ac0fd85d47 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 19 May 2022 17:21:50 +0200 Subject: x86/PCI: Disable E820 reserved region clipping starting in 2023 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some firmware includes unusable space (host bridge registers, hidden PCI device BARs, etc) in PCI host bridge _CRS. As far as we know, there's nothing in the ACPI, UEFI, or PCI Firmware spec that requires the OS to remove E820 reserved regions from _CRS, so this seems like a firmware defect. As a workaround, 4dc2287c1805 ("x86: avoid E820 regions when allocating address space") has clipped out the unusable space in the past. This is required for machines like the following: - Dell Precision T3500 (the original motivator for 4dc2287c1805); see https://bugzilla.kernel.org/show_bug.cgi?id=16228 - Asus C523NA (Coral) Chromebook; see https://lore.kernel.org/all/4e9fca2f-0af1-3684-6c97-4c35befd5019@redhat.com/ - Lenovo ThinkPad X1 Gen 2; see: https://bugzilla.redhat.com/show_bug.cgi?id=2029207 But other firmware supplies E820 reserved regions that cover entire _CRS windows, and clipping throws away the entire window, leaving none for hot-added or uninitialized devices. This clipping breaks a whole range of Lenovo IdeaPads, Yogas, Yoga Slims, and notebooks, as well as Acer Spin 5 and Clevo X170KM-G Barebone machines. E820 reserved entries that cover a memory-mapped PCI host bridge, including its registers and memory/IO windows, are probably *not* a firmware defect. Per ACPI v5.4, sec 15.2, the E820 memory map may include: Address ranges defined for baseboard memory-mapped I/O devices, such as APICs, are returned as reserved. Disable the E820 clipping by default for all post-2022 machines. We already have quirks to disable clipping for pre-2023 machines, and we'll likely need quirks to *enable* clipping for post-2022 machines that incorrectly include unusable space in _CRS, including Chromebooks and Lenovo ThinkPads. Here's the rationale for doing this. If we do nothing, and continue clipping by default: - Future systems like the Lenovo IdeaPads, Yogas, etc, Acer Spin, and Clevo Barebones will require new quirks to disable clipping. - The problem here is E820 entries that cover entire _CRS windows that should not be clipped out. - I think these E820 entries are legal per spec, and it would be hard to get BIOS vendors to change them. - We will discover new systems that need clipping disabled piecemeal as they are released. - Future systems like Lenovo X1 Carbon and the Chromebooks (probably anything using coreboot) will just work, even though their _CRS is incorrect, so we will not notice new ones that rely on the clipping. - BIOS updates will not require new quirks unless they change the DMI model string. If we add the date check in this commit that disables clipping, e.g., "no clipping when date >= 2023": - Future systems like Lenovo *IIL*, Acer Spin, and Clevo Barebones will just work without new quirks. - Future systems like Lenovo X1 Carbon and the Chromebooks will require new quirks to *enable* clipping. - The problem here is that _CRS contains regions that are not usable by PCI devices, and we rely on the E820 kludge to clip them out. - I think this use of E820 is clearly a firmware bug, so we have a fighting chance of getting it changed eventually. - BIOS updates after the cutoff date *will* require quirks, but only for systems like Lenovo X1 Carbon and Chromebooks that we already think have broken firmware. It seems to me like it's better to add quirks for firmware that we think is broken than for firmware that seems unusual but correct. [bhelgaas: comment and commit log] Link: https://lore.kernel.org/linux-pci/20220518220754.GA7911@bhelgaas/ Link: https://lore.kernel.org/r/20220519152150.6135-4-hdegoede@redhat.com Signed-off-by: Hans de Goede Signed-off-by: Bjorn Helgaas Acked-by: Rafael J. Wysocki Cc: Benoit Grégoire Cc: Hui Wang --- arch/x86/pci/acpi.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index cd8f47c0d97e..a4f43054bc79 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -200,6 +200,27 @@ void __init pci_acpi_crs_quirks(void) if (year >= 0 && year < 2008 && iomem_resource.end <= 0xffffffff) pci_use_crs = false; + /* + * Some firmware includes unusable space (host bridge registers, + * hidden PCI device BARs, etc) in PCI host bridge _CRS. This is a + * firmware defect, and 4dc2287c1805 ("x86: avoid E820 regions when + * allocating address space") has clipped out the unusable space in + * the past. + * + * But other firmware supplies E820 reserved regions that cover + * entire _CRS windows, so clipping throws away the entire window, + * leaving none for hot-added or uninitialized devices. These E820 + * entries are probably *not* a firmware defect, so disable the + * clipping by default for post-2022 machines. + * + * We already have quirks to disable clipping for pre-2023 + * machines, and we'll likely need quirks to *enable* clipping for + * post-2022 machines that incorrectly include unusable space in + * _CRS. + */ + if (year >= 2023) + pci_use_e820 = false; + dmi_check_system(pci_crs_quirks); /* -- cgit From 9c55d99e099bd7aa6b91fce8718505c35d5dfc65 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 19 May 2022 16:59:13 +0200 Subject: x86/microcode: Add explicit CPU vendor dependency Add an explicit dependency to the respective CPU vendor so that the respective microcode support for it gets built only when that support is enabled. Reported-by: Randy Dunlap Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/8ead0da9-9545-b10d-e3db-7df1a1f219e4@infradead.org --- arch/x86/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b0142e01002e..7ba627c60504 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1313,7 +1313,7 @@ config MICROCODE config MICROCODE_INTEL bool "Intel microcode loading support" - depends on MICROCODE + depends on CPU_SUP_INTEL && MICROCODE default MICROCODE help This options enables microcode patch loading support for Intel @@ -1325,7 +1325,7 @@ config MICROCODE_INTEL config MICROCODE_AMD bool "AMD microcode loading support" - depends on MICROCODE + depends on CPU_SUP_AMD && MICROCODE help If you select this option, microcode patch loading support for AMD processors will be enabled. -- cgit From ce6565282b3b16fd850c6a676f78c6bc76d0c235 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 20 May 2022 10:26:04 +0200 Subject: x86/entry: Fixup objtool/ibt validation Commit 47f33de4aafb ("x86/sev: Mark the code returning to user space as syscall gap") added a bunch of text references without annotating them, resulting in a spree of objtool complaints: vmlinux.o: warning: objtool: vc_switch_off_ist+0x77: relocation to !ENDBR: entry_SYSCALL_64+0x15c vmlinux.o: warning: objtool: vc_switch_off_ist+0x8f: relocation to !ENDBR: entry_SYSCALL_compat+0xa5 vmlinux.o: warning: objtool: vc_switch_off_ist+0x97: relocation to !ENDBR: .entry.text+0x21ea vmlinux.o: warning: objtool: vc_switch_off_ist+0xef: relocation to !ENDBR: .entry.text+0x162 vmlinux.o: warning: objtool: __sev_es_ist_enter+0x60: relocation to !ENDBR: entry_SYSCALL_64+0x15c vmlinux.o: warning: objtool: __sev_es_ist_enter+0x6c: relocation to !ENDBR: .entry.text+0x162 vmlinux.o: warning: objtool: __sev_es_ist_enter+0x8a: relocation to !ENDBR: entry_SYSCALL_compat+0xa5 vmlinux.o: warning: objtool: __sev_es_ist_enter+0xc1: relocation to !ENDBR: .entry.text+0x21ea Since these text references are used to compare against IP, and are not an indirect call target, they don't need ENDBR so annotate them away. Fixes: 47f33de4aafb ("x86/sev: Mark the code returning to user space as syscall gap") Reported-by: Stephen Rothwell Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220520082604.GQ2578@worktop.programming.kicks-ass.net --- arch/x86/entry/entry_64.S | 3 +++ arch/x86/entry/entry_64_compat.S | 3 +++ 2 files changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 2fd8a5cad853..58a2d764fa39 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -216,9 +216,12 @@ syscall_return_via_sysret: popq %rdi popq %rsp SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL) + ANNOTATE_NOENDBR swapgs sysretq SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL) + ANNOTATE_NOENDBR + int3 SYM_CODE_END(entry_SYSCALL_64) /* diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 3c0e14960e2b..8011021d7ce8 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -298,6 +298,7 @@ sysret32_from_system_call: */ movq RSP-ORIG_RAX(%rsp), %rsp SYM_INNER_LABEL(entry_SYSRETL_compat_unsafe_stack, SYM_L_GLOBAL) + ANNOTATE_NOENDBR /* * The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored @@ -316,6 +317,8 @@ SYM_INNER_LABEL(entry_SYSRETL_compat_unsafe_stack, SYM_L_GLOBAL) swapgs sysretl SYM_INNER_LABEL(entry_SYSRETL_compat_end, SYM_L_GLOBAL) + ANNOTATE_NOENDBR + int3 SYM_CODE_END(entry_SYSCALL_compat) /* -- cgit From d936411dc9caeb3edb992e39c33d4d1d81ca8c08 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 20 May 2022 12:12:30 +0200 Subject: x86: Remove empty files Remove empty files which were supposed to get removed with the respective commits removing the functionality in them: $ find arch/x86/ -empty arch/x86/lib/mmx_32.c arch/x86/include/asm/fpu/internal.h arch/x86/include/asm/mmx.h Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220520101723.12006-1-bp@alien8.de --- arch/x86/include/asm/fpu/internal.h | 0 arch/x86/include/asm/mmx.h | 0 arch/x86/lib/mmx_32.c | 0 3 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 arch/x86/include/asm/fpu/internal.h delete mode 100644 arch/x86/include/asm/mmx.h delete mode 100644 arch/x86/lib/mmx_32.c (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/arch/x86/include/asm/mmx.h b/arch/x86/include/asm/mmx.h deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c deleted file mode 100644 index e69de29bb2d1..000000000000 -- cgit From c796f02162e428b595ff70196dca161ee46b163b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 20 May 2022 10:38:39 +0200 Subject: x86/tdx: Fix RETs in TDX asm Because build-testing is over-rated, fix a few trivial objtool complaints: vmlinux.o: warning: objtool: __tdx_module_call+0x3e: missing int3 after ret vmlinux.o: warning: objtool: __tdx_hypercall+0x6e: missing int3 after ret Fixes: eb94f1b6a70a ("x86/tdx: Add __tdx_module_call() and __tdx_hypercall() helper functions") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220520083839.GR2578@worktop.programming.kicks-ass.net --- arch/x86/coco/tdx/tdcall.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/coco/tdx/tdcall.S b/arch/x86/coco/tdx/tdcall.S index eeb4511dc414..f9eb1134f22d 100644 --- a/arch/x86/coco/tdx/tdcall.S +++ b/arch/x86/coco/tdx/tdcall.S @@ -73,7 +73,7 @@ SYM_FUNC_START(__tdx_module_call) FRAME_BEGIN TDX_MODULE_CALL host=0 FRAME_END - ret + RET SYM_FUNC_END(__tdx_module_call) /* @@ -196,7 +196,7 @@ SYM_FUNC_START(__tdx_hypercall) FRAME_END - retq + RET .Lpanic: call __tdx_hypercall_failed /* __tdx_hypercall_failed never returns */ -- cgit From 4ac19ead0dfbabd8e0bfc731f507cfb0b95d6c99 Mon Sep 17 00:00:00 2001 From: Aaron Lewis Date: Tue, 17 May 2022 05:12:36 +0000 Subject: kvm: x86/pmu: Fix the compare function used by the pmu event filter When returning from the compare function the u64 is truncated to an int. This results in a loss of the high nybble[1] in the event select and its sign if that nybble is in use. Switch from using a result that can end up being truncated to a result that can only be: 1, 0, -1. [1] bits 35:32 in the event select register and bits 11:8 in the event select. Fixes: 7ff775aca48ad ("KVM: x86/pmu: Use binary search to check filtered events") Signed-off-by: Aaron Lewis Reviewed-by: Sean Christopherson Message-Id: <20220517051238.2566934-1-aaronlewis@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index eca39f56c231..0604bc29f0b8 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -171,9 +171,12 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc) return true; } -static int cmp_u64(const void *a, const void *b) +static int cmp_u64(const void *pa, const void *pb) { - return *(__u64 *)a - *(__u64 *)b; + u64 a = *(u64 *)pa; + u64 b = *(u64 *)pb; + + return (a > b) - (a < b); } void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) -- cgit From ea8c66fe8d8f4f93df941e52120a3512d7bf5128 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 19 May 2022 10:15:04 -0700 Subject: KVM: x86: hyper-v: fix type of valid_bank_mask In kvm_hv_flush_tlb(), valid_bank_mask is declared as unsigned long, but is used as u64, which is wrong for i386, and has been spotted by LKP after applying "KVM: x86: hyper-v: replace bitmap_weight() with hweight64()" https://lore.kernel.org/lkml/20220510154750.212913-12-yury.norov@gmail.com/ But it's wrong even without that patch because now bitmap_weight() dereferences a word after valid_bank_mask on i386. >> include/asm-generic/bitops/const_hweight.h:21:76: warning: right shift count >= width of type +[-Wshift-count-overflow] 21 | #define __const_hweight64(w) (__const_hweight32(w) + __const_hweight32((w) >> 32)) | ^~ include/asm-generic/bitops/const_hweight.h:10:16: note: in definition of macro '__const_hweight8' 10 | ((!!((w) & (1ULL << 0))) + \ | ^ include/asm-generic/bitops/const_hweight.h:20:31: note: in expansion of macro '__const_hweight16' 20 | #define __const_hweight32(w) (__const_hweight16(w) + __const_hweight16((w) >> 16)) | ^~~~~~~~~~~~~~~~~ include/asm-generic/bitops/const_hweight.h:21:54: note: in expansion of macro '__const_hweight32' 21 | #define __const_hweight64(w) (__const_hweight32(w) + __const_hweight32((w) >> 32)) | ^~~~~~~~~~~~~~~~~ include/asm-generic/bitops/const_hweight.h:29:49: note: in expansion of macro '__const_hweight64' 29 | #define hweight64(w) (__builtin_constant_p(w) ? __const_hweight64(w) : __arch_hweight64(w)) | ^~~~~~~~~~~~~~~~~ arch/x86/kvm/hyperv.c:1983:36: note: in expansion of macro 'hweight64' 1983 | if (hc->var_cnt != hweight64(valid_bank_mask)) | ^~~~~~~~~ CC: Borislav Petkov CC: Dave Hansen CC: H. Peter Anvin CC: Ingo Molnar CC: Jim Mattson CC: Joerg Roedel CC: Paolo Bonzini CC: Sean Christopherson CC: Thomas Gleixner CC: Vitaly Kuznetsov CC: Wanpeng Li CC: kvm@vger.kernel.org CC: linux-kernel@vger.kernel.org CC: x86@kernel.org Reported-by: kernel test robot Signed-off-by: Yury Norov Message-Id: <20220519171504.1238724-1-yury.norov@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 46f9dfb60469..a0702b6be3e8 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1914,7 +1914,7 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) struct hv_send_ipi_ex send_ipi_ex; struct hv_send_ipi send_ipi; DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS); - unsigned long valid_bank_mask; + u64 valid_bank_mask; u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; u32 vector; bool all_cpus; @@ -1956,7 +1956,7 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) valid_bank_mask = send_ipi_ex.vp_set.valid_bank_mask; all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL; - if (hc->var_cnt != bitmap_weight(&valid_bank_mask, 64)) + if (hc->var_cnt != bitmap_weight((unsigned long *)&valid_bank_mask, 64)) return HV_STATUS_INVALID_HYPERCALL_INPUT; if (all_cpus) -- cgit From 9f46c187e2e680ecd9de7983e4d081c3391acc76 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 20 May 2022 13:48:11 -0400 Subject: KVM: x86/mmu: fix NULL pointer dereference on guest INVPCID With shadow paging enabled, the INVPCID instruction results in a call to kvm_mmu_invpcid_gva. If INVPCID is executed with CR0.PG=0, the invlpg callback is not set and the result is a NULL pointer dereference. Fix it trivially by checking for mmu->invlpg before every call. There are other possibilities: - check for CR0.PG, because KVM (like all Intel processors after P5) flushes guest TLB on CR0.PG changes so that INVPCID/INVLPG are a nop with paging disabled - check for EFER.LMA, because KVM syncs and flushes when switching MMU contexts outside of 64-bit mode All of these are tricky, go for the simple solution. This is CVE-2022-1789. Reported-by: Yongkang Jia Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 56ebc4fb7f91..45e1573f8f1d 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5470,14 +5470,16 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) uint i; if (pcid == kvm_get_active_pcid(vcpu)) { - mmu->invlpg(vcpu, gva, mmu->root.hpa); + if (mmu->invlpg) + mmu->invlpg(vcpu, gva, mmu->root.hpa); tlb_flush = true; } for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { if (VALID_PAGE(mmu->prev_roots[i].hpa) && pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) { - mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); + if (mmu->invlpg) + mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); tlb_flush = true; } } -- cgit From aadd1b678ebef81cc3ed23663253ae3749cdc673 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 20 May 2022 16:57:52 -0700 Subject: x86/alternative: Introduce text_poke_set Introduce a memset like API for text_poke. This will be used to fill the unused RX memory with illegal instructions. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/bpf/20220520235758.1858153-3-song@kernel.org --- arch/x86/include/asm/text-patching.h | 1 + arch/x86/kernel/alternative.c | 67 ++++++++++++++++++++++++++++++------ 2 files changed, 58 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index d20ab0921480..1cc15528ce29 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -45,6 +45,7 @@ extern void *text_poke(void *addr, const void *opcode, size_t len); extern void text_poke_sync(void); extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len); extern void *text_poke_copy(void *addr, const void *opcode, size_t len); +extern void *text_poke_set(void *addr, int c, size_t len); extern int poke_int3_handler(struct pt_regs *regs); extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index d374cb3cf024..7563b5bc8328 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -994,7 +994,21 @@ static inline void unuse_temporary_mm(temp_mm_state_t prev_state) __ro_after_init struct mm_struct *poking_mm; __ro_after_init unsigned long poking_addr; -static void *__text_poke(void *addr, const void *opcode, size_t len) +static void text_poke_memcpy(void *dst, const void *src, size_t len) +{ + memcpy(dst, src, len); +} + +static void text_poke_memset(void *dst, const void *src, size_t len) +{ + int c = *(const int *)src; + + memset(dst, c, len); +} + +typedef void text_poke_f(void *dst, const void *src, size_t len); + +static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len) { bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; struct page *pages[2] = {NULL}; @@ -1059,7 +1073,7 @@ static void *__text_poke(void *addr, const void *opcode, size_t len) prev = use_temporary_mm(poking_mm); kasan_disable_current(); - memcpy((u8 *)poking_addr + offset_in_page(addr), opcode, len); + func((u8 *)poking_addr + offset_in_page(addr), src, len); kasan_enable_current(); /* @@ -1087,11 +1101,13 @@ static void *__text_poke(void *addr, const void *opcode, size_t len) (cross_page_boundary ? 2 : 1) * PAGE_SIZE, PAGE_SHIFT, false); - /* - * If the text does not match what we just wrote then something is - * fundamentally screwy; there's nothing we can really do about that. - */ - BUG_ON(memcmp(addr, opcode, len)); + if (func == text_poke_memcpy) { + /* + * If the text does not match what we just wrote then something is + * fundamentally screwy; there's nothing we can really do about that. + */ + BUG_ON(memcmp(addr, src, len)); + } local_irq_restore(flags); pte_unmap_unlock(ptep, ptl); @@ -1118,7 +1134,7 @@ void *text_poke(void *addr, const void *opcode, size_t len) { lockdep_assert_held(&text_mutex); - return __text_poke(addr, opcode, len); + return __text_poke(text_poke_memcpy, addr, opcode, len); } /** @@ -1137,7 +1153,7 @@ void *text_poke(void *addr, const void *opcode, size_t len) */ void *text_poke_kgdb(void *addr, const void *opcode, size_t len) { - return __text_poke(addr, opcode, len); + return __text_poke(text_poke_memcpy, addr, opcode, len); } /** @@ -1167,7 +1183,38 @@ void *text_poke_copy(void *addr, const void *opcode, size_t len) s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched); - __text_poke((void *)ptr, opcode + patched, s); + __text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s); + patched += s; + } + mutex_unlock(&text_mutex); + return addr; +} + +/** + * text_poke_set - memset into (an unused part of) RX memory + * @addr: address to modify + * @c: the byte to fill the area with + * @len: length to copy, could be more than 2x PAGE_SIZE + * + * This is useful to overwrite unused regions of RX memory with illegal + * instructions. + */ +void *text_poke_set(void *addr, int c, size_t len) +{ + unsigned long start = (unsigned long)addr; + size_t patched = 0; + + if (WARN_ON_ONCE(core_kernel_text(start))) + return NULL; + + mutex_lock(&text_mutex); + while (patched < len) { + unsigned long ptr = start + patched; + size_t s; + + s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched); + + __text_poke(text_poke_memset, (void *)ptr, (void *)&c, s); patched += s; } mutex_unlock(&text_mutex); -- cgit From fe736565efb775620dbcf3c459c1cd80d3e868da Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 20 May 2022 16:57:53 -0700 Subject: bpf: Introduce bpf_arch_text_invalidate for bpf_prog_pack Introduce bpf_arch_text_invalidate and use it to fill unused part of the bpf_prog_pack with illegal instructions when a BPF program is freed. Fixes: 57631054fae6 ("bpf: Introduce bpf_prog_pack allocator") Fixes: 33c9805860e5 ("bpf: Introduce bpf_jit_binary_pack_[alloc|finalize|free]") Reported-by: Linus Torvalds Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220520235758.1858153-4-song@kernel.org --- arch/x86/net/bpf_jit_comp.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index a2b6d197c226..f298b18a9a3d 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -228,6 +228,11 @@ static void jit_fill_hole(void *area, unsigned int size) memset(area, 0xcc, size); } +int bpf_arch_text_invalidate(void *dst, size_t len) +{ + return IS_ERR_OR_NULL(text_poke_set(dst, 0xcc, len)); +} + struct jit_context { int cleanup_addr; /* Epilogue code offset */ -- cgit From 5d7c854593a460706dacf8e1b16c9bdcb1c2d7bb Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 28 Mar 2022 08:26:48 +0200 Subject: livepatch: Remove klp_arch_set_pc() and asm/livepatch.h All three versions of klp_arch_set_pc() do exactly the same: they call ftrace_instruction_pointer_set(). Call ftrace_instruction_pointer_set() directly and remove klp_arch_set_pc(). As klp_arch_set_pc() was the only thing remaining in asm/livepatch.h on x86 and s390, remove asm/livepatch.h livepatch.h remains on powerpc but its content is exclusively used by powerpc specific code. Signed-off-by: Christophe Leroy Acked-by: Petr Mladek Acked-by: Peter Zijlstra (Intel) Acked-by: Miroslav Benes Signed-off-by: Petr Mladek --- arch/x86/include/asm/livepatch.h | 20 -------------------- 1 file changed, 20 deletions(-) delete mode 100644 arch/x86/include/asm/livepatch.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h deleted file mode 100644 index 7c5cc6660e4b..000000000000 --- a/arch/x86/include/asm/livepatch.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * livepatch.h - x86-specific Kernel Live Patching Core - * - * Copyright (C) 2014 Seth Jennings - * Copyright (C) 2014 SUSE - */ - -#ifndef _ASM_X86_LIVEPATCH_H -#define _ASM_X86_LIVEPATCH_H - -#include -#include - -static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip) -{ - ftrace_instruction_pointer_set(fregs, ip); -} - -#endif /* _ASM_X86_LIVEPATCH_H */ -- cgit From 8a33d96bd178d5f49cc5c1898e4cda08e221d2db Mon Sep 17 00:00:00 2001 From: XueBing Chen Date: Wed, 25 May 2022 16:51:01 +0800 Subject: x86/setup: Use strscpy() to replace deprecated strlcpy() strlcpy() is marked deprecated and should not be used, because it doesn't limit the source length. The preferred interface for when strlcpy()'s return value is not checked (truncation) is strscpy(). [ mingo: Tweaked the changelog ] Signed-off-by: XueBing Chen Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/730f0fef.a33.180fa69880f.Coremail.chenxuebing@jari.cn --- arch/x86/kernel/setup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 249981bf3d8a..3ebb85327edb 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -903,18 +903,18 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_CMDLINE_BOOL #ifdef CONFIG_CMDLINE_OVERRIDE - strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); + strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); #else if (builtin_cmdline[0]) { /* append boot loader cmdline to builtin */ strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE); strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE); - strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); + strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); } #endif #endif - strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); + strscpy(command_line, boot_command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; /* -- cgit From 758cd94a0e16ddbc5607428f28170897f67c2964 Mon Sep 17 00:00:00 2001 From: Juerg Haefliger Date: Wed, 25 May 2022 15:32:02 +0200 Subject: x86/Kconfig: Fix indentation and add endif comments to arch/x86/Kconfig The convention for indentation seems to be a single tab. Help text is further indented by an additional two whitespaces. Fix the lines that violate these rules. While add it, add missing trailing endif comments and squeeze multiple empty lines. Signed-off-by: Juerg Haefliger Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20220525133203.52463-2-juerg.haefliger@canonical.com --- arch/x86/Kconfig | 101 +++++++++++++++++++++++++++---------------------------- 1 file changed, 49 insertions(+), 52 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 762a0b6ab8b6..4d610ae216ce 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -41,11 +41,11 @@ config FORCE_DYNAMIC_FTRACE depends on FUNCTION_TRACER select DYNAMIC_FTRACE help - We keep the static function tracing (!DYNAMIC_FTRACE) around - in order to test the non static function tracing in the - generic code, as other architectures still use it. But we - only need to keep it around for x86_64. No need to keep it - for x86_32. For x86_32, force DYNAMIC_FTRACE. + We keep the static function tracing (!DYNAMIC_FTRACE) around + in order to test the non static function tracing in the + generic code, as other architectures still use it. But we + only need to keep it around for x86_64. No need to keep it + for x86_32. For x86_32, force DYNAMIC_FTRACE. # # Arch settings # @@ -392,9 +392,9 @@ config CC_HAS_SANE_STACKPROTECTOR default $(success,$(srctree)/scripts/gcc-x86_64-has-stack-protector.sh $(CC)) if 64BIT default $(success,$(srctree)/scripts/gcc-x86_32-has-stack-protector.sh $(CC)) help - We have to make sure stack protector is unconditionally disabled if - the compiler produces broken code or if it does not let us control - the segment on 32-bit kernels. + We have to make sure stack protector is unconditionally disabled if + the compiler produces broken code or if it does not let us control + the segment on 32-bit kernels. menu "Processor type and features" @@ -530,7 +530,7 @@ config X86_EXTENDED_PLATFORM If you have one of these systems, or if you want to build a generic distribution kernel, say Y here - otherwise say N. -endif +endif # X86_32 if X86_64 config X86_EXTENDED_PLATFORM @@ -549,7 +549,7 @@ config X86_EXTENDED_PLATFORM If you have one of these systems, or if you want to build a generic distribution kernel, say Y here - otherwise say N. -endif +endif # X86_64 # This is an alphabetically sorted list of 64 bit extended platforms # Please maintain the alphabetic order if and when there are additions config X86_NUMACHIP @@ -597,9 +597,9 @@ config X86_GOLDFISH bool "Goldfish (Virtual Platform)" depends on X86_EXTENDED_PLATFORM help - Enable support for the Goldfish virtual platform used primarily - for Android development. Unless you are building for the Android - Goldfish emulator say N here. + Enable support for the Goldfish virtual platform used primarily + for Android development. Unless you are building for the Android + Goldfish emulator say N here. config X86_INTEL_CE bool "CE4100 TV platform" @@ -898,7 +898,7 @@ config INTEL_TDX_GUEST memory contents and CPU state. TDX guests are protected from some attacks from the VMM. -endif #HYPERVISOR_GUEST +endif # HYPERVISOR_GUEST source "arch/x86/Kconfig.cpu" @@ -1159,16 +1159,16 @@ config X86_MCE_INTEL prompt "Intel MCE features" depends on X86_MCE && X86_LOCAL_APIC help - Additional support for intel specific MCE features such as - the thermal monitor. + Additional support for intel specific MCE features such as + the thermal monitor. config X86_MCE_AMD def_bool y prompt "AMD MCE features" depends on X86_MCE && X86_LOCAL_APIC && AMD_NB help - Additional support for AMD specific MCE features such as - the DRAM Error Threshold. + Additional support for AMD specific MCE features such as + the DRAM Error Threshold. config X86_ANCIENT_MCE bool "Support for old Pentium 5 / WinChip machine checks" @@ -1246,18 +1246,18 @@ config X86_VSYSCALL_EMULATION default y depends on X86_64 help - This enables emulation of the legacy vsyscall page. Disabling - it is roughly equivalent to booting with vsyscall=none, except - that it will also disable the helpful warning if a program - tries to use a vsyscall. With this option set to N, offending - programs will just segfault, citing addresses of the form - 0xffffffffff600?00. + This enables emulation of the legacy vsyscall page. Disabling + it is roughly equivalent to booting with vsyscall=none, except + that it will also disable the helpful warning if a program + tries to use a vsyscall. With this option set to N, offending + programs will just segfault, citing addresses of the form + 0xffffffffff600?00. - This option is required by many programs built before 2013, and - care should be used even with newer programs if set to N. + This option is required by many programs built before 2013, and + care should be used even with newer programs if set to N. - Disabling this option saves about 7K of kernel size and - possibly 4K of additional runtime pagetable memory. + Disabling this option saves about 7K of kernel size and + possibly 4K of additional runtime pagetable memory. config X86_IOPL_IOPERM bool "IOPERM and IOPL Emulation" @@ -1994,15 +1994,15 @@ config EFI_MIXED bool "EFI mixed-mode support" depends on EFI_STUB && X86_64 help - Enabling this feature allows a 64-bit kernel to be booted - on a 32-bit firmware, provided that your CPU supports 64-bit - mode. + Enabling this feature allows a 64-bit kernel to be booted + on a 32-bit firmware, provided that your CPU supports 64-bit + mode. - Note that it is not possible to boot a mixed-mode enabled - kernel via the EFI boot stub - a bootloader that supports - the EFI handover protocol must be used. + Note that it is not possible to boot a mixed-mode enabled + kernel via the EFI boot stub - a bootloader that supports + the EFI handover protocol must be used. - If unsure, say N. + If unsure, say N. source "kernel/Kconfig.hz" @@ -2227,16 +2227,16 @@ config RANDOMIZE_MEMORY select DYNAMIC_MEMORY_LAYOUT default RANDOMIZE_BASE help - Randomizes the base virtual address of kernel memory sections - (physical memory mapping, vmalloc & vmemmap). This security feature - makes exploits relying on predictable memory locations less reliable. + Randomizes the base virtual address of kernel memory sections + (physical memory mapping, vmalloc & vmemmap). This security feature + makes exploits relying on predictable memory locations less reliable. - The order of allocations remains unchanged. Entropy is generated in - the same way as RANDOMIZE_BASE. Current implementation in the optimal - configuration have in average 30,000 different possible virtual - addresses for each memory section. + The order of allocations remains unchanged. Entropy is generated in + the same way as RANDOMIZE_BASE. Current implementation in the optimal + configuration have in average 30,000 different possible virtual + addresses for each memory section. - If unsure, say Y. + If unsure, say Y. config RANDOMIZE_MEMORY_PHYSICAL_PADDING hex "Physical memory mapping padding" if EXPERT @@ -2246,12 +2246,12 @@ config RANDOMIZE_MEMORY_PHYSICAL_PADDING range 0x1 0x40 if MEMORY_HOTPLUG range 0x0 0x40 help - Define the padding in terabytes added to the existing physical - memory size during kernel memory randomization. It is useful - for memory hotplug support but reduces the entropy available for - address randomization. + Define the padding in terabytes added to the existing physical + memory size during kernel memory randomization. It is useful + for memory hotplug support but reduces the entropy available for + address randomization. - If unsure, leave at the default value. + If unsure, leave at the default value. config HOTPLUG_CPU def_bool y @@ -2598,7 +2598,6 @@ source "drivers/idle/Kconfig" endmenu - menu "Bus options (PCI etc.)" choice @@ -2822,7 +2821,6 @@ config AMD_NB endmenu - menu "Binary Emulations" config IA32_EMULATION @@ -2867,11 +2865,10 @@ config COMPAT_FOR_U64_ALIGNMENT config SYSVIPC_COMPAT def_bool y depends on SYSVIPC -endif +endif # COMPAT endmenu - config HAVE_ATOMIC_IOMAP def_bool y depends on X86_32 -- cgit From 0ecfacb4c50ae2fd32ee1898562fe55f045b56d2 Mon Sep 17 00:00:00 2001 From: Juerg Haefliger Date: Wed, 25 May 2022 15:32:03 +0200 Subject: x86/Kconfig: Fix indentation of arch/x86/Kconfig.debug The convention for indentation seems to be a single tab. Help text is further indented by an additional two whitespaces. Fix the lines that violate these rules. Signed-off-by: Juerg Haefliger Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20220525133203.52463-3-juerg.haefliger@canonical.com --- arch/x86/Kconfig.debug | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index d872a7522e55..340399f69954 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -73,20 +73,19 @@ config DEBUG_TLBFLUSH bool "Set upper limit of TLB entries to flush one-by-one" depends on DEBUG_KERNEL help + X86-only for now. - X86-only for now. + This option allows the user to tune the amount of TLB entries the + kernel flushes one-by-one instead of doing a full TLB flush. In + certain situations, the former is cheaper. This is controlled by the + tlb_flushall_shift knob under /sys/kernel/debug/x86. If you set it + to -1, the code flushes the whole TLB unconditionally. Otherwise, + for positive values of it, the kernel will use single TLB entry + invalidating instructions according to the following formula: - This option allows the user to tune the amount of TLB entries the - kernel flushes one-by-one instead of doing a full TLB flush. In - certain situations, the former is cheaper. This is controlled by the - tlb_flushall_shift knob under /sys/kernel/debug/x86. If you set it - to -1, the code flushes the whole TLB unconditionally. Otherwise, - for positive values of it, the kernel will use single TLB entry - invalidating instructions according to the following formula: + flush_entries <= active_tlb_entries / 2^tlb_flushall_shift - flush_entries <= active_tlb_entries / 2^tlb_flushall_shift - - If in doubt, say "N". + If in doubt, say "N". config IOMMU_DEBUG bool "Enable IOMMU debugging" @@ -119,10 +118,10 @@ config X86_DECODER_SELFTEST depends on DEBUG_KERNEL && INSTRUCTION_DECODER depends on !COMPILE_TEST help - Perform x86 instruction decoder selftests at build time. - This option is useful for checking the sanity of x86 instruction - decoder code. - If unsure, say "N". + Perform x86 instruction decoder selftests at build time. + This option is useful for checking the sanity of x86 instruction + decoder code. + If unsure, say "N". choice prompt "IO delay type" -- cgit From 20eb48885b62d5de4ab6be7e08f9f55aa33333fd Mon Sep 17 00:00:00 2001 From: sunliming Date: Wed, 25 May 2022 09:28:27 +0800 Subject: x86/idt: Remove unused headers Commit: 4b9a8dca0e58 ("x86/idt: Remove the tracing IDT completely") removed the 'tracing IDT' from arch/x86/kernel/tracepoint.c, but left related headers included - remove them. [ mingo: Tweak changelog. ] Signed-off-by: sunliming Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20220525012827.93464-1-sunliming@kylinos.cn --- arch/x86/kernel/tracepoint.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c index fcfc077afe2d..f39aad69fb64 100644 --- a/arch/x86/kernel/tracepoint.c +++ b/arch/x86/kernel/tracepoint.c @@ -8,10 +8,7 @@ #include #include -#include -#include #include -#include DEFINE_STATIC_KEY_FALSE(trace_pagefault_key); -- cgit From 108ea7eb3e754be735a1c33d2713a19527fb2065 Mon Sep 17 00:00:00 2001 From: Juerg Haefliger Date: Wed, 25 May 2022 15:39:49 +0200 Subject: perf/x86/Kconfig: Fix indentation in the Kconfig file The convention for indentation seems to be a single tab. Help text is further indented by an additional two whitespaces. Fix the lines that violate these rules. Signed-off-by: Juerg Haefliger Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20220525133949.53730-1-juerg.haefliger@canonical.com --- arch/x86/events/Kconfig | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/Kconfig b/arch/x86/events/Kconfig index 09c56965750a..dabdf3d7bf84 100644 --- a/arch/x86/events/Kconfig +++ b/arch/x86/events/Kconfig @@ -6,24 +6,24 @@ config PERF_EVENTS_INTEL_UNCORE depends on PERF_EVENTS && CPU_SUP_INTEL && PCI default y help - Include support for Intel uncore performance events. These are - available on NehalemEX and more modern processors. + Include support for Intel uncore performance events. These are + available on NehalemEX and more modern processors. config PERF_EVENTS_INTEL_RAPL tristate "Intel/AMD rapl performance events" depends on PERF_EVENTS && (CPU_SUP_INTEL || CPU_SUP_AMD) && PCI default y help - Include support for Intel and AMD rapl performance events for power - monitoring on modern processors. + Include support for Intel and AMD rapl performance events for power + monitoring on modern processors. config PERF_EVENTS_INTEL_CSTATE tristate "Intel cstate performance events" depends on PERF_EVENTS && CPU_SUP_INTEL && PCI default y help - Include support for Intel cstate performance events for power - monitoring on modern processors. + Include support for Intel cstate performance events for power + monitoring on modern processors. config PERF_EVENTS_AMD_POWER depends on PERF_EVENTS && CPU_SUP_AMD -- cgit From 86dca369075b3e310c3c0adb0f81e513c562b5e4 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 25 May 2022 06:39:52 -0700 Subject: perf/x86/intel: Fix event constraints for ICL According to the latest event list, the event encoding 0x55 INST_DECODED.DECODERS and 0x56 UOPS_DECODED.DEC0 are only available on the first 4 counters. Add them into the event constraints table. Fixes: 6017608936c1 ("perf/x86/intel: Add Icelake support") Signed-off-by: Kan Liang Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220525133952.1660658-1-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 955ae91c56dc..45024abd929f 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -276,7 +276,7 @@ static struct event_constraint intel_icl_event_constraints[] = { INTEL_EVENT_CONSTRAINT_RANGE(0x03, 0x0a, 0xf), INTEL_EVENT_CONSTRAINT_RANGE(0x1f, 0x28, 0xf), INTEL_EVENT_CONSTRAINT(0x32, 0xf), /* SW_PREFETCH_ACCESS.* */ - INTEL_EVENT_CONSTRAINT_RANGE(0x48, 0x54, 0xf), + INTEL_EVENT_CONSTRAINT_RANGE(0x48, 0x56, 0xf), INTEL_EVENT_CONSTRAINT_RANGE(0x60, 0x8b, 0xf), INTEL_UEVENT_CONSTRAINT(0x04a3, 0xff), /* CYCLE_ACTIVITY.STALLS_TOTAL */ INTEL_UEVENT_CONSTRAINT(0x10a3, 0xff), /* CYCLE_ACTIVITY.CYCLES_MEM_ANY */ -- cgit From 3a2bfec0b02f2226ff3376a5d2ff604d799bd7ea Mon Sep 17 00:00:00 2001 From: Li kunyu Date: Wed, 18 May 2022 10:36:40 +0800 Subject: ftrace: Remove return value of ftrace_arch_modify_*() All instances of the function ftrace_arch_modify_prepare() and ftrace_arch_modify_post_process() return zero. There's no point in checking their return value. Just have them be void functions. Link: https://lkml.kernel.org/r/20220518023639.4065-1-kunyu@nfschina.com Signed-off-by: Li kunyu Signed-off-by: Steven Rostedt (Google) --- arch/x86/kernel/ftrace.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 1e31c7d21597..73d2719ed12c 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -37,7 +37,7 @@ static int ftrace_poke_late = 0; -int ftrace_arch_code_modify_prepare(void) +void ftrace_arch_code_modify_prepare(void) __acquires(&text_mutex) { /* @@ -47,10 +47,9 @@ int ftrace_arch_code_modify_prepare(void) */ mutex_lock(&text_mutex); ftrace_poke_late = 1; - return 0; } -int ftrace_arch_code_modify_post_process(void) +void ftrace_arch_code_modify_post_process(void) __releases(&text_mutex) { /* @@ -61,7 +60,6 @@ int ftrace_arch_code_modify_post_process(void) text_poke_finish(); ftrace_poke_late = 0; mutex_unlock(&text_mutex); - return 0; } static const char *ftrace_nop_replace(void) -- cgit From feccde2a49ff88e3e197adf1f16ea852dd05d059 Mon Sep 17 00:00:00 2001 From: sunliming Date: Wed, 25 May 2022 09:28:27 +0800 Subject: x86,tracing: Remove unused headers Commit 4b9a8dca0e58 ("x86/idt: Remove the tracing IDT completely") removed the tracing IDT from the file arch/x86/kernel/tracepoint.c, but left the related headers unused, remove it. Link: https://lkml.kernel.org/r/20220525012827.93464-1-sunliming@kylinos.cn Signed-off-by: sunliming Signed-off-by: Steven Rostedt (Google) --- arch/x86/kernel/tracepoint.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c index fcfc077afe2d..f39aad69fb64 100644 --- a/arch/x86/kernel/tracepoint.c +++ b/arch/x86/kernel/tracepoint.c @@ -8,10 +8,7 @@ #include #include -#include -#include #include -#include DEFINE_STATIC_KEY_FALSE(trace_pagefault_key); -- cgit From aef54851bf1d3e31f9c15e8134e859c2f343ceab Mon Sep 17 00:00:00 2001 From: sunliming Date: Thu, 26 May 2022 19:08:31 +0800 Subject: x86/traceponit: Fix comment about irq vector tracepoints Commit: 4b9a8dca0e58 ("x86/idt: Remove the tracing IDT completely") removed the 'tracing IDT' from arch/x86/kernel/tracepoint.c, but left related comment. So that the comment become anachronistic. Just remove the comment. Link: https://lkml.kernel.org/r/20220526110831.175743-1-sunliming@kylinos.cn Signed-off-by: sunliming Signed-off-by: Steven Rostedt (Google) --- arch/x86/kernel/tracepoint.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c index f39aad69fb64..03ae1caaa878 100644 --- a/arch/x86/kernel/tracepoint.c +++ b/arch/x86/kernel/tracepoint.c @@ -1,9 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Code for supporting irq vector tracepoints. - * * Copyright (C) 2013 Seiji Aguchi - * */ #include #include -- cgit From 2a4a62a14be1947fa945c5c11ebf67326381a568 Mon Sep 17 00:00:00 2001 From: Vincent Whitchurch Date: Mon, 23 May 2022 16:04:03 +0200 Subject: um: Fix out-of-bounds read in LDT setup syscall_stub_data() expects the data_count parameter to be the number of longs, not bytes. ================================================================== BUG: KASAN: stack-out-of-bounds in syscall_stub_data+0x70/0xe0 Read of size 128 at addr 000000006411f6f0 by task swapper/1 CPU: 0 PID: 1 Comm: swapper Not tainted 5.18.0+ #18 Call Trace: show_stack.cold+0x166/0x2a7 __dump_stack+0x3a/0x43 dump_stack_lvl+0x1f/0x27 print_report.cold+0xdb/0xf81 kasan_report+0x119/0x1f0 kasan_check_range+0x3a3/0x440 memcpy+0x52/0x140 syscall_stub_data+0x70/0xe0 write_ldt_entry+0xac/0x190 init_new_ldt+0x515/0x960 init_new_context+0x2c4/0x4d0 mm_init.constprop.0+0x5ed/0x760 mm_alloc+0x118/0x170 0x60033f48 do_one_initcall+0x1d7/0x860 0x60003e7b kernel_init+0x6e/0x3d4 new_thread_handler+0x1e7/0x2c0 The buggy address belongs to stack of task swapper/1 and is located at offset 64 in frame: init_new_ldt+0x0/0x960 This frame has 2 objects: [32, 40) 'addr' [64, 80) 'desc' ================================================================== Fixes: 858259cf7d1c443c83 ("uml: maintain own LDT entries") Signed-off-by: Vincent Whitchurch Cc: stable@vger.kernel.org Signed-off-by: Richard Weinberger --- arch/x86/um/ldt.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/um/ldt.c b/arch/x86/um/ldt.c index 3ee234b6234d..255a44dd415a 100644 --- a/arch/x86/um/ldt.c +++ b/arch/x86/um/ldt.c @@ -23,9 +23,11 @@ static long write_ldt_entry(struct mm_id *mm_idp, int func, { long res; void *stub_addr; + + BUILD_BUG_ON(sizeof(*desc) % sizeof(long)); + res = syscall_stub_data(mm_idp, (unsigned long *)desc, - (sizeof(*desc) + sizeof(long) - 1) & - ~(sizeof(long) - 1), + sizeof(*desc) / sizeof(long), addr, &stub_addr); if (!res) { unsigned long args[] = { func, -- cgit From f7081834b2d5bbc77d67073d8ab490bfeaf3c13b Mon Sep 17 00:00:00 2001 From: Bo Liu Date: Fri, 27 May 2022 02:14:00 -0400 Subject: x86: Fix all occurences of the "the the" typo Rather than waiting for the bots to fix these one-by-one, fix all occurences of "the the" throughout arch/x86. Signed-off-by: Bo Liu Signed-off-by: Ingo Molnar Cc: Paolo Bonzini Link: https://lore.kernel.org/r/20220527061400.5694-1-liubo03@inspur.com --- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/x86.c | 2 +- arch/x86/platform/efi/efi_thunk_64.S | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 610355b9ccce..2a67350d93d2 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6221,7 +6221,7 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) int size = PAGE_SIZE << L1D_CACHE_ORDER; /* - * This code is only executed when the the flush mode is 'cond' or + * This code is only executed when the flush mode is 'cond' or * 'always' */ if (static_branch_likely(&vmx_l1d_flush_cond)) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4790f0d7d40b..5f11d7bc6ef4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11855,7 +11855,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) if (current->mm == kvm->mm) { /* * Free memory regions allocated on behalf of userspace, - * unless the the memory map has changed due to process exit + * unless the memory map has changed due to process exit * or fd copying. */ mutex_lock(&kvm->slots_lock); diff --git a/arch/x86/platform/efi/efi_thunk_64.S b/arch/x86/platform/efi/efi_thunk_64.S index 854dd81804b7..9ffe2bad27d5 100644 --- a/arch/x86/platform/efi/efi_thunk_64.S +++ b/arch/x86/platform/efi/efi_thunk_64.S @@ -8,7 +8,7 @@ * The below thunking functions are only used after ExitBootServices() * has been called. This simplifies things considerably as compared with * the early EFI thunking because we can leave all the kernel state - * intact (GDT, IDT, etc) and simply invoke the the 32-bit EFI runtime + * intact (GDT, IDT, etc) and simply invoke the 32-bit EFI runtime * services from __KERNEL32_CS. This means we can continue to service * interrupts across an EFI mixed mode call. * -- cgit From e19d11267f0e6c8aff2d15d2dfed12365b4c9184 Mon Sep 17 00:00:00 2001 From: Fanjun Kong Date: Thu, 26 May 2022 22:20:39 +0800 Subject: x86/mm: Use PAGE_ALIGNED(x) instead of IS_ALIGNED(x, PAGE_SIZE) The already provides the PAGE_ALIGNED() macro. Let's use this macro instead of IS_ALIGNED() and passing PAGE_SIZE directly. No change in functionality. [ mingo: Tweak changelog. ] Signed-off-by: Fanjun Kong Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20220526142038.1582839-1-bh1scw@gmail.com --- arch/x86/mm/init_64.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 61d0ab154f96..8779d6be6a49 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1240,8 +1240,8 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct, void __ref vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap) { - VM_BUG_ON(!IS_ALIGNED(start, PAGE_SIZE)); - VM_BUG_ON(!IS_ALIGNED(end, PAGE_SIZE)); + VM_BUG_ON(!PAGE_ALIGNED(start)); + VM_BUG_ON(!PAGE_ALIGNED(end)); remove_pagetable(start, end, false, altmap); } @@ -1605,8 +1605,8 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, { int err; - VM_BUG_ON(!IS_ALIGNED(start, PAGE_SIZE)); - VM_BUG_ON(!IS_ALIGNED(end, PAGE_SIZE)); + VM_BUG_ON(!PAGE_ALIGNED(start)); + VM_BUG_ON(!PAGE_ALIGNED(end)); if (end - start < PAGES_PER_SECTION * sizeof(struct page)) err = vmemmap_populate_basepages(start, end, node, NULL); -- cgit From 5f3da8c08508df82823566c32f753071c8ad36af Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 19 Apr 2022 09:05:09 -0700 Subject: objtool: Add CONFIG_HAVE_UACCESS_VALIDATION Allow an arch specify that it has objtool uaccess validation with CONFIG_HAVE_UACCESS_VALIDATION. For now, doing so unconditionally selects CONFIG_OBJTOOL. Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/d393d5e2fe73aec6e8e41d5c24f4b6fe8583f2d8.1650384225.git.jpoimboe@redhat.com --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cf531fbcd229..5f41f3c3df9a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -258,6 +258,7 @@ config X86 select HAVE_PREEMPT_DYNAMIC_CALL select HAVE_RSEQ select HAVE_SYSCALL_TRACEPOINTS + select HAVE_UACCESS_VALIDATION if HAVE_OBJTOOL select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_USER_RETURN_NOTIFIER select HAVE_GENERIC_VDSO -- cgit From a6a5eb269f6f3a2fe392f725a8d9052190c731e2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 2 May 2022 12:15:23 +0200 Subject: x86/cpu: Elide KCSAN for cpu_has() and friends As x86 uses the headers, the regular forms of all bitops are instrumented with explicit calls to KASAN and KCSAN checks. As these are explicit calls, these are not suppressed by the noinstr function attribute. This can result in calls to those check functions in noinstr code, which objtool warns about: vmlinux.o: warning: objtool: enter_from_user_mode+0x24: call to __kcsan_check_access() leaves .noinstr.text section vmlinux.o: warning: objtool: syscall_enter_from_user_mode+0x28: call to __kcsan_check_access() leaves .noinstr.text section vmlinux.o: warning: objtool: syscall_enter_from_user_mode_prepare+0x24: call to __kcsan_check_access() leaves .noinstr.text section vmlinux.o: warning: objtool: irqentry_enter_from_user_mode+0x24: call to __kcsan_check_access() leaves .noinstr.text section Prevent this by using the arch_*() bitops, which are the underlying bitops without explciit instrumentation. [null: Changelog] Reported-by: kernel test robot Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220502111216.290518605@infradead.org --- arch/x86/include/asm/cpufeature.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 66d3e3b1d24d..ea34cc31b047 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -54,7 +54,7 @@ extern const char * const x86_power_flags[32]; extern const char * const x86_bug_flags[NBUGINTS*32]; #define test_cpu_cap(c, bit) \ - test_bit(bit, (unsigned long *)((c)->x86_capability)) + arch_test_bit(bit, (unsigned long *)((c)->x86_capability)) /* * There are 32 bits/features in each mask word. The high bits -- cgit From 1894a4030582472336c2873cb07911ce67e0d14e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 7 May 2022 13:37:45 +0200 Subject: x86: Always inline on_thread_stack() and current_top_of_stack() Becaues GCC clearly lost it's marbles again... vmlinux.o: warning: objtool: enter_from_user_mode+0x4e: call to on_thread_stack() leaves .noinstr.text section vmlinux.o: warning: objtool: syscall_enter_from_user_mode+0x53: call to on_thread_stack() leaves .noinstr.text section vmlinux.o: warning: objtool: syscall_enter_from_user_mode_prepare+0x4e: call to on_thread_stack() leaves .noinstr.text section vmlinux.o: warning: objtool: irqentry_enter_from_user_mode+0x4e: call to on_thread_stack() leaves .noinstr.text section vmlinux.o: warning: objtool: enter_from_user_mode+0x4e: call to current_top_of_stack() leaves .noinstr.text section vmlinux.o: warning: objtool: syscall_enter_from_user_mode+0x53: call to current_top_of_stack() leaves .noinstr.text section vmlinux.o: warning: objtool: syscall_enter_from_user_mode_prepare+0x4e: call to current_top_of_stack() leaves .noinstr.text section vmlinux.o: warning: objtool: irqentry_enter_from_user_mode+0x4e: call to current_top_of_stack() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220526105958.071435483@infradead.org --- arch/x86/include/asm/processor.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 91d0f93a00c7..356308c73951 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -559,7 +559,7 @@ static __always_inline void native_swapgs(void) #endif } -static inline unsigned long current_top_of_stack(void) +static __always_inline unsigned long current_top_of_stack(void) { /* * We can't read directly from tss.sp0: sp0 on x86_32 is special in @@ -569,7 +569,7 @@ static inline unsigned long current_top_of_stack(void) return this_cpu_read_stable(cpu_current_top_of_stack); } -static inline bool on_thread_stack(void) +static __always_inline bool on_thread_stack(void) { return (unsigned long)(current_top_of_stack() - current_stack_pointer) < THREAD_SIZE; -- cgit From 2028a255f4df3af9e759f01f958d3237f825f256 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 20 May 2022 21:27:29 +0200 Subject: x86/extable: Annotate ex_handler_msr_mce() as a dead end Fix vmlinux.o: warning: objtool: fixup_exception+0x2d6: unreachable instruction Signed-off-by: Borislav Petkov Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220520192729.23969-1-bp@alien8.de --- arch/x86/include/asm/extable.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/extable.h b/arch/x86/include/asm/extable.h index 155c991ba95e..eeed395c3177 100644 --- a/arch/x86/include/asm/extable.h +++ b/arch/x86/include/asm/extable.h @@ -42,9 +42,13 @@ extern int ex_get_fixup_type(unsigned long ip); extern void early_fixup_exception(struct pt_regs *regs, int trapnr); #ifdef CONFIG_X86_MCE -extern void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr); +extern void __noreturn ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr); #else -static inline void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr) { } +static inline void __noreturn ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr) +{ + for (;;) + cpu_relax(); +} #endif #if defined(CONFIG_BPF_JIT) && defined(CONFIG_X86_64) -- cgit From 3e35142ef99fe6b4fe5d834ad43ee13cca10a2dc Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 19 May 2022 14:42:37 +0530 Subject: kexec_file: drop weak attribute from arch_kexec_apply_relocations[_add] Since commit d1bcae833b32f1 ("ELF: Don't generate unused section symbols") [1], binutils (v2.36+) started dropping section symbols that it thought were unused. This isn't an issue in general, but with kexec_file.c, gcc is placing kexec_arch_apply_relocations[_add] into a separate .text.unlikely section and the section symbol ".text.unlikely" is being dropped. Due to this, recordmcount is unable to find a non-weak symbol in .text.unlikely to generate a relocation record against. Address this by dropping the weak attribute from these functions. Instead, follow the existing pattern of having architectures #define the name of the function they want to override in their headers. [1] https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=d1bcae833b32f1 [akpm@linux-foundation.org: arch/s390/include/asm/kexec.h needs linux/module.h] Link: https://lkml.kernel.org/r/20220519091237.676736-1-naveen.n.rao@linux.vnet.ibm.com Signed-off-by: Michael Ellerman Signed-off-by: Naveen N. Rao Cc: "Eric W. Biederman" Cc: Signed-off-by: Andrew Morton --- arch/x86/include/asm/kexec.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 11b7c06e2828..6ad8d946cd3e 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -186,6 +186,14 @@ extern int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, extern void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages); #define arch_kexec_pre_free_pages arch_kexec_pre_free_pages +#ifdef CONFIG_KEXEC_FILE +struct purgatory_info; +int arch_kexec_apply_relocations_add(struct purgatory_info *pi, + Elf_Shdr *section, + const Elf_Shdr *relsec, + const Elf_Shdr *symtab); +#define arch_kexec_apply_relocations_add arch_kexec_apply_relocations_add +#endif #endif typedef void crash_vmclear_fn(void); -- cgit From b39181f7c6907dc66ff937b74758671fa6ba430c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 26 May 2022 14:19:12 -0400 Subject: ftrace: Add FTRACE_MCOUNT_MAX_OFFSET to avoid adding weak function If an unused weak function was traced, it's call to fentry will still exist, which gets added into the __mcount_loc table. Ftrace will use kallsyms to retrieve the name for each location in __mcount_loc to display it in the available_filter_functions and used to enable functions via the name matching in set_ftrace_filter/notrace. Enabling these functions do nothing but enable an unused call to ftrace_caller. If a traced weak function is overridden, the symbol of the function would be used for it, which will either created duplicate names, or if the previous function was not traced, it would be incorrectly be listed in available_filter_functions as a function that can be traced. This became an issue with BPF[1] as there are tooling that enables the direct callers via ftrace but then checks to see if the functions were actually enabled. The case of one function that was marked notrace, but was followed by an unused weak function that was traced. The unused function's call to fentry was added to the __mcount_loc section, and kallsyms retrieved the untraced function's symbol as the weak function was overridden. Since the untraced function would not get traced, the BPF check would detect this and fail. The real fix would be to fix kallsyms to not show addresses of weak functions as the function before it. But that would require adding code in the build to add function size to kallsyms so that it can know when the function ends instead of just using the start of the next known symbol. In the mean time, this is a work around. Add a FTRACE_MCOUNT_MAX_OFFSET macro that if defined, ftrace will ignore any function that has its call to fentry/mcount that has an offset from the symbol that is greater than FTRACE_MCOUNT_MAX_OFFSET. If CONFIG_HAVE_FENTRY is defined for x86, define FTRACE_MCOUNT_MAX_OFFSET to zero (unless IBT is enabled), which will have ftrace ignore all locations that are not at the start of the function (or one after the ENDBR instruction). A worker thread is added at boot up to scan all the ftrace record entries, and will mark any that fail the FTRACE_MCOUNT_MAX_OFFSET test as disabled. They will still appear in the available_filter_functions file as: __ftrace_invalid_address___ (showing the offset that caused it to be invalid). This is required for tools that use libtracefs (like trace-cmd does) that scan the available_filter_functions and enable set_ftrace_filter and set_ftrace_notrace using indexes of the function listed in the file (this is a speedup, as enabling thousands of files via names is an O(n^2) operation and can take minutes to complete, where the indexing takes less than a second). The invalid functions cannot be removed from available_filter_functions as the names there correspond to the ftrace records in the array that manages them (and the indexing depends on this). [1] https://lore.kernel.org/all/20220412094923.0abe90955e5db486b7bca279@kernel.org/ Link: https://lkml.kernel.org/r/20220526141912.794c2786@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- arch/x86/include/asm/ftrace.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 024d9797646e..b5ef474be858 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -9,6 +9,13 @@ # define MCOUNT_ADDR ((unsigned long)(__fentry__)) #define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ +/* Ignore unused weak functions which will have non zero offsets */ +#ifdef CONFIG_HAVE_FENTRY +# include +/* Add offset for endbr64 if IBT enabled */ +# define FTRACE_MCOUNT_MAX_OFFSET ENDBR_INSN_SIZE +#endif + #ifdef CONFIG_DYNAMIC_FTRACE #define ARCH_SUPPORTS_FTRACE_OPS 1 #endif -- cgit From 181b6f40e9ea80c76756d4d0cdeed396016c487e Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 25 May 2022 18:12:29 +0200 Subject: x86/microcode: Rip out the OLD_INTERFACE Everything should be using the early initrd loading by now. Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20220525161232.14924-2-bp@alien8.de --- arch/x86/Kconfig | 12 ----- arch/x86/kernel/cpu/microcode/core.c | 100 ----------------------------------- 2 files changed, 112 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 762a0b6ab8b6..f423a2d87b15 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1350,18 +1350,6 @@ config MICROCODE_AMD If you select this option, microcode patch loading support for AMD processors will be enabled. -config MICROCODE_OLD_INTERFACE - bool "Ancient loading interface (DEPRECATED)" - default n - depends on MICROCODE - help - DO NOT USE THIS! This is the ancient /dev/cpu/microcode interface - which was used by userspace tools like iucode_tool and microcode.ctl. - It is inadequate because it runs too late to be able to properly - load microcode on a machine and it needs special tools. Instead, you - should've switched to the early loading method with the initrd or - builtin microcode by now: Documentation/x86/microcode.rst - config X86_MSR tristate "/dev/cpu/*/msr - Model-specific register support" help diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 239ff5fcec6a..b72c4134f289 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -373,98 +373,6 @@ static int apply_microcode_on_target(int cpu) return ret; } -#ifdef CONFIG_MICROCODE_OLD_INTERFACE -static int do_microcode_update(const void __user *buf, size_t size) -{ - int error = 0; - int cpu; - - for_each_online_cpu(cpu) { - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - enum ucode_state ustate; - - if (!uci->valid) - continue; - - ustate = microcode_ops->request_microcode_user(cpu, buf, size); - if (ustate == UCODE_ERROR) { - error = -1; - break; - } else if (ustate == UCODE_NEW) { - apply_microcode_on_target(cpu); - } - } - - return error; -} - -static int microcode_open(struct inode *inode, struct file *file) -{ - return capable(CAP_SYS_RAWIO) ? stream_open(inode, file) : -EPERM; -} - -static ssize_t microcode_write(struct file *file, const char __user *buf, - size_t len, loff_t *ppos) -{ - ssize_t ret = -EINVAL; - unsigned long nr_pages = totalram_pages(); - - if ((len >> PAGE_SHIFT) > nr_pages) { - pr_err("too much data (max %ld pages)\n", nr_pages); - return ret; - } - - cpus_read_lock(); - mutex_lock(µcode_mutex); - - if (do_microcode_update(buf, len) == 0) - ret = (ssize_t)len; - - if (ret > 0) - perf_check_microcode(); - - mutex_unlock(µcode_mutex); - cpus_read_unlock(); - - return ret; -} - -static const struct file_operations microcode_fops = { - .owner = THIS_MODULE, - .write = microcode_write, - .open = microcode_open, - .llseek = no_llseek, -}; - -static struct miscdevice microcode_dev = { - .minor = MICROCODE_MINOR, - .name = "microcode", - .nodename = "cpu/microcode", - .fops = µcode_fops, -}; - -static int __init microcode_dev_init(void) -{ - int error; - - error = misc_register(µcode_dev); - if (error) { - pr_err("can't misc_register on minor=%d\n", MICROCODE_MINOR); - return error; - } - - return 0; -} - -static void __exit microcode_dev_exit(void) -{ - misc_deregister(µcode_dev); -} -#else -#define microcode_dev_init() 0 -#define microcode_dev_exit() do { } while (0) -#endif - /* fake device for request_firmware */ static struct platform_device *microcode_pdev; @@ -856,10 +764,6 @@ static int __init microcode_init(void) goto out_driver; } - error = microcode_dev_init(); - if (error) - goto out_ucode_group; - register_syscore_ops(&mc_syscore_ops); cpuhp_setup_state_nocalls(CPUHP_AP_MICROCODE_LOADER, "x86/microcode:starting", mc_cpu_starting, NULL); @@ -870,10 +774,6 @@ static int __init microcode_init(void) return 0; - out_ucode_group: - sysfs_remove_group(&cpu_subsys.dev_root->kobj, - &cpu_root_microcode_group); - out_driver: cpus_read_lock(); mutex_lock(µcode_mutex); -- cgit From a77a94f86273ce42a39cb479217dd8d68acfe0ff Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 25 May 2022 18:12:30 +0200 Subject: x86/microcode: Default-disable late loading It is dangerous and it should not be used anyway - there's a nice early loading already. Requested-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20220525161232.14924-3-bp@alien8.de --- arch/x86/Kconfig | 11 +++++++++++ arch/x86/kernel/cpu/common.c | 2 ++ arch/x86/kernel/cpu/microcode/core.c | 7 ++++++- 3 files changed, 19 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f423a2d87b15..976309d57a58 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1350,6 +1350,17 @@ config MICROCODE_AMD If you select this option, microcode patch loading support for AMD processors will be enabled. +config MICROCODE_LATE_LOADING + bool "Late microcode loading (DANGEROUS)" + default n + depends on MICROCODE + help + Loading microcode late, when the system is up and executing instructions + is a tricky business and should be avoided if possible. Just the sequence + of synchronizing all cores and SMT threads is one fragile dance which does + not guarantee that cores might not softlock after the loading. Therefore, + use this at your own risk. Late loading taints the kernel too. + config X86_MSR tristate "/dev/cpu/*/msr - Model-specific register support" help diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2e9142797c99..c296cb1c0113 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -2222,6 +2222,7 @@ void cpu_init_secondary(void) } #endif +#ifdef CONFIG_MICROCODE_LATE_LOADING /* * The microcode loader calls this upon late microcode load to recheck features, * only when microcode has been updated. Caller holds microcode_mutex and CPU @@ -2251,6 +2252,7 @@ void microcode_check(void) pr_warn("x86/CPU: CPU features have changed after loading microcode, but might not take effect.\n"); pr_warn("x86/CPU: Please consider either early loading through initrd/built-in or a potential BIOS update.\n"); } +#endif /* * Invoked from core CPU hotplug code after hotplug operations diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index b72c4134f289..c717db6b6856 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -376,6 +376,7 @@ static int apply_microcode_on_target(int cpu) /* fake device for request_firmware */ static struct platform_device *microcode_pdev; +#ifdef CONFIG_MICROCODE_LATE_LOADING /* * Late loading dance. Why the heavy-handed stomp_machine effort? * @@ -543,6 +544,9 @@ put: return ret; } +static DEVICE_ATTR_WO(reload); +#endif + static ssize_t version_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -559,7 +563,6 @@ static ssize_t pf_show(struct device *dev, return sprintf(buf, "0x%x\n", uci->cpu_sig.pf); } -static DEVICE_ATTR_WO(reload); static DEVICE_ATTR(version, 0444, version_show, NULL); static DEVICE_ATTR(processor_flags, 0444, pf_show, NULL); @@ -712,7 +715,9 @@ static int mc_cpu_down_prep(unsigned int cpu) } static struct attribute *cpu_root_microcode_attrs[] = { +#ifdef CONFIG_MICROCODE_LATE_LOADING &dev_attr_reload.attr, +#endif NULL }; -- cgit From d23d33ea0fcdc4bbb484990bf53867f99c63ccab Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 25 May 2022 18:12:31 +0200 Subject: x86/microcode: Taint and warn on late loading Warn before it is attempted and taint the kernel. Late loading microcode can lead to malfunction of the kernel when the microcode update changes behaviour. There is no way for the kernel to determine whether its safe or not. Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20220525161232.14924-4-bp@alien8.de --- arch/x86/kernel/cpu/microcode/core.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index c717db6b6856..801b44ac3851 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -493,6 +493,9 @@ static int microcode_reload_late(void) { int ret; + pr_err("Attempting late microcode loading - it is dangerous and taints the kernel.\n"); + pr_err("You should switch to early loading, if possible.\n"); + atomic_set(&late_cpus_in, 0); atomic_set(&late_cpus_out, 0); @@ -541,6 +544,8 @@ put: if (ret == 0) ret = size; + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + return ret; } -- cgit From 0c0fe08c76485fe0178ebb0fa1a2052c727abe94 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 25 May 2022 18:12:32 +0200 Subject: x86/microcode: Remove unnecessary perf callback c93dc84cbe32 ("perf/x86: Add a microcode revision check for SNB-PEBS") checks whether the microcode revision has fixed PEBS issues. This can happen either: 1. At PEBS init time, where the early microcode has been loaded already 2. During late loading, in the microcode_check() callback. So remove the unnecessary call in the microcode loader init routine. Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20220525161232.14924-5-bp@alien8.de --- arch/x86/kernel/cpu/microcode/core.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 801b44ac3851..ad57e0e4d674 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -756,10 +756,7 @@ static int __init microcode_init(void) cpus_read_lock(); mutex_lock(µcode_mutex); - error = subsys_interface_register(&mc_cpu_interface); - if (!error) - perf_check_microcode(); mutex_unlock(µcode_mutex); cpus_read_unlock(); -- cgit From 41925b105e345ebc84cedb64f59d20cb14a62613 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 30 May 2022 10:26:34 +0200 Subject: xen: replace xen_remap() with memremap() xen_remap() is used to establish mappings for frames not under direct control of the kernel: for Xenstore and console ring pages, and for grant pages of non-PV guests. Today xen_remap() is defined to use ioremap() on x86 (doing uncached mappings), and ioremap_cache() on Arm (doing cached mappings). Uncached mappings for those use cases are bad for performance, so they should be avoided if possible. As all use cases of xen_remap() don't require uncached mappings (the mapped area is always physical RAM), a mapping using the standard WB cache mode is fine. As sparse is flagging some of the xen_remap() use cases to be not appropriate for iomem(), as the result is not annotated with the __iomem modifier, eliminate xen_remap() completely and replace all use cases with memremap() specifying the MEMREMAP_WB caching mode. xen_unmap() can be replaced with memunmap(). Reported-by: kernel test robot Signed-off-by: Juergen Gross Reviewed-by: Boris Ostrovsky Acked-by: Stefano Stabellini Link: https://lore.kernel.org/r/20220530082634.6339-1-jgross@suse.com Signed-off-by: Juergen Gross --- arch/x86/include/asm/xen/page.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index e989bc2269f5..777e6b21efe9 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -347,9 +347,6 @@ unsigned long arbitrary_virt_to_mfn(void *vaddr); void make_lowmem_page_readonly(void *vaddr); void make_lowmem_page_readwrite(void *vaddr); -#define xen_remap(cookie, size) ioremap((cookie), (size)) -#define xen_unmap(cookie) iounmap((cookie)) - static inline bool xen_arch_need_swiotlb(struct device *dev, phys_addr_t phys, dma_addr_t dev_addr) -- cgit From ca209f8b5f61b74782dd4275ebc7173d92cb4905 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Wed, 1 Jun 2022 13:50:43 +0200 Subject: efi: x86: Fix config name for setting the NX-compatibility flag in the PE header Commit 21b68da7bf4a ("efi: x86: Set the NX-compatibility flag in the PE header") intends to set the compatibility flag, i.e., IMAGE_DLL_CHARACTERISTICS_NX_COMPAT, but this ifdef is actually dead as the CONFIG_DXE_MEM_ATTRIBUTES Kconfig option does not exist. The config is actually called EFI_DXE_MEM_ATTRIBUTES. Adjust the ifdef to use the intended config name. The issue was identified with ./scripts/checkkconfigsymbols.py. Fixes: 21b68da7bf4a ("efi: x86: Set the NX-compatibility flag in the PE header") Signed-off-by: Lukas Bulwahn Link: https://lore.kernel.org/r/20220601115043.7678-1-lukas.bulwahn@gmail.com Signed-off-by: Ard Biesheuvel --- arch/x86/boot/header.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 0352e4589efa..f912d7770130 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -163,7 +163,7 @@ extra_header_fields: .long 0x200 # SizeOfHeaders .long 0 # CheckSum .word IMAGE_SUBSYSTEM_EFI_APPLICATION # Subsystem (EFI application) -#ifdef CONFIG_DXE_MEM_ATTRIBUTES +#ifdef CONFIG_EFI_DXE_MEM_ATTRIBUTES .word IMAGE_DLL_CHARACTERISTICS_NX_COMPAT # DllCharacteristics #else .word 0 # DllCharacteristics -- cgit From 31f1a0edff78c43e8a3bd3692af0db1b25c21b17 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 20 May 2022 12:07:58 +0200 Subject: efi/x86: libstub: Make DXE calls mixed mode safe The newly added DXE calls use 64-bit quantities, which means we need to marshall them explicitly when running in mixed mode. Currently, we get away without it because we just bail when GetMemorySpaceDescriptor() fails, which is guaranteed to happen due to the function argument mixup. Let's fix this properly, though, by defining the macros that describe how to marshall the arguments. While at it, drop an incorrect cast on a status variable. Signed-off-by: Ard Biesheuvel --- arch/x86/include/asm/efi.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index bed74a0f2932..71943dce691e 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -270,6 +270,8 @@ static inline u32 efi64_convert_status(efi_status_t status) return (u32)(status | (u64)status >> 32); } +#define __efi64_split(val) (val) & U32_MAX, (u64)(val) >> 32 + #define __efi64_argmap_free_pages(addr, size) \ ((addr), 0, (size)) @@ -317,6 +319,13 @@ static inline u32 efi64_convert_status(efi_status_t status) #define __efi64_argmap_hash_log_extend_event(prot, fl, addr, size, ev) \ ((prot), (fl), 0ULL, (u64)(addr), 0ULL, (u64)(size), 0ULL, ev) +/* DXE services */ +#define __efi64_argmap_get_memory_space_descriptor(phys, desc) \ + (__efi64_split(phys), (desc)) + +#define __efi64_argmap_set_memory_space_descriptor(phys, size, flags) \ + (__efi64_split(phys), __efi64_split(size), __efi64_split(flags)) + /* * The macros below handle the plumbing for the argument mapping. To add a * mapping for a specific EFI method, simply define a macro -- cgit From b3e34a47f98974d0844444c5121aaff123004e57 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 23 Feb 2022 19:32:24 +0800 Subject: x86/kexec: fix memory leak of elf header buffer This is reported by kmemleak detector: unreferenced object 0xffffc900002a9000 (size 4096): comm "kexec", pid 14950, jiffies 4295110793 (age 373.951s) hex dump (first 32 bytes): 7f 45 4c 46 02 01 01 00 00 00 00 00 00 00 00 00 .ELF............ 04 00 3e 00 01 00 00 00 00 00 00 00 00 00 00 00 ..>............. backtrace: [<0000000016a8ef9f>] __vmalloc_node_range+0x101/0x170 [<000000002b66b6c0>] __vmalloc_node+0xb4/0x160 [<00000000ad40107d>] crash_prepare_elf64_headers+0x8e/0xcd0 [<0000000019afff23>] crash_load_segments+0x260/0x470 [<0000000019ebe95c>] bzImage64_load+0x814/0xad0 [<0000000093e16b05>] arch_kexec_kernel_image_load+0x1be/0x2a0 [<000000009ef2fc88>] kimage_file_alloc_init+0x2ec/0x5a0 [<0000000038f5a97a>] __do_sys_kexec_file_load+0x28d/0x530 [<0000000087c19992>] do_syscall_64+0x3b/0x90 [<0000000066e063a4>] entry_SYSCALL_64_after_hwframe+0x44/0xae In crash_prepare_elf64_headers(), a buffer is allocated via vmalloc() to store elf headers. While it's not freed back to system correctly when kdump kernel is reloaded or unloaded. Then memory leak is caused. Fix it by introducing x86 specific function arch_kimage_file_post_load_cleanup(), and freeing the buffer there. And also remove the incorrect elf header buffer freeing code. Before calling arch specific kexec_file loading function, the image instance has been initialized. So 'image->elf_headers' must be NULL. It doesn't make sense to free the elf header buffer in the place. Three different people have reported three bugs about the memory leak on x86_64 inside Redhat. Link: https://lkml.kernel.org/r/20220223113225.63106-2-bhe@redhat.com Signed-off-by: Baoquan He Acked-by: Dave Young Cc: Signed-off-by: Andrew Morton --- arch/x86/kernel/machine_kexec_64.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 566bb8e17149..0611fd83858e 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -376,9 +376,6 @@ void machine_kexec(struct kimage *image) #ifdef CONFIG_KEXEC_FILE void *arch_kexec_kernel_image_load(struct kimage *image) { - vfree(image->elf_headers); - image->elf_headers = NULL; - if (!image->fops || !image->fops->load) return ERR_PTR(-ENOEXEC); @@ -514,6 +511,15 @@ overflow: (int)ELF64_R_TYPE(rel[i].r_info), value); return -ENOEXEC; } + +int arch_kimage_file_post_load_cleanup(struct kimage *image) +{ + vfree(image->elf_headers); + image->elf_headers = NULL; + image->elf_headers_sz = 0; + + return kexec_image_post_load_cleanup_default(image); +} #endif /* CONFIG_KEXEC_FILE */ static int -- cgit From 0c9782e204d3cc5625b9e8bf4e8625d38dfe0139 Mon Sep 17 00:00:00 2001 From: Kristen Carlson Accardi Date: Fri, 20 May 2022 10:42:47 -0700 Subject: x86/sgx: Set active memcg prior to shmem allocation When the system runs out of enclave memory, SGX can reclaim EPC pages by swapping to normal RAM. These backing pages are allocated via a per-enclave shared memory area. Since SGX allows unlimited over commit on EPC memory, the reclaimer thread can allocate a large number of backing RAM pages in response to EPC memory pressure. When the shared memory backing RAM allocation occurs during the reclaimer thread context, the shared memory is charged to the root memory control group, and the shmem usage of the enclave is not properly accounted for, making cgroups ineffective at limiting the amount of RAM an enclave can consume. For example, when using a cgroup to launch a set of test enclaves, the kernel does not properly account for 50% - 75% of shmem page allocations on average. In the worst case, when nearly all allocations occur during the reclaimer thread, the kernel accounts less than a percent of the amount of shmem used by the enclave's cgroup to the correct cgroup. SGX stores a list of mm_structs that are associated with an enclave. Pick one of them during reclaim and charge that mm's memcg with the shmem allocation. The one that gets picked is arbitrary, but this list almost always only has one mm. The cases where there is more than one mm with different memcg's are not worth considering. Create a new function - sgx_encl_alloc_backing(). This function is used whenever a new backing storage page needs to be allocated. Previously the same function was used for page allocation as well as retrieving a previously allocated page. Prior to backing page allocation, if there is a mm_struct associated with the enclave that is requesting the allocation, it is set as the active memory control group. [ dhansen: - fix merge conflict with ELDU fixes - check against actual ksgxd_tsk, not ->mm ] Cc: stable@vger.kernel.org Signed-off-by: Kristen Carlson Accardi Signed-off-by: Dave Hansen Reviewed-by: Shakeel Butt Acked-by: Roman Gushchin Link: https://lkml.kernel.org/r/20220520174248.4918-1-kristen@linux.intel.com --- arch/x86/kernel/cpu/sgx/encl.c | 105 ++++++++++++++++++++++++++++++++++++++++- arch/x86/kernel/cpu/sgx/encl.h | 7 ++- arch/x86/kernel/cpu/sgx/main.c | 9 +++- 3 files changed, 115 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 3c24e6124d95..19876ebfb504 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -152,7 +152,7 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); - ret = sgx_encl_get_backing(encl, page_index, &b); + ret = sgx_encl_lookup_backing(encl, page_index, &b); if (ret) return ret; @@ -718,7 +718,7 @@ static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl, * 0 on success, * -errno otherwise. */ -int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, +static int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, struct sgx_backing *backing) { pgoff_t page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); @@ -743,6 +743,107 @@ int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, return 0; } +/* + * When called from ksgxd, returns the mem_cgroup of a struct mm stored + * in the enclave's mm_list. When not called from ksgxd, just returns + * the mem_cgroup of the current task. + */ +static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl) +{ + struct mem_cgroup *memcg = NULL; + struct sgx_encl_mm *encl_mm; + int idx; + + /* + * If called from normal task context, return the mem_cgroup + * of the current task's mm. The remainder of the handling is for + * ksgxd. + */ + if (!current_is_ksgxd()) + return get_mem_cgroup_from_mm(current->mm); + + /* + * Search the enclave's mm_list to find an mm associated with + * this enclave to charge the allocation to. + */ + idx = srcu_read_lock(&encl->srcu); + + list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { + if (!mmget_not_zero(encl_mm->mm)) + continue; + + memcg = get_mem_cgroup_from_mm(encl_mm->mm); + + mmput_async(encl_mm->mm); + + break; + } + + srcu_read_unlock(&encl->srcu, idx); + + /* + * In the rare case that there isn't an mm associated with + * the enclave, set memcg to the current active mem_cgroup. + * This will be the root mem_cgroup if there is no active + * mem_cgroup. + */ + if (!memcg) + return get_mem_cgroup_from_mm(NULL); + + return memcg; +} + +/** + * sgx_encl_alloc_backing() - allocate a new backing storage page + * @encl: an enclave pointer + * @page_index: enclave page index + * @backing: data for accessing backing storage for the page + * + * When called from ksgxd, sets the active memcg from one of the + * mms in the enclave's mm_list prior to any backing page allocation, + * in order to ensure that shmem page allocations are charged to the + * enclave. + * + * Return: + * 0 on success, + * -errno otherwise. + */ +int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index, + struct sgx_backing *backing) +{ + struct mem_cgroup *encl_memcg = sgx_encl_get_mem_cgroup(encl); + struct mem_cgroup *memcg = set_active_memcg(encl_memcg); + int ret; + + ret = sgx_encl_get_backing(encl, page_index, backing); + + set_active_memcg(memcg); + mem_cgroup_put(encl_memcg); + + return ret; +} + +/** + * sgx_encl_lookup_backing() - retrieve an existing backing storage page + * @encl: an enclave pointer + * @page_index: enclave page index + * @backing: data for accessing backing storage for the page + * + * Retrieve a backing page for loading data back into an EPC page with ELDU. + * It is the caller's responsibility to ensure that it is appropriate to use + * sgx_encl_lookup_backing() rather than sgx_encl_alloc_backing(). If lookup is + * not used correctly, this will cause an allocation which is not accounted for. + * + * Return: + * 0 on success, + * -errno otherwise. + */ +int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index, + struct sgx_backing *backing) +{ + return sgx_encl_get_backing(encl, page_index, backing); +} + /** * sgx_encl_put_backing() - Unpin the backing storage * @backing: data for accessing backing storage for the page diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h index d44e7372151f..332ef3568267 100644 --- a/arch/x86/kernel/cpu/sgx/encl.h +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -103,10 +103,13 @@ static inline int sgx_encl_find(struct mm_struct *mm, unsigned long addr, int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, unsigned long end, unsigned long vm_flags); +bool current_is_ksgxd(void); void sgx_encl_release(struct kref *ref); int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm); -int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, - struct sgx_backing *backing); +int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index, + struct sgx_backing *backing); +int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index, + struct sgx_backing *backing); void sgx_encl_put_backing(struct sgx_backing *backing); int sgx_encl_test_and_clear_young(struct mm_struct *mm, struct sgx_encl_page *page); diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index ab4ec54bbdd9..a78652d43e61 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -313,7 +313,7 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page, sgx_encl_put_backing(backing); if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) { - ret = sgx_encl_get_backing(encl, PFN_DOWN(encl->size), + ret = sgx_encl_alloc_backing(encl, PFN_DOWN(encl->size), &secs_backing); if (ret) goto out; @@ -384,7 +384,7 @@ static void sgx_reclaim_pages(void) page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base); mutex_lock(&encl_page->encl->lock); - ret = sgx_encl_get_backing(encl_page->encl, page_index, &backing[i]); + ret = sgx_encl_alloc_backing(encl_page->encl, page_index, &backing[i]); if (ret) { mutex_unlock(&encl_page->encl->lock); goto skip; @@ -475,6 +475,11 @@ static bool __init sgx_page_reclaimer_init(void) return true; } +bool current_is_ksgxd(void) +{ + return current == ksgxd_tsk; +} + static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid) { struct sgx_numa_node *node = &sgx_numa_nodes[nid]; -- cgit From 4aec74bccf5d047347bdfef3638e3031dd75c0a0 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sun, 23 Jan 2022 10:38:46 -0800 Subject: arch/x86: replace cpumask_weight with cpumask_empty where appropriate In some cases, arch/x86 code calls cpumask_weight() to check if any bit of a given cpumask is set. We can do it more efficiently with cpumask_empty() because cpumask_empty() stops traversing the cpumask as soon as it finds first set bit, while cpumask_weight() counts all bits unconditionally. Signed-off-by: Yury Norov Reviewed-by: Steve Wahl --- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 14 +++++++------- arch/x86/mm/mmio-mod.c | 2 +- arch/x86/platform/uv/uv_nmi.c | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 83f901e2c2df..f276aff521e8 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -341,14 +341,14 @@ static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, /* Check whether cpus belong to parent ctrl group */ cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask); - if (cpumask_weight(tmpmask)) { + if (!cpumask_empty(tmpmask)) { rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n"); return -EINVAL; } /* Check whether cpus are dropped from this group */ cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); - if (cpumask_weight(tmpmask)) { + if (!cpumask_empty(tmpmask)) { /* Give any dropped cpus to parent rdtgroup */ cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask); update_closid_rmid(tmpmask, prgrp); @@ -359,7 +359,7 @@ static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, * and update per-cpu rmid */ cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); - if (cpumask_weight(tmpmask)) { + if (!cpumask_empty(tmpmask)) { head = &prgrp->mon.crdtgrp_list; list_for_each_entry(crgrp, head, mon.crdtgrp_list) { if (crgrp == rdtgrp) @@ -394,7 +394,7 @@ static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, /* Check whether cpus are dropped from this group */ cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); - if (cpumask_weight(tmpmask)) { + if (!cpumask_empty(tmpmask)) { /* Can't drop from default group */ if (rdtgrp == &rdtgroup_default) { rdt_last_cmd_puts("Can't drop CPUs from default group\n"); @@ -413,12 +413,12 @@ static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, * and update per-cpu closid/rmid. */ cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); - if (cpumask_weight(tmpmask)) { + if (!cpumask_empty(tmpmask)) { list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { if (r == rdtgrp) continue; cpumask_and(tmpmask1, &r->cpu_mask, tmpmask); - if (cpumask_weight(tmpmask1)) + if (!cpumask_empty(tmpmask1)) cpumask_rdtgrp_clear(r, tmpmask1); } update_closid_rmid(tmpmask, rdtgrp); @@ -488,7 +488,7 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, /* check that user didn't specify any offline cpus */ cpumask_andnot(tmpmask, newmask, cpu_online_mask); - if (cpumask_weight(tmpmask)) { + if (!cpumask_empty(tmpmask)) { ret = -EINVAL; rdt_last_cmd_puts("Can only assign online CPUs\n"); goto unlock; diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 933a2ebad471..c3317f0650d8 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -400,7 +400,7 @@ static void leave_uniprocessor(void) int cpu; int err; - if (!cpumask_available(downed_cpus) || cpumask_weight(downed_cpus) == 0) + if (!cpumask_available(downed_cpus) || cpumask_empty(downed_cpus)) return; pr_notice("Re-enabling CPUs...\n"); for_each_cpu(cpu, downed_cpus) { diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c index 1e9ff28bc2e0..ea277fc08357 100644 --- a/arch/x86/platform/uv/uv_nmi.c +++ b/arch/x86/platform/uv/uv_nmi.c @@ -985,7 +985,7 @@ static int uv_handle_nmi(unsigned int reason, struct pt_regs *regs) /* Clear global flags */ if (master) { - if (cpumask_weight(uv_nmi_cpu_mask)) + if (!cpumask_empty(uv_nmi_cpu_mask)) uv_nmi_cleanup_mask(); atomic_set(&uv_nmi_cpus_in_nmi, -1); atomic_set(&uv_nmi_cpu, -1); -- cgit From dcf23cca930d1a60f7cd6b3a245a5081d77b8081 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sun, 23 Jan 2022 10:38:57 -0800 Subject: arch/x86: replace nodes_weight with nodes_empty where appropriate mm code calls nodes_weight() to check if any bit of a given nodemask is set. We can do it more efficiently with nodes_empty() because nodes_empty() stops traversing the nodemask as soon as it finds first set bit, while nodes_weight() counts all bits unconditionally. Signed-off-by: Yury Norov --- arch/x86/mm/amdtopology.c | 2 +- arch/x86/mm/numa_emulation.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c index 058b2f36b3a6..b3ca7d23e4b0 100644 --- a/arch/x86/mm/amdtopology.c +++ b/arch/x86/mm/amdtopology.c @@ -154,7 +154,7 @@ int __init amd_numa_init(void) node_set(nodeid, numa_nodes_parsed); } - if (!nodes_weight(numa_nodes_parsed)) + if (nodes_empty(numa_nodes_parsed)) return -ENOENT; /* diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index 1a02b791d273..9a9305367fdd 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -123,7 +123,7 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei, * Continue to fill physical nodes with fake nodes until there is no * memory left on any of them. */ - while (nodes_weight(physnode_mask)) { + while (!nodes_empty(physnode_mask)) { for_each_node_mask(i, physnode_mask) { u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); u64 start, limit, end; @@ -270,7 +270,7 @@ static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei, * Fill physical nodes with fake nodes of size until there is no memory * left on any of them. */ - while (nodes_weight(physnode_mask)) { + while (!nodes_empty(physnode_mask)) { for_each_node_mask(i, physnode_mask) { u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); u64 start, limit, end; -- cgit From a7ef9b455c7ca8f07a5b4bd967a3c39c7434d43f Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 19 May 2022 10:15:04 -0700 Subject: KVM: x86: hyper-v: fix type of valid_bank_mask In kvm_hv_flush_tlb(), valid_bank_mask is declared as unsigned long, but is used as u64, which is wrong for i386, and has been spotted by LKP after applying "KVM: x86: hyper-v: replace bitmap_weight() with hweight64()" https://lore.kernel.org/lkml/20220510154750.212913-12-yury.norov@gmail.com/ But it's wrong even without that patch because now bitmap_weight() dereferences a word after valid_bank_mask on i386. >> include/asm-generic/bitops/const_hweight.h:21:76: warning: right shift count >= width of type +[-Wshift-count-overflow] 21 | #define __const_hweight64(w) (__const_hweight32(w) + __const_hweight32((w) >> 32)) | ^~ include/asm-generic/bitops/const_hweight.h:10:16: note: in definition of macro '__const_hweight8' 10 | ((!!((w) & (1ULL << 0))) + \ | ^ include/asm-generic/bitops/const_hweight.h:20:31: note: in expansion of macro '__const_hweight16' 20 | #define __const_hweight32(w) (__const_hweight16(w) + __const_hweight16((w) >> 16)) | ^~~~~~~~~~~~~~~~~ include/asm-generic/bitops/const_hweight.h:21:54: note: in expansion of macro '__const_hweight32' 21 | #define __const_hweight64(w) (__const_hweight32(w) + __const_hweight32((w) >> 32)) | ^~~~~~~~~~~~~~~~~ include/asm-generic/bitops/const_hweight.h:29:49: note: in expansion of macro '__const_hweight64' 29 | #define hweight64(w) (__builtin_constant_p(w) ? __const_hweight64(w) : __arch_hweight64(w)) | ^~~~~~~~~~~~~~~~~ arch/x86/kvm/hyperv.c:1983:36: note: in expansion of macro 'hweight64' 1983 | if (hc->var_cnt != hweight64(valid_bank_mask)) | ^~~~~~~~~ CC: Borislav Petkov CC: Dave Hansen CC: H. Peter Anvin CC: Ingo Molnar CC: Jim Mattson CC: Joerg Roedel CC: Paolo Bonzini CC: Sean Christopherson CC: Thomas Gleixner CC: Vitaly Kuznetsov CC: Wanpeng Li CC: kvm@vger.kernel.org CC: linux-kernel@vger.kernel.org CC: x86@kernel.org Reported-by: kernel test robot Signed-off-by: Yury Norov Message-Id: <20220519171504.1238724-1-yury.norov@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index c964923a7684..18325e0ae7bc 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1938,7 +1938,7 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) struct hv_send_ipi_ex send_ipi_ex; struct hv_send_ipi send_ipi; DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS); - unsigned long valid_bank_mask; + u64 valid_bank_mask; u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; u32 vector; bool all_cpus; @@ -1980,7 +1980,7 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) valid_bank_mask = send_ipi_ex.vp_set.valid_bank_mask; all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL; - if (hc->var_cnt != bitmap_weight(&valid_bank_mask, 64)) + if (hc->var_cnt != bitmap_weight((unsigned long *)&valid_bank_mask, 64)) return HV_STATUS_INVALID_HYPERCALL_INPUT; if (all_cpus) -- cgit From d603fd8dd35f6028bb09cd2e9ec6557c4bc0dd95 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 9 May 2022 18:54:23 -0700 Subject: KVM: x86: hyper-v: replace bitmap_weight() with hweight64() kvm_hv_flush_tlb() applies bitmap API to a u64 variable valid_bank_mask. Since valid_bank_mask has a fixed size, we can use hweight64() and avoid excessive bloating. CC: Borislav Petkov CC: Dave Hansen CC: H. Peter Anvin CC: Ingo Molnar CC: Jim Mattson CC: Joerg Roedel CC: Paolo Bonzini CC: Sean Christopherson CC: Thomas Gleixner CC: Vitaly Kuznetsov CC: Wanpeng Li CC: kvm@vger.kernel.org CC: linux-kernel@vger.kernel.org CC: x86@kernel.org Acked-by: Vitaly Kuznetsov Signed-off-by: Yury Norov --- arch/x86/kvm/hyperv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 18325e0ae7bc..c8ca95d4e4e9 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1879,7 +1879,7 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) all_cpus = flush_ex.hv_vp_set.format != HV_GENERIC_SET_SPARSE_4K; - if (hc->var_cnt != bitmap_weight((unsigned long *)&valid_bank_mask, 64)) + if (hc->var_cnt != hweight64(valid_bank_mask)) return HV_STATUS_INVALID_HYPERCALL_INPUT; if (all_cpus) @@ -1980,7 +1980,7 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) valid_bank_mask = send_ipi_ex.vp_set.valid_bank_mask; all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL; - if (hc->var_cnt != bitmap_weight((unsigned long *)&valid_bank_mask, 64)) + if (hc->var_cnt != hweight64(valid_bank_mask)) return HV_STATUS_INVALID_HYPERCALL_INPUT; if (all_cpus) -- cgit