From 9a62d20027da3164a22244d9f022c0c987261687 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 26 Nov 2019 11:09:42 +0100 Subject: x86/mm/32: Sync only to VMALLOC_END in vmalloc_sync_all() The job of vmalloc_sync_all() is to help the lazy freeing of vmalloc() ranges: before such vmap ranges are reused we make sure that they are unmapped from every task's page tables. This is really easy on pagetable setups where the kernel page tables are shared between all tasks - this is the case on 32-bit kernels with SHARED_KERNEL_PMD = 1. But on !SHARED_KERNEL_PMD 32-bit kernels this involves iterating over the pgd_list and clearing all pmd entries in the pgds that are cleared in the init_mm.pgd, which is the reference pagetable that the vmalloc() code uses. In that context the current practice of vmalloc_sync_all() iterating until FIX_ADDR_TOP is buggy: for (address = VMALLOC_START & PMD_MASK; address >= TASK_SIZE_MAX && address < FIXADDR_TOP; address += PMD_SIZE) { struct page *page; Because iterating up to FIXADDR_TOP will involve a lot of non-vmalloc address ranges: VMALLOC -> PKMAP -> LDT -> CPU_ENTRY_AREA -> FIX_ADDR This is mostly harmless for the FIX_ADDR and CPU_ENTRY_AREA ranges that don't clear their pmds, but it's lethal for the LDT range, which relies on having different mappings in different processes, and 'synchronizing' them in the vmalloc sense corrupts those pagetable entries (clearing them). This got particularly prominent with PTI, which turns SHARED_KERNEL_PMD off and makes this the dominant mapping mode on 32-bit. To make LDT working again vmalloc_sync_all() must only iterate over the volatile parts of the kernel address range that are identical between all processes. So the correct check in vmalloc_sync_all() is "address < VMALLOC_END" to make sure the VMALLOC areas are synchronized and the LDT mapping is not falsely overwritten. The CPU_ENTRY_AREA and the FIXMAP area are no longer synced either, but this is not really a proplem since their PMDs get established during bootup and never change. This change fixes the ldt_gdt selftest in my setup. [ mingo: Fixed up the changelog to explain the logic and modified the copying to only happen up until VMALLOC_END. ] Reported-by: Borislav Petkov Tested-by: Borislav Petkov Signed-off-by: Joerg Roedel Cc: Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Joerg Roedel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: hpa@zytor.com Fixes: 7757d607c6b3: ("x86/pti: Allow CONFIG_PAGE_TABLE_ISOLATION for x86_32") Link: https://lkml.kernel.org/r/20191126111119.GA110513@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9ceacd1156db..304d31d8cbbc 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -197,7 +197,7 @@ void vmalloc_sync_all(void) return; for (address = VMALLOC_START & PMD_MASK; - address >= TASK_SIZE_MAX && address < FIXADDR_TOP; + address >= TASK_SIZE_MAX && address < VMALLOC_END; address += PMD_SIZE) { struct page *page; -- cgit From 93efbde2c331004d8053f04b4bf0ca3e630b474a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 20 Nov 2019 22:12:38 -0800 Subject: x86/traps: Disentangle the 32-bit and 64-bit doublefault code The 64-bit doublefault handler is much nicer than the 32-bit one. As a first step toward unifying them, make the 64-bit handler self-contained. This should have no effect no functional effect except in the odd case of x86_64 with CONFIG_DOUBLEFAULT=n in which case it will change the logging a bit. This also gets rid of CONFIG_DOUBLEFAULT configurability on 64-bit kernels. It didn't do anything useful -- CONFIG_DOUBLEFAULT=n didn't actually disable doublefault handling on x86_64. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 2 +- arch/x86/include/asm/processor.h | 1 - arch/x86/kernel/doublefault.c | 11 ----------- arch/x86/kernel/traps.c | 12 +++--------- 4 files changed, 4 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 409c00f74e60..c4eab8ed33a3 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -117,7 +117,7 @@ config DEBUG_WX config DOUBLEFAULT default y - bool "Enable doublefault exception handler" if EXPERT + bool "Enable doublefault exception handler" if EXPERT && X86_32 ---help--- This option allows trapping of rare doublefault exceptions that would otherwise cause a system to silently reboot. Disabling this diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index e51afbb0cbfb..f6c630097d9f 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -997,7 +997,6 @@ bool xen_set_default_idle(void); #endif void stop_this_cpu(void *dummy); -void df_debug(struct pt_regs *regs, long error_code); void microcode_check(void); enum l1tf_mitigations { diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c index 0d6c657593f8..0b3c616b61a3 100644 --- a/arch/x86/kernel/doublefault.c +++ b/arch/x86/kernel/doublefault.c @@ -72,15 +72,4 @@ struct x86_hw_tss doublefault_tss __cacheline_aligned = { .__cr3 = __pa_nodebug(swapper_pg_dir), }; -/* dummy for do_double_fault() call */ -void df_debug(struct pt_regs *regs, long error_code) {} - -#else /* !CONFIG_X86_32 */ - -void df_debug(struct pt_regs *regs, long error_code) -{ - pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code); - show_regs(regs); - panic("Machine halted."); -} #endif diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index c90312146da0..76381b04dc93 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -411,15 +411,9 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2); #endif -#ifdef CONFIG_DOUBLEFAULT - df_debug(regs, error_code); -#endif - /* - * This is always a kernel trap and never fixable (and thus must - * never return). - */ - for (;;) - die(str, regs, error_code); + pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code); + show_regs(regs); + panic("Machine halted."); } #endif -- cgit From e99b6f46ee5c127d39d2f3a2682fdeef10386316 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 21 Nov 2019 09:42:30 -0800 Subject: x86/doublefault/32: Rename doublefault.c to doublefault_32.c doublefault.c now only contains 32-bit code. Rename it to doublefault_32.c. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 4 ++- arch/x86/kernel/doublefault.c | 75 ---------------------------------------- arch/x86/kernel/doublefault_32.c | 71 +++++++++++++++++++++++++++++++++++++ 3 files changed, 74 insertions(+), 76 deletions(-) delete mode 100644 arch/x86/kernel/doublefault.c create mode 100644 arch/x86/kernel/doublefault_32.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 32acb970f416..6175e370ee4a 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -100,7 +100,9 @@ obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o obj-y += kprobes/ obj-$(CONFIG_MODULES) += module.o -obj-$(CONFIG_DOUBLEFAULT) += doublefault.o +ifeq ($(CONFIG_X86_32),y) +obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o +endif obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_VM86) += vm86_32.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c deleted file mode 100644 index 0b3c616b61a3..000000000000 --- a/arch/x86/kernel/doublefault.c +++ /dev/null @@ -1,75 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#ifdef CONFIG_X86_32 - -#define DOUBLEFAULT_STACKSIZE (1024) -static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE]; -#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE) - -#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM) - -static void doublefault_fn(void) -{ - struct desc_ptr gdt_desc = {0, 0}; - unsigned long gdt, tss; - - native_store_gdt(&gdt_desc); - gdt = gdt_desc.address; - - printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size); - - if (ptr_ok(gdt)) { - gdt += GDT_ENTRY_TSS << 3; - tss = get_desc_base((struct desc_struct *)gdt); - printk(KERN_EMERG "double fault, tss at %08lx\n", tss); - - if (ptr_ok(tss)) { - struct x86_hw_tss *t = (struct x86_hw_tss *)tss; - - printk(KERN_EMERG "eip = %08lx, esp = %08lx\n", - t->ip, t->sp); - - printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n", - t->ax, t->bx, t->cx, t->dx); - printk(KERN_EMERG "esi = %08lx, edi = %08lx\n", - t->si, t->di); - } - } - - for (;;) - cpu_relax(); -} - -struct x86_hw_tss doublefault_tss __cacheline_aligned = { - .sp0 = STACK_START, - .ss0 = __KERNEL_DS, - .ldt = 0, - .io_bitmap_base = IO_BITMAP_OFFSET_INVALID, - - .ip = (unsigned long) doublefault_fn, - /* 0x2 bit is always set */ - .flags = X86_EFLAGS_SF | 0x2, - .sp = STACK_START, - .es = __USER_DS, - .cs = __KERNEL_CS, - .ss = __KERNEL_DS, - .ds = __USER_DS, - .fs = __KERNEL_PERCPU, -#ifndef CONFIG_X86_32_LAZY_GS - .gs = __KERNEL_STACK_CANARY, -#endif - - .__cr3 = __pa_nodebug(swapper_pg_dir), -}; - -#endif diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c new file mode 100644 index 000000000000..61c707ca8a09 --- /dev/null +++ b/arch/x86/kernel/doublefault_32.c @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define DOUBLEFAULT_STACKSIZE (1024) +static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE]; +#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE) + +#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM) + +static void doublefault_fn(void) +{ + struct desc_ptr gdt_desc = {0, 0}; + unsigned long gdt, tss; + + native_store_gdt(&gdt_desc); + gdt = gdt_desc.address; + + printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size); + + if (ptr_ok(gdt)) { + gdt += GDT_ENTRY_TSS << 3; + tss = get_desc_base((struct desc_struct *)gdt); + printk(KERN_EMERG "double fault, tss at %08lx\n", tss); + + if (ptr_ok(tss)) { + struct x86_hw_tss *t = (struct x86_hw_tss *)tss; + + printk(KERN_EMERG "eip = %08lx, esp = %08lx\n", + t->ip, t->sp); + + printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n", + t->ax, t->bx, t->cx, t->dx); + printk(KERN_EMERG "esi = %08lx, edi = %08lx\n", + t->si, t->di); + } + } + + for (;;) + cpu_relax(); +} + +struct x86_hw_tss doublefault_tss __cacheline_aligned = { + .sp0 = STACK_START, + .ss0 = __KERNEL_DS, + .ldt = 0, + .io_bitmap_base = IO_BITMAP_OFFSET_INVALID, + + .ip = (unsigned long) doublefault_fn, + /* 0x2 bit is always set */ + .flags = X86_EFLAGS_SF | 0x2, + .sp = STACK_START, + .es = __USER_DS, + .cs = __KERNEL_CS, + .ss = __KERNEL_DS, + .ds = __USER_DS, + .fs = __KERNEL_PERCPU, +#ifndef CONFIG_X86_32_LAZY_GS + .gs = __KERNEL_STACK_CANARY, +#endif + + .__cr3 = __pa_nodebug(swapper_pg_dir), +}; -- cgit From dc4e0021b00b5a4ecba56fae509217776592b0aa Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 26 Nov 2019 18:27:16 +0100 Subject: x86/doublefault/32: Move #DF stack and TSS to cpu_entry_area There are three problems with the current layout of the doublefault stack and TSS. First, the TSS is only cacheline-aligned, which is not enough -- if the hardware portion of the TSS (struct x86_hw_tss) crosses a page boundary, horrible things happen [0]. Second, the stack and TSS are global, so simultaneous double faults on different CPUs will cause massive corruption. Third, the whole mechanism won't work if user CR3 is loaded, resulting in a triple fault [1]. Let the doublefault stack and TSS share a page (which prevents the TSS from spanning a page boundary), make it percpu, and move it into cpu_entry_area. Teach the stack dump code about the doublefault stack. [0] Real hardware will read past the end of the page onto the next *physical* page if a task switch happens. Virtual machines may have any number of bugs, and I would consider it reasonable for a VM to summarily kill the guest if it tries to task-switch to a page-spanning TSS. [1] Real hardware triple faults. At least some VMs seem to hang. I'm not sure what's going on. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpu_entry_area.h | 12 +++++++ arch/x86/include/asm/doublefault.h | 13 ++++++++ arch/x86/include/asm/pgtable_32_types.h | 7 ++-- arch/x86/include/asm/processor.h | 1 - arch/x86/kernel/cpu/common.c | 12 ++----- arch/x86/kernel/doublefault_32.c | 58 ++++++++++++++++++++++----------- arch/x86/kernel/dumpstack_32.c | 30 +++++++++++++++++ arch/x86/mm/cpu_entry_area.c | 14 +++++++- 8 files changed, 113 insertions(+), 34 deletions(-) create mode 100644 arch/x86/include/asm/doublefault.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index ea866c7bf31d..804734058c77 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -65,6 +65,13 @@ enum exception_stack_ordering { #endif +#ifdef CONFIG_X86_32 +struct doublefault_stack { + unsigned long stack[(PAGE_SIZE - sizeof(struct x86_hw_tss)) / sizeof(unsigned long)]; + struct x86_hw_tss tss; +} __aligned(PAGE_SIZE); +#endif + /* * cpu_entry_area is a percpu region that contains things needed by the CPU * and early entry/exit code. Real types aren't used for all fields here @@ -86,6 +93,11 @@ struct cpu_entry_area { #endif struct entry_stack_page entry_stack_page; +#ifdef CONFIG_X86_32 + char guard_doublefault_stack[PAGE_SIZE]; + struct doublefault_stack doublefault_stack; +#endif + /* * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because * we need task switches to work, and task switches write to the TSS. diff --git a/arch/x86/include/asm/doublefault.h b/arch/x86/include/asm/doublefault.h new file mode 100644 index 000000000000..af9a14ac8962 --- /dev/null +++ b/arch/x86/include/asm/doublefault.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_DOUBLEFAULT_H +#define _ASM_X86_DOUBLEFAULT_H + +#if defined(CONFIG_X86_32) && defined(CONFIG_DOUBLEFAULT) +extern void doublefault_init_cpu_tss(void); +#else +static inline void doublefault_init_cpu_tss(void) +{ +} +#endif + +#endif /* _ASM_X86_DOUBLEFAULT_H */ diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h index 19f5807260c3..0416d42e5bdd 100644 --- a/arch/x86/include/asm/pgtable_32_types.h +++ b/arch/x86/include/asm/pgtable_32_types.h @@ -41,10 +41,11 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */ #endif /* - * Define this here and validate with BUILD_BUG_ON() in pgtable_32.c - * to avoid include recursion hell + * This is an upper bound on sizeof(struct cpu_entry_area) / PAGE_SIZE. + * Define this here and validate with BUILD_BUG_ON() in cpu_entry_area.c + * to avoid include recursion hell. */ -#define CPU_ENTRY_AREA_PAGES (NR_CPUS * 41) +#define CPU_ENTRY_AREA_PAGES (NR_CPUS * 43) /* The +1 is for the readonly IDT page: */ #define CPU_ENTRY_AREA_BASE \ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index f6c630097d9f..0340aad3f2fc 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -166,7 +166,6 @@ enum cpuid_regs_idx { extern struct cpuinfo_x86 boot_cpu_data; extern struct cpuinfo_x86 new_cpu_data; -extern struct x86_hw_tss doublefault_tss; extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS]; extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS]; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index baa2fed8deb6..2e4d90294fe6 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -1814,8 +1815,6 @@ static inline void tss_setup_ist(struct tss_struct *tss) tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE); } -static inline void gdt_setup_doublefault_tss(int cpu) { } - #else /* CONFIG_X86_64 */ static inline void setup_getcpu(int cpu) { } @@ -1827,13 +1826,6 @@ static inline void ucode_cpu_init(int cpu) static inline void tss_setup_ist(struct tss_struct *tss) { } -static inline void gdt_setup_doublefault_tss(int cpu) -{ -#ifdef CONFIG_DOUBLEFAULT - /* Set up the doublefault TSS pointer in the GDT */ - __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); -#endif -} #endif /* !CONFIG_X86_64 */ static inline void tss_setup_io_bitmap(struct tss_struct *tss) @@ -1923,7 +1915,7 @@ void cpu_init(void) clear_all_debug_regs(); dbg_restore_debug_regs(); - gdt_setup_doublefault_tss(cpu); + doublefault_init_cpu_tss(); fpu__init_cpu(); diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c index 61c707ca8a09..4eecfe4825ed 100644 --- a/arch/x86/kernel/doublefault_32.c +++ b/arch/x86/kernel/doublefault_32.c @@ -10,10 +10,6 @@ #include #include -#define DOUBLEFAULT_STACKSIZE (1024) -static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE]; -#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE) - #define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM) static void doublefault_fn(void) @@ -21,6 +17,8 @@ static void doublefault_fn(void) struct desc_ptr gdt_desc = {0, 0}; unsigned long gdt, tss; + BUILD_BUG_ON(sizeof(struct doublefault_stack) != PAGE_SIZE); + native_store_gdt(&gdt_desc); gdt = gdt_desc.address; @@ -48,24 +46,46 @@ static void doublefault_fn(void) cpu_relax(); } -struct x86_hw_tss doublefault_tss __cacheline_aligned = { - .sp0 = STACK_START, - .ss0 = __KERNEL_DS, - .ldt = 0, +DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = { + .tss = { + /* + * No sp0 or ss0 -- we never run CPL != 0 with this TSS + * active. sp is filled in later. + */ + .ldt = 0, .io_bitmap_base = IO_BITMAP_OFFSET_INVALID, - .ip = (unsigned long) doublefault_fn, - /* 0x2 bit is always set */ - .flags = X86_EFLAGS_SF | 0x2, - .sp = STACK_START, - .es = __USER_DS, - .cs = __KERNEL_CS, - .ss = __KERNEL_DS, - .ds = __USER_DS, - .fs = __KERNEL_PERCPU, + .ip = (unsigned long) doublefault_fn, + /* 0x2 bit is always set */ + .flags = X86_EFLAGS_SF | 0x2, + .es = __USER_DS, + .cs = __KERNEL_CS, + .ss = __KERNEL_DS, + .ds = __USER_DS, + .fs = __KERNEL_PERCPU, #ifndef CONFIG_X86_32_LAZY_GS - .gs = __KERNEL_STACK_CANARY, + .gs = __KERNEL_STACK_CANARY, #endif - .__cr3 = __pa_nodebug(swapper_pg_dir), + .__cr3 = __pa_nodebug(swapper_pg_dir), + }, }; + +void doublefault_init_cpu_tss(void) +{ + unsigned int cpu = smp_processor_id(); + struct cpu_entry_area *cea = get_cpu_entry_area(cpu); + + /* + * The linker isn't smart enough to initialize percpu variables that + * point to other places in percpu space. + */ + this_cpu_write(doublefault_stack.tss.sp, + (unsigned long)&cea->doublefault_stack.stack + + sizeof(doublefault_stack.stack)); + + /* Set up doublefault TSS pointer in the GDT */ + __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, + &get_cpu_entry_area(cpu)->doublefault_stack.tss); + +} diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 64a59d726639..8e3a8fedfa4d 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -29,6 +29,9 @@ const char *stack_type_name(enum stack_type type) if (type == STACK_TYPE_ENTRY) return "ENTRY_TRAMPOLINE"; + if (type == STACK_TYPE_EXCEPTION) + return "#DF"; + return NULL; } @@ -82,6 +85,30 @@ static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) return true; } +static bool in_doublefault_stack(unsigned long *stack, struct stack_info *info) +{ +#ifdef CONFIG_DOUBLEFAULT + struct cpu_entry_area *cea = get_cpu_entry_area(raw_smp_processor_id()); + struct doublefault_stack *ss = &cea->doublefault_stack; + + void *begin = ss->stack; + void *end = begin + sizeof(ss->stack); + + if ((void *)stack < begin || (void *)stack >= end) + return false; + + info->type = STACK_TYPE_EXCEPTION; + info->begin = begin; + info->end = end; + info->next_sp = (unsigned long *)this_cpu_read(cpu_tss_rw.x86_tss.sp); + + return true; +#else + return false; +#endif +} + + int get_stack_info(unsigned long *stack, struct task_struct *task, struct stack_info *info, unsigned long *visit_mask) { @@ -105,6 +132,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, if (in_softirq_stack(stack, info)) goto recursion_check; + if (in_doublefault_stack(stack, info)) + goto recursion_check; + goto unknown; recursion_check: diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index 82ead8e27888..56f9189bbadb 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -17,6 +17,10 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(struct exception_stacks, exception_stacks); DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks); #endif +#if defined(CONFIG_X86_32) && defined(CONFIG_DOUBLEFAULT) +DECLARE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack); +#endif + struct cpu_entry_area *get_cpu_entry_area(int cpu) { unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE; @@ -108,7 +112,15 @@ static void __init percpu_setup_exception_stacks(unsigned int cpu) cea_map_stack(MCE); } #else -static inline void percpu_setup_exception_stacks(unsigned int cpu) {} +static inline void percpu_setup_exception_stacks(unsigned int cpu) +{ +#ifdef CONFIG_DOUBLEFAULT + struct cpu_entry_area *cea = get_cpu_entry_area(cpu); + + cea_map_percpu_pages(&cea->doublefault_stack, + &per_cpu(doublefault_stack, cpu), 1, PAGE_KERNEL); +#endif +} #endif /* Setup the fixmap mappings only once per-processor */ -- cgit From 7d8d8cfdee9a7bd6f9682f253fa98efdd8048a9e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 20 Nov 2019 23:06:41 -0800 Subject: x86/doublefault/32: Rewrite the x86_32 #DF handler and unify with 64-bit The old x86_32 doublefault_fn() was old and crufty, and it did not even try to recover. do_double_fault() is much nicer. Rewrite the 32-bit double fault code to sanitize CPU state and call do_double_fault(). This is mostly an exercise i386 archaeology. With this patch applied, 32-bit double faults get a real stack trace, just like 64-bit double faults. [ mingo: merged the patch to a later kernel base. ] Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 42 +++++++++++++++ arch/x86/include/asm/traps.h | 3 ++ arch/x86/kernel/doublefault_32.c | 107 +++++++++++++++++++++++++++------------ arch/x86/kernel/traps.c | 19 ++++++- 4 files changed, 138 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 5832b11f01bb..632432bb723d 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1537,6 +1537,48 @@ SYM_CODE_START(debug) jmp common_exception SYM_CODE_END(debug) +#ifdef CONFIG_DOUBLEFAULT +SYM_CODE_START(double_fault) +1: + /* + * This is a task gate handler, not an interrupt gate handler. + * The error code is on the stack, but the stack is otherwise + * empty. Interrupts are off. Our state is sane with the following + * exceptions: + * + * - CR0.TS is set. "TS" literally means "task switched". + * - EFLAGS.NT is set because we're a "nested task". + * - The doublefault TSS has back_link set and has been marked busy. + * - TR points to the doublefault TSS and the normal TSS is busy. + * - CR3 is the normal kernel PGD. This would be delightful, except + * that the CPU didn't bother to save the old CR3 anywhere. This + * would make it very awkward to return back to the context we came + * from. + * + * The rest of EFLAGS is sanitized for us, so we don't need to + * worry about AC or DF. + * + * Don't even bother popping the error code. It's always zero, + * and ignoring it makes us a bit more robust against buggy + * hypervisor task gate implementations. + * + * We will manually undo the task switch instead of doing a + * task-switching IRET. + */ + + clts /* clear CR0.TS */ + pushl $X86_EFLAGS_FIXED + popfl /* clear EFLAGS.NT */ + + call doublefault_shim + + /* We don't support returning, so we have no IRET here. */ +1: + hlt + jmp 1b +SYM_CODE_END(double_fault) +#endif + /* * NMI is doubly nasty. It can happen on the first instruction of * entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index b25e633033c3..ffa0dc8a535e 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -69,6 +69,9 @@ dotraplinkage void do_overflow(struct pt_regs *regs, long error_code); dotraplinkage void do_bounds(struct pt_regs *regs, long error_code); dotraplinkage void do_invalid_op(struct pt_regs *regs, long error_code); dotraplinkage void do_device_not_available(struct pt_regs *regs, long error_code); +#if defined(CONFIG_X86_64) || defined(CONFIG_DOUBLEFAULT) +dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2); +#endif dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *regs, long error_code); dotraplinkage void do_invalid_TSS(struct pt_regs *regs, long error_code); dotraplinkage void do_segment_not_present(struct pt_regs *regs, long error_code); diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c index 4eecfe4825ed..3793646f0fb5 100644 --- a/arch/x86/kernel/doublefault_32.c +++ b/arch/x86/kernel/doublefault_32.c @@ -9,42 +9,83 @@ #include #include #include +#include +extern void double_fault(void); #define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM) -static void doublefault_fn(void) -{ - struct desc_ptr gdt_desc = {0, 0}; - unsigned long gdt, tss; +#define TSS(x) this_cpu_read(cpu_tss_rw.x86_tss.x) - BUILD_BUG_ON(sizeof(struct doublefault_stack) != PAGE_SIZE); +static void set_df_gdt_entry(unsigned int cpu); - native_store_gdt(&gdt_desc); - gdt = gdt_desc.address; +/* + * Called by double_fault with CR0.TS and EFLAGS.NT cleared. The CPU thinks + * we're running the doublefault task. Cannot return. + */ +asmlinkage notrace void __noreturn doublefault_shim(void) +{ + unsigned long cr2; + struct pt_regs regs; - printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size); + BUILD_BUG_ON(sizeof(struct doublefault_stack) != PAGE_SIZE); - if (ptr_ok(gdt)) { - gdt += GDT_ENTRY_TSS << 3; - tss = get_desc_base((struct desc_struct *)gdt); - printk(KERN_EMERG "double fault, tss at %08lx\n", tss); + cr2 = native_read_cr2(); - if (ptr_ok(tss)) { - struct x86_hw_tss *t = (struct x86_hw_tss *)tss; + /* Reset back to the normal kernel task. */ + force_reload_TR(); + set_df_gdt_entry(smp_processor_id()); - printk(KERN_EMERG "eip = %08lx, esp = %08lx\n", - t->ip, t->sp); + trace_hardirqs_off(); - printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n", - t->ax, t->bx, t->cx, t->dx); - printk(KERN_EMERG "esi = %08lx, edi = %08lx\n", - t->si, t->di); - } - } + /* + * Fill in pt_regs. A downside of doing this in C is that the unwinder + * won't see it (no ENCODE_FRAME_POINTER), so a nested stack dump + * won't successfully unwind to the source of the double fault. + * The main dump from do_double_fault() is fine, though, since it + * uses these regs directly. + * + * If anyone ever cares, this could be moved to asm. + */ + regs.ss = TSS(ss); + regs.__ssh = 0; + regs.sp = TSS(sp); + regs.flags = TSS(flags); + regs.cs = TSS(cs); + /* We won't go through the entry asm, so we can leave __csh as 0. */ + regs.__csh = 0; + regs.ip = TSS(ip); + regs.orig_ax = 0; + regs.gs = TSS(gs); + regs.__gsh = 0; + regs.fs = TSS(fs); + regs.__fsh = 0; + regs.es = TSS(es); + regs.__esh = 0; + regs.ds = TSS(ds); + regs.__dsh = 0; + regs.ax = TSS(ax); + regs.bp = TSS(bp); + regs.di = TSS(di); + regs.si = TSS(si); + regs.dx = TSS(dx); + regs.cx = TSS(cx); + regs.bx = TSS(bx); + + do_double_fault(®s, 0, cr2); - for (;;) - cpu_relax(); + /* + * x86_32 does not save the original CR3 anywhere on a task switch. + * This means that, even if we wanted to return, we would need to find + * some way to reconstruct CR3. We could make a credible guess based + * on cpu_tlbstate, but that would be racy and would not account for + * PTI. + * + * Instead, don't bother. We can return through + * rewind_stack_do_exit() instead. + */ + panic("cannot return from double fault\n"); } +NOKPROBE_SYMBOL(doublefault_shim); DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = { .tss = { @@ -55,9 +96,8 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = { .ldt = 0, .io_bitmap_base = IO_BITMAP_OFFSET_INVALID, - .ip = (unsigned long) doublefault_fn, - /* 0x2 bit is always set */ - .flags = X86_EFLAGS_SF | 0x2, + .ip = (unsigned long) double_fault, + .flags = X86_EFLAGS_FIXED, .es = __USER_DS, .cs = __KERNEL_CS, .ss = __KERNEL_DS, @@ -71,6 +111,14 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = { }, }; +static void set_df_gdt_entry(unsigned int cpu) +{ + /* Set up doublefault TSS pointer in the GDT */ + __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, + &get_cpu_entry_area(cpu)->doublefault_stack.tss); + +} + void doublefault_init_cpu_tss(void) { unsigned int cpu = smp_processor_id(); @@ -84,8 +132,5 @@ void doublefault_init_cpu_tss(void) (unsigned long)&cea->doublefault_stack.stack + sizeof(doublefault_stack.stack)); - /* Set up doublefault TSS pointer in the GDT */ - __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, - &get_cpu_entry_area(cpu)->doublefault_stack.tss); - + set_df_gdt_entry(cpu); } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 76381b04dc93..a9b16c3a933d 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -306,8 +306,23 @@ __visible void __noreturn handle_stack_overflow(const char *message, } #endif -#ifdef CONFIG_X86_64 -/* Runs on IST stack */ +#if defined(CONFIG_X86_64) || defined(CONFIG_DOUBLEFAULT) +/* + * Runs on an IST stack for x86_64 and on a special task stack for x86_32. + * + * On x86_64, this is more or less a normal kernel entry. Notwithstanding the + * SDM's warnings about double faults being unrecoverable, returning works as + * expected. Presumably what the SDM actually means is that the CPU may get + * the register state wrong on entry, so returning could be a bad idea. + * + * Various CPU engineers have promised that double faults due to an IRET fault + * while the stack is read-only are, in fact, recoverable. + * + * On x86_32, this is entered through a task gate, and regs are synthesized + * from the TSS. Returning is, in principle, okay, but changes to regs will + * be lost. If, for some reason, we need to return to a context with modified + * regs, the shim code could be adjusted to synchronize the registers. + */ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2) { static const char str[] = "double fault"; -- cgit From 0337b7ebfcb8efb4ea0a9f2b2f284217a1c0e62d Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 25 Nov 2019 22:37:44 -0800 Subject: x86/traps: die() instead of panicking on a double fault A double fault has a decent chance of being recoverable by killing the offending thread. Use die() so that we at least try to recover. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a9b16c3a933d..05da6b5b167b 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -427,7 +427,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign #endif pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code); - show_regs(regs); + die("double fault", regs, error_code); panic("Machine halted."); } #endif -- cgit From 8e05f1b4f27d07a0f93e7c6fd28525a5d082b85c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 15 Jul 2019 10:08:48 -0700 Subject: x86/ptrace: Remove set_segment_reg() implementations for current seg_segment_reg() should be unreachable with task == current. Rather than confusingly trying to make it work, just explicitly disable this case. (regset->get is used for current in the coredump code, but the ->set interface is only used for ptrace, and you can't ptrace yourself.) Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 066e5b01a7e0..3b3b16932589 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -182,6 +182,9 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset) static int set_segment_reg(struct task_struct *task, unsigned long offset, u16 value) { + if (WARN_ON_ONCE(task == current)) + return -EIO; + /* * The value argument was already truncated to 16 bits. */ @@ -209,10 +212,7 @@ static int set_segment_reg(struct task_struct *task, break; case offsetof(struct user_regs_struct, gs): - if (task == current) - set_user_gs(task_pt_regs(task), value); - else - task_user_gs(task) = value; + task_user_gs(task) = value; } return 0; @@ -272,6 +272,9 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset) static int set_segment_reg(struct task_struct *task, unsigned long offset, u16 value) { + if (WARN_ON_ONCE(task == current)) + return -EIO; + /* * The value argument was already truncated to 16 bits. */ @@ -281,23 +284,15 @@ static int set_segment_reg(struct task_struct *task, switch (offset) { case offsetof(struct user_regs_struct,fs): task->thread.fsindex = value; - if (task == current) - loadsegment(fs, task->thread.fsindex); break; case offsetof(struct user_regs_struct,gs): task->thread.gsindex = value; - if (task == current) - load_gs_index(task->thread.gsindex); break; case offsetof(struct user_regs_struct,ds): task->thread.ds = value; - if (task == current) - loadsegment(ds, task->thread.ds); break; case offsetof(struct user_regs_struct,es): task->thread.es = value; - if (task == current) - loadsegment(es, task->thread.es); break; /* -- cgit From 56f2ab41b652251f336a0f471b1033afeaedd161 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 17 Jul 2019 06:44:16 -0700 Subject: x86/ptrace: Document FSBASE and GSBASE ABI oddities Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 3b3b16932589..f0e1ddbc2fd7 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -281,6 +281,20 @@ static int set_segment_reg(struct task_struct *task, if (invalid_selector(value)) return -EIO; + /* + * This function has some ABI oddities. + * + * A 32-bit ptracer probably expects that writing FS or GS will change + * FSBASE or GSBASE respectively. In the absence of FSGSBASE support, + * this code indeed has that effect. When FSGSBASE is added, this + * will require a special case. + * + * For existing 64-bit ptracers, writing FS or GS *also* currently + * changes the base if the selector is nonzero the next time the task + * is run. This behavior may not be needed, and trying to preserve it + * when FSGSBASE is added would be complicated at best. + */ + switch (offset) { case offsetof(struct user_regs_struct,fs): task->thread.fsindex = value; @@ -370,6 +384,9 @@ static int putreg(struct task_struct *child, * When changing the FS base, use do_arch_prctl_64() * to set the index to zero and to set the base * as requested. + * + * NB: This behavior is nonsensical and likely needs to + * change when FSGSBASE support is added. */ if (child->thread.fsbase != value) return do_arch_prctl_64(child, ARCH_SET_FS, value); -- cgit From 3e1b43586eae232157ca70e905cece0297f17bfd Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 24 Nov 2019 17:12:25 +0100 Subject: x86/entry/32: Remove unused 'restore_all_notrace' local label Signed-off-by: Borislav Petkov Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 632432bb723d..7e0560442538 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1090,7 +1090,6 @@ SYM_FUNC_START(entry_INT80_32) restore_all: TRACE_IRQS_IRET SWITCH_TO_ENTRY_STACK -.Lrestore_all_notrace: CHECK_AND_APPLY_ESPFIX .Lrestore_nocheck: /* Switch back to user CR3 */ -- cgit From 59c4bd853abcea95eccc167a7d7fd5f1a5f47b98 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 28 Nov 2019 09:53:06 +0100 Subject: x86/fpu: Don't cache access to fpu_fpregs_owner_ctx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The state/owner of the FPU is saved to fpu_fpregs_owner_ctx by pointing to the context that is currently loaded. It never changed during the lifetime of a task - it remained stable/constant. After deferred FPU registers loading until return to userland was implemented, the content of fpu_fpregs_owner_ctx may change during preemption and must not be cached. This went unnoticed for some time and was now noticed, in particular since gcc 9 is caching that load in copy_fpstate_to_sigframe() and reusing it in the retry loop: copy_fpstate_to_sigframe() load fpu_fpregs_owner_ctx and save on stack fpregs_lock() copy_fpregs_to_sigframe() /* failed */ fpregs_unlock() *** PREEMPTION, another uses FPU, changes fpu_fpregs_owner_ctx *** fault_in_pages_writeable() /* succeed, retry */ fpregs_lock() __fpregs_load_activate() fpregs_state_valid() /* uses fpu_fpregs_owner_ctx from stack */ copy_fpregs_to_sigframe() /* succeeds, random FPU content */ This is a comparison of the assembly produced by gcc 9, without vs with this patch: | # arch/x86/kernel/fpu/signal.c:173: if (!access_ok(buf, size)) | cmpq %rdx, %rax # tmp183, _4 | jb .L190 #, |-# arch/x86/include/asm/fpu/internal.h:512: return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; |-#APP |-# 512 "arch/x86/include/asm/fpu/internal.h" 1 |- movq %gs:fpu_fpregs_owner_ctx,%rax #, pfo_ret__ |-# 0 "" 2 |-#NO_APP |- movq %rax, -88(%rbp) # pfo_ret__, %sfp … |-# arch/x86/include/asm/fpu/internal.h:512: return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; |- movq -88(%rbp), %rcx # %sfp, pfo_ret__ |- cmpq %rcx, -64(%rbp) # pfo_ret__, %sfp |+# arch/x86/include/asm/fpu/internal.h:512: return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; |+#APP |+# 512 "arch/x86/include/asm/fpu/internal.h" 1 |+ movq %gs:fpu_fpregs_owner_ctx(%rip),%rax # fpu_fpregs_owner_ctx, pfo_ret__ |+# 0 "" 2 |+# arch/x86/include/asm/fpu/internal.h:512: return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; |+#NO_APP |+ cmpq %rax, -64(%rbp) # pfo_ret__, %sfp Use this_cpu_read() instead this_cpu_read_stable() to avoid caching of fpu_fpregs_owner_ctx during preemption points. The Fixes: tag points to the commit where deferred FPU loading was added. Since this commit, the compiler is no longer allowed to move the load of fpu_fpregs_owner_ctx somewhere else / outside of the locked section. A task preemption will change its value and stale content will be observed. [ bp: Massage. ] Debugged-by: Austin Clements Debugged-by: David Chase Debugged-by: Ian Lance Taylor Fixes: 5f409e20b7945 ("x86/fpu: Defer FPU state load until return to userspace") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov Reviewed-by: Rik van Riel Tested-by: Borislav Petkov Cc: Aubrey Li Cc: Austin Clements Cc: Barret Rhoden Cc: Dave Hansen Cc: David Chase Cc: "H. Peter Anvin" Cc: ian@airs.com Cc: Ingo Molnar Cc: Josh Bleecher Snyder Cc: Thomas Gleixner Cc: x86-ml Link: https://lkml.kernel.org/r/20191128085306.hxfa2o3knqtu4wfn@linutronix.de Link: https://bugzilla.kernel.org/show_bug.cgi?id=205663 --- arch/x86/include/asm/fpu/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 4c95c365058a..44c48e34d799 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -509,7 +509,7 @@ static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu) static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu) { - return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; + return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; } /* -- cgit From 7b0b8cfd261c569177d64d6e9b1800fbe412fd65 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 30 Nov 2019 16:00:53 +0100 Subject: x86/ioperm: Save an indentation level in tss_update_io_bitmap() ... for better readability. No functional changes. [ Minor edit. ] Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/process.c | 52 +++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index bd2a11ca5dd6..61e93a318983 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -377,37 +377,37 @@ static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm) void tss_update_io_bitmap(void) { struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw); + struct thread_struct *t = ¤t->thread; u16 *base = &tss->x86_tss.io_bitmap_base; - if (test_thread_flag(TIF_IO_BITMAP)) { - struct thread_struct *t = ¤t->thread; - - if (IS_ENABLED(CONFIG_X86_IOPL_IOPERM) && t->iopl_emul == 3) { - *base = IO_BITMAP_OFFSET_VALID_ALL; - } else { - struct io_bitmap *iobm = t->io_bitmap; - /* - * Only copy bitmap data when the sequence number - * differs. The update time is accounted to the - * incoming task. - */ - if (tss->io_bitmap.prev_sequence != iobm->sequence) - tss_copy_io_bitmap(tss, iobm); - - /* Enable the bitmap */ - *base = IO_BITMAP_OFFSET_VALID_MAP; - } + if (!test_thread_flag(TIF_IO_BITMAP)) { + tss_invalidate_io_bitmap(tss); + return; + } + + if (IS_ENABLED(CONFIG_X86_IOPL_IOPERM) && t->iopl_emul == 3) { + *base = IO_BITMAP_OFFSET_VALID_ALL; + } else { + struct io_bitmap *iobm = t->io_bitmap; + /* - * Make sure that the TSS limit is covering the io bitmap. - * It might have been cut down by a VMEXIT to 0x67 which - * would cause a subsequent I/O access from user space to - * trigger a #GP because tbe bitmap is outside the TSS - * limit. + * Only copy bitmap data when the sequence number differs. The + * update time is accounted to the incoming task. */ - refresh_tss_limit(); - } else { - tss_invalidate_io_bitmap(tss); + if (tss->io_bitmap.prev_sequence != iobm->sequence) + tss_copy_io_bitmap(tss, iobm); + + /* Enable the bitmap */ + *base = IO_BITMAP_OFFSET_VALID_MAP; } + + /* + * Make sure that the TSS limit is covering the IO bitmap. It might have + * been cut down by a VMEXIT to 0x67 which would cause a subsequent I/O + * access from user space to trigger a #GP because tbe bitmap is outside + * the TSS limit. + */ + refresh_tss_limit(); } #else /* CONFIG_X86_IOPL_IOPERM */ static inline void switch_to_bitmap(unsigned long tifp) { } -- cgit From 91298f1a302dad0f0f630413c812818636faa8a0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 1 Dec 2019 15:49:47 +0100 Subject: x86/mm/pat: Fix off-by-one bugs in interval tree search There's a bug in the new PAT code, the conversion of memtype_check_conflict() is buggy: 8d04a5f97a5f: ("x86/mm/pat: Convert the PAT tree to a generic interval tree") dprintk("Overlap at 0x%Lx-0x%Lx\n", match->start, match->end); found_type = match->type; - node = rb_next(&match->rb); - while (node) { - match = rb_entry(node, struct memtype, rb); - - if (match->start >= end) /* Checked all possible matches */ - goto success; - - if (is_node_overlap(match, start, end) && - match->type != found_type) { + match = memtype_interval_iter_next(match, start, end); + while (match) { + if (match->type != found_type) goto failure; - } - node = rb_next(&match->rb); + match = memtype_interval_iter_next(match, start, end); } Note how the '>= end' condition to end the interval check, got converted into: + match = memtype_interval_iter_next(match, start, end); This is subtly off by one, because the interval trees interfaces require closed interval parameters: include/linux/interval_tree_generic.h /* \ * Iterate over intervals intersecting [start;last] \ * \ * Note that a node's interval intersects [start;last] iff: \ * Cond1: ITSTART(node) <= last \ * and \ * Cond2: start <= ITLAST(node) \ */ \ ... if (ITSTART(node) <= last) { /* Cond1 */ \ if (start <= ITLAST(node)) /* Cond2 */ \ return node; /* node is leftmost match */ \ [start;last] is a closed interval (note that '<= last' check) - while the PAT 'end' parameter is 1 byte beyond the end of the range, because ioremap() and the other mapping APIs usually use the [start,end) half-open interval, derived from 'size'. This is what ioremap() does for example: /* * Mappings have to be page-aligned */ offset = phys_addr & ~PAGE_MASK; phys_addr &= PHYSICAL_PAGE_MASK; size = PAGE_ALIGN(last_addr+1) - phys_addr; retval = reserve_memtype(phys_addr, (u64)phys_addr + size, pcm, &new_pcm); phys_addr+size will be on a page boundary, after the last byte of the mapped interval. So the correct parameter to use in the interval tree searches is not 'end' but 'end-1'. This could have relevance if conflicting PAT ranges are exactly adjacent, for example a future WC region is followed immediately by an already mapped UC- region - in this case memtype_check_conflict() would incorrectly deny the WC memtype region and downgrade the memtype to UC-. BTW., rather annoyingly this downgrading is done silently in memtype_check_insert(): int memtype_check_insert(struct memtype *new, enum page_cache_mode *ret_type) { int err = 0; err = memtype_check_conflict(new->start, new->end, new->type, ret_type); if (err) return err; if (ret_type) new->type = *ret_type; memtype_interval_insert(new, &memtype_rbroot); return 0; } So on such a conflict we'd just silently get UC- in *ret_type, and write it into the new region, never the wiser ... So assuming that the patch below fixes the primary bug the diagnostics side of ioremap() cache attribute downgrades would be another thing to fix. Anyway, I checked all the interval-tree iterations, and most of them are off by one - but I think the one related to memtype_check_conflict() is the one causing this particular performance regression. The only correct interval-tree searches were these two: arch/x86/mm/pat_interval.c: match = memtype_interval_iter_first(&memtype_rbroot, 0, ULONG_MAX); arch/x86/mm/pat_interval.c: match = memtype_interval_iter_next(match, 0, ULONG_MAX); The ULONG_MAX was hiding the off-by-one in plain sight. :-) Note that the bug was probably benign in the sense of implementing a too strict cache attribute conflict policy and downgrading cache attributes, so AFAICS the worst outcome of this bug would be a performance regression, not any instabilities. Reported-by: kernel test robot Reported-by: Kenneth R. Crudup Reported-by: Mariusz Ceier Tested-by: Mariusz Ceier Tested-by: Kenneth R. Crudup Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191201144947.GA4167@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/mm/pat_interval.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat_interval.c b/arch/x86/mm/pat_interval.c index 47a1bf30748f..6855362eaf21 100644 --- a/arch/x86/mm/pat_interval.c +++ b/arch/x86/mm/pat_interval.c @@ -56,7 +56,7 @@ static struct memtype *memtype_match(u64 start, u64 end, int match_type) { struct memtype *match; - match = memtype_interval_iter_first(&memtype_rbroot, start, end); + match = memtype_interval_iter_first(&memtype_rbroot, start, end-1); while (match != NULL && match->start < end) { if ((match_type == MEMTYPE_EXACT_MATCH) && (match->start == start) && (match->end == end)) @@ -66,7 +66,7 @@ static struct memtype *memtype_match(u64 start, u64 end, int match_type) (match->start < start) && (match->end == end)) return match; - match = memtype_interval_iter_next(match, start, end); + match = memtype_interval_iter_next(match, start, end-1); } return NULL; /* Returns NULL if there is no match */ @@ -79,7 +79,7 @@ static int memtype_check_conflict(u64 start, u64 end, struct memtype *match; enum page_cache_mode found_type = reqtype; - match = memtype_interval_iter_first(&memtype_rbroot, start, end); + match = memtype_interval_iter_first(&memtype_rbroot, start, end-1); if (match == NULL) goto success; @@ -89,12 +89,12 @@ static int memtype_check_conflict(u64 start, u64 end, dprintk("Overlap at 0x%Lx-0x%Lx\n", match->start, match->end); found_type = match->type; - match = memtype_interval_iter_next(match, start, end); + match = memtype_interval_iter_next(match, start, end-1); while (match) { if (match->type != found_type) goto failure; - match = memtype_interval_iter_next(match, start, end); + match = memtype_interval_iter_next(match, start, end-1); } success: if (newtype) @@ -160,7 +160,7 @@ struct memtype *memtype_erase(u64 start, u64 end) struct memtype *memtype_lookup(u64 addr) { return memtype_interval_iter_first(&memtype_rbroot, addr, - addr + PAGE_SIZE); + addr + PAGE_SIZE-1); } #if defined(CONFIG_DEBUG_FS) -- cgit