From aef0148f3606117352053c015cb33734e9ee7397 Mon Sep 17 00:00:00 2001 From: Arvind Sankar Date: Wed, 2 Sep 2020 22:30:56 -0400 Subject: x86/cmdline: Disable jump tables for cmdline.c When CONFIG_RETPOLINE is disabled, Clang uses a jump table for the switch statement in cmdline_find_option (jump tables are disabled when CONFIG_RETPOLINE is enabled). This function is called very early in boot from sme_enable() if CONFIG_AMD_MEM_ENCRYPT is enabled. At this time, the kernel is still executing out of the identity mapping, but the jump table will contain virtual addresses. Fix this by disabling jump tables for cmdline.c when AMD_MEM_ENCRYPT is enabled. Signed-off-by: Arvind Sankar Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200903023056.3914690-1-nivedita@alum.mit.edu --- arch/x86/lib/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index d46fff11f06f..aa067859a70b 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -24,7 +24,7 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_cmdline.o = -pg endif -CFLAGS_cmdline.o := -fno-stack-protector +CFLAGS_cmdline.o := -fno-stack-protector -fno-jump-tables endif inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk -- cgit From 4819e15f740ec884a50bdc431d7f1e7638b6f7d9 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 2 Sep 2020 17:59:04 +0200 Subject: x86/mm/32: Bring back vmalloc faulting on x86_32 One can not simply remove vmalloc faulting on x86-32. Upstream commit: 7f0a002b5a21 ("x86/mm: remove vmalloc faulting") removed it on x86 alltogether because previously the arch_sync_kernel_mappings() interface was introduced. This interface added synchronization of vmalloc/ioremap page-table updates to all page-tables in the system at creation time and was thought to make vmalloc faulting obsolete. But that assumption was incredibly naive. It turned out that there is a race window between the time the vmalloc or ioremap code establishes a mapping and the time it synchronizes this change to other page-tables in the system. During this race window another CPU or thread can establish a vmalloc mapping which uses the same intermediate page-table entries (e.g. PMD or PUD) and does no synchronization in the end, because it found all necessary mappings already present in the kernel reference page-table. But when these intermediate page-table entries are not yet synchronized, the other CPU or thread will continue with a vmalloc address that is not yet mapped in the page-table it currently uses, causing an unhandled page fault and oops like below: BUG: unable to handle page fault for address: fe80c000 #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page *pde = 33183067 *pte = a8648163 Oops: 0002 [#1] SMP CPU: 1 PID: 13514 Comm: cve-2017-17053 Tainted: G ... Call Trace: ldt_dup_context+0x66/0x80 dup_mm+0x2b3/0x480 copy_process+0x133b/0x15c0 _do_fork+0x94/0x3e0 __ia32_sys_clone+0x67/0x80 __do_fast_syscall_32+0x3f/0x70 do_fast_syscall_32+0x29/0x60 do_SYSENTER_32+0x15/0x20 entry_SYSENTER_32+0x9f/0xf2 EIP: 0xb7eef549 So the arch_sync_kernel_mappings() interface is racy, but removing it would mean to re-introduce the vmalloc_sync_all() interface, which is even more awful. Keep arch_sync_kernel_mappings() in place and catch the race condition in the page-fault handler instead. Do a partial revert of above commit to get vmalloc faulting on x86-32 back in place. Fixes: 7f0a002b5a21 ("x86/mm: remove vmalloc faulting") Reported-by: Naresh Kamboju Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200902155904.17544-1-joro@8bytes.org --- arch/x86/mm/fault.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) (limited to 'arch') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 35f1498e9832..6e3e8a124903 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -190,6 +190,53 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) return pmd_k; } +/* + * Handle a fault on the vmalloc or module mapping area + * + * This is needed because there is a race condition between the time + * when the vmalloc mapping code updates the PMD to the point in time + * where it synchronizes this update with the other page-tables in the + * system. + * + * In this race window another thread/CPU can map an area on the same + * PMD, finds it already present and does not synchronize it with the + * rest of the system yet. As a result v[mz]alloc might return areas + * which are not mapped in every page-table in the system, causing an + * unhandled page-fault when they are accessed. + */ +static noinline int vmalloc_fault(unsigned long address) +{ + unsigned long pgd_paddr; + pmd_t *pmd_k; + pte_t *pte_k; + + /* Make sure we are in vmalloc area: */ + if (!(address >= VMALLOC_START && address < VMALLOC_END)) + return -1; + + /* + * Synchronize this task's top level page-table + * with the 'reference' page table. + * + * Do _not_ use "current" here. We might be inside + * an interrupt in the middle of a task switch.. + */ + pgd_paddr = read_cr3_pa(); + pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); + if (!pmd_k) + return -1; + + if (pmd_large(*pmd_k)) + return 0; + + pte_k = pte_offset_kernel(pmd_k, address); + if (!pte_present(*pte_k)) + return -1; + + return 0; +} +NOKPROBE_SYMBOL(vmalloc_fault); + void arch_sync_kernel_mappings(unsigned long start, unsigned long end) { unsigned long addr; @@ -1110,6 +1157,37 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, */ WARN_ON_ONCE(hw_error_code & X86_PF_PK); +#ifdef CONFIG_X86_32 + /* + * We can fault-in kernel-space virtual memory on-demand. The + * 'reference' page table is init_mm.pgd. + * + * NOTE! We MUST NOT take any locks for this case. We may + * be in an interrupt or a critical region, and should + * only copy the information from the master page table, + * nothing more. + * + * Before doing this on-demand faulting, ensure that the + * fault is not any of the following: + * 1. A fault on a PTE with a reserved bit set. + * 2. A fault caused by a user-mode access. (Do not demand- + * fault kernel memory due to user-mode accesses). + * 3. A fault caused by a page-level protection violation. + * (A demand fault would be on a non-present page which + * would have X86_PF_PROT==0). + * + * This is only needed to close a race condition on x86-32 in + * the vmalloc mapping/unmapping code. See the comment above + * vmalloc_fault() for details. On x86-64 the race does not + * exist as the vmalloc mappings don't need to be synchronized + * there. + */ + if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { + if (vmalloc_fault(address) >= 0) + return; + } +#endif + /* Was the fault spurious, caused by lazy TLB invalidation? */ if (spurious_kernel_fault(hw_error_code, address)) return; -- cgit From ccae0f36d500aef727f98acd8d0601e6b262a513 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 4 Sep 2020 14:10:47 +0800 Subject: x86, fakenuma: Fix invalid starting node ID Commit: cc9aec03e58f ("x86/numa_emulation: Introduce uniform split capability") uses "-1" as the starting node ID, which causes the strange kernel log as follows, when "numa=fake=32G" is added to the kernel command line: Faking node -1 at [mem 0x0000000000000000-0x0000000893ffffff] (35136MB) Faking node 0 at [mem 0x0000001840000000-0x000000203fffffff] (32768MB) Faking node 1 at [mem 0x0000000894000000-0x000000183fffffff] (64192MB) Faking node 2 at [mem 0x0000002040000000-0x000000283fffffff] (32768MB) Faking node 3 at [mem 0x0000002840000000-0x000000303fffffff] (32768MB) And finally the kernel crashes: BUG: Bad page state in process swapper pfn:00011 page:(____ptrval____) refcount:0 mapcount:1 mapping:(____ptrval____) index:0x55cd7e44b270 pfn:0x11 failed to read mapping contents, not a valid kernel address? flags: 0x5(locked|uptodate) raw: 0000000000000005 000055cd7e44af30 000055cd7e44af50 0000000100000006 raw: 000055cd7e44b270 000055cd7e44b290 0000000000000000 000055cd7e44b510 page dumped because: page still charged to cgroup page->mem_cgroup:000055cd7e44b510 Modules linked in: CPU: 0 PID: 0 Comm: swapper Not tainted 5.9.0-rc2 #1 Hardware name: Intel Corporation S2600WFT/S2600WFT, BIOS SE5C620.86B.02.01.0008.031920191559 03/19/2019 Call Trace: dump_stack+0x57/0x80 bad_page.cold+0x63/0x94 __free_pages_ok+0x33f/0x360 memblock_free_all+0x127/0x195 mem_init+0x23/0x1f5 start_kernel+0x219/0x4f5 secondary_startup_64+0xb6/0xc0 Fix this bug via using 0 as the starting node ID. This restores the original behavior before cc9aec03e58f. [ mingo: Massaged the changelog. ] Fixes: cc9aec03e58f ("x86/numa_emulation: Introduce uniform split capability") Signed-off-by: "Huang, Ying" Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200904061047.612950-1-ying.huang@intel.com --- arch/x86/mm/numa_emulation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index c5174b4e318b..683cd12f4793 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -321,7 +321,7 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, u64 addr, u64 max_addr, u64 size) { return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size, - 0, NULL, NUMA_NO_NODE); + 0, NULL, 0); } static int __init setup_emu2phys_nid(int *dfl_phys_nid) -- cgit From 2356bb4b8221d7dc8c7beb810418122ed90254c9 Mon Sep 17 00:00:00 2001 From: Vamshi K Sthambamkadi Date: Fri, 28 Aug 2020 17:02:46 +0530 Subject: tracing/kprobes, x86/ptrace: Fix regs argument order for i386 On i386, the order of parameters passed on regs is eax,edx,and ecx (as per regparm(3) calling conventions). Change the mapping in regs_get_kernel_argument(), so that arg1=ax arg2=dx, and arg3=cx. Running the selftests testcase kprobes_args_use.tc shows the result as passed. Fixes: 3c88ee194c28 ("x86: ptrace: Add function argument access API") Signed-off-by: Vamshi K Sthambamkadi Signed-off-by: Borislav Petkov Acked-by: Masami Hiramatsu Acked-by: Peter Zijlstra (Intel) Cc: Link: https://lkml.kernel.org/r/20200828113242.GA1424@cosmos --- arch/x86/include/asm/ptrace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 40aa69d04862..d8324a236696 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -327,8 +327,8 @@ static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs, static const unsigned int argument_offs[] = { #ifdef __i386__ offsetof(struct pt_regs, ax), - offsetof(struct pt_regs, cx), offsetof(struct pt_regs, dx), + offsetof(struct pt_regs, cx), #define NR_REG_ARGUMENTS 3 #else offsetof(struct pt_regs, di), -- cgit From 662a0221893a3d58aa72719671844264306f6e4b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Sep 2020 15:25:50 +0200 Subject: x86/entry: Fix AC assertion The WARN added in commit 3c73b81a9164 ("x86/entry, selftests: Further improve user entry sanity checks") unconditionally triggers on a IVB machine because it does not support SMAP. For !SMAP hardware the CLAC/STAC instructions are patched out and thus if userspace sets AC, it is still have set after entry. Fixes: 3c73b81a9164 ("x86/entry, selftests: Further improve user entry sanity checks") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Tested-by: Daniel Thompson Acked-by: Andy Lutomirski Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20200902133200.666781610@infradead.org --- arch/x86/include/asm/entry-common.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h index a8f9315b9eae..6fe54b2813c1 100644 --- a/arch/x86/include/asm/entry-common.h +++ b/arch/x86/include/asm/entry-common.h @@ -18,8 +18,16 @@ static __always_inline void arch_check_user_regs(struct pt_regs *regs) * state, not the interrupt state as imagined by Xen. */ unsigned long flags = native_save_fl(); - WARN_ON_ONCE(flags & (X86_EFLAGS_AC | X86_EFLAGS_DF | - X86_EFLAGS_NT)); + unsigned long mask = X86_EFLAGS_DF | X86_EFLAGS_NT; + + /* + * For !SMAP hardware we patch out CLAC on entry. + */ + if (boot_cpu_has(X86_FEATURE_SMAP) || + (IS_ENABLED(CONFIG_64_BIT) && boot_cpu_has(X86_FEATURE_XENPV))) + mask |= X86_EFLAGS_AC; + + WARN_ON_ONCE(flags & mask); /* We think we came from user mode. Make sure pt_regs agrees. */ WARN_ON_ONCE(!user_mode(regs)); -- cgit From d5c678aed5eddb944b8e7ce451b107b39245962d Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 2 Sep 2020 15:25:51 +0200 Subject: x86/debug: Allow a single level of #DB recursion Trying to clear DR7 around a #DB from usermode malfunctions if the tasks schedules when delivering SIGTRAP. Rather than trying to define a special no-recursion region, just allow a single level of recursion. The same mechanism is used for NMI, and it hasn't caused any problems yet. Fixes: 9f58fdde95c9 ("x86/db: Split out dr6/7 handling") Reported-by: Kyle Huey Debugged-by: Josh Poimboeuf Signed-off-by: Andy Lutomirski Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Tested-by: Daniel Thompson Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/8b9bd05f187231df008d48cf818a6a311cbd5c98.1597882384.git.luto@kernel.org Link: https://lore.kernel.org/r/20200902133200.726584153@infradead.org --- arch/x86/kernel/traps.c | 65 +++++++++++++++++++++++-------------------------- 1 file changed, 31 insertions(+), 34 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 1f66d2d1e998..81a2fb711091 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -729,20 +729,9 @@ static bool is_sysenter_singlestep(struct pt_regs *regs) #endif } -static __always_inline void debug_enter(unsigned long *dr6, unsigned long *dr7) +static __always_inline unsigned long debug_read_clear_dr6(void) { - /* - * Disable breakpoints during exception handling; recursive exceptions - * are exceedingly 'fun'. - * - * Since this function is NOKPROBE, and that also applies to - * HW_BREAKPOINT_X, we can't hit a breakpoint before this (XXX except a - * HW_BREAKPOINT_W on our stack) - * - * Entry text is excluded for HW_BP_X and cpu_entry_area, which - * includes the entry stack is excluded for everything. - */ - *dr7 = local_db_save(); + unsigned long dr6; /* * The Intel SDM says: @@ -755,15 +744,12 @@ static __always_inline void debug_enter(unsigned long *dr6, unsigned long *dr7) * * Keep it simple: clear DR6 immediately. */ - get_debugreg(*dr6, 6); + get_debugreg(dr6, 6); set_debugreg(0, 6); /* Filter out all the reserved bits which are preset to 1 */ - *dr6 &= ~DR6_RESERVED; -} + dr6 &= ~DR6_RESERVED; -static __always_inline void debug_exit(unsigned long dr7) -{ - local_db_restore(dr7); + return dr6; } /* @@ -863,6 +849,18 @@ out: static __always_inline void exc_debug_kernel(struct pt_regs *regs, unsigned long dr6) { + /* + * Disable breakpoints during exception handling; recursive exceptions + * are exceedingly 'fun'. + * + * Since this function is NOKPROBE, and that also applies to + * HW_BREAKPOINT_X, we can't hit a breakpoint before this (XXX except a + * HW_BREAKPOINT_W on our stack) + * + * Entry text is excluded for HW_BP_X and cpu_entry_area, which + * includes the entry stack is excluded for everything. + */ + unsigned long dr7 = local_db_save(); bool irq_state = idtentry_enter_nmi(regs); instrumentation_begin(); @@ -883,6 +881,8 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, instrumentation_end(); idtentry_exit_nmi(regs, irq_state); + + local_db_restore(dr7); } static __always_inline void exc_debug_user(struct pt_regs *regs, @@ -894,6 +894,15 @@ static __always_inline void exc_debug_user(struct pt_regs *regs, */ WARN_ON_ONCE(!user_mode(regs)); + /* + * NB: We can't easily clear DR7 here because + * idtentry_exit_to_usermode() can invoke ptrace, schedule, access + * user memory, etc. This means that a recursive #DB is possible. If + * this happens, that #DB will hit exc_debug_kernel() and clear DR7. + * Since we're not on the IST stack right now, everything will be + * fine. + */ + irqentry_enter_from_user_mode(regs); instrumentation_begin(); @@ -907,36 +916,24 @@ static __always_inline void exc_debug_user(struct pt_regs *regs, /* IST stack entry */ DEFINE_IDTENTRY_DEBUG(exc_debug) { - unsigned long dr6, dr7; - - debug_enter(&dr6, &dr7); - exc_debug_kernel(regs, dr6); - debug_exit(dr7); + exc_debug_kernel(regs, debug_read_clear_dr6()); } /* User entry, runs on regular task stack */ DEFINE_IDTENTRY_DEBUG_USER(exc_debug) { - unsigned long dr6, dr7; - - debug_enter(&dr6, &dr7); - exc_debug_user(regs, dr6); - debug_exit(dr7); + exc_debug_user(regs, debug_read_clear_dr6()); } #else /* 32 bit does not have separate entry points. */ DEFINE_IDTENTRY_RAW(exc_debug) { - unsigned long dr6, dr7; - - debug_enter(&dr6, &dr7); + unsigned long dr6 = debug_read_clear_dr6(); if (user_mode(regs)) exc_debug_user(regs, dr6); else exc_debug_kernel(regs, dr6); - - debug_exit(dr7); } #endif -- cgit From 4facb95b7adaf77e2da73aafb9ba60996fe42a12 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 2 Sep 2020 01:50:54 +0200 Subject: x86/entry: Unbreak 32bit fast syscall Andy reported that the syscall treacing for 32bit fast syscall fails: # ./tools/testing/selftests/x86/ptrace_syscall_32 ... [RUN] SYSEMU [FAIL] Initial args are wrong (nr=224, args=10 11 12 13 14 4289172732) ... [RUN] SYSCALL [FAIL] Initial args are wrong (nr=29, args=0 0 0 0 0 4289172732) The eason is that the conversion to generic entry code moved the retrieval of the sixth argument (EBP) after the point where the syscall entry work runs, i.e. ptrace, seccomp, audit... Unbreak it by providing a split up version of syscall_enter_from_user_mode(). - syscall_enter_from_user_mode_prepare() establishes state and enables interrupts - syscall_enter_from_user_mode_work() runs the entry work Replace the call to syscall_enter_from_user_mode() in the 32bit fast syscall C-entry with the split functions and stick the EBP retrieval between them. Fixes: 27d6b4d14f5c ("x86/entry: Use generic syscall entry function") Reported-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/87k0xdjbtt.fsf@nanos.tec.linutronix.de --- arch/x86/entry/common.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 48512c7944e7..2f84c7ca74ea 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -60,16 +60,10 @@ __visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs) #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) static __always_inline unsigned int syscall_32_enter(struct pt_regs *regs) { - unsigned int nr = (unsigned int)regs->orig_ax; - if (IS_ENABLED(CONFIG_IA32_EMULATION)) current_thread_info()->status |= TS_COMPAT; - /* - * Subtlety here: if ptrace pokes something larger than 2^32-1 into - * orig_ax, the unsigned int return value truncates it. This may - * or may not be necessary, but it matches the old asm behavior. - */ - return (unsigned int)syscall_enter_from_user_mode(regs, nr); + + return (unsigned int)regs->orig_ax; } /* @@ -91,15 +85,29 @@ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs) { unsigned int nr = syscall_32_enter(regs); + /* + * Subtlety here: if ptrace pokes something larger than 2^32-1 into + * orig_ax, the unsigned int return value truncates it. This may + * or may not be necessary, but it matches the old asm behavior. + */ + nr = (unsigned int)syscall_enter_from_user_mode(regs, nr); + do_syscall_32_irqs_on(regs, nr); syscall_exit_to_user_mode(regs); } static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) { - unsigned int nr = syscall_32_enter(regs); + unsigned int nr = syscall_32_enter(regs); int res; + /* + * This cannot use syscall_enter_from_user_mode() as it has to + * fetch EBP before invoking any of the syscall entry work + * functions. + */ + syscall_enter_from_user_mode_prepare(regs); + instrumentation_begin(); /* Fetch EBP from where the vDSO stashed it. */ if (IS_ENABLED(CONFIG_X86_64)) { @@ -122,6 +130,9 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) return false; } + /* The case truncates any ptrace induced syscall nr > 2^32 -1 */ + nr = (unsigned int)syscall_enter_from_user_mode_work(regs, nr); + /* Now this is just like a normal syscall. */ do_syscall_32_irqs_on(regs, nr); syscall_exit_to_user_mode(regs); -- cgit