diff options
Diffstat (limited to 'arch/arm/mm/fault.c')
| -rw-r--r-- | arch/arm/mm/fault.c | 455 |
1 files changed, 260 insertions, 195 deletions
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index c97f7940cb95..2bc828a1940c 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -1,14 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/arch/arm/mm/fault.c * * Copyright (C) 1995 Linus Torvalds * Modifications for ARM processor (c) 1995-2004 Russell King - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ -#include <linux/module.h> +#include <linux/extable.h> #include <linux/signal.h> #include <linux/mm.h> #include <linux/hardirq.h> @@ -16,12 +13,12 @@ #include <linux/kprobes.h> #include <linux/uaccess.h> #include <linux/page-flags.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> +#include <linux/sched/debug.h> #include <linux/highmem.h> #include <linux/perf_event.h> +#include <linux/kfence.h> -#include <asm/exception.h> -#include <asm/pgtable.h> #include <asm/system_misc.h> #include <asm/system_info.h> #include <asm/tlbflush.h> @@ -30,78 +27,63 @@ #ifdef CONFIG_MMU -#ifdef CONFIG_KPROBES -static inline int notify_page_fault(struct pt_regs *regs, unsigned int fsr) +bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) { - int ret = 0; - - if (!user_mode(regs)) { - /* kprobe_running() needs smp_processor_id() */ - preempt_disable(); - if (kprobe_running() && kprobe_fault_handler(regs, fsr)) - ret = 1; - preempt_enable(); - } + unsigned long addr = (unsigned long)unsafe_src; - return ret; -} -#else -static inline int notify_page_fault(struct pt_regs *regs, unsigned int fsr) -{ - return 0; + return addr >= TASK_SIZE && ULONG_MAX - addr >= size; } -#endif /* * This is useful to dump out the page tables associated with * 'addr' in mm 'mm'. */ -void show_pte(struct mm_struct *mm, unsigned long addr) +void show_pte(const char *lvl, struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; if (!mm) mm = &init_mm; - printk(KERN_ALERT "pgd = %p\n", mm->pgd); pgd = pgd_offset(mm, addr); - printk(KERN_ALERT "[%08lx] *pgd=%08llx", - addr, (long long)pgd_val(*pgd)); + printk("%s[%08lx] *pgd=%08llx", lvl, addr, (long long)pgd_val(*pgd)); do { + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; - if (pgd_none(*pgd)) + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) break; - if (pgd_bad(*pgd)) { - printk("(bad)"); + if (p4d_bad(*p4d)) { + pr_cont("(bad)"); break; } - pud = pud_offset(pgd, addr); + pud = pud_offset(p4d, addr); if (PTRS_PER_PUD != 1) - printk(", *pud=%08llx", (long long)pud_val(*pud)); + pr_cont(", *pud=%08llx", (long long)pud_val(*pud)); if (pud_none(*pud)) break; if (pud_bad(*pud)) { - printk("(bad)"); + pr_cont("(bad)"); break; } pmd = pmd_offset(pud, addr); if (PTRS_PER_PMD != 1) - printk(", *pmd=%08llx", (long long)pmd_val(*pmd)); + pr_cont(", *pmd=%08llx", (long long)pmd_val(*pmd)); if (pmd_none(*pmd)) break; if (pmd_bad(*pmd)) { - printk("(bad)"); + pr_cont("(bad)"); break; } @@ -110,21 +92,57 @@ void show_pte(struct mm_struct *mm, unsigned long addr) break; pte = pte_offset_map(pmd, addr); - printk(", *pte=%08llx", (long long)pte_val(*pte)); + if (!pte) + break; + + pr_cont(", *pte=%08llx", (long long)pte_val(*pte)); #ifndef CONFIG_ARM_LPAE - printk(", *ppte=%08llx", + pr_cont(", *ppte=%08llx", (long long)pte_val(pte[PTE_HWTABLE_PTRS])); #endif pte_unmap(pte); } while(0); - printk("\n"); + pr_cont("\n"); } #else /* CONFIG_MMU */ -void show_pte(struct mm_struct *mm, unsigned long addr) +void show_pte(const char *lvl, struct mm_struct *mm, unsigned long addr) { } #endif /* CONFIG_MMU */ +static inline bool is_write_fault(unsigned int fsr) +{ + return (fsr & FSR_WRITE) && !(fsr & FSR_CM); +} + +static inline bool is_translation_fault(unsigned int fsr) +{ + int fs = fsr_fs(fsr); +#ifdef CONFIG_ARM_LPAE + if ((fs & FS_MMU_NOLL_MASK) == FS_TRANS_NOLL) + return true; +#else + if (fs == FS_L1_TRANS || fs == FS_L2_TRANS) + return true; +#endif + return false; +} + +static void die_kernel_fault(const char *msg, struct mm_struct *mm, + unsigned long addr, unsigned int fsr, + struct pt_regs *regs) +{ + bust_spinlocks(1); + pr_alert("8<--- cut here ---\n"); + pr_alert("Unable to handle kernel %s at virtual address %08lx when %s\n", + msg, addr, fsr & FSR_LNX_PF ? "execute" : str_write_read(fsr & FSR_WRITE)); + + show_pte(KERN_ALERT, mm, addr); + die("Oops", regs, fsr); + bust_spinlocks(0); + make_task_dead(SIGKILL); +} + /* * Oops. The kernel tried to access some page that wasn't present. */ @@ -132,6 +150,7 @@ static void __do_kernel_fault(struct mm_struct *mm, unsigned long addr, unsigned int fsr, struct pt_regs *regs) { + const char *msg; /* * Are we prepared to handle this kernel fault? */ @@ -141,16 +160,17 @@ __do_kernel_fault(struct mm_struct *mm, unsigned long addr, unsigned int fsr, /* * No handler, we'll have to terminate things with extreme prejudice. */ - bust_spinlocks(1); - printk(KERN_ALERT - "Unable to handle kernel %s at virtual address %08lx\n", - (addr < PAGE_SIZE) ? "NULL pointer dereference" : - "paging request", addr); + if (addr < PAGE_SIZE) { + msg = "NULL pointer dereference"; + } else { + if (is_translation_fault(fsr) && + kfence_handle_page_fault(addr, is_write_fault(fsr), regs)) + return; - show_pte(mm, addr); - die("Oops", regs, fsr); - bust_spinlocks(0); - do_exit(SIGKILL); + msg = "paging request"; + } + + die_kernel_fault(msg, mm, addr, fsr, regs); } /* @@ -158,30 +178,35 @@ __do_kernel_fault(struct mm_struct *mm, unsigned long addr, unsigned int fsr, * User mode accesses just cause a SIGSEGV */ static void -__do_user_fault(struct task_struct *tsk, unsigned long addr, - unsigned int fsr, unsigned int sig, int code, - struct pt_regs *regs) +__do_user_fault(unsigned long addr, unsigned int fsr, unsigned int sig, + int code, struct pt_regs *regs) { - struct siginfo si; + struct task_struct *tsk = current; + + if (addr > TASK_SIZE) + harden_branch_predictor(); #ifdef CONFIG_DEBUG_USER if (((user_debug & UDBG_SEGV) && (sig == SIGSEGV)) || ((user_debug & UDBG_BUS) && (sig == SIGBUS))) { - printk(KERN_DEBUG "%s: unhandled page fault (%d) at 0x%08lx, code 0x%03x\n", + pr_err("8<--- cut here ---\n"); + pr_err("%s: unhandled page fault (%d) at 0x%08lx, code 0x%03x\n", tsk->comm, sig, addr, fsr); - show_pte(tsk->mm, addr); + show_pte(KERN_ERR, tsk->mm, addr); show_regs(regs); } #endif +#ifndef CONFIG_KUSER_HELPERS + if ((sig == SIGSEGV) && ((addr & PAGE_MASK) == 0xffff0000)) + printk_ratelimited(KERN_DEBUG + "%s: CONFIG_KUSER_HELPERS disabled at 0x%08lx\n", + tsk->comm, addr); +#endif tsk->thread.address = addr; tsk->thread.error_code = fsr; tsk->thread.trap_no = 14; - si.si_signo = sig; - si.si_errno = 0; - si.si_code = code; - si.si_addr = (void __user *)addr; - force_sig_info(sig, &si, tsk); + force_sig_fault(sig, code, (void __user *)addr); } void do_bad_area(unsigned long addr, unsigned int fsr, struct pt_regs *regs) @@ -194,82 +219,59 @@ void do_bad_area(unsigned long addr, unsigned int fsr, struct pt_regs *regs) * have no context to handle this fault with. */ if (user_mode(regs)) - __do_user_fault(tsk, addr, fsr, SIGSEGV, SEGV_MAPERR, regs); + __do_user_fault(addr, fsr, SIGSEGV, SEGV_MAPERR, regs); else __do_kernel_fault(mm, addr, fsr, regs); } #ifdef CONFIG_MMU -#define VM_FAULT_BADMAP 0x010000 -#define VM_FAULT_BADACCESS 0x020000 - -/* - * Check that the permissions on the VMA allow for the fault which occurred. - * If we encountered a write fault, we must have write permission, otherwise - * we allow any permission. - */ -static inline bool access_error(unsigned int fsr, struct vm_area_struct *vma) +static inline bool is_permission_fault(unsigned int fsr) { - unsigned int mask = VM_READ | VM_WRITE | VM_EXEC; - - if (fsr & FSR_WRITE) - mask = VM_WRITE; - if (fsr & FSR_LNX_PF) - mask = VM_EXEC; - - return vma->vm_flags & mask ? false : true; + int fs = fsr_fs(fsr); +#ifdef CONFIG_ARM_LPAE + if ((fs & FS_MMU_NOLL_MASK) == FS_PERM_NOLL) + return true; +#else + if (fs == FS_L1_PERM || fs == FS_L2_PERM) + return true; +#endif + return false; } -static int __kprobes -__do_page_fault(struct mm_struct *mm, unsigned long addr, unsigned int fsr, - unsigned int flags, struct task_struct *tsk) +#ifdef CONFIG_CPU_TTBR0_PAN +static inline bool ttbr0_usermode_access_allowed(struct pt_regs *regs) { - struct vm_area_struct *vma; - int fault; - - vma = find_vma(mm, addr); - fault = VM_FAULT_BADMAP; - if (unlikely(!vma)) - goto out; - if (unlikely(vma->vm_start > addr)) - goto check_stack; + struct svc_pt_regs *svcregs; - /* - * Ok, we have a good vm_area for this - * memory access, so we can handle it. - */ -good_area: - if (access_error(fsr, vma)) { - fault = VM_FAULT_BADACCESS; - goto out; - } + /* If we are in user mode: permission granted */ + if (user_mode(regs)) + return true; - return handle_mm_fault(mm, vma, addr & PAGE_MASK, flags); + /* uaccess state saved above pt_regs on SVC exception entry */ + svcregs = to_svc_pt_regs(regs); -check_stack: - /* Don't allow expansion below FIRST_USER_ADDRESS */ - if (vma->vm_flags & VM_GROWSDOWN && - addr >= FIRST_USER_ADDRESS && !expand_stack(vma, addr)) - goto good_area; -out: - return fault; + return !(svcregs->ttbcr & TTBCR_EPD0); } +#else +static inline bool ttbr0_usermode_access_allowed(struct pt_regs *regs) +{ + return true; +} +#endif static int __kprobes do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { - struct task_struct *tsk; - struct mm_struct *mm; - int fault, sig, code; - int write = fsr & FSR_WRITE; - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | - (write ? FAULT_FLAG_WRITE : 0); - - if (notify_page_fault(regs, fsr)) + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + int sig, code; + vm_fault_t fault; + unsigned int flags = FAULT_FLAG_DEFAULT; + vm_flags_t vm_flags = VM_ACCESS_FLAGS; + + if (kprobe_page_fault(regs, fsr)) return 0; - tsk = current; - mm = tsk->mm; /* Enable interrupts if they were enabled in the parent context. */ if (interrupts_enabled(regs)) @@ -279,75 +281,126 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) * If we're in an interrupt or have no user * context, we must not take the fault.. */ - if (in_atomic() || !mm) + if (faulthandler_disabled() || !mm) goto no_context; + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; + + if (is_write_fault(fsr)) { + flags |= FAULT_FLAG_WRITE; + vm_flags = VM_WRITE; + } + + if (fsr & FSR_LNX_PF) { + vm_flags = VM_EXEC; + + if (is_permission_fault(fsr) && !user_mode(regs)) + die_kernel_fault("execution of memory", + mm, addr, fsr, regs); + } + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); + /* - * As per x86, we may deadlock here. However, since the kernel only - * validly references user space from well defined areas of the code, - * we can bug out early if this is from code which shouldn't. + * Privileged access aborts with CONFIG_CPU_TTBR0_PAN enabled are + * routed via the translation fault mechanism. Check whether uaccess + * is disabled while in kernel mode. */ - if (!down_read_trylock(&mm->mmap_sem)) { - if (!user_mode(regs) && !search_exception_tables(regs->ARM_pc)) + if (!ttbr0_usermode_access_allowed(regs)) + goto no_context; + + if (!(flags & FAULT_FLAG_USER)) + goto lock_mmap; + + vma = lock_vma_under_rcu(mm, addr); + if (!vma) + goto lock_mmap; + + if (!(vma->vm_flags & vm_flags)) { + vma_end_read(vma); + count_vm_vma_lock_event(VMA_LOCK_SUCCESS); + fault = 0; + code = SEGV_ACCERR; + goto bad_area; + } + fault = handle_mm_fault(vma, addr, flags | FAULT_FLAG_VMA_LOCK, regs); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); + + if (!(fault & VM_FAULT_RETRY)) { + count_vm_vma_lock_event(VMA_LOCK_SUCCESS); + goto done; + } + count_vm_vma_lock_event(VMA_LOCK_RETRY); + if (fault & VM_FAULT_MAJOR) + flags |= FAULT_FLAG_TRIED; + + /* Quick path to respond to signals */ + if (fault_signal_pending(fault, regs)) { + if (!user_mode(regs)) goto no_context; + return 0; + } +lock_mmap: + retry: - down_read(&mm->mmap_sem); - } else { - /* - * The above down_read_trylock() might have succeeded in - * which case, we'll have missed the might_sleep() from - * down_read() - */ - might_sleep(); -#ifdef CONFIG_DEBUG_VM - if (!user_mode(regs) && - !search_exception_tables(regs->ARM_pc)) - goto no_context; -#endif + vma = lock_mm_and_find_vma(mm, addr, regs); + if (unlikely(!vma)) { + fault = 0; + code = SEGV_MAPERR; + goto bad_area; } - fault = __do_page_fault(mm, addr, fsr, flags, tsk); + /* + * ok, we have a good vm_area for this memory access, check the + * permissions on the VMA allow for the fault which occurred. + */ + if (!(vma->vm_flags & vm_flags)) { + mmap_read_unlock(mm); + fault = 0; + code = SEGV_ACCERR; + goto bad_area; + } + + fault = handle_mm_fault(vma, addr & PAGE_MASK, flags, regs); /* If we need to retry but a fatal signal is pending, handle the - * signal first. We do not need to release the mmap_sem because + * signal first. We do not need to release the mmap_lock because * it would already be released in __lock_page_or_retry in * mm/filemap.c. */ - if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) + if (fault_signal_pending(fault, regs)) { + if (!user_mode(regs)) + goto no_context; return 0; + } - /* - * Major/minor page fault accounting is only done on the - * initial attempt. If we go through a retry, it is extremely - * likely that the page will be found in page cache at that point. - */ + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + return 0; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); - if (!(fault & VM_FAULT_ERROR) && flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_MAJOR) { - tsk->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, - regs, addr); - } else { - tsk->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, - regs, addr); - } + if (!(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_RETRY) { - /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk - * of starvation. */ - flags &= ~FAULT_FLAG_ALLOW_RETRY; flags |= FAULT_FLAG_TRIED; goto retry; } } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); +done: + /* Handle the "normal" case first */ + if (likely(!(fault & VM_FAULT_ERROR))) + return 0; + + code = SEGV_MAPERR; +bad_area: /* - * Handle the "normal" case first - VM_FAULT_MAJOR / VM_FAULT_MINOR + * If we are in kernel mode at this point, we + * have no context to handle this fault with. */ - if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS)))) - return 0; + if (!user_mode(regs)) + goto no_context; if (fault & VM_FAULT_OOM) { /* @@ -359,13 +412,6 @@ retry: return 0; } - /* - * If we are in kernel mode at this point, we - * have no context to handle this fault with. - */ - if (!user_mode(regs)) - goto no_context; - if (fault & VM_FAULT_SIGBUS) { /* * We had some memory, but were unable to @@ -379,11 +425,9 @@ retry: * isn't in our memory map.. */ sig = SIGSEGV; - code = fault == VM_FAULT_BADACCESS ? - SEGV_ACCERR : SEGV_MAPERR; } - __do_user_fault(tsk, addr, fsr, sig, code, regs); + __do_user_fault(addr, fsr, sig, code, regs); return 0; no_context: @@ -422,6 +466,7 @@ do_translation_fault(unsigned long addr, unsigned int fsr, { unsigned int index; pgd_t *pgd, *pgd_k; + p4d_t *p4d, *p4d_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; @@ -436,13 +481,16 @@ do_translation_fault(unsigned long addr, unsigned int fsr, pgd = cpu_get_pgd() + index; pgd_k = init_mm.pgd + index; - if (pgd_none(*pgd_k)) + p4d = p4d_offset(pgd, addr); + p4d_k = p4d_offset(pgd_k, addr); + + if (p4d_none(*p4d_k)) goto bad_area; - if (!pgd_present(*pgd)) - set_pgd(pgd, *pgd_k); + if (!p4d_present(*p4d)) + set_p4d(p4d, *p4d_k); - pud = pud_offset(pgd, addr); - pud_k = pud_offset(pgd_k, addr); + pud = pud_offset(p4d, addr); + pud_k = pud_offset(p4d_k, addr); if (pud_none(*pud_k)) goto bad_area; @@ -539,23 +587,21 @@ hook_fault_code(int nr, int (*fn)(unsigned long, unsigned int, struct pt_regs *) /* * Dispatch a data abort to the relevant handler. */ -asmlinkage void __exception +asmlinkage void do_DataAbort(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { const struct fsr_info *inf = fsr_info + fsr_fs(fsr); - struct siginfo info; if (!inf->fn(addr, fsr & ~FSR_LNX_PF, regs)) return; - printk(KERN_ALERT "Unhandled fault: %s (0x%03x) at 0x%08lx\n", + pr_alert("8<--- cut here ---\n"); + pr_alert("Unhandled fault: %s (0x%03x) at 0x%08lx\n", inf->name, fsr, addr); + show_pte(KERN_ALERT, current->mm, addr); - info.si_signo = inf->sig; - info.si_errno = 0; - info.si_code = inf->code; - info.si_addr = (void __user *)addr; - arm_notify_die("", regs, &info, fsr, 0); + arm_notify_die("", regs, inf->sig, inf->code, (void __user *)addr, + fsr, 0); } void __init @@ -571,23 +617,42 @@ hook_ifault_code(int nr, int (*fn)(unsigned long, unsigned int, struct pt_regs * ifsr_info[nr].name = name; } -asmlinkage void __exception +asmlinkage void do_PrefetchAbort(unsigned long addr, unsigned int ifsr, struct pt_regs *regs) { const struct fsr_info *inf = ifsr_info + fsr_fs(ifsr); - struct siginfo info; if (!inf->fn(addr, ifsr | FSR_LNX_PF, regs)) return; - printk(KERN_ALERT "Unhandled prefetch abort: %s (0x%03x) at 0x%08lx\n", + pr_alert("8<--- cut here ---\n"); + pr_alert("Unhandled prefetch abort: %s (0x%03x) at 0x%08lx\n", inf->name, ifsr, addr); - info.si_signo = inf->sig; - info.si_errno = 0; - info.si_code = inf->code; - info.si_addr = (void __user *)addr; - arm_notify_die("", regs, &info, ifsr, 0); + arm_notify_die("", regs, inf->sig, inf->code, (void __user *)addr, + ifsr, 0); +} + +/* + * Abort handler to be used only during first unmasking of asynchronous aborts + * on the boot CPU. This makes sure that the machine will not die if the + * firmware/bootloader left an imprecise abort pending for us to trip over. + */ +static int __init early_abort_handler(unsigned long addr, unsigned int fsr, + struct pt_regs *regs) +{ + pr_warn("Hit pending asynchronous external abort (FSR=0x%08x) during " + "first unmask, this is most likely caused by a " + "firmware/bootloader bug.\n", fsr); + + return 0; +} + +void __init early_abt_enable(void) +{ + fsr_info[FSR_FS_AEA].fn = early_abort_handler; + local_abt_enable(); + fsr_info[FSR_FS_AEA].fn = do_bad; } #ifndef CONFIG_ARM_LPAE |
