diff options
Diffstat (limited to 'arch/arm/mm/fault.c')
-rw-r--r-- | arch/arm/mm/fault.c | 271 |
1 files changed, 152 insertions, 119 deletions
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index bd0f4821f7e1..439dc6a26bb9 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -17,14 +17,21 @@ #include <linux/sched/debug.h> #include <linux/highmem.h> #include <linux/perf_event.h> +#include <linux/kfence.h> -#include <asm/pgtable.h> #include <asm/system_misc.h> #include <asm/system_info.h> #include <asm/tlbflush.h> #include "fault.h" +bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) +{ + unsigned long addr = (unsigned long)unsafe_src; + + return addr >= TASK_SIZE && ULONG_MAX - addr >= size; +} + #ifdef CONFIG_MMU /* @@ -38,24 +45,25 @@ void show_pte(const char *lvl, struct mm_struct *mm, unsigned long addr) if (!mm) mm = &init_mm; - printk("%spgd = %p\n", lvl, mm->pgd); pgd = pgd_offset(mm, addr); printk("%s[%08lx] *pgd=%08llx", lvl, addr, (long long)pgd_val(*pgd)); do { + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; - if (pgd_none(*pgd)) + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) break; - if (pgd_bad(*pgd)) { + if (p4d_bad(*p4d)) { pr_cont("(bad)"); break; } - pud = pud_offset(pgd, addr); + pud = pud_offset(p4d, addr); if (PTRS_PER_PUD != 1) pr_cont(", *pud=%08llx", (long long)pud_val(*pud)); @@ -84,6 +92,9 @@ void show_pte(const char *lvl, struct mm_struct *mm, unsigned long addr) break; pte = pte_offset_map(pmd, addr); + if (!pte) + break; + pr_cont(", *pte=%08llx", (long long)pte_val(*pte)); #ifndef CONFIG_ARM_LPAE pr_cont(", *ppte=%08llx", @@ -99,6 +110,40 @@ void show_pte(const char *lvl, struct mm_struct *mm, unsigned long addr) { } #endif /* CONFIG_MMU */ +static inline bool is_write_fault(unsigned int fsr) +{ + return (fsr & FSR_WRITE) && !(fsr & FSR_CM); +} + +static inline bool is_translation_fault(unsigned int fsr) +{ + int fs = fsr_fs(fsr); +#ifdef CONFIG_ARM_LPAE + if ((fs & FS_MMU_NOLL_MASK) == FS_TRANS_NOLL) + return true; +#else + if (fs == FS_L1_TRANS || fs == FS_L2_TRANS) + return true; +#endif + return false; +} + +static void die_kernel_fault(const char *msg, struct mm_struct *mm, + unsigned long addr, unsigned int fsr, + struct pt_regs *regs) +{ + bust_spinlocks(1); + pr_alert("8<--- cut here ---\n"); + pr_alert("Unable to handle kernel %s at virtual address %08lx when %s\n", + msg, addr, fsr & FSR_LNX_PF ? "execute" : + fsr & FSR_WRITE ? "write" : "read"); + + show_pte(KERN_ALERT, mm, addr); + die("Oops", regs, fsr); + bust_spinlocks(0); + make_task_dead(SIGKILL); +} + /* * Oops. The kernel tried to access some page that wasn't present. */ @@ -106,6 +151,7 @@ static void __do_kernel_fault(struct mm_struct *mm, unsigned long addr, unsigned int fsr, struct pt_regs *regs) { + const char *msg; /* * Are we prepared to handle this kernel fault? */ @@ -115,16 +161,17 @@ __do_kernel_fault(struct mm_struct *mm, unsigned long addr, unsigned int fsr, /* * No handler, we'll have to terminate things with extreme prejudice. */ - bust_spinlocks(1); - pr_alert("8<--- cut here ---\n"); - pr_alert("Unable to handle kernel %s at virtual address %08lx\n", - (addr < PAGE_SIZE) ? "NULL pointer dereference" : - "paging request", addr); + if (addr < PAGE_SIZE) { + msg = "NULL pointer dereference"; + } else { + if (is_translation_fault(fsr) && + kfence_handle_page_fault(addr, is_write_fault(fsr), regs)) + return; - show_pte(KERN_ALERT, mm, addr); - die("Oops", regs, fsr); - bust_spinlocks(0); - do_exit(SIGKILL); + msg = "paging request"; + } + + die_kernel_fault(msg, mm, addr, fsr, regs); } /* @@ -179,75 +226,35 @@ void do_bad_area(unsigned long addr, unsigned int fsr, struct pt_regs *regs) } #ifdef CONFIG_MMU -#define VM_FAULT_BADMAP 0x010000 -#define VM_FAULT_BADACCESS 0x020000 - -/* - * Check that the permissions on the VMA allow for the fault which occurred. - * If we encountered a write fault, we must have write permission, otherwise - * we allow any permission. - */ -static inline bool access_error(unsigned int fsr, struct vm_area_struct *vma) -{ - unsigned int mask = VM_READ | VM_WRITE | VM_EXEC; - - if ((fsr & FSR_WRITE) && !(fsr & FSR_CM)) - mask = VM_WRITE; - if (fsr & FSR_LNX_PF) - mask = VM_EXEC; +#define VM_FAULT_BADMAP ((__force vm_fault_t)0x010000) +#define VM_FAULT_BADACCESS ((__force vm_fault_t)0x020000) - return vma->vm_flags & mask ? false : true; -} - -static vm_fault_t __kprobes -__do_page_fault(struct mm_struct *mm, unsigned long addr, unsigned int fsr, - unsigned int flags, struct task_struct *tsk) +static inline bool is_permission_fault(unsigned int fsr) { - struct vm_area_struct *vma; - vm_fault_t fault; - - vma = find_vma(mm, addr); - fault = VM_FAULT_BADMAP; - if (unlikely(!vma)) - goto out; - if (unlikely(vma->vm_start > addr)) - goto check_stack; - - /* - * Ok, we have a good vm_area for this - * memory access, so we can handle it. - */ -good_area: - if (access_error(fsr, vma)) { - fault = VM_FAULT_BADACCESS; - goto out; - } - - return handle_mm_fault(vma, addr & PAGE_MASK, flags); - -check_stack: - /* Don't allow expansion below FIRST_USER_ADDRESS */ - if (vma->vm_flags & VM_GROWSDOWN && - addr >= FIRST_USER_ADDRESS && !expand_stack(vma, addr)) - goto good_area; -out: - return fault; + int fs = fsr_fs(fsr); +#ifdef CONFIG_ARM_LPAE + if ((fs & FS_MMU_NOLL_MASK) == FS_PERM_NOLL) + return true; +#else + if (fs == FS_L1_PERM || fs == FS_L2_PERM) + return true; +#endif + return false; } static int __kprobes do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { - struct task_struct *tsk; - struct mm_struct *mm; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; int sig, code; vm_fault_t fault; - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + unsigned int flags = FAULT_FLAG_DEFAULT; + unsigned long vm_flags = VM_ACCESS_FLAGS; if (kprobe_page_fault(regs, fsr)) return 0; - tsk = current; - mm = tsk->mm; /* Enable interrupts if they were enabled in the parent context. */ if (interrupts_enabled(regs)) @@ -262,72 +269,92 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) if (user_mode(regs)) flags |= FAULT_FLAG_USER; - if ((fsr & FSR_WRITE) && !(fsr & FSR_CM)) + + if (is_write_fault(fsr)) { flags |= FAULT_FLAG_WRITE; + vm_flags = VM_WRITE; + } - /* - * As per x86, we may deadlock here. However, since the kernel only - * validly references user space from well defined areas of the code, - * we can bug out early if this is from code which shouldn't. - */ - if (!down_read_trylock(&mm->mmap_sem)) { - if (!user_mode(regs) && !search_exception_tables(regs->ARM_pc)) + if (fsr & FSR_LNX_PF) { + vm_flags = VM_EXEC; + + if (is_permission_fault(fsr) && !user_mode(regs)) + die_kernel_fault("execution of memory", + mm, addr, fsr, regs); + } + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); + + if (!(flags & FAULT_FLAG_USER)) + goto lock_mmap; + + vma = lock_vma_under_rcu(mm, addr); + if (!vma) + goto lock_mmap; + + if (!(vma->vm_flags & vm_flags)) { + vma_end_read(vma); + goto lock_mmap; + } + fault = handle_mm_fault(vma, addr, flags | FAULT_FLAG_VMA_LOCK, regs); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); + + if (!(fault & VM_FAULT_RETRY)) { + count_vm_vma_lock_event(VMA_LOCK_SUCCESS); + goto done; + } + count_vm_vma_lock_event(VMA_LOCK_RETRY); + if (fault & VM_FAULT_MAJOR) + flags |= FAULT_FLAG_TRIED; + + /* Quick path to respond to signals */ + if (fault_signal_pending(fault, regs)) { + if (!user_mode(regs)) goto no_context; + return 0; + } +lock_mmap: + retry: - down_read(&mm->mmap_sem); - } else { - /* - * The above down_read_trylock() might have succeeded in - * which case, we'll have missed the might_sleep() from - * down_read() - */ - might_sleep(); -#ifdef CONFIG_DEBUG_VM - if (!user_mode(regs) && - !search_exception_tables(regs->ARM_pc)) - goto no_context; -#endif + vma = lock_mm_and_find_vma(mm, addr, regs); + if (unlikely(!vma)) { + fault = VM_FAULT_BADMAP; + goto bad_area; } - fault = __do_page_fault(mm, addr, fsr, flags, tsk); + /* + * ok, we have a good vm_area for this memory access, check the + * permissions on the VMA allow for the fault which occurred. + */ + if (!(vma->vm_flags & vm_flags)) + fault = VM_FAULT_BADACCESS; + else + fault = handle_mm_fault(vma, addr & PAGE_MASK, flags, regs); /* If we need to retry but a fatal signal is pending, handle the - * signal first. We do not need to release the mmap_sem because + * signal first. We do not need to release the mmap_lock because * it would already be released in __lock_page_or_retry in * mm/filemap.c. */ - if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) { + if (fault_signal_pending(fault, regs)) { if (!user_mode(regs)) goto no_context; return 0; } - /* - * Major/minor page fault accounting is only done on the - * initial attempt. If we go through a retry, it is extremely - * likely that the page will be found in page cache at that point. - */ + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + return 0; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); - if (!(fault & VM_FAULT_ERROR) && flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_MAJOR) { - tsk->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, - regs, addr); - } else { - tsk->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, - regs, addr); - } + if (!(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_RETRY) { - /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk - * of starvation. */ - flags &= ~FAULT_FLAG_ALLOW_RETRY; flags |= FAULT_FLAG_TRIED; goto retry; } } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); +done: /* * Handle the "normal" case first - VM_FAULT_MAJOR @@ -335,6 +362,7 @@ retry: if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS)))) return 0; +bad_area: /* * If we are in kernel mode at this point, we * have no context to handle this fault with. @@ -408,6 +436,7 @@ do_translation_fault(unsigned long addr, unsigned int fsr, { unsigned int index; pgd_t *pgd, *pgd_k; + p4d_t *p4d, *p4d_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; @@ -422,13 +451,16 @@ do_translation_fault(unsigned long addr, unsigned int fsr, pgd = cpu_get_pgd() + index; pgd_k = init_mm.pgd + index; - if (pgd_none(*pgd_k)) + p4d = p4d_offset(pgd, addr); + p4d_k = p4d_offset(pgd_k, addr); + + if (p4d_none(*p4d_k)) goto bad_area; - if (!pgd_present(*pgd)) - set_pgd(pgd, *pgd_k); + if (!p4d_present(*p4d)) + set_p4d(p4d, *p4d_k); - pud = pud_offset(pgd, addr); - pud_k = pud_offset(pgd_k, addr); + pud = pud_offset(p4d, addr); + pud_k = pud_offset(p4d_k, addr); if (pud_none(*pud_k)) goto bad_area; @@ -563,6 +595,7 @@ do_PrefetchAbort(unsigned long addr, unsigned int ifsr, struct pt_regs *regs) if (!inf->fn(addr, ifsr | FSR_LNX_PF, regs)) return; + pr_alert("8<--- cut here ---\n"); pr_alert("Unhandled prefetch abort: %s (0x%03x) at 0x%08lx\n", inf->name, ifsr, addr); |