summaryrefslogtreecommitdiff
path: root/arch/arm/mm/fault.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/arm/mm/fault.c')
-rw-r--r--arch/arm/mm/fault.c250
1 files changed, 162 insertions, 88 deletions
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index efa402025031..2bc828a1940c 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -17,6 +17,7 @@
#include <linux/sched/debug.h>
#include <linux/highmem.h>
#include <linux/perf_event.h>
+#include <linux/kfence.h>
#include <asm/system_misc.h>
#include <asm/system_info.h>
@@ -26,6 +27,13 @@
#ifdef CONFIG_MMU
+bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size)
+{
+ unsigned long addr = (unsigned long)unsafe_src;
+
+ return addr >= TASK_SIZE && ULONG_MAX - addr >= size;
+}
+
/*
* This is useful to dump out the page tables associated with
* 'addr' in mm 'mm'.
@@ -37,7 +45,6 @@ void show_pte(const char *lvl, struct mm_struct *mm, unsigned long addr)
if (!mm)
mm = &init_mm;
- printk("%spgd = %p\n", lvl, mm->pgd);
pgd = pgd_offset(mm, addr);
printk("%s[%08lx] *pgd=%08llx", lvl, addr, (long long)pgd_val(*pgd));
@@ -85,6 +92,9 @@ void show_pte(const char *lvl, struct mm_struct *mm, unsigned long addr)
break;
pte = pte_offset_map(pmd, addr);
+ if (!pte)
+ break;
+
pr_cont(", *pte=%08llx", (long long)pte_val(*pte));
#ifndef CONFIG_ARM_LPAE
pr_cont(", *ppte=%08llx",
@@ -100,6 +110,39 @@ void show_pte(const char *lvl, struct mm_struct *mm, unsigned long addr)
{ }
#endif /* CONFIG_MMU */
+static inline bool is_write_fault(unsigned int fsr)
+{
+ return (fsr & FSR_WRITE) && !(fsr & FSR_CM);
+}
+
+static inline bool is_translation_fault(unsigned int fsr)
+{
+ int fs = fsr_fs(fsr);
+#ifdef CONFIG_ARM_LPAE
+ if ((fs & FS_MMU_NOLL_MASK) == FS_TRANS_NOLL)
+ return true;
+#else
+ if (fs == FS_L1_TRANS || fs == FS_L2_TRANS)
+ return true;
+#endif
+ return false;
+}
+
+static void die_kernel_fault(const char *msg, struct mm_struct *mm,
+ unsigned long addr, unsigned int fsr,
+ struct pt_regs *regs)
+{
+ bust_spinlocks(1);
+ pr_alert("8<--- cut here ---\n");
+ pr_alert("Unable to handle kernel %s at virtual address %08lx when %s\n",
+ msg, addr, fsr & FSR_LNX_PF ? "execute" : str_write_read(fsr & FSR_WRITE));
+
+ show_pte(KERN_ALERT, mm, addr);
+ die("Oops", regs, fsr);
+ bust_spinlocks(0);
+ make_task_dead(SIGKILL);
+}
+
/*
* Oops. The kernel tried to access some page that wasn't present.
*/
@@ -107,6 +150,7 @@ static void
__do_kernel_fault(struct mm_struct *mm, unsigned long addr, unsigned int fsr,
struct pt_regs *regs)
{
+ const char *msg;
/*
* Are we prepared to handle this kernel fault?
*/
@@ -116,16 +160,17 @@ __do_kernel_fault(struct mm_struct *mm, unsigned long addr, unsigned int fsr,
/*
* No handler, we'll have to terminate things with extreme prejudice.
*/
- bust_spinlocks(1);
- pr_alert("8<--- cut here ---\n");
- pr_alert("Unable to handle kernel %s at virtual address %08lx\n",
- (addr < PAGE_SIZE) ? "NULL pointer dereference" :
- "paging request", addr);
+ if (addr < PAGE_SIZE) {
+ msg = "NULL pointer dereference";
+ } else {
+ if (is_translation_fault(fsr) &&
+ kfence_handle_page_fault(addr, is_write_fault(fsr), regs))
+ return;
- show_pte(KERN_ALERT, mm, addr);
- die("Oops", regs, fsr);
- bust_spinlocks(0);
- do_exit(SIGKILL);
+ msg = "paging request";
+ }
+
+ die_kernel_fault(msg, mm, addr, fsr, regs);
}
/*
@@ -180,76 +225,53 @@ void do_bad_area(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
}
#ifdef CONFIG_MMU
-#define VM_FAULT_BADMAP 0x010000
-#define VM_FAULT_BADACCESS 0x020000
-
-/*
- * Check that the permissions on the VMA allow for the fault which occurred.
- * If we encountered a write fault, we must have write permission, otherwise
- * we allow any permission.
- */
-static inline bool access_error(unsigned int fsr, struct vm_area_struct *vma)
+static inline bool is_permission_fault(unsigned int fsr)
{
- unsigned int mask = VM_ACCESS_FLAGS;
-
- if ((fsr & FSR_WRITE) && !(fsr & FSR_CM))
- mask = VM_WRITE;
- if (fsr & FSR_LNX_PF)
- mask = VM_EXEC;
-
- return vma->vm_flags & mask ? false : true;
+ int fs = fsr_fs(fsr);
+#ifdef CONFIG_ARM_LPAE
+ if ((fs & FS_MMU_NOLL_MASK) == FS_PERM_NOLL)
+ return true;
+#else
+ if (fs == FS_L1_PERM || fs == FS_L2_PERM)
+ return true;
+#endif
+ return false;
}
-static vm_fault_t __kprobes
-__do_page_fault(struct mm_struct *mm, unsigned long addr, unsigned int fsr,
- unsigned int flags, struct task_struct *tsk,
- struct pt_regs *regs)
+#ifdef CONFIG_CPU_TTBR0_PAN
+static inline bool ttbr0_usermode_access_allowed(struct pt_regs *regs)
{
- struct vm_area_struct *vma;
- vm_fault_t fault;
+ struct svc_pt_regs *svcregs;
- vma = find_vma(mm, addr);
- fault = VM_FAULT_BADMAP;
- if (unlikely(!vma))
- goto out;
- if (unlikely(vma->vm_start > addr))
- goto check_stack;
-
- /*
- * Ok, we have a good vm_area for this
- * memory access, so we can handle it.
- */
-good_area:
- if (access_error(fsr, vma)) {
- fault = VM_FAULT_BADACCESS;
- goto out;
- }
+ /* If we are in user mode: permission granted */
+ if (user_mode(regs))
+ return true;
- return handle_mm_fault(vma, addr & PAGE_MASK, flags, regs);
+ /* uaccess state saved above pt_regs on SVC exception entry */
+ svcregs = to_svc_pt_regs(regs);
-check_stack:
- /* Don't allow expansion below FIRST_USER_ADDRESS */
- if (vma->vm_flags & VM_GROWSDOWN &&
- addr >= FIRST_USER_ADDRESS && !expand_stack(vma, addr))
- goto good_area;
-out:
- return fault;
+ return !(svcregs->ttbcr & TTBCR_EPD0);
+}
+#else
+static inline bool ttbr0_usermode_access_allowed(struct pt_regs *regs)
+{
+ return true;
}
+#endif
static int __kprobes
do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
{
- struct task_struct *tsk;
- struct mm_struct *mm;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
int sig, code;
vm_fault_t fault;
unsigned int flags = FAULT_FLAG_DEFAULT;
+ vm_flags_t vm_flags = VM_ACCESS_FLAGS;
if (kprobe_page_fault(regs, fsr))
return 0;
- tsk = current;
- mm = tsk->mm;
/* Enable interrupts if they were enabled in the parent context. */
if (interrupts_enabled(regs))
@@ -264,36 +286,84 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
- if ((fsr & FSR_WRITE) && !(fsr & FSR_CM))
+
+ if (is_write_fault(fsr)) {
flags |= FAULT_FLAG_WRITE;
+ vm_flags = VM_WRITE;
+ }
+
+ if (fsr & FSR_LNX_PF) {
+ vm_flags = VM_EXEC;
+
+ if (is_permission_fault(fsr) && !user_mode(regs))
+ die_kernel_fault("execution of memory",
+ mm, addr, fsr, regs);
+ }
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
/*
- * As per x86, we may deadlock here. However, since the kernel only
- * validly references user space from well defined areas of the code,
- * we can bug out early if this is from code which shouldn't.
+ * Privileged access aborts with CONFIG_CPU_TTBR0_PAN enabled are
+ * routed via the translation fault mechanism. Check whether uaccess
+ * is disabled while in kernel mode.
*/
- if (!mmap_read_trylock(mm)) {
- if (!user_mode(regs) && !search_exception_tables(regs->ARM_pc))
+ if (!ttbr0_usermode_access_allowed(regs))
+ goto no_context;
+
+ if (!(flags & FAULT_FLAG_USER))
+ goto lock_mmap;
+
+ vma = lock_vma_under_rcu(mm, addr);
+ if (!vma)
+ goto lock_mmap;
+
+ if (!(vma->vm_flags & vm_flags)) {
+ vma_end_read(vma);
+ count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+ fault = 0;
+ code = SEGV_ACCERR;
+ goto bad_area;
+ }
+ fault = handle_mm_fault(vma, addr, flags | FAULT_FLAG_VMA_LOCK, regs);
+ if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
+ vma_end_read(vma);
+
+ if (!(fault & VM_FAULT_RETRY)) {
+ count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+ goto done;
+ }
+ count_vm_vma_lock_event(VMA_LOCK_RETRY);
+ if (fault & VM_FAULT_MAJOR)
+ flags |= FAULT_FLAG_TRIED;
+
+ /* Quick path to respond to signals */
+ if (fault_signal_pending(fault, regs)) {
+ if (!user_mode(regs))
goto no_context;
+ return 0;
+ }
+lock_mmap:
+
retry:
- mmap_read_lock(mm);
- } else {
- /*
- * The above down_read_trylock() might have succeeded in
- * which case, we'll have missed the might_sleep() from
- * down_read()
- */
- might_sleep();
-#ifdef CONFIG_DEBUG_VM
- if (!user_mode(regs) &&
- !search_exception_tables(regs->ARM_pc))
- goto no_context;
-#endif
+ vma = lock_mm_and_find_vma(mm, addr, regs);
+ if (unlikely(!vma)) {
+ fault = 0;
+ code = SEGV_MAPERR;
+ goto bad_area;
}
- fault = __do_page_fault(mm, addr, fsr, flags, tsk, regs);
+ /*
+ * ok, we have a good vm_area for this memory access, check the
+ * permissions on the VMA allow for the fault which occurred.
+ */
+ if (!(vma->vm_flags & vm_flags)) {
+ mmap_read_unlock(mm);
+ fault = 0;
+ code = SEGV_ACCERR;
+ goto bad_area;
+ }
+
+ fault = handle_mm_fault(vma, addr & PAGE_MASK, flags, regs);
/* If we need to retry but a fatal signal is pending, handle the
* signal first. We do not need to release the mmap_lock because
@@ -305,7 +375,11 @@ retry:
return 0;
}
- if (!(fault & VM_FAULT_ERROR) && flags & FAULT_FLAG_ALLOW_RETRY) {
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return 0;
+
+ if (!(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_RETRY) {
flags |= FAULT_FLAG_TRIED;
goto retry;
@@ -313,13 +387,14 @@ retry:
}
mmap_read_unlock(mm);
+done:
- /*
- * Handle the "normal" case first - VM_FAULT_MAJOR
- */
- if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS))))
+ /* Handle the "normal" case first */
+ if (likely(!(fault & VM_FAULT_ERROR)))
return 0;
+ code = SEGV_MAPERR;
+bad_area:
/*
* If we are in kernel mode at this point, we
* have no context to handle this fault with.
@@ -350,8 +425,6 @@ retry:
* isn't in our memory map..
*/
sig = SIGSEGV;
- code = fault == VM_FAULT_BADACCESS ?
- SEGV_ACCERR : SEGV_MAPERR;
}
__do_user_fault(addr, fsr, sig, code, regs);
@@ -552,6 +625,7 @@ do_PrefetchAbort(unsigned long addr, unsigned int ifsr, struct pt_regs *regs)
if (!inf->fn(addr, ifsr | FSR_LNX_PF, regs))
return;
+ pr_alert("8<--- cut here ---\n");
pr_alert("Unhandled prefetch abort: %s (0x%03x) at 0x%08lx\n",
inf->name, ifsr, addr);