diff options
Diffstat (limited to 'arch/arm64/mm/fault.c')
-rw-r--r-- | arch/arm64/mm/fault.c | 304 |
1 files changed, 212 insertions, 92 deletions
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 4bf899fb451b..c7861c9864e6 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -31,6 +31,7 @@ #include <linux/highmem.h> #include <linux/perf_event.h> #include <linux/preempt.h> +#include <linux/hugetlb.h> #include <asm/bug.h> #include <asm/cpufeature.h> @@ -42,7 +43,22 @@ #include <asm/pgtable.h> #include <asm/tlbflush.h> -static const char *fault_name(unsigned int esr); +#include <acpi/ghes.h> + +struct fault_info { + int (*fn)(unsigned long addr, unsigned int esr, + struct pt_regs *regs); + int sig; + int code; + const char *name; +}; + +static const struct fault_info fault_info[]; + +static inline const struct fault_info *esr_to_fault_info(unsigned int esr) +{ + return fault_info + (esr & 63); +} #ifdef CONFIG_KPROBES static inline int notify_page_fault(struct pt_regs *regs, unsigned int esr) @@ -67,18 +83,35 @@ static inline int notify_page_fault(struct pt_regs *regs, unsigned int esr) #endif /* - * Dump out the page tables associated with 'addr' in mm 'mm'. + * Dump out the page tables associated with 'addr' in the currently active mm. */ -void show_pte(struct mm_struct *mm, unsigned long addr) +void show_pte(unsigned long addr) { + struct mm_struct *mm; pgd_t *pgd; - if (!mm) + if (addr < TASK_SIZE) { + /* TTBR0 */ + mm = current->active_mm; + if (mm == &init_mm) { + pr_alert("[%016lx] user address but active_mm is swapper\n", + addr); + return; + } + } else if (addr >= VA_START) { + /* TTBR1 */ mm = &init_mm; + } else { + pr_alert("[%016lx] address between user and kernel address ranges\n", + addr); + return; + } - pr_alert("pgd = %p\n", mm->pgd); + pr_alert("%s pgtable: %luk pages, %u-bit VAs, pgd = %p\n", + mm == &init_mm ? "swapper" : "user", PAGE_SIZE / SZ_1K, + VA_BITS, mm->pgd); pgd = pgd_offset(mm, addr); - pr_alert("[%08lx] *pgd=%016llx", addr, pgd_val(*pgd)); + pr_alert("[%016lx] *pgd=%016llx", addr, pgd_val(*pgd)); do { pud_t *pud; @@ -161,12 +194,33 @@ static bool is_el1_instruction_abort(unsigned int esr) return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_CUR; } +static inline bool is_permission_fault(unsigned int esr, struct pt_regs *regs, + unsigned long addr) +{ + unsigned int ec = ESR_ELx_EC(esr); + unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE; + + if (ec != ESR_ELx_EC_DABT_CUR && ec != ESR_ELx_EC_IABT_CUR) + return false; + + if (fsc_type == ESR_ELx_FSC_PERM) + return true; + + if (addr < USER_DS && system_uses_ttbr0_pan()) + return fsc_type == ESR_ELx_FSC_FAULT && + (regs->pstate & PSR_PAN_BIT); + + return false; +} + /* * The kernel tried to access some page that wasn't present. */ -static void __do_kernel_fault(struct mm_struct *mm, unsigned long addr, - unsigned int esr, struct pt_regs *regs) +static void __do_kernel_fault(unsigned long addr, unsigned int esr, + struct pt_regs *regs) { + const char *msg; + /* * Are we prepared to handle this kernel fault? * We are almost certainly not prepared to handle instruction faults. @@ -178,11 +232,22 @@ static void __do_kernel_fault(struct mm_struct *mm, unsigned long addr, * No handler, we'll have to terminate things with extreme prejudice. */ bust_spinlocks(1); - pr_alert("Unable to handle kernel %s at virtual address %08lx\n", - (addr < PAGE_SIZE) ? "NULL pointer dereference" : - "paging request", addr); - show_pte(mm, addr); + if (is_permission_fault(esr, regs, addr)) { + if (esr & ESR_ELx_WNR) + msg = "write to read-only memory"; + else + msg = "read from unreadable memory"; + } else if (addr < PAGE_SIZE) { + msg = "NULL pointer dereference"; + } else { + msg = "paging request"; + } + + pr_alert("Unable to handle kernel %s at virtual address %08lx\n", msg, + addr); + + show_pte(addr); die("Oops", regs, esr); bust_spinlocks(0); do_exit(SIGKILL); @@ -194,16 +259,20 @@ static void __do_kernel_fault(struct mm_struct *mm, unsigned long addr, */ static void __do_user_fault(struct task_struct *tsk, unsigned long addr, unsigned int esr, unsigned int sig, int code, - struct pt_regs *regs) + struct pt_regs *regs, int fault) { struct siginfo si; + const struct fault_info *inf; + unsigned int lsb = 0; if (unhandled_signal(tsk, sig) && show_unhandled_signals_ratelimited()) { - pr_info("%s[%d]: unhandled %s (%d) at 0x%08lx, esr 0x%03x\n", - tsk->comm, task_pid_nr(tsk), fault_name(esr), sig, + inf = esr_to_fault_info(esr); + pr_info("%s[%d]: unhandled %s (%d) at 0x%08lx, esr 0x%03x", + tsk->comm, task_pid_nr(tsk), inf->name, sig, addr, esr); - show_pte(tsk->mm, addr); - show_regs(regs); + print_vma_addr(KERN_CONT ", in ", regs->pc); + pr_cont("\n"); + __show_regs(regs); } tsk->thread.fault_address = addr; @@ -212,22 +281,34 @@ static void __do_user_fault(struct task_struct *tsk, unsigned long addr, si.si_errno = 0; si.si_code = code; si.si_addr = (void __user *)addr; + /* + * Either small page or large page may be poisoned. + * In other words, VM_FAULT_HWPOISON_LARGE and + * VM_FAULT_HWPOISON are mutually exclusive. + */ + if (fault & VM_FAULT_HWPOISON_LARGE) + lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); + else if (fault & VM_FAULT_HWPOISON) + lsb = PAGE_SHIFT; + si.si_addr_lsb = lsb; + force_sig_info(sig, &si, tsk); } static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *regs) { struct task_struct *tsk = current; - struct mm_struct *mm = tsk->active_mm; + const struct fault_info *inf; /* * If we are in kernel mode at this point, we have no context to * handle this fault with. */ - if (user_mode(regs)) - __do_user_fault(tsk, addr, esr, SIGSEGV, SEGV_MAPERR, regs); - else - __do_kernel_fault(mm, addr, esr, regs); + if (user_mode(regs)) { + inf = esr_to_fault_info(esr); + __do_user_fault(tsk, addr, esr, inf->sig, inf->code, regs, 0); + } else + __do_kernel_fault(addr, esr, regs); } #define VM_FAULT_BADMAP 0x010000 @@ -270,21 +351,6 @@ out: return fault; } -static inline bool is_permission_fault(unsigned int esr, struct pt_regs *regs) -{ - unsigned int ec = ESR_ELx_EC(esr); - unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE; - - if (ec != ESR_ELx_EC_DABT_CUR && ec != ESR_ELx_EC_IABT_CUR) - return false; - - if (system_uses_ttbr0_pan()) - return fsc_type == ESR_ELx_FSC_FAULT && - (regs->pstate & PSR_PAN_BIT); - else - return fsc_type == ESR_ELx_FSC_PERM; -} - static bool is_el0_instruction_abort(unsigned int esr) { return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_LOW; @@ -295,7 +361,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, { struct task_struct *tsk; struct mm_struct *mm; - int fault, sig, code; + int fault, sig, code, major = 0; unsigned long vm_flags = VM_READ | VM_WRITE; unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; @@ -322,7 +388,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, mm_flags |= FAULT_FLAG_WRITE; } - if (addr < USER_DS && is_permission_fault(esr, regs)) { + if (addr < USER_DS && is_permission_fault(esr, regs, addr)) { /* regs->orig_addr_limit may be 0 if we entered from EL0 */ if (regs->orig_addr_limit == KERNEL_DS) die("Accessing user space memory with fs=KERNEL_DS", regs, esr); @@ -334,6 +400,8 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, die("Accessing user space memory outside uaccess.h routines", regs, esr); } + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); + /* * As per x86, we may deadlock here. However, since the kernel only * validly references user space from well defined areas of the code, @@ -357,24 +425,42 @@ retry: } fault = __do_page_fault(mm, addr, mm_flags, vm_flags, tsk); + major |= fault & VM_FAULT_MAJOR; - /* - * If we need to retry but a fatal signal is pending, handle the - * signal first. We do not need to release the mmap_sem because it - * would already be released in __lock_page_or_retry in mm/filemap.c. - */ - if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) - return 0; + if (fault & VM_FAULT_RETRY) { + /* + * If we need to retry but a fatal signal is pending, + * handle the signal first. We do not need to release + * the mmap_sem because it would already be released + * in __lock_page_or_retry in mm/filemap.c. + */ + if (fatal_signal_pending(current)) + return 0; + + /* + * Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk of + * starvation. + */ + if (mm_flags & FAULT_FLAG_ALLOW_RETRY) { + mm_flags &= ~FAULT_FLAG_ALLOW_RETRY; + mm_flags |= FAULT_FLAG_TRIED; + goto retry; + } + } + up_read(&mm->mmap_sem); /* - * Major/minor page fault accounting is only done on the initial - * attempt. If we go through a retry, it is extremely likely that the - * page will be found in page cache at that point. + * Handle the "normal" (no error) case first. */ - - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); - if (mm_flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_MAJOR) { + if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | + VM_FAULT_BADACCESS)))) { + /* + * Major/minor page fault accounting is only done + * once. If we go through a retry, it is extremely + * likely that the page will be found in page cache at + * that point. + */ + if (major) { tsk->maj_flt++; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, addr); @@ -383,25 +469,9 @@ retry: perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, addr); } - if (fault & VM_FAULT_RETRY) { - /* - * Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk of - * starvation. - */ - mm_flags &= ~FAULT_FLAG_ALLOW_RETRY; - mm_flags |= FAULT_FLAG_TRIED; - goto retry; - } - } - up_read(&mm->mmap_sem); - - /* - * Handle the "normal" case first - VM_FAULT_MAJOR - */ - if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | - VM_FAULT_BADACCESS)))) return 0; + } /* * If we are in kernel mode at this point, we have no context to @@ -427,6 +497,9 @@ retry: */ sig = SIGBUS; code = BUS_ADRERR; + } else if (fault & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) { + sig = SIGBUS; + code = BUS_MCEERR_AR; } else { /* * Something tried to access memory that isn't in our memory @@ -437,11 +510,11 @@ retry: SEGV_ACCERR : SEGV_MAPERR; } - __do_user_fault(tsk, addr, esr, sig, code, regs); + __do_user_fault(tsk, addr, esr, sig, code, regs, fault); return 0; no_context: - __do_kernel_fault(mm, addr, esr, regs); + __do_kernel_fault(addr, esr, regs); return 0; } @@ -488,12 +561,48 @@ static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs) return 1; } -static const struct fault_info { - int (*fn)(unsigned long addr, unsigned int esr, struct pt_regs *regs); - int sig; - int code; - const char *name; -} fault_info[] = { +/* + * This abort handler deals with Synchronous External Abort. + * It calls notifiers, and then returns "fault". + */ +static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) +{ + struct siginfo info; + const struct fault_info *inf; + int ret = 0; + + inf = esr_to_fault_info(esr); + pr_err("Synchronous External Abort: %s (0x%08x) at 0x%016lx\n", + inf->name, esr, addr); + + /* + * Synchronous aborts may interrupt code which had interrupts masked. + * Before calling out into the wider kernel tell the interested + * subsystems. + */ + if (IS_ENABLED(CONFIG_ACPI_APEI_SEA)) { + if (interrupts_enabled(regs)) + nmi_enter(); + + ret = ghes_notify_sea(); + + if (interrupts_enabled(regs)) + nmi_exit(); + } + + info.si_signo = SIGBUS; + info.si_errno = 0; + info.si_code = 0; + if (esr & ESR_ELx_FnV) + info.si_addr = NULL; + else + info.si_addr = (void __user *)addr; + arm64_notify_die("", regs, &info, esr); + + return ret; +} + +static const struct fault_info fault_info[] = { { do_bad, SIGBUS, 0, "ttbr address size fault" }, { do_bad, SIGBUS, 0, "level 1 address size fault" }, { do_bad, SIGBUS, 0, "level 2 address size fault" }, @@ -510,22 +619,22 @@ static const struct fault_info { { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 permission fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 permission fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 permission fault" }, - { do_bad, SIGBUS, 0, "synchronous external abort" }, + { do_sea, SIGBUS, 0, "synchronous external abort" }, { do_bad, SIGBUS, 0, "unknown 17" }, { do_bad, SIGBUS, 0, "unknown 18" }, { do_bad, SIGBUS, 0, "unknown 19" }, - { do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous parity error" }, + { do_sea, SIGBUS, 0, "level 0 (translation table walk)" }, + { do_sea, SIGBUS, 0, "level 1 (translation table walk)" }, + { do_sea, SIGBUS, 0, "level 2 (translation table walk)" }, + { do_sea, SIGBUS, 0, "level 3 (translation table walk)" }, + { do_sea, SIGBUS, 0, "synchronous parity or ECC error" }, { do_bad, SIGBUS, 0, "unknown 25" }, { do_bad, SIGBUS, 0, "unknown 26" }, { do_bad, SIGBUS, 0, "unknown 27" }, - { do_bad, SIGBUS, 0, "synchronous parity error (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous parity error (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous parity error (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous parity error (translation table walk)" }, + { do_sea, SIGBUS, 0, "level 0 synchronous parity error (translation table walk)" }, + { do_sea, SIGBUS, 0, "level 1 synchronous parity error (translation table walk)" }, + { do_sea, SIGBUS, 0, "level 2 synchronous parity error (translation table walk)" }, + { do_sea, SIGBUS, 0, "level 3 synchronous parity error (translation table walk)" }, { do_bad, SIGBUS, 0, "unknown 32" }, { do_alignment_fault, SIGBUS, BUS_ADRALN, "alignment fault" }, { do_bad, SIGBUS, 0, "unknown 34" }, @@ -560,10 +669,21 @@ static const struct fault_info { { do_bad, SIGBUS, 0, "unknown 63" }, }; -static const char *fault_name(unsigned int esr) +/* + * Handle Synchronous External Aborts that occur in a guest kernel. + * + * The return value will be zero if the SEA was successfully handled + * and non-zero if there was an error processing the error or there was + * no error to process. + */ +int handle_guest_sea(phys_addr_t addr, unsigned int esr) { - const struct fault_info *inf = fault_info + (esr & 63); - return inf->name; + int ret = -ENOENT; + + if (IS_ENABLED(CONFIG_ACPI_APEI_SEA)) + ret = ghes_notify_sea(); + + return ret; } /* @@ -572,7 +692,7 @@ static const char *fault_name(unsigned int esr) asmlinkage void __exception do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs) { - const struct fault_info *inf = fault_info + (esr & 63); + const struct fault_info *inf = esr_to_fault_info(esr); struct siginfo info; if (!inf->fn(addr, esr, regs)) |