diff options
Diffstat (limited to 'arch/s390/mm/fault.c')
| -rw-r--r-- | arch/s390/mm/fault.c | 893 |
1 files changed, 339 insertions, 554 deletions
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 047c3e4c59a2..e2e13778c36a 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -1,17 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 /* * S390 version * Copyright IBM Corp. 1999 * Author(s): Hartmut Penner (hp@de.ibm.com) - * Ulrich Weigand (uweigand@de.ibm.com) + * Ulrich Weigand (uweigand@de.ibm.com) * * Derived from "arch/i386/mm/fault.c" * Copyright (C) 1995 Linus Torvalds */ #include <linux/kernel_stat.h> +#include <linux/mmu_context.h> +#include <linux/cpufeature.h> #include <linux/perf_event.h> #include <linux/signal.h> #include <linux/sched.h> +#include <linux/sched/debug.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/string.h> @@ -19,241 +23,233 @@ #include <linux/ptrace.h> #include <linux/mman.h> #include <linux/mm.h> -#include <linux/compat.h> #include <linux/smp.h> #include <linux/kdebug.h> #include <linux/init.h> #include <linux/console.h> -#include <linux/module.h> +#include <linux/extable.h> #include <linux/hardirq.h> #include <linux/kprobes.h> #include <linux/uaccess.h> #include <linux/hugetlb.h> +#include <linux/kfence.h> +#include <linux/pagewalk.h> +#include <asm/asm-extable.h> #include <asm/asm-offsets.h> -#include <asm/pgtable.h> +#include <asm/ptrace.h> +#include <asm/fault.h> +#include <asm/diag.h> #include <asm/irq.h> -#include <asm/mmu_context.h> #include <asm/facility.h> +#include <asm/uv.h> #include "../kernel/entry.h" -#ifndef CONFIG_64BIT -#define __FAIL_ADDR_MASK 0x7ffff000 -#define __SUBCODE_MASK 0x0200 -#define __PF_RES_FIELD 0ULL -#else /* CONFIG_64BIT */ -#define __FAIL_ADDR_MASK -4096L -#define __SUBCODE_MASK 0x0600 -#define __PF_RES_FIELD 0x8000000000000000ULL -#endif /* CONFIG_64BIT */ +/* + * Find out which address space caused the exception. + */ +static bool is_kernel_fault(struct pt_regs *regs) +{ + union teid teid = { .val = regs->int_parm_long }; -#define VM_FAULT_BADCONTEXT 0x010000 -#define VM_FAULT_BADMAP 0x020000 -#define VM_FAULT_BADACCESS 0x040000 -#define VM_FAULT_SIGNAL 0x080000 + if (user_mode(regs)) + return false; + if (teid.as == PSW_BITS_AS_SECONDARY) + return false; + return true; +} + +static unsigned long get_fault_address(struct pt_regs *regs) +{ + union teid teid = { .val = regs->int_parm_long }; -static unsigned long store_indication __read_mostly; + return teid.addr * PAGE_SIZE; +} -#ifdef CONFIG_64BIT -static int __init fault_init(void) +static __always_inline bool fault_is_write(struct pt_regs *regs) { + union teid teid = { .val = regs->int_parm_long }; + if (test_facility(75)) - store_indication = 0xc00; - return 0; + return teid.fsi == TEID_FSI_STORE; + return false; } -early_initcall(fault_init); -#endif -static inline int notify_page_fault(struct pt_regs *regs) +static void dump_pagetable(unsigned long asce, unsigned long address) { - int ret = 0; - - /* kprobe_running() needs smp_processor_id() */ - if (kprobes_built_in() && !user_mode(regs)) { - preempt_disable(); - if (kprobe_running() && kprobe_fault_handler(regs, 14)) - ret = 1; - preempt_enable(); + unsigned long entry, *table = __va(asce & _ASCE_ORIGIN); + + pr_alert("AS:%016lx ", asce); + switch (asce & _ASCE_TYPE_MASK) { + case _ASCE_TYPE_REGION1: + table += (address & _REGION1_INDEX) >> _REGION1_SHIFT; + if (get_kernel_nofault(entry, table)) + goto bad; + pr_cont("R1:%016lx ", entry); + if (entry & _REGION_ENTRY_INVALID) + goto out; + table = __va(entry & _REGION_ENTRY_ORIGIN); + fallthrough; + case _ASCE_TYPE_REGION2: + table += (address & _REGION2_INDEX) >> _REGION2_SHIFT; + if (get_kernel_nofault(entry, table)) + goto bad; + pr_cont("R2:%016lx ", entry); + if (entry & _REGION_ENTRY_INVALID) + goto out; + table = __va(entry & _REGION_ENTRY_ORIGIN); + fallthrough; + case _ASCE_TYPE_REGION3: + table += (address & _REGION3_INDEX) >> _REGION3_SHIFT; + if (get_kernel_nofault(entry, table)) + goto bad; + pr_cont("R3:%016lx ", entry); + if (entry & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE)) + goto out; + table = __va(entry & _REGION_ENTRY_ORIGIN); + fallthrough; + case _ASCE_TYPE_SEGMENT: + table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; + if (get_kernel_nofault(entry, table)) + goto bad; + pr_cont("S:%016lx ", entry); + if (entry & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE)) + goto out; + table = __va(entry & _SEGMENT_ENTRY_ORIGIN); } - return ret; + table += (address & _PAGE_INDEX) >> PAGE_SHIFT; + if (get_kernel_nofault(entry, table)) + goto bad; + pr_cont("P:%016lx ", entry); +out: + pr_cont("\n"); + return; +bad: + pr_cont("BAD\n"); } - -/* - * Unlock any spinlocks which will prevent us from getting the - * message out. - */ -void bust_spinlocks(int yes) +static void dump_fault_info(struct pt_regs *regs) { - if (yes) { - oops_in_progress = 1; + union teid teid = { .val = regs->int_parm_long }; + unsigned long asce; + + pr_alert("Failing address: %016lx TEID: %016lx", + get_fault_address(regs), teid.val); + if (test_facility(131)) + pr_cont(" ESOP-2"); + else if (machine_has_esop()) + pr_cont(" ESOP-1"); + else + pr_cont(" SOP"); + if (test_facility(75)) + pr_cont(" FSI"); + pr_cont("\n"); + pr_alert("Fault in "); + switch (teid.as) { + case PSW_BITS_AS_HOME: + pr_cont("home space "); + break; + case PSW_BITS_AS_SECONDARY: + pr_cont("secondary space "); + break; + case PSW_BITS_AS_ACCREG: + pr_cont("access register "); + break; + case PSW_BITS_AS_PRIMARY: + pr_cont("primary space "); + break; + } + pr_cont("mode while using "); + if (is_kernel_fault(regs)) { + asce = get_lowcore()->kernel_asce.val; + pr_cont("kernel "); } else { - int loglevel_save = console_loglevel; - console_unblank(); - oops_in_progress = 0; - /* - * OK, the message is on the console. Now we call printk() - * without oops_in_progress set so that printk will give klogd - * a poke. Hold onto your hats... - */ - console_loglevel = 15; - printk(" "); - console_loglevel = loglevel_save; + asce = get_lowcore()->user_asce.val; + pr_cont("user "); } + pr_cont("ASCE.\n"); + dump_pagetable(asce, get_fault_address(regs)); } -/* - * Returns the address space associated with the fault. - * Returns 0 for kernel space and 1 for user space. - */ -static inline int user_space_fault(unsigned long trans_exc_code) +int show_unhandled_signals = 1; + +static const struct ctl_table s390_fault_sysctl_table[] = { + { + .procname = "userprocess_debug", + .data = &show_unhandled_signals, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +}; + +static int __init init_s390_fault_sysctls(void) { - /* - * The lowest two bits of the translation exception - * identification indicate which paging table was used. - */ - trans_exc_code &= 3; - if (trans_exc_code == 2) - /* Access via secondary space, set_fs setting decides */ - return current->thread.mm_segment.ar4; - if (s390_user_mode == HOME_SPACE_MODE) - /* User space if the access has been done via home space. */ - return trans_exc_code == 3; - /* - * If the user space is not the home space the kernel runs in home - * space. Access via secondary space has already been covered, - * access via primary space or access register is from user space - * and access via home space is from the kernel. - */ - return trans_exc_code != 3; + register_sysctl_init("kernel", s390_fault_sysctl_table); + return 0; } +arch_initcall(init_s390_fault_sysctls); -static inline void report_user_fault(struct pt_regs *regs, long signr) +void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault) { + static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); + if ((task_pid_nr(current) > 1) && !show_unhandled_signals) return; if (!unhandled_signal(current, signr)) return; - if (!printk_ratelimit()) + if (!__ratelimit(&rs)) return; - printk(KERN_ALERT "User process fault: interruption code 0x%X ", - regs->int_code); - print_vma_addr(KERN_CONT "in ", regs->psw.addr & PSW_ADDR_INSN); - printk(KERN_CONT "\n"); - printk(KERN_ALERT "failing address: %lX\n", - regs->int_parm_long & __FAIL_ADDR_MASK); + pr_alert("User process fault: interruption code %04x ilc:%d ", + regs->int_code & 0xffff, regs->int_code >> 17); + print_vma_addr(KERN_CONT "in ", regs->psw.addr); + pr_cont("\n"); + if (is_mm_fault) + dump_fault_info(regs); show_regs(regs); } -/* - * Send SIGSEGV to task. This is an external routine - * to keep the stack usage of do_page_fault small. - */ -static noinline void do_sigsegv(struct pt_regs *regs, int si_code) +static void do_sigsegv(struct pt_regs *regs, int si_code) { - struct siginfo si; - - report_user_fault(regs, SIGSEGV); - si.si_signo = SIGSEGV; - si.si_code = si_code; - si.si_addr = (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK); - force_sig_info(SIGSEGV, &si, current); + report_user_fault(regs, SIGSEGV, 1); + force_sig_fault(SIGSEGV, si_code, (void __user *)get_fault_address(regs)); } -static noinline void do_no_context(struct pt_regs *regs) +static void handle_fault_error_nolock(struct pt_regs *regs, int si_code) { - const struct exception_table_entry *fixup; unsigned long address; + bool is_write; - /* Are we prepared to handle this kernel fault? */ - fixup = search_exception_tables(regs->psw.addr & PSW_ADDR_INSN); - if (fixup) { - regs->psw.addr = extable_fixup(fixup) | PSW_ADDR_AMODE; + if (user_mode(regs)) { + if (WARN_ON_ONCE(!si_code)) + si_code = SEGV_MAPERR; + return do_sigsegv(regs, si_code); + } + if (fixup_exception(regs)) return; + if (is_kernel_fault(regs)) { + address = get_fault_address(regs); + is_write = fault_is_write(regs); + if (kfence_handle_page_fault(address, is_write, regs)) + return; + pr_alert("Unable to handle kernel pointer dereference in virtual kernel address space\n"); + } else { + pr_alert("Unable to handle kernel paging request in virtual user address space\n"); } - - /* - * Oops. The kernel tried to access some bad page. We'll have to - * terminate things with extreme prejudice. - */ - address = regs->int_parm_long & __FAIL_ADDR_MASK; - if (!user_space_fault(regs->int_parm_long)) - printk(KERN_ALERT "Unable to handle kernel pointer dereference" - " at virtual kernel address %p\n", (void *)address); - else - printk(KERN_ALERT "Unable to handle kernel paging request" - " at virtual user address %p\n", (void *)address); - + dump_fault_info(regs); die(regs, "Oops"); - do_exit(SIGKILL); } -static noinline void do_low_address(struct pt_regs *regs) +static void handle_fault_error(struct pt_regs *regs, int si_code) { - /* Low-address protection hit in kernel mode means - NULL pointer write access in kernel mode. */ - if (regs->psw.mask & PSW_MASK_PSTATE) { - /* Low-address protection hit in user mode 'cannot happen'. */ - die (regs, "Low-address protection"); - do_exit(SIGKILL); - } + struct mm_struct *mm = current->mm; - do_no_context(regs); + mmap_read_unlock(mm); + handle_fault_error_nolock(regs, si_code); } -static noinline void do_sigbus(struct pt_regs *regs) +static void do_sigbus(struct pt_regs *regs) { - struct task_struct *tsk = current; - struct siginfo si; - - /* - * Send a sigbus, regardless of whether we were in kernel - * or user mode. - */ - si.si_signo = SIGBUS; - si.si_errno = 0; - si.si_code = BUS_ADRERR; - si.si_addr = (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK); - force_sig_info(SIGBUS, &si, tsk); -} - -static noinline void do_fault_error(struct pt_regs *regs, int fault) -{ - int si_code; - - switch (fault) { - case VM_FAULT_BADACCESS: - case VM_FAULT_BADMAP: - /* Bad memory access. Check if it is kernel or user space. */ - if (user_mode(regs)) { - /* User mode accesses just cause a SIGSEGV */ - si_code = (fault == VM_FAULT_BADMAP) ? - SEGV_MAPERR : SEGV_ACCERR; - do_sigsegv(regs, si_code); - return; - } - case VM_FAULT_BADCONTEXT: - do_no_context(regs); - break; - case VM_FAULT_SIGNAL: - if (!user_mode(regs)) - do_no_context(regs); - break; - default: /* fault & VM_FAULT_ERROR */ - if (fault & VM_FAULT_OOM) { - if (!user_mode(regs)) - do_no_context(regs); - else - pagefault_out_of_memory(); - } else if (fault & VM_FAULT_SIGBUS) { - /* Kernel mode? Handle exceptions or die */ - if (!user_mode(regs)) - do_no_context(regs); - else - do_sigbus(regs); - } else - BUG(); - break; - } + force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)get_fault_address(regs)); } /* @@ -262,427 +258,216 @@ static noinline void do_fault_error(struct pt_regs *regs, int fault) * routines. * * interruption code (int_code): - * 04 Protection -> Write-Protection (suprression) - * 10 Segment translation -> Not present (nullification) - * 11 Page translation -> Not present (nullification) - * 3b Region third trans. -> Not present (nullification) + * 04 Protection -> Write-Protection (suppression) + * 10 Segment translation -> Not present (nullification) + * 11 Page translation -> Not present (nullification) + * 3b Region third trans. -> Not present (nullification) */ -static inline int do_exception(struct pt_regs *regs, int access) +static void do_exception(struct pt_regs *regs, int access) { - struct task_struct *tsk; - struct mm_struct *mm; struct vm_area_struct *vma; - unsigned long trans_exc_code; unsigned long address; + struct mm_struct *mm; unsigned int flags; - int fault; + vm_fault_t fault; + bool is_write; - tsk = current; /* * The instruction that caused the program check has * been nullified. Don't signal single step via SIGTRAP. */ - clear_tsk_thread_flag(tsk, TIF_PER_TRAP); - - if (notify_page_fault(regs)) - return 0; - - mm = tsk->mm; - trans_exc_code = regs->int_parm_long; - - /* - * Verify that the fault happened in user space, that - * we are not in an interrupt and that there is a - * user context. - */ - fault = VM_FAULT_BADCONTEXT; - if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm)) - goto out; - - address = trans_exc_code & __FAIL_ADDR_MASK; + clear_thread_flag(TIF_PER_TRAP); + if (kprobe_page_fault(regs, 14)) + return; + mm = current->mm; + address = get_fault_address(regs); + is_write = fault_is_write(regs); + if (is_kernel_fault(regs) || faulthandler_disabled() || !mm) + return handle_fault_error_nolock(regs, 0); perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); - flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; - if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400) + flags = FAULT_FLAG_DEFAULT; + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; + if (is_write) + access = VM_WRITE; + if (access == VM_WRITE) flags |= FAULT_FLAG_WRITE; - down_read(&mm->mmap_sem); - -#ifdef CONFIG_PGSTE - if ((current->flags & PF_VCPU) && S390_lowcore.gmap) { - address = __gmap_fault(address, - (struct gmap *) S390_lowcore.gmap); - if (address == -EFAULT) { - fault = VM_FAULT_BADMAP; - goto out_up; - } - if (address == -ENOMEM) { - fault = VM_FAULT_OOM; - goto out_up; - } + if (!(flags & FAULT_FLAG_USER)) + goto lock_mmap; + vma = lock_vma_under_rcu(mm, address); + if (!vma) + goto lock_mmap; + if (!(vma->vm_flags & access)) { + vma_end_read(vma); + count_vm_vma_lock_event(VMA_LOCK_SUCCESS); + return handle_fault_error_nolock(regs, SEGV_ACCERR); } -#endif - + fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); + if (!(fault & VM_FAULT_RETRY)) { + count_vm_vma_lock_event(VMA_LOCK_SUCCESS); + goto done; + } + count_vm_vma_lock_event(VMA_LOCK_RETRY); + if (fault & VM_FAULT_MAJOR) + flags |= FAULT_FLAG_TRIED; + /* Quick path to respond to signals */ + if (fault_signal_pending(fault, regs)) { + if (!user_mode(regs)) + handle_fault_error_nolock(regs, 0); + return; + } +lock_mmap: retry: - fault = VM_FAULT_BADMAP; - vma = find_vma(mm, address); + vma = lock_mm_and_find_vma(mm, address, regs); if (!vma) - goto out_up; - - if (unlikely(vma->vm_start > address)) { - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto out_up; - if (expand_stack(vma, address)) - goto out_up; - } - - /* - * Ok, we have a good vm_area for this memory access, so - * we can handle it.. - */ - fault = VM_FAULT_BADACCESS; + return handle_fault_error_nolock(regs, SEGV_MAPERR); if (unlikely(!(vma->vm_flags & access))) - goto out_up; - - if (is_vm_hugetlb_page(vma)) - address &= HPAGE_MASK; - /* - * If for any reason at all we couldn't handle the fault, - * make sure we exit gracefully rather than endlessly redo - * the fault. - */ - fault = handle_mm_fault(mm, vma, address, flags); - /* No reason to continue if interrupted by SIGKILL. */ - if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) { - fault = VM_FAULT_SIGNAL; - goto out; + return handle_fault_error(regs, SEGV_ACCERR); + fault = handle_mm_fault(vma, address, flags, regs); + if (fault_signal_pending(fault, regs)) { + if (!user_mode(regs)) + handle_fault_error_nolock(regs, 0); + return; } - if (unlikely(fault & VM_FAULT_ERROR)) - goto out_up; - - /* - * Major/minor page fault accounting is only done on the - * initial attempt. If we go through a retry, it is extremely - * likely that the page will be found in page cache at that point. - */ - if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_MAJOR) { - tsk->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, - regs, address); - } else { - tsk->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, - regs, address); - } - if (fault & VM_FAULT_RETRY) { - /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk - * of starvation. */ - flags &= ~FAULT_FLAG_ALLOW_RETRY; - flags |= FAULT_FLAG_TRIED; - down_read(&mm->mmap_sem); - goto retry; - } + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + return; + if (fault & VM_FAULT_RETRY) { + flags |= FAULT_FLAG_TRIED; + goto retry; + } + mmap_read_unlock(mm); +done: + if (!(fault & VM_FAULT_ERROR)) + return; + if (fault & VM_FAULT_OOM) { + if (!user_mode(regs)) + handle_fault_error_nolock(regs, 0); + else + pagefault_out_of_memory(); + } else if (fault & VM_FAULT_SIGSEGV) { + if (!user_mode(regs)) + handle_fault_error_nolock(regs, 0); + else + do_sigsegv(regs, SEGV_MAPERR); + } else if (fault & (VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | + VM_FAULT_HWPOISON_LARGE)) { + if (!user_mode(regs)) + handle_fault_error_nolock(regs, 0); + else + do_sigbus(regs); + } else { + pr_emerg("Unexpected fault flags: %08x\n", fault); + BUG(); } - fault = 0; -out_up: - up_read(&mm->mmap_sem); -out: - return fault; } -void __kprobes do_protection_exception(struct pt_regs *regs) +void do_protection_exception(struct pt_regs *regs) { - unsigned long trans_exc_code; - int fault; + union teid teid = { .val = regs->int_parm_long }; - trans_exc_code = regs->int_parm_long; /* * Protection exceptions are suppressing, decrement psw address. * The exception to this rule are aborted transactions, for these * the PSW already points to the correct location. */ - if (!(regs->int_code & 0x200)) + if (!(regs->int_code & 0x200)) { regs->psw.addr = __rewind_psw(regs->psw, regs->int_code >> 16); - /* - * Check for low-address protection. This needs to be treated - * as a special case because the translation exception code - * field is not guaranteed to contain valid data in this case. - */ - if (unlikely(!(trans_exc_code & 4))) { - do_low_address(regs); - return; + set_pt_regs_flag(regs, PIF_PSW_ADDR_ADJUSTED); } - fault = do_exception(regs, VM_WRITE); - if (unlikely(fault)) - do_fault_error(regs, fault); -} - -void __kprobes do_dat_exception(struct pt_regs *regs) -{ - int access, fault; - - access = VM_READ | VM_EXEC | VM_WRITE; - fault = do_exception(regs, access); - if (unlikely(fault)) - do_fault_error(regs, fault); -} - -#ifdef CONFIG_64BIT -void __kprobes do_asce_exception(struct pt_regs *regs) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - unsigned long trans_exc_code; - /* - * The instruction that caused the program check has - * been nullified. Don't signal single step via SIGTRAP. + * If bit 61 if the TEID is not set, the remainder of the + * TEID is unpredictable. Special handling is required. */ - clear_tsk_thread_flag(current, TIF_PER_TRAP); - - trans_exc_code = regs->int_parm_long; - if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm)) - goto no_context; - - down_read(&mm->mmap_sem); - vma = find_vma(mm, trans_exc_code & __FAIL_ADDR_MASK); - up_read(&mm->mmap_sem); - - if (vma) { - update_mm(mm, current); - return; + if (unlikely(!teid.b61)) { + if (user_mode(regs)) { + dump_fault_info(regs); + die(regs, "Unexpected TEID"); + } + /* Assume low-address protection in kernel mode. */ + return handle_fault_error_nolock(regs, 0); } - - /* User mode accesses just cause a SIGSEGV */ - if (user_mode(regs)) { - do_sigsegv(regs, SEGV_MAPERR); - return; + if (unlikely(cpu_has_nx() && teid.b56)) { + regs->int_parm_long = (teid.addr * PAGE_SIZE) | (regs->psw.addr & PAGE_MASK); + return handle_fault_error_nolock(regs, SEGV_ACCERR); } - -no_context: - do_no_context(regs); -} -#endif - -int __handle_fault(unsigned long uaddr, unsigned long pgm_int_code, int write) -{ - struct pt_regs regs; - int access, fault; - - /* Emulate a uaccess fault from kernel mode. */ - regs.psw.mask = psw_kernel_bits | PSW_MASK_DAT | PSW_MASK_MCHECK; - if (!irqs_disabled()) - regs.psw.mask |= PSW_MASK_IO | PSW_MASK_EXT; - regs.psw.addr = (unsigned long) __builtin_return_address(0); - regs.psw.addr |= PSW_ADDR_AMODE; - regs.int_code = pgm_int_code; - regs.int_parm_long = (uaddr & PAGE_MASK) | 2; - access = write ? VM_WRITE : VM_READ; - fault = do_exception(®s, access); - /* - * Since the fault happened in kernel mode while performing a uaccess - * all we need to do now is emulating a fixup in case "fault" is not - * zero. - * For the calling uaccess functions this results always in -EFAULT. - */ - return fault ? -EFAULT : 0; + do_exception(regs, VM_WRITE); } +NOKPROBE_SYMBOL(do_protection_exception); -#ifdef CONFIG_PFAULT -/* - * 'pfault' pseudo page faults routines. - */ -static int pfault_disable; - -static int __init nopfault(char *str) +void do_dat_exception(struct pt_regs *regs) { - pfault_disable = 1; - return 1; + do_exception(regs, VM_ACCESS_FLAGS); } +NOKPROBE_SYMBOL(do_dat_exception); -__setup("nopfault", nopfault); - -struct pfault_refbk { - u16 refdiagc; - u16 reffcode; - u16 refdwlen; - u16 refversn; - u64 refgaddr; - u64 refselmk; - u64 refcmpmk; - u64 reserved; -} __attribute__ ((packed, aligned(8))); - -int pfault_init(void) -{ - struct pfault_refbk refbk = { - .refdiagc = 0x258, - .reffcode = 0, - .refdwlen = 5, - .refversn = 2, - .refgaddr = __LC_CURRENT_PID, - .refselmk = 1ULL << 48, - .refcmpmk = 1ULL << 48, - .reserved = __PF_RES_FIELD }; - int rc; - - if (pfault_disable) - return -1; - asm volatile( - " diag %1,%0,0x258\n" - "0: j 2f\n" - "1: la %0,8\n" - "2:\n" - EX_TABLE(0b,1b) - : "=d" (rc) : "a" (&refbk), "m" (refbk) : "cc"); - return rc; -} +#if IS_ENABLED(CONFIG_PGSTE) -void pfault_fini(void) +void do_secure_storage_access(struct pt_regs *regs) { - struct pfault_refbk refbk = { - .refdiagc = 0x258, - .reffcode = 1, - .refdwlen = 5, - .refversn = 2, - }; - - if (pfault_disable) - return; - asm volatile( - " diag %0,0,0x258\n" - "0:\n" - EX_TABLE(0b,0b) - : : "a" (&refbk), "m" (refbk) : "cc"); -} - -static DEFINE_SPINLOCK(pfault_lock); -static LIST_HEAD(pfault_list); - -static void pfault_interrupt(struct ext_code ext_code, - unsigned int param32, unsigned long param64) -{ - struct task_struct *tsk; - __u16 subcode; - pid_t pid; + union teid teid = { .val = regs->int_parm_long }; + unsigned long addr = get_fault_address(regs); + struct vm_area_struct *vma; + struct folio_walk fw; + struct mm_struct *mm; + struct folio *folio; + int rc; /* - * Get the external interruption subcode & pfault - * initial/completion signal bit. VM stores this - * in the 'cpu address' field associated with the - * external interrupt. + * Bit 61 indicates if the address is valid, if it is not the + * kernel should be stopped or SIGSEGV should be sent to the + * process. Bit 61 is not reliable without the misc UV feature, + * therefore this needs to be checked too. */ - subcode = ext_code.subcode; - if ((subcode & 0xff00) != __SUBCODE_MASK) - return; - inc_irq_stat(IRQEXT_PFL); - /* Get the token (= pid of the affected task). */ - pid = sizeof(void *) == 4 ? param32 : param64; - rcu_read_lock(); - tsk = find_task_by_pid_ns(pid, &init_pid_ns); - if (tsk) - get_task_struct(tsk); - rcu_read_unlock(); - if (!tsk) - return; - spin_lock(&pfault_lock); - if (subcode & 0x0080) { - /* signal bit is set -> a page has been swapped in by VM */ - if (tsk->thread.pfault_wait == 1) { - /* Initial interrupt was faster than the completion - * interrupt. pfault_wait is valid. Set pfault_wait - * back to zero and wake up the process. This can - * safely be done because the task is still sleeping - * and can't produce new pfaults. */ - tsk->thread.pfault_wait = 0; - list_del(&tsk->thread.list); - wake_up_process(tsk); - put_task_struct(tsk); - } else { - /* Completion interrupt was faster than initial - * interrupt. Set pfault_wait to -1 so the initial - * interrupt doesn't put the task to sleep. - * If the task is not running, ignore the completion - * interrupt since it must be a leftover of a PFAULT - * CANCEL operation which didn't remove all pending - * completion interrupts. */ - if (tsk->state == TASK_RUNNING) - tsk->thread.pfault_wait = -1; - } - } else { - /* signal bit not set -> a real page is missing. */ - if (WARN_ON_ONCE(tsk != current)) - goto out; - if (tsk->thread.pfault_wait == 1) { - /* Already on the list with a reference: put to sleep */ - __set_task_state(tsk, TASK_UNINTERRUPTIBLE); - set_tsk_need_resched(tsk); - } else if (tsk->thread.pfault_wait == -1) { - /* Completion interrupt was faster than the initial - * interrupt (pfault_wait == -1). Set pfault_wait - * back to zero and exit. */ - tsk->thread.pfault_wait = 0; - } else { - /* Initial interrupt arrived before completion - * interrupt. Let the task sleep. - * An extra task reference is needed since a different - * cpu may set the task state to TASK_RUNNING again - * before the scheduler is reached. */ - get_task_struct(tsk); - tsk->thread.pfault_wait = 1; - list_add(&tsk->thread.list, &pfault_list); - __set_task_state(tsk, TASK_UNINTERRUPTIBLE); - set_tsk_need_resched(tsk); + if (uv_has_feature(BIT_UV_FEAT_MISC) && !teid.b61) { + /* + * When this happens, userspace did something that it + * was not supposed to do, e.g. branching into secure + * memory. Trigger a segmentation fault. + */ + if (user_mode(regs)) { + send_sig(SIGSEGV, current, 0); + return; } + /* + * The kernel should never run into this case and + * there is no way out of this situation. + */ + panic("Unexpected PGM 0x3d with TEID bit 61=0"); } -out: - spin_unlock(&pfault_lock); - put_task_struct(tsk); -} - -static int __cpuinit pfault_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - struct thread_struct *thread, *next; - struct task_struct *tsk; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DEAD: - spin_lock_irq(&pfault_lock); - list_for_each_entry_safe(thread, next, &pfault_list, list) { - thread->pfault_wait = 0; - list_del(&thread->list); - tsk = container_of(thread, struct task_struct, thread); - wake_up_process(tsk); - put_task_struct(tsk); + if (is_kernel_fault(regs)) { + folio = phys_to_folio(addr); + if (unlikely(!folio_try_get(folio))) + return; + rc = arch_make_folio_accessible(folio); + folio_put(folio); + if (rc) + BUG(); + } else { + if (faulthandler_disabled()) + return handle_fault_error_nolock(regs, 0); + mm = current->mm; + mmap_read_lock(mm); + vma = find_vma(mm, addr); + if (!vma) + return handle_fault_error(regs, SEGV_MAPERR); + folio = folio_walk_start(&fw, vma, addr, 0); + if (!folio) { + mmap_read_unlock(mm); + return; } - spin_unlock_irq(&pfault_lock); - break; - default: - break; + /* arch_make_folio_accessible() needs a raised refcount. */ + folio_get(folio); + rc = arch_make_folio_accessible(folio); + folio_put(folio); + folio_walk_end(&fw, vma); + if (rc) + send_sig(SIGSEGV, current, 0); + mmap_read_unlock(mm); } - return NOTIFY_OK; -} - -static int __init pfault_irq_init(void) -{ - int rc; - - rc = register_external_interrupt(0x2603, pfault_interrupt); - if (rc) - goto out_extint; - rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP; - if (rc) - goto out_pfault; - service_subclass_irq_register(); - hotcpu_notifier(pfault_cpu_notify, 0); - return 0; - -out_pfault: - unregister_external_interrupt(0x2603, pfault_interrupt); -out_extint: - pfault_disable = 1; - return rc; } -early_initcall(pfault_irq_init); +NOKPROBE_SYMBOL(do_secure_storage_access); -#endif /* CONFIG_PFAULT */ +#endif /* CONFIG_PGSTE */ |
