// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2021 Benjamin Berg */ #include #include #include #include /* * Known security issues * * Userspace can jump to this address to execute *any* syscall that is * permitted by the stub. As we will return afterwards, it can do * whatever it likes, including: * - Tricking the kernel into handing out the memory FD * - Using this memory FD to read/write all physical memory * - Running in parallel to the kernel processing a syscall * (possibly creating data races?) * - Blocking e.g. SIGALRM to avoid time based scheduling * * To avoid this, the permitted location for each syscall needs to be * checked for in the SECCOMP filter (which is reasonably simple). Also, * more care will need to go into considerations how the code might be * tricked by using a prepared stack (or even modifying the stack from * another thread in case SMP support is added). * * As for the SIGALRM, the best counter measure will be to check in the * kernel that the process is reporting back the SIGALRM in a timely * fashion. */ static __always_inline int syscall_handler(int fd_map[STUB_MAX_FDS]) { struct stub_data *d = get_stub_data(); int i; unsigned long res; int fd; for (i = 0; i < d->syscall_data_len; i++) { struct stub_syscall *sc = &d->syscall_data[i]; switch (sc->syscall) { case STUB_SYSCALL_MMAP: if (fd_map) fd = fd_map[sc->mem.fd]; else fd = sc->mem.fd; res = stub_syscall6(STUB_MMAP_NR, sc->mem.addr, sc->mem.length, sc->mem.prot, MAP_SHARED | MAP_FIXED, fd, sc->mem.offset); if (res != sc->mem.addr) { d->err = res; d->syscall_data_len = i; return -1; } break; case STUB_SYSCALL_MUNMAP: res = stub_syscall2(__NR_munmap, sc->mem.addr, sc->mem.length); if (res) { d->err = res; d->syscall_data_len = i; return -1; } break; default: d->err = -95; /* EOPNOTSUPP */ d->syscall_data_len = i; return -1; } } d->err = 0; d->syscall_data_len = 0; return 0; } void __section(".__syscall_stub") stub_syscall_handler(void) { syscall_handler(NULL); trap_myself(); } void __section(".__syscall_stub") stub_signal_interrupt(int sig, siginfo_t *info, void *p) { struct stub_data *d = get_stub_data(); char rcv_data; union { char data[CMSG_SPACE(sizeof(int) * STUB_MAX_FDS)]; struct cmsghdr align; } ctrl = {}; struct iovec iov = { .iov_base = &rcv_data, .iov_len = 1, }; struct msghdr msghdr = { .msg_iov = &iov, .msg_iovlen = 1, .msg_control = &ctrl, .msg_controllen = sizeof(ctrl), }; ucontext_t *uc = p; struct cmsghdr *fd_msg; int *fd_map; int num_fds; long res; d->signal = sig; d->si_offset = (unsigned long)info - (unsigned long)&d->sigstack[0]; d->mctx_offset = (unsigned long)&uc->uc_mcontext - (unsigned long)&d->sigstack[0]; restart_wait: d->futex = FUTEX_IN_KERN; do { res = stub_syscall3(__NR_futex, (unsigned long)&d->futex, FUTEX_WAKE, 1); } while (res == -EINTR); do { res = stub_syscall4(__NR_futex, (unsigned long)&d->futex, FUTEX_WAIT, FUTEX_IN_KERN, 0); } while (res == -EINTR || d->futex == FUTEX_IN_KERN); if (res < 0 && res != -EAGAIN) stub_syscall1(__NR_exit_group, 1); if (d->syscall_data_len) { /* Read passed FDs (if any) */ do { res = stub_syscall3(__NR_recvmsg, 0, (unsigned long)&msghdr, 0); } while (res == -EINTR); /* We should never have a receive error (other than -EAGAIN) */ if (res < 0 && res != -EAGAIN) stub_syscall1(__NR_exit_group, 1); /* Receive the FDs */ num_fds = 0; fd_msg = msghdr.msg_control; fd_map = (void *)&CMSG_DATA(fd_msg); if (res == iov.iov_len && msghdr.msg_controllen > sizeof(struct cmsghdr)) num_fds = (fd_msg->cmsg_len - CMSG_LEN(0)) / sizeof(int); /* Try running queued syscalls. */ res = syscall_handler(fd_map); while (num_fds) stub_syscall2(__NR_close, fd_map[--num_fds], 0); } else { res = 0; } if (res < 0 || d->restart_wait) { /* Report SIGSYS if we restart. */ d->signal = SIGSYS; d->restart_wait = 0; goto restart_wait; } /* Restore arch dependent state that is not part of the mcontext */ stub_seccomp_restore_state(&d->arch_data); /* Return so that the host modified mcontext is restored. */ } void __section(".__syscall_stub") stub_signal_restorer(void) { /* We must not have anything on the stack when doing rt_sigreturn */ stub_syscall0(__NR_rt_sigreturn); }