diff options
Diffstat (limited to 'arch/um/os-Linux')
31 files changed, 2485 insertions, 3223 deletions
diff --git a/arch/um/os-Linux/Makefile b/arch/um/os-Linux/Makefile index 08ff5094fcdd..f8d672d570d9 100644 --- a/arch/um/os-Linux/Makefile +++ b/arch/um/os-Linux/Makefile @@ -1,20 +1,23 @@ +# SPDX-License-Identifier: GPL-2.0 # # Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) -# Licensed under the GPL # -obj-y = aio.o execvp.o file.o helper.o irq.o main.o mem.o process.o \ +# Don't instrument UML-specific code +KCOV_INSTRUMENT := n + +obj-y = elf_aux.o execvp.o file.o helper.o irq.o main.o mem.o process.o \ registers.o sigio.o signal.o start_up.o time.o tty.o \ - umid.o user_syms.o util.o drivers/ skas/ + umid.o user_syms.o util.o skas/ -obj-$(CONFIG_ARCH_REUSE_HOST_VSYSCALL_AREA) += elf_aux.o +CFLAGS_signal.o += -Wframe-larger-than=4096 -USER_OBJS := $(user-objs-y) aio.o elf_aux.o execvp.o file.o helper.o irq.o \ - main.o mem.o process.o registers.o sigio.o signal.o start_up.o time.o \ - tty.o umid.o util.o +CFLAGS_main.o += -Wno-frame-larger-than -HAVE_AIO_ABI := $(shell [ -r /usr/include/linux/aio_abi.h ] && \ - echo -DHAVE_AIO_ABI ) -CFLAGS_aio.o += $(HAVE_AIO_ABI) +obj-$(CONFIG_SMP) += smp.o + +USER_OBJS := $(user-objs-y) elf_aux.o execvp.o file.o helper.o irq.o \ + main.o mem.o process.o registers.o sigio.o signal.o start_up.o time.o \ + tty.o umid.o util.o smp.o -include arch/um/scripts/Makefile.rules +include $(srctree)/arch/um/scripts/Makefile.rules diff --git a/arch/um/os-Linux/aio.c b/arch/um/os-Linux/aio.c deleted file mode 100644 index 3a6bc2af0961..000000000000 --- a/arch/um/os-Linux/aio.c +++ /dev/null @@ -1,391 +0,0 @@ -/* - * Copyright (C) 2004 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL - */ - -#include <unistd.h> -#include <sched.h> -#include <signal.h> -#include <errno.h> -#include <sys/time.h> -#include <asm/unistd.h> -#include <aio.h> -#include <init.h> -#include <kern_util.h> -#include <os.h> - -struct aio_thread_req { - enum aio_type type; - int io_fd; - unsigned long long offset; - char *buf; - int len; - struct aio_context *aio; -}; - -#if defined(HAVE_AIO_ABI) -#include <linux/aio_abi.h> - -/* - * If we have the headers, we are going to build with AIO enabled. - * If we don't have aio in libc, we define the necessary stubs here. - */ - -#if !defined(HAVE_AIO_LIBC) - -static long io_setup(int n, aio_context_t *ctxp) -{ - return syscall(__NR_io_setup, n, ctxp); -} - -static long io_submit(aio_context_t ctx, long nr, struct iocb **iocbpp) -{ - return syscall(__NR_io_submit, ctx, nr, iocbpp); -} - -static long io_getevents(aio_context_t ctx_id, long min_nr, long nr, - struct io_event *events, struct timespec *timeout) -{ - return syscall(__NR_io_getevents, ctx_id, min_nr, nr, events, timeout); -} - -#endif - -/* - * The AIO_MMAP cases force the mmapped page into memory here - * rather than in whatever place first touches the data. I used - * to do this by touching the page, but that's delicate because - * gcc is prone to optimizing that away. So, what's done here - * is we read from the descriptor from which the page was - * mapped. The caller is required to pass an offset which is - * inside the page that was mapped. Thus, when the read - * returns, we know that the page is in the page cache, and - * that it now backs the mmapped area. - */ - -static int do_aio(aio_context_t ctx, enum aio_type type, int fd, char *buf, - int len, unsigned long long offset, struct aio_context *aio) -{ - struct iocb *iocbp = & ((struct iocb) { - .aio_data = (unsigned long) aio, - .aio_fildes = fd, - .aio_buf = (unsigned long) buf, - .aio_nbytes = len, - .aio_offset = offset - }); - char c; - - switch (type) { - case AIO_READ: - iocbp->aio_lio_opcode = IOCB_CMD_PREAD; - break; - case AIO_WRITE: - iocbp->aio_lio_opcode = IOCB_CMD_PWRITE; - break; - case AIO_MMAP: - iocbp->aio_lio_opcode = IOCB_CMD_PREAD; - iocbp->aio_buf = (unsigned long) &c; - iocbp->aio_nbytes = sizeof(c); - break; - default: - printk(UM_KERN_ERR "Bogus op in do_aio - %d\n", type); - return -EINVAL; - } - - return (io_submit(ctx, 1, &iocbp) > 0) ? 0 : -errno; -} - -/* Initialized in an initcall and unchanged thereafter */ -static aio_context_t ctx = 0; - -static int aio_thread(void *arg) -{ - struct aio_thread_reply reply; - struct io_event event; - int err, n, reply_fd; - - signal(SIGWINCH, SIG_IGN); - - while (1) { - n = io_getevents(ctx, 1, 1, &event, NULL); - if (n < 0) { - if (errno == EINTR) - continue; - printk(UM_KERN_ERR "aio_thread - io_getevents failed, " - "errno = %d\n", errno); - } - else { - reply = ((struct aio_thread_reply) - { .data = (void *) (long) event.data, - .err = event.res }); - reply_fd = ((struct aio_context *) reply.data)->reply_fd; - err = write(reply_fd, &reply, sizeof(reply)); - if (err != sizeof(reply)) - printk(UM_KERN_ERR "aio_thread - write failed, " - "fd = %d, err = %d\n", reply_fd, errno); - } - } - return 0; -} - -#endif - -static int do_not_aio(struct aio_thread_req *req) -{ - char c; - unsigned long long actual; - int n; - - actual = lseek64(req->io_fd, req->offset, SEEK_SET); - if (actual != req->offset) - return -errno; - - switch (req->type) { - case AIO_READ: - n = read(req->io_fd, req->buf, req->len); - break; - case AIO_WRITE: - n = write(req->io_fd, req->buf, req->len); - break; - case AIO_MMAP: - n = read(req->io_fd, &c, sizeof(c)); - break; - default: - printk(UM_KERN_ERR "do_not_aio - bad request type : %d\n", - req->type); - return -EINVAL; - } - - if (n < 0) - return -errno; - return 0; -} - -/* These are initialized in initcalls and not changed */ -static int aio_req_fd_r = -1; -static int aio_req_fd_w = -1; -static int aio_pid = -1; -static unsigned long aio_stack; - -static int not_aio_thread(void *arg) -{ - struct aio_thread_req req; - struct aio_thread_reply reply; - int err; - - signal(SIGWINCH, SIG_IGN); - while (1) { - err = read(aio_req_fd_r, &req, sizeof(req)); - if (err != sizeof(req)) { - if (err < 0) - printk(UM_KERN_ERR "not_aio_thread - " - "read failed, fd = %d, err = %d\n", - aio_req_fd_r, - errno); - else { - printk(UM_KERN_ERR "not_aio_thread - short " - "read, fd = %d, length = %d\n", - aio_req_fd_r, err); - } - continue; - } - err = do_not_aio(&req); - reply = ((struct aio_thread_reply) { .data = req.aio, - .err = err }); - err = write(req.aio->reply_fd, &reply, sizeof(reply)); - if (err != sizeof(reply)) - printk(UM_KERN_ERR "not_aio_thread - write failed, " - "fd = %d, err = %d\n", req.aio->reply_fd, errno); - } - - return 0; -} - -static int init_aio_24(void) -{ - int fds[2], err; - - err = os_pipe(fds, 1, 1); - if (err) - goto out; - - aio_req_fd_w = fds[0]; - aio_req_fd_r = fds[1]; - - err = os_set_fd_block(aio_req_fd_w, 0); - if (err) - goto out_close_pipe; - - err = run_helper_thread(not_aio_thread, NULL, - CLONE_FILES | CLONE_VM, &aio_stack); - if (err < 0) - goto out_close_pipe; - - aio_pid = err; - goto out; - -out_close_pipe: - close(fds[0]); - close(fds[1]); - aio_req_fd_w = -1; - aio_req_fd_r = -1; -out: -#ifndef HAVE_AIO_ABI - printk(UM_KERN_INFO "/usr/include/linux/aio_abi.h not present during " - "build\n"); -#endif - printk(UM_KERN_INFO "2.6 host AIO support not used - falling back to " - "I/O thread\n"); - return 0; -} - -#ifdef HAVE_AIO_ABI -#define DEFAULT_24_AIO 0 -static int init_aio_26(void) -{ - int err; - - if (io_setup(256, &ctx)) { - err = -errno; - printk(UM_KERN_ERR "aio_thread failed to initialize context, " - "err = %d\n", errno); - return err; - } - - err = run_helper_thread(aio_thread, NULL, - CLONE_FILES | CLONE_VM, &aio_stack); - if (err < 0) - return err; - - aio_pid = err; - - printk(UM_KERN_INFO "Using 2.6 host AIO\n"); - return 0; -} - -static int submit_aio_26(enum aio_type type, int io_fd, char *buf, int len, - unsigned long long offset, struct aio_context *aio) -{ - struct aio_thread_reply reply; - int err; - - err = do_aio(ctx, type, io_fd, buf, len, offset, aio); - if (err) { - reply = ((struct aio_thread_reply) { .data = aio, - .err = err }); - err = write(aio->reply_fd, &reply, sizeof(reply)); - if (err != sizeof(reply)) { - err = -errno; - printk(UM_KERN_ERR "submit_aio_26 - write failed, " - "fd = %d, err = %d\n", aio->reply_fd, -err); - } - else err = 0; - } - - return err; -} - -#else -#define DEFAULT_24_AIO 1 -static int init_aio_26(void) -{ - return -ENOSYS; -} - -static int submit_aio_26(enum aio_type type, int io_fd, char *buf, int len, - unsigned long long offset, struct aio_context *aio) -{ - return -ENOSYS; -} -#endif - -/* Initialized in an initcall and unchanged thereafter */ -static int aio_24 = DEFAULT_24_AIO; - -static int __init set_aio_24(char *name, int *add) -{ - aio_24 = 1; - return 0; -} - -__uml_setup("aio=2.4", set_aio_24, -"aio=2.4\n" -" This is used to force UML to use 2.4-style AIO even when 2.6 AIO is\n" -" available. 2.4 AIO is a single thread that handles one request at a\n" -" time, synchronously. 2.6 AIO is a thread which uses the 2.6 AIO \n" -" interface to handle an arbitrary number of pending requests. 2.6 AIO \n" -" is not available in tt mode, on 2.4 hosts, or when UML is built with\n" -" /usr/include/linux/aio_abi.h not available. Many distributions don't\n" -" include aio_abi.h, so you will need to copy it from a kernel tree to\n" -" your /usr/include/linux in order to build an AIO-capable UML\n\n" -); - -static int init_aio(void) -{ - int err; - - if (!aio_24) { - err = init_aio_26(); - if (err && (errno == ENOSYS)) { - printk(UM_KERN_INFO "2.6 AIO not supported on the " - "host - reverting to 2.4 AIO\n"); - aio_24 = 1; - } - else return err; - } - - if (aio_24) - return init_aio_24(); - - return 0; -} - -/* - * The reason for the __initcall/__uml_exitcall asymmetry is that init_aio - * needs to be called when the kernel is running because it calls run_helper, - * which needs get_free_page. exit_aio is a __uml_exitcall because the generic - * kernel does not run __exitcalls on shutdown, and can't because many of them - * break when called outside of module unloading. - */ -__initcall(init_aio); - -static void exit_aio(void) -{ - if (aio_pid != -1) { - os_kill_process(aio_pid, 1); - free_stack(aio_stack, 0); - } -} - -__uml_exitcall(exit_aio); - -static int submit_aio_24(enum aio_type type, int io_fd, char *buf, int len, - unsigned long long offset, struct aio_context *aio) -{ - struct aio_thread_req req = { .type = type, - .io_fd = io_fd, - .offset = offset, - .buf = buf, - .len = len, - .aio = aio, - }; - int err; - - err = write(aio_req_fd_w, &req, sizeof(req)); - if (err == sizeof(req)) - err = 0; - else err = -errno; - - return err; -} - -int submit_aio(enum aio_type type, int io_fd, char *buf, int len, - unsigned long long offset, int reply_fd, - struct aio_context *aio) -{ - aio->reply_fd = reply_fd; - if (aio_24) - return submit_aio_24(type, io_fd, buf, len, offset, aio); - else - return submit_aio_26(type, io_fd, buf, len, offset, aio); -} diff --git a/arch/um/os-Linux/drivers/Makefile b/arch/um/os-Linux/drivers/Makefile deleted file mode 100644 index 6c546dc9222b..000000000000 --- a/arch/um/os-Linux/drivers/Makefile +++ /dev/null @@ -1,13 +0,0 @@ -# -# Copyright (C) 2000, 2002 Jeff Dike (jdike@karaya.com) -# Licensed under the GPL -# - -ethertap-objs := ethertap_kern.o ethertap_user.o -tuntap-objs := tuntap_kern.o tuntap_user.o - -obj-y = -obj-$(CONFIG_UML_NET_ETHERTAP) += ethertap.o -obj-$(CONFIG_UML_NET_TUNTAP) += tuntap.o - -include arch/um/scripts/Makefile.rules diff --git a/arch/um/os-Linux/drivers/etap.h b/arch/um/os-Linux/drivers/etap.h deleted file mode 100644 index 54183a679fdd..000000000000 --- a/arch/um/os-Linux/drivers/etap.h +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL - */ - -#ifndef __DRIVERS_ETAP_H -#define __DRIVERS_ETAP_H - -#include <net_user.h> - -struct ethertap_data { - char *dev_name; - char *gate_addr; - int data_fd; - int control_fd; - void *dev; -}; - -extern const struct net_user_info ethertap_user_info; - -#endif diff --git a/arch/um/os-Linux/drivers/ethertap_kern.c b/arch/um/os-Linux/drivers/ethertap_kern.c deleted file mode 100644 index f424600a583f..000000000000 --- a/arch/um/os-Linux/drivers/ethertap_kern.c +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and - * James Leu (jleu@mindspring.net). - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Copyright (C) 2001 by various other people who didn't put their name here. - * Licensed under the GPL. - */ - -#include <linux/init.h> -#include <linux/netdevice.h> -#include "etap.h" -#include <net_kern.h> - -struct ethertap_init { - char *dev_name; - char *gate_addr; -}; - -static void etap_init(struct net_device *dev, void *data) -{ - struct uml_net_private *pri; - struct ethertap_data *epri; - struct ethertap_init *init = data; - - pri = netdev_priv(dev); - epri = (struct ethertap_data *) pri->user; - epri->dev_name = init->dev_name; - epri->gate_addr = init->gate_addr; - epri->data_fd = -1; - epri->control_fd = -1; - epri->dev = dev; - - printk(KERN_INFO "ethertap backend - %s", epri->dev_name); - if (epri->gate_addr != NULL) - printk(KERN_CONT ", IP = %s", epri->gate_addr); - printk(KERN_CONT "\n"); -} - -static int etap_read(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - int len; - - len = net_recvfrom(fd, skb_mac_header(skb), - skb->dev->mtu + 2 + ETH_HEADER_ETHERTAP); - if (len <= 0) - return(len); - - skb_pull(skb, 2); - len -= 2; - return len; -} - -static int etap_write(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - skb_push(skb, 2); - return net_send(fd, skb->data, skb->len); -} - -const struct net_kern_info ethertap_kern_info = { - .init = etap_init, - .protocol = eth_protocol, - .read = etap_read, - .write = etap_write, -}; - -int ethertap_setup(char *str, char **mac_out, void *data) -{ - struct ethertap_init *init = data; - - *init = ((struct ethertap_init) - { .dev_name = NULL, - .gate_addr = NULL }); - if (tap_setup_common(str, "ethertap", &init->dev_name, mac_out, - &init->gate_addr)) - return 0; - if (init->dev_name == NULL) { - printk(KERN_ERR "ethertap_setup : Missing tap device name\n"); - return 0; - } - - return 1; -} - -static struct transport ethertap_transport = { - .list = LIST_HEAD_INIT(ethertap_transport.list), - .name = "ethertap", - .setup = ethertap_setup, - .user = ðertap_user_info, - .kern = ðertap_kern_info, - .private_size = sizeof(struct ethertap_data), - .setup_size = sizeof(struct ethertap_init), -}; - -static int register_ethertap(void) -{ - register_transport(ðertap_transport); - return 0; -} - -late_initcall(register_ethertap); diff --git a/arch/um/os-Linux/drivers/ethertap_user.c b/arch/um/os-Linux/drivers/ethertap_user.c deleted file mode 100644 index b39b6696ac58..000000000000 --- a/arch/um/os-Linux/drivers/ethertap_user.c +++ /dev/null @@ -1,248 +0,0 @@ -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and - * James Leu (jleu@mindspring.net). - * Copyright (C) 2001 by various other people who didn't put their name here. - * Licensed under the GPL. - */ - -#include <stdio.h> -#include <unistd.h> -#include <errno.h> -#include <string.h> -#include <sys/socket.h> -#include <sys/wait.h> -#include "etap.h" -#include <os.h> -#include <net_user.h> -#include <um_malloc.h> - -#define MAX_PACKET ETH_MAX_PACKET - -static int etap_user_init(void *data, void *dev) -{ - struct ethertap_data *pri = data; - - pri->dev = dev; - return 0; -} - -struct addr_change { - enum { ADD_ADDR, DEL_ADDR } what; - unsigned char addr[4]; - unsigned char netmask[4]; -}; - -static void etap_change(int op, unsigned char *addr, unsigned char *netmask, - int fd) -{ - struct addr_change change; - char *output; - int n; - - change.what = op; - memcpy(change.addr, addr, sizeof(change.addr)); - memcpy(change.netmask, netmask, sizeof(change.netmask)); - CATCH_EINTR(n = write(fd, &change, sizeof(change))); - if (n != sizeof(change)) { - printk(UM_KERN_ERR "etap_change - request failed, err = %d\n", - errno); - return; - } - - output = uml_kmalloc(UM_KERN_PAGE_SIZE, UM_GFP_KERNEL); - if (output == NULL) - printk(UM_KERN_ERR "etap_change : Failed to allocate output " - "buffer\n"); - read_output(fd, output, UM_KERN_PAGE_SIZE); - if (output != NULL) { - printk("%s", output); - kfree(output); - } -} - -static void etap_open_addr(unsigned char *addr, unsigned char *netmask, - void *arg) -{ - etap_change(ADD_ADDR, addr, netmask, *((int *) arg)); -} - -static void etap_close_addr(unsigned char *addr, unsigned char *netmask, - void *arg) -{ - etap_change(DEL_ADDR, addr, netmask, *((int *) arg)); -} - -struct etap_pre_exec_data { - int control_remote; - int control_me; - int data_me; -}; - -static void etap_pre_exec(void *arg) -{ - struct etap_pre_exec_data *data = arg; - - dup2(data->control_remote, 1); - close(data->data_me); - close(data->control_me); -} - -static int etap_tramp(char *dev, char *gate, int control_me, - int control_remote, int data_me, int data_remote) -{ - struct etap_pre_exec_data pe_data; - int pid, err, n; - char version_buf[sizeof("nnnnn\0")]; - char data_fd_buf[sizeof("nnnnnn\0")]; - char gate_buf[sizeof("nnn.nnn.nnn.nnn\0")]; - char *setup_args[] = { "uml_net", version_buf, "ethertap", dev, - data_fd_buf, gate_buf, NULL }; - char *nosetup_args[] = { "uml_net", version_buf, "ethertap", - dev, data_fd_buf, NULL }; - char **args, c; - - sprintf(data_fd_buf, "%d", data_remote); - sprintf(version_buf, "%d", UML_NET_VERSION); - if (gate != NULL) { - strcpy(gate_buf, gate); - args = setup_args; - } - else args = nosetup_args; - - err = 0; - pe_data.control_remote = control_remote; - pe_data.control_me = control_me; - pe_data.data_me = data_me; - pid = run_helper(etap_pre_exec, &pe_data, args); - - if (pid < 0) - err = pid; - close(data_remote); - close(control_remote); - CATCH_EINTR(n = read(control_me, &c, sizeof(c))); - if (n != sizeof(c)) { - err = -errno; - printk(UM_KERN_ERR "etap_tramp : read of status failed, " - "err = %d\n", -err); - return err; - } - if (c != 1) { - printk(UM_KERN_ERR "etap_tramp : uml_net failed\n"); - err = helper_wait(pid); - } - return err; -} - -static int etap_open(void *data) -{ - struct ethertap_data *pri = data; - char *output; - int data_fds[2], control_fds[2], err, output_len; - - err = tap_open_common(pri->dev, pri->gate_addr); - if (err) - return err; - - err = socketpair(AF_UNIX, SOCK_DGRAM, 0, data_fds); - if (err) { - err = -errno; - printk(UM_KERN_ERR "etap_open - data socketpair failed - " - "err = %d\n", errno); - return err; - } - - err = socketpair(AF_UNIX, SOCK_STREAM, 0, control_fds); - if (err) { - err = -errno; - printk(UM_KERN_ERR "etap_open - control socketpair failed - " - "err = %d\n", errno); - goto out_close_data; - } - - err = etap_tramp(pri->dev_name, pri->gate_addr, control_fds[0], - control_fds[1], data_fds[0], data_fds[1]); - output_len = UM_KERN_PAGE_SIZE; - output = uml_kmalloc(output_len, UM_GFP_KERNEL); - read_output(control_fds[0], output, output_len); - - if (output == NULL) - printk(UM_KERN_ERR "etap_open : failed to allocate output " - "buffer\n"); - else { - printk("%s", output); - kfree(output); - } - - if (err < 0) { - printk(UM_KERN_ERR "etap_tramp failed - err = %d\n", -err); - goto out_close_control; - } - - pri->data_fd = data_fds[0]; - pri->control_fd = control_fds[0]; - iter_addresses(pri->dev, etap_open_addr, &pri->control_fd); - return data_fds[0]; - -out_close_control: - close(control_fds[0]); - close(control_fds[1]); -out_close_data: - close(data_fds[0]); - close(data_fds[1]); - return err; -} - -static void etap_close(int fd, void *data) -{ - struct ethertap_data *pri = data; - - iter_addresses(pri->dev, etap_close_addr, &pri->control_fd); - close(fd); - - if (shutdown(pri->data_fd, SHUT_RDWR) < 0) - printk(UM_KERN_ERR "etap_close - shutdown data socket failed, " - "errno = %d\n", errno); - - if (shutdown(pri->control_fd, SHUT_RDWR) < 0) - printk(UM_KERN_ERR "etap_close - shutdown control socket " - "failed, errno = %d\n", errno); - - close(pri->data_fd); - pri->data_fd = -1; - close(pri->control_fd); - pri->control_fd = -1; -} - -static void etap_add_addr(unsigned char *addr, unsigned char *netmask, - void *data) -{ - struct ethertap_data *pri = data; - - tap_check_ips(pri->gate_addr, addr); - if (pri->control_fd == -1) - return; - etap_open_addr(addr, netmask, &pri->control_fd); -} - -static void etap_del_addr(unsigned char *addr, unsigned char *netmask, - void *data) -{ - struct ethertap_data *pri = data; - - if (pri->control_fd == -1) - return; - - etap_close_addr(addr, netmask, &pri->control_fd); -} - -const struct net_user_info ethertap_user_info = { - .init = etap_user_init, - .open = etap_open, - .close = etap_close, - .remove = NULL, - .add_address = etap_add_addr, - .delete_address = etap_del_addr, - .mtu = ETH_MAX_PACKET, - .max_packet = ETH_MAX_PACKET + ETH_HEADER_ETHERTAP, -}; diff --git a/arch/um/os-Linux/drivers/tuntap.h b/arch/um/os-Linux/drivers/tuntap.h deleted file mode 100644 index 7367354ac8df..000000000000 --- a/arch/um/os-Linux/drivers/tuntap.h +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL - */ - -#ifndef __UM_TUNTAP_H -#define __UM_TUNTAP_H - -#include <net_user.h> - -struct tuntap_data { - char *dev_name; - int fixed_config; - char *gate_addr; - int fd; - void *dev; -}; - -extern const struct net_user_info tuntap_user_info; - -#endif diff --git a/arch/um/os-Linux/drivers/tuntap_kern.c b/arch/um/os-Linux/drivers/tuntap_kern.c deleted file mode 100644 index d9d56e5810fe..000000000000 --- a/arch/um/os-Linux/drivers/tuntap_kern.c +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL - */ - -#include <linux/netdevice.h> -#include <linux/init.h> -#include <linux/skbuff.h> -#include <asm/errno.h> -#include <net_kern.h> -#include "tuntap.h" - -struct tuntap_init { - char *dev_name; - char *gate_addr; -}; - -static void tuntap_init(struct net_device *dev, void *data) -{ - struct uml_net_private *pri; - struct tuntap_data *tpri; - struct tuntap_init *init = data; - - pri = netdev_priv(dev); - tpri = (struct tuntap_data *) pri->user; - tpri->dev_name = init->dev_name; - tpri->fixed_config = (init->dev_name != NULL); - tpri->gate_addr = init->gate_addr; - tpri->fd = -1; - tpri->dev = dev; - - printk(KERN_INFO "TUN/TAP backend - "); - if (tpri->gate_addr != NULL) - printk(KERN_CONT "IP = %s", tpri->gate_addr); - printk(KERN_CONT "\n"); -} - -static int tuntap_read(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - return net_read(fd, skb_mac_header(skb), - skb->dev->mtu + ETH_HEADER_OTHER); -} - -static int tuntap_write(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - return net_write(fd, skb->data, skb->len); -} - -const struct net_kern_info tuntap_kern_info = { - .init = tuntap_init, - .protocol = eth_protocol, - .read = tuntap_read, - .write = tuntap_write, -}; - -int tuntap_setup(char *str, char **mac_out, void *data) -{ - struct tuntap_init *init = data; - - *init = ((struct tuntap_init) - { .dev_name = NULL, - .gate_addr = NULL }); - if (tap_setup_common(str, "tuntap", &init->dev_name, mac_out, - &init->gate_addr)) - return 0; - - return 1; -} - -static struct transport tuntap_transport = { - .list = LIST_HEAD_INIT(tuntap_transport.list), - .name = "tuntap", - .setup = tuntap_setup, - .user = &tuntap_user_info, - .kern = &tuntap_kern_info, - .private_size = sizeof(struct tuntap_data), - .setup_size = sizeof(struct tuntap_init), -}; - -static int register_tuntap(void) -{ - register_transport(&tuntap_transport); - return 0; -} - -late_initcall(register_tuntap); diff --git a/arch/um/os-Linux/drivers/tuntap_user.c b/arch/um/os-Linux/drivers/tuntap_user.c deleted file mode 100644 index 14126d9176aa..000000000000 --- a/arch/um/os-Linux/drivers/tuntap_user.c +++ /dev/null @@ -1,215 +0,0 @@ -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL - */ - -#include <stdio.h> -#include <unistd.h> -#include <errno.h> -#include <string.h> -#include <linux/if_tun.h> -#include <net/if.h> -#include <sys/ioctl.h> -#include <sys/socket.h> -#include <sys/wait.h> -#include <sys/uio.h> -#include <kern_util.h> -#include <os.h> -#include "tuntap.h" - -static int tuntap_user_init(void *data, void *dev) -{ - struct tuntap_data *pri = data; - - pri->dev = dev; - return 0; -} - -static void tuntap_add_addr(unsigned char *addr, unsigned char *netmask, - void *data) -{ - struct tuntap_data *pri = data; - - tap_check_ips(pri->gate_addr, addr); - if ((pri->fd == -1) || pri->fixed_config) - return; - open_addr(addr, netmask, pri->dev_name); -} - -static void tuntap_del_addr(unsigned char *addr, unsigned char *netmask, - void *data) -{ - struct tuntap_data *pri = data; - - if ((pri->fd == -1) || pri->fixed_config) - return; - close_addr(addr, netmask, pri->dev_name); -} - -struct tuntap_pre_exec_data { - int stdout; - int close_me; -}; - -static void tuntap_pre_exec(void *arg) -{ - struct tuntap_pre_exec_data *data = arg; - - dup2(data->stdout, 1); - close(data->close_me); -} - -static int tuntap_open_tramp(char *gate, int *fd_out, int me, int remote, - char *buffer, int buffer_len, int *used_out) -{ - struct tuntap_pre_exec_data data; - char version_buf[sizeof("nnnnn\0")]; - char *argv[] = { "uml_net", version_buf, "tuntap", "up", gate, - NULL }; - char buf[CMSG_SPACE(sizeof(*fd_out))]; - struct msghdr msg; - struct cmsghdr *cmsg; - struct iovec iov; - int pid, n, err; - - sprintf(version_buf, "%d", UML_NET_VERSION); - - data.stdout = remote; - data.close_me = me; - - pid = run_helper(tuntap_pre_exec, &data, argv); - - if (pid < 0) - return -pid; - - close(remote); - - msg.msg_name = NULL; - msg.msg_namelen = 0; - if (buffer != NULL) { - iov = ((struct iovec) { buffer, buffer_len }); - msg.msg_iov = &iov; - msg.msg_iovlen = 1; - } - else { - msg.msg_iov = NULL; - msg.msg_iovlen = 0; - } - msg.msg_control = buf; - msg.msg_controllen = sizeof(buf); - msg.msg_flags = 0; - n = recvmsg(me, &msg, 0); - *used_out = n; - if (n < 0) { - err = -errno; - printk(UM_KERN_ERR "tuntap_open_tramp : recvmsg failed - " - "errno = %d\n", errno); - return err; - } - helper_wait(pid); - - cmsg = CMSG_FIRSTHDR(&msg); - if (cmsg == NULL) { - printk(UM_KERN_ERR "tuntap_open_tramp : didn't receive a " - "message\n"); - return -EINVAL; - } - if ((cmsg->cmsg_level != SOL_SOCKET) || - (cmsg->cmsg_type != SCM_RIGHTS)) { - printk(UM_KERN_ERR "tuntap_open_tramp : didn't receive a " - "descriptor\n"); - return -EINVAL; - } - *fd_out = ((int *) CMSG_DATA(cmsg))[0]; - os_set_exec_close(*fd_out); - return 0; -} - -static int tuntap_open(void *data) -{ - struct ifreq ifr; - struct tuntap_data *pri = data; - char *output, *buffer; - int err, fds[2], len, used; - - err = tap_open_common(pri->dev, pri->gate_addr); - if (err < 0) - return err; - - if (pri->fixed_config) { - pri->fd = os_open_file("/dev/net/tun", - of_cloexec(of_rdwr(OPENFLAGS())), 0); - if (pri->fd < 0) { - printk(UM_KERN_ERR "Failed to open /dev/net/tun, " - "err = %d\n", -pri->fd); - return pri->fd; - } - memset(&ifr, 0, sizeof(ifr)); - ifr.ifr_flags = IFF_TAP | IFF_NO_PI; - strlcpy(ifr.ifr_name, pri->dev_name, sizeof(ifr.ifr_name)); - if (ioctl(pri->fd, TUNSETIFF, &ifr) < 0) { - err = -errno; - printk(UM_KERN_ERR "TUNSETIFF failed, errno = %d\n", - errno); - close(pri->fd); - return err; - } - } - else { - err = socketpair(AF_UNIX, SOCK_DGRAM, 0, fds); - if (err) { - err = -errno; - printk(UM_KERN_ERR "tuntap_open : socketpair failed - " - "errno = %d\n", errno); - return err; - } - - buffer = get_output_buffer(&len); - if (buffer != NULL) - len--; - used = 0; - - err = tuntap_open_tramp(pri->gate_addr, &pri->fd, fds[0], - fds[1], buffer, len, &used); - - output = buffer; - if (err < 0) { - printk("%s", output); - free_output_buffer(buffer); - printk(UM_KERN_ERR "tuntap_open_tramp failed - " - "err = %d\n", -err); - return err; - } - - pri->dev_name = uml_strdup(buffer); - output += IFNAMSIZ; - printk("%s", output); - free_output_buffer(buffer); - - close(fds[0]); - iter_addresses(pri->dev, open_addr, pri->dev_name); - } - - return pri->fd; -} - -static void tuntap_close(int fd, void *data) -{ - struct tuntap_data *pri = data; - - if (!pri->fixed_config) - iter_addresses(pri->dev, close_addr, pri->dev_name); - close(fd); - pri->fd = -1; -} - -const struct net_user_info tuntap_user_info = { - .init = tuntap_user_init, - .open = tuntap_open, - .close = tuntap_close, - .remove = NULL, - .add_address = tuntap_add_addr, - .delete_address = tuntap_del_addr, - .mtu = ETH_MAX_PACKET, - .max_packet = ETH_MAX_PACKET + ETH_HEADER_OTHER, -}; diff --git a/arch/um/os-Linux/elf_aux.c b/arch/um/os-Linux/elf_aux.c index 1a365ddc4d02..72f416edf252 100644 --- a/arch/um/os-Linux/elf_aux.c +++ b/arch/um/os-Linux/elf_aux.c @@ -1,7 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * arch/um/kernel/elf_aux.c * - * Scan the Elf auxiliary vector provided by the host to extract + * Scan the ELF auxiliary vector provided by the host to extract * information about vsyscall-page, etc. * * Copyright (C) 2004 Fujitsu Siemens Computers GmbH @@ -12,37 +13,27 @@ #include <init.h> #include <elf_user.h> #include <mem_user.h> +#include "internal.h" +#include <linux/swab.h> +#if __BITS_PER_LONG == 64 +typedef Elf64_auxv_t elf_auxv_t; +#else typedef Elf32_auxv_t elf_auxv_t; +#endif /* These are initialized very early in boot and never changed */ char * elf_aux_platform; -extern long elf_aux_hwcap; -unsigned long vsyscall_ehdr; -unsigned long vsyscall_end; -unsigned long __kernel_vsyscall; +long elf_aux_hwcap; __init void scan_elf_aux( char **envp) { - long page_size = 0; elf_auxv_t * auxv; while ( *envp++ != NULL) ; for ( auxv = (elf_auxv_t *)envp; auxv->a_type != AT_NULL; auxv++) { switch ( auxv->a_type ) { - case AT_SYSINFO: - __kernel_vsyscall = auxv->a_un.a_val; - /* See if the page is under TASK_SIZE */ - if (__kernel_vsyscall < (unsigned long) envp) - __kernel_vsyscall = 0; - break; - case AT_SYSINFO_EHDR: - vsyscall_ehdr = auxv->a_un.a_val; - /* See if the page is under TASK_SIZE */ - if (vsyscall_ehdr < (unsigned long) envp) - vsyscall_ehdr = 0; - break; case AT_HWCAP: elf_aux_hwcap = auxv->a_un.a_val; break; @@ -54,20 +45,6 @@ __init void scan_elf_aux( char **envp) elf_aux_platform = (char *) (long) auxv->a_un.a_val; break; - case AT_PAGESZ: - page_size = auxv->a_un.a_val; - break; } } - if ( ! __kernel_vsyscall || ! vsyscall_ehdr || - ! elf_aux_hwcap || ! elf_aux_platform || - ! page_size || (vsyscall_ehdr % page_size) ) { - __kernel_vsyscall = 0; - vsyscall_ehdr = 0; - elf_aux_hwcap = 0; - elf_aux_platform = "i586"; - } - else { - vsyscall_end = vsyscall_ehdr + page_size; - } } diff --git a/arch/um/os-Linux/execvp.c b/arch/um/os-Linux/execvp.c index 8fb25ca07c46..c09a5fd5e225 100644 --- a/arch/um/os-Linux/execvp.c +++ b/arch/um/os-Linux/execvp.c @@ -93,6 +93,7 @@ int execvp_noalloc(char *buf, const char *file, char *const argv[]) up finding no executable we can use, we want to diagnose that we did find one but were denied access. */ got_eacces = 1; + break; case ENOENT: case ESTALE: case ENOTDIR: @@ -136,7 +137,7 @@ int main(int argc, char**argv) int ret; argc--; if (!argc) { - fprintf(stderr, "Not enough arguments\n"); + os_warn("Not enough arguments\n"); return 1; } argv++; diff --git a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c index c17bd6f7d674..21f0e50fb1df 100644 --- a/arch/um/os-Linux/file.c +++ b/arch/um/os-Linux/file.c @@ -1,18 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <stdio.h> #include <unistd.h> +#include <stdlib.h> +#include <string.h> #include <errno.h> #include <fcntl.h> #include <signal.h> +#include <linux/falloc.h> #include <sys/ioctl.h> #include <sys/mount.h> #include <sys/socket.h> #include <sys/stat.h> +#include <sys/sysmacros.h> #include <sys/un.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/eventfd.h> +#include <poll.h> #include <os.h> static void copy_stat(struct uml_stat *dst, const struct stat64 *src) @@ -98,21 +106,6 @@ int os_get_ifname(int fd, char* namebuf) return 0; } -int os_set_slip(int fd) -{ - int disc, sencap; - - disc = N_SLIP; - if (ioctl(fd, TIOCSETD, &disc) < 0) - return -errno; - - sencap = 0; - if (ioctl(fd, SIOCSIFENCAP, &sencap) < 0) - return -errno; - - return 0; -} - int os_mode_fd(int fd, int mode) { int err; @@ -233,6 +226,16 @@ out: return err; } +int os_dup_file(int fd) +{ + int new_fd = dup(fd); + + if (new_fd < 0) + return -errno; + + return new_fd; +} + void os_close_file(int fd) { close(fd); @@ -257,6 +260,15 @@ int os_read_file(int fd, void *buf, int len) return n; } +int os_pread_file(int fd, void *buf, int len, unsigned long long offset) +{ + int n = pread(fd, buf, len, offset); + + if (n < 0) + return -errno; + return n; +} + int os_write_file(int fd, const void *buf, int len) { int n = write(fd, (void *) buf, len); @@ -266,6 +278,25 @@ int os_write_file(int fd, const void *buf, int len) return n; } +int os_sync_file(int fd) +{ + int n = fdatasync(fd); + + if (n < 0) + return -errno; + return n; +} + +int os_pwrite_file(int fd, const void *buf, int len, unsigned long long offset) +{ + int n = pwrite(fd, (void *) buf, len, offset); + + if (n < 0) + return -errno; + return n; +} + + int os_file_size(const char *file, unsigned long long *size_out) { struct uml_stat buf; @@ -304,7 +335,7 @@ int os_file_size(const char *file, unsigned long long *size_out) return 0; } -int os_file_modtime(const char *file, unsigned long *modtime) +int os_file_modtime(const char *file, long long *modtime) { struct uml_stat buf; int err; @@ -461,44 +492,51 @@ int os_shutdown_socket(int fd, int r, int w) return 0; } -int os_rcv_fd(int fd, int *helper_pid_out) +/** + * os_rcv_fd_msg - receive message with (optional) FDs + * @fd: the FD to receive from + * @fds: the array for FDs to write to + * @n_fds: number of FDs to receive (@fds array size) + * @data: the message buffer + * @data_len: the size of the message to receive + * + * Receive a message with FDs. + * + * Returns: the size of the received message, or an error code + */ +ssize_t os_rcv_fd_msg(int fd, int *fds, unsigned int n_fds, + void *data, size_t data_len) { - int new, n; - char buf[CMSG_SPACE(sizeof(new))]; - struct msghdr msg; +#define MAX_RCV_FDS 2 + char buf[CMSG_SPACE(sizeof(*fds) * MAX_RCV_FDS)]; struct cmsghdr *cmsg; - struct iovec iov; - - msg.msg_name = NULL; - msg.msg_namelen = 0; - iov = ((struct iovec) { .iov_base = helper_pid_out, - .iov_len = sizeof(*helper_pid_out) }); - msg.msg_iov = &iov; - msg.msg_iovlen = 1; - msg.msg_control = buf; - msg.msg_controllen = sizeof(buf); - msg.msg_flags = 0; + struct iovec iov = { + .iov_base = data, + .iov_len = data_len, + }; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = buf, + .msg_controllen = CMSG_SPACE(sizeof(*fds) * n_fds), + }; + int n; + + if (n_fds > MAX_RCV_FDS) + return -EINVAL; n = recvmsg(fd, &msg, 0); if (n < 0) return -errno; - else if (n != iov.iov_len) - *helper_pid_out = -1; cmsg = CMSG_FIRSTHDR(&msg); - if (cmsg == NULL) { - printk(UM_KERN_ERR "rcv_fd didn't receive anything, " - "error = %d\n", errno); - return -1; - } - if ((cmsg->cmsg_level != SOL_SOCKET) || - (cmsg->cmsg_type != SCM_RIGHTS)) { - printk(UM_KERN_ERR "rcv_fd didn't receive a descriptor\n"); - return -1; - } + if (!cmsg || + cmsg->cmsg_level != SOL_SOCKET || + cmsg->cmsg_type != SCM_RIGHTS) + return n; - new = ((int *) CMSG_DATA(cmsg))[0]; - return new; + memcpy(fds, CMSG_DATA(cmsg), cmsg->cmsg_len - CMSG_LEN(0)); + return n; } int os_create_unix_socket(const char *file, int len, int close_on_exec) @@ -574,3 +612,115 @@ unsigned long long os_makedev(unsigned major, unsigned minor) { return makedev(major, minor); } + +int os_falloc_punch(int fd, unsigned long long offset, int len) +{ + int n = fallocate(fd, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE, offset, len); + + if (n < 0) + return -errno; + return n; +} + +int os_falloc_zeroes(int fd, unsigned long long offset, int len) +{ + int n = fallocate(fd, FALLOC_FL_ZERO_RANGE|FALLOC_FL_KEEP_SIZE, offset, len); + + if (n < 0) + return -errno; + return n; +} + +int os_eventfd(unsigned int initval, int flags) +{ + int fd = eventfd(initval, flags); + + if (fd < 0) + return -errno; + return fd; +} + +int os_sendmsg_fds(int fd, const void *buf, unsigned int len, const int *fds, + unsigned int fds_num) +{ + struct iovec iov = { + .iov_base = (void *) buf, + .iov_len = len, + }; + union { + char control[CMSG_SPACE(sizeof(*fds) * OS_SENDMSG_MAX_FDS)]; + struct cmsghdr align; + } u; + unsigned int fds_size = sizeof(*fds) * fds_num; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = u.control, + .msg_controllen = CMSG_SPACE(fds_size), + }; + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + int err; + + if (fds_num > OS_SENDMSG_MAX_FDS) + return -EINVAL; + memset(u.control, 0, sizeof(u.control)); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(fds_size); + memcpy(CMSG_DATA(cmsg), fds, fds_size); + err = sendmsg(fd, &msg, 0); + + if (err < 0) + return -errno; + return err; +} + +int os_poll(unsigned int n, const int *fds) +{ + /* currently need 2 FDs at most so avoid dynamic allocation */ + struct pollfd pollfds[2] = {}; + unsigned int i; + int ret; + + if (n > ARRAY_SIZE(pollfds)) + return -EINVAL; + + for (i = 0; i < n; i++) { + pollfds[i].fd = fds[i]; + pollfds[i].events = POLLIN; + } + + ret = poll(pollfds, n, -1); + if (ret < 0) + return -errno; + + /* Return the index of the available FD */ + for (i = 0; i < n; i++) { + if (pollfds[i].revents) + return i; + } + + return -EIO; +} + +void *os_mmap_rw_shared(int fd, size_t size) +{ + void *res = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + + if (res == MAP_FAILED) + return NULL; + + return res; +} + +void *os_mremap_rw_shared(void *old_addr, size_t old_size, size_t new_size) +{ + void *res; + + res = mremap(old_addr, old_size, new_size, MREMAP_MAYMOVE, NULL); + + if (res == MAP_FAILED) + return NULL; + + return res; +} diff --git a/arch/um/os-Linux/helper.c b/arch/um/os-Linux/helper.c index e3ee4a51ef63..89c2ad2a4e3a 100644 --- a/arch/um/os-Linux/helper.c +++ b/arch/um/os-Linux/helper.c @@ -1,12 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <stdlib.h> +#include <string.h> #include <unistd.h> #include <errno.h> #include <sched.h> +#include <pthread.h> #include <linux/limits.h> #include <sys/socket.h> #include <sys/wait.h> @@ -45,7 +47,7 @@ int run_helper(void (*pre_exec)(void *), void *pre_data, char **argv) unsigned long stack, sp; int pid, fds[2], ret, n; - stack = alloc_stack(0, __cant_sleep()); + stack = alloc_stack(0, __uml_cant_sleep()); if (stack == 0) return -ENOMEM; @@ -64,12 +66,12 @@ int run_helper(void (*pre_exec)(void *), void *pre_data, char **argv) goto out_close; } - sp = stack + UM_KERN_PAGE_SIZE - sizeof(void *); + sp = stack + UM_KERN_PAGE_SIZE; data.pre_exec = pre_exec; data.pre_data = pre_data; data.argv = argv; data.fd = fds[1]; - data.buf = __cant_sleep() ? uml_kmalloc(PATH_MAX, UM_GFP_ATOMIC) : + data.buf = __uml_cant_sleep() ? uml_kmalloc(PATH_MAX, UM_GFP_ATOMIC) : uml_kmalloc(PATH_MAX, UM_GFP_KERNEL); pid = clone(helper_child, (void *) sp, CLONE_VM, &data); if (pid < 0) { @@ -96,9 +98,13 @@ int run_helper(void (*pre_exec)(void *), void *pre_data, char **argv) "ret = %d\n", -n); ret = n; } - CATCH_EINTR(waitpid(pid, NULL, __WCLONE)); + CATCH_EINTR(waitpid(pid, NULL, __WALL)); } + if (ret < 0) + printk(UM_KERN_ERR "run_helper : failed to exec %s on host: %s\n", + argv[0], strerror(-ret)); + out_free2: kfree(data.buf); out_close: @@ -116,11 +122,15 @@ int run_helper_thread(int (*proc)(void *), void *arg, unsigned int flags, unsigned long stack, sp; int pid, status, err; - stack = alloc_stack(0, __cant_sleep()); + /* To share memory space, use os_run_helper_thread() instead. */ + if (flags & CLONE_VM) + return -EINVAL; + + stack = alloc_stack(0, __uml_cant_sleep()); if (stack == 0) return -ENOMEM; - sp = stack + UM_KERN_PAGE_SIZE - sizeof(void *); + sp = stack + UM_KERN_PAGE_SIZE; pid = clone(proc, (void *) sp, flags, arg); if (pid < 0) { err = -errno; @@ -129,7 +139,7 @@ int run_helper_thread(int (*proc)(void *), void *arg, unsigned int flags, return err; } if (stack_out == NULL) { - CATCH_EINTR(pid = waitpid(pid, &status, __WCLONE)); + CATCH_EINTR(pid = waitpid(pid, &status, __WALL)); if (pid < 0) { err = -errno; printk(UM_KERN_ERR "run_helper_thread - wait failed, " @@ -148,7 +158,7 @@ int run_helper_thread(int (*proc)(void *), void *arg, unsigned int flags, int helper_wait(int pid) { int ret, status; - int wflags = __WCLONE; + int wflags = __WALL; CATCH_EINTR(ret = waitpid(pid, &status, wflags)); if (ret < 0) { @@ -162,3 +172,65 @@ int helper_wait(int pid) } else return 0; } + +struct os_helper_thread { + pthread_t handle; +}; + +int os_run_helper_thread(struct os_helper_thread **td_out, + void *(*routine)(void *), void *arg) +{ + struct os_helper_thread *td; + sigset_t sigset, oset; + int err, flags; + + flags = __uml_cant_sleep() ? UM_GFP_ATOMIC : UM_GFP_KERNEL; + td = uml_kmalloc(sizeof(*td), flags); + if (!td) + return -ENOMEM; + + sigfillset(&sigset); + if (sigprocmask(SIG_SETMASK, &sigset, &oset) < 0) { + err = -errno; + kfree(td); + return err; + } + + err = pthread_create(&td->handle, NULL, routine, arg); + + if (sigprocmask(SIG_SETMASK, &oset, NULL) < 0) + panic("Failed to restore the signal mask: %d", errno); + + if (err != 0) + kfree(td); + else + *td_out = td; + + return -err; +} + +void os_kill_helper_thread(struct os_helper_thread *td) +{ + pthread_cancel(td->handle); + pthread_join(td->handle, NULL); + kfree(td); +} + +void os_fix_helper_thread_signals(void) +{ + sigset_t sigset; + + sigemptyset(&sigset); + + sigaddset(&sigset, SIGWINCH); + sigaddset(&sigset, SIGPIPE); + sigaddset(&sigset, SIGPROF); + sigaddset(&sigset, SIGINT); + sigaddset(&sigset, SIGTERM); + sigaddset(&sigset, SIGCHLD); + sigaddset(&sigset, SIGALRM); + sigaddset(&sigset, SIGIO); + sigaddset(&sigset, SIGUSR1); + + pthread_sigmask(SIG_SETMASK, &sigset, NULL); +} diff --git a/arch/um/os-Linux/internal.h b/arch/um/os-Linux/internal.h index 0dc2c9f135f6..bac9fcc8c14c 100644 --- a/arch/um/os-Linux/internal.h +++ b/arch/um/os-Linux/internal.h @@ -1 +1,36 @@ -void alarm_handler(int sig, struct siginfo *unused_si, mcontext_t *mc); +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __UM_OS_LINUX_INTERNAL_H +#define __UM_OS_LINUX_INTERNAL_H + +#include <mm_id.h> +#include <stub-data.h> +#include <signal.h> + +/* + * elf_aux.c + */ +void scan_elf_aux(char **envp); + +/* + * mem.c + */ +void check_tmpexec(void); + +/* + * signal.c + */ +extern __thread int signals_enabled; +int timer_alarm_pending(void); + +/* + * skas/process.c + */ +void wait_stub_done(int pid); +void wait_stub_done_seccomp(struct mm_id *mm_idp, int running, int wait_sigsys); + +/* + * smp.c + */ +#define IPI_SIGNAL SIGRTMIN + +#endif /* __UM_OS_LINUX_INTERNAL_H */ diff --git a/arch/um/os-Linux/irq.c b/arch/um/os-Linux/irq.c index b9afb74b79ad..cf7e49c08b21 100644 --- a/arch/um/os-Linux/irq.c +++ b/arch/um/os-Linux/irq.c @@ -1,135 +1,145 @@ +// SPDX-License-Identifier: GPL-2.0 /* + * Copyright (C) 2017 - Cambridge Greys Ltd + * Copyright (C) 2011 - 2014 Cisco Systems Inc * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <stdlib.h> #include <errno.h> -#include <poll.h> +#include <sys/epoll.h> #include <signal.h> #include <string.h> #include <irq_user.h> #include <os.h> #include <um_malloc.h> +/* Epoll support */ + +static int epollfd = -1; + +#define MAX_EPOLL_EVENTS 64 + +static struct epoll_event epoll_events[MAX_EPOLL_EVENTS]; + +/* Helper to return an Epoll data pointer from an epoll event structure. + * We need to keep this one on the userspace side to keep includes separate + */ + +void *os_epoll_get_data_pointer(int index) +{ + return epoll_events[index].data.ptr; +} + +/* Helper to compare events versus the events in the epoll structure. + * Same as above - needs to be on the userspace side + */ + + +int os_epoll_triggered(int index, int events) +{ + return epoll_events[index].events & events; +} +/* Helper to set the event mask. + * The event mask is opaque to the kernel side, because it does not have + * access to the right includes/defines for EPOLL constants. + */ + +int os_event_mask(enum um_irq_type irq_type) +{ + if (irq_type == IRQ_READ) + return EPOLLIN | EPOLLPRI | EPOLLERR | EPOLLHUP | EPOLLRDHUP; + if (irq_type == IRQ_WRITE) + return EPOLLOUT; + return 0; +} + /* - * Locked by irq_lock in arch/um/kernel/irq.c. Changed by os_create_pollfd - * and os_free_irq_by_cb, which are called under irq_lock. + * Initial Epoll Setup */ -static struct pollfd *pollfds = NULL; -static int pollfds_num = 0; -static int pollfds_size = 0; +int os_setup_epoll(void) +{ + epollfd = epoll_create(MAX_EPOLL_EVENTS); + return epollfd; +} -int os_waiting_for_events(struct irq_fd *active_fds) +/* + * Helper to run the actual epoll_wait + */ +int os_waiting_for_events_epoll(void) { - struct irq_fd *irq_fd; - int i, n, err; + int n, err; - n = poll(pollfds, pollfds_num, 0); + n = epoll_wait(epollfd, + (struct epoll_event *) &epoll_events, MAX_EPOLL_EVENTS, 0); if (n < 0) { err = -errno; if (errno != EINTR) - printk(UM_KERN_ERR "os_waiting_for_events:" - " poll returned %d, errno = %d\n", n, errno); + printk( + UM_KERN_ERR "os_waiting_for_events:" + " epoll returned %d, error = %s\n", n, + strerror(errno) + ); return err; } - - if (n == 0) - return 0; - - irq_fd = active_fds; - - for (i = 0; i < pollfds_num; i++) { - if (pollfds[i].revents != 0) { - irq_fd->current_events = pollfds[i].revents; - pollfds[i].fd = -1; - } - irq_fd = irq_fd->next; - } return n; } -int os_create_pollfd(int fd, int events, void *tmp_pfd, int size_tmpfds) -{ - if (pollfds_num == pollfds_size) { - if (size_tmpfds <= pollfds_size * sizeof(pollfds[0])) { - /* return min size needed for new pollfds area */ - return (pollfds_size + 1) * sizeof(pollfds[0]); - } - - if (pollfds != NULL) { - memcpy(tmp_pfd, pollfds, - sizeof(pollfds[0]) * pollfds_size); - /* remove old pollfds */ - kfree(pollfds); - } - pollfds = tmp_pfd; - pollfds_size++; - } else - kfree(tmp_pfd); /* remove not used tmp_pfd */ - - pollfds[pollfds_num] = ((struct pollfd) { .fd = fd, - .events = events, - .revents = 0 }); - pollfds_num++; - - return 0; -} -void os_free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg, - struct irq_fd *active_fds, struct irq_fd ***last_irq_ptr2) +/* + * Helper to add a fd to epoll + */ +int os_add_epoll_fd(int events, int fd, void *data) { - struct irq_fd **prev; - int i = 0; - - prev = &active_fds; - while (*prev != NULL) { - if ((*test)(*prev, arg)) { - struct irq_fd *old_fd = *prev; - if ((pollfds[i].fd != -1) && - (pollfds[i].fd != (*prev)->fd)) { - printk(UM_KERN_ERR "os_free_irq_by_cb - " - "mismatch between active_fds and " - "pollfds, fd %d vs %d\n", - (*prev)->fd, pollfds[i].fd); - goto out; - } - - pollfds_num--; - - /* - * This moves the *whole* array after pollfds[i] - * (though it doesn't spot as such)! - */ - memmove(&pollfds[i], &pollfds[i + 1], - (pollfds_num - i) * sizeof(pollfds[0])); - if (*last_irq_ptr2 == &old_fd->next) - *last_irq_ptr2 = prev; - - *prev = (*prev)->next; - if (old_fd->type == IRQ_WRITE) - ignore_sigio_fd(old_fd->fd); - kfree(old_fd); - continue; - } - prev = &(*prev)->next; - i++; - } - out: - return; + struct epoll_event event; + int result; + + event.data.ptr = data; + event.events = events | EPOLLET; + result = epoll_ctl(epollfd, EPOLL_CTL_ADD, fd, &event); + if ((result) && (errno == EEXIST)) + result = os_mod_epoll_fd(events, fd, data); + if (result) + printk("epollctl add err fd %d, %s\n", fd, strerror(errno)); + return result; } -int os_get_pollfd(int i) +/* + * Helper to mod the fd event mask and/or data backreference + */ +int os_mod_epoll_fd(int events, int fd, void *data) { - return pollfds[i].fd; + struct epoll_event event; + int result; + + event.data.ptr = data; + event.events = events; + result = epoll_ctl(epollfd, EPOLL_CTL_MOD, fd, &event); + if (result) + printk(UM_KERN_ERR + "epollctl mod err fd %d, %s\n", fd, strerror(errno)); + return result; } -void os_set_pollfd(int i, int fd) +/* + * Helper to delete the epoll fd + */ +int os_del_epoll_fd(int fd) { - pollfds[i].fd = fd; + struct epoll_event event; + /* This is quiet as we use this as IO ON/OFF - so it is often + * invoked on a non-existent fd + */ + return epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, &event); } void os_set_ioignore(void) { signal(SIGIO, SIG_IGN); } + +void os_close_epoll_fd(void) +{ + /* Needed so we do not leak an fd when rebooting */ + os_close_file(epollfd); +} diff --git a/arch/um/os-Linux/main.c b/arch/um/os-Linux/main.c index 749c96da7b99..7e114862a723 100644 --- a/arch/um/os-Linux/main.c +++ b/arch/um/os-Linux/main.c @@ -1,6 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* + * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de) * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <stdio.h> @@ -10,19 +11,17 @@ #include <signal.h> #include <string.h> #include <sys/resource.h> +#include <sys/personality.h> #include <as-layout.h> #include <init.h> #include <kern_util.h> #include <os.h> #include <um_malloc.h> +#include "internal.h" -#define PGD_BOUND (4 * 1024 * 1024) #define STACKSIZE (8 * 1024 * 1024) -#define THREAD_NAME_LEN (256) -long elf_aux_hwcap; - -static void set_stklim(void) +static void __init set_stklim(void) { struct rlimit lim; @@ -39,24 +38,13 @@ static void set_stklim(void) } } -static __init void do_uml_initcalls(void) -{ - initcall_t *call; - - call = &__uml_initcall_start; - while (call < &__uml_initcall_end) { - (*call)(); - call++; - } -} - static void last_ditch_exit(int sig) { uml_cleanup(); exit(1); } -static void install_fatal_handler(int sig) +static void __init install_fatal_handler(int sig) { struct sigaction action; @@ -73,15 +61,15 @@ static void install_fatal_handler(int sig) action.sa_restorer = NULL; action.sa_handler = last_ditch_exit; if (sigaction(sig, &action, NULL) < 0) { - printf("failed to install handler for signal %d - errno = %d\n", - sig, errno); + os_warn("failed to install handler for signal %d " + "- errno = %d\n", sig, errno); exit(1); } } #define UML_LIB_PATH ":" OS_LIB_PATH "/uml" -static void setup_env_path(void) +static void __init setup_env_path(void) { char *new_path = NULL; char *old_path = NULL; @@ -112,17 +100,32 @@ static void setup_env_path(void) } } -extern void scan_elf_aux( char **envp); - int __init main(int argc, char **argv, char **envp) { char **new_argv; int ret, i, err; + /* Disable randomization and re-exec if it was changed successfully */ + ret = personality(PER_LINUX | ADDR_NO_RANDOMIZE); + if (ret >= 0 && (ret & (PER_LINUX | ADDR_NO_RANDOMIZE)) != + (PER_LINUX | ADDR_NO_RANDOMIZE)) { + char buf[4096] = {}; + ssize_t ret; + + ret = readlink("/proc/self/exe", buf, sizeof(buf)); + if (ret < 0 || ret >= sizeof(buf)) { + perror("readlink failure"); + exit(1); + } + execve(buf, argv, envp); + } + set_stklim(); setup_env_path(); + setsid(); + new_argv = malloc((argc + 1) * sizeof(char *)); if (new_argv == NULL) { perror("Mallocing argv"); @@ -144,12 +147,10 @@ int __init main(int argc, char **argv, char **envp) install_fatal_handler(SIGINT); install_fatal_handler(SIGTERM); -#ifdef CONFIG_ARCH_REUSE_HOST_VSYSCALL_AREA scan_elf_aux(envp); -#endif - do_uml_initcalls(); - ret = linux_main(argc, argv); + change_sig(SIGPIPE, 0); + ret = linux_main(argc, argv, envp); /* * Disable SIGPROF - I have no idea why libc doesn't do this or turn @@ -160,18 +161,18 @@ int __init main(int argc, char **argv, char **envp) /* * This signal stuff used to be in the reboot case. However, - * sometimes a SIGVTALRM can come in when we're halting (reproducably + * sometimes a timer signal can come in when we're halting (reproducably * when writing out gcov information, presumably because that takes * some time) and cause a segfault. */ - /* stop timers and set SIGVTALRM to be ignored */ - disable_timer(); + /* stop timers and set timer signal to be ignored */ + os_timer_disable(0); /* disable SIGIO for the fds and set SIGIO to be ignored */ err = deactivate_all_fds(); if (err) - printf("deactivate_all_fds failed, errno = %d\n", -err); + os_warn("deactivate_all_fds failed, errno = %d\n", -err); /* * Let any pending signals fire now. This ensures @@ -180,18 +181,23 @@ int __init main(int argc, char **argv, char **envp) */ unblock_signals(); + os_info("\n"); /* Reboot */ if (ret) { - printf("\n"); execvp(new_argv[0], new_argv); perror("Failed to exec kernel"); ret = 1; } - printf("\n"); return uml_exitcode; } extern void *__real_malloc(int); +extern void __real_free(void *); + +/* workaround for -Wmissing-prototypes warnings */ +void *__wrap_malloc(int size); +void *__wrap_calloc(int n, int size); +void __wrap_free(void *ptr); void *__wrap_malloc(int size) { @@ -224,10 +230,6 @@ void *__wrap_calloc(int n, int size) return ptr; } -extern void __real_free(void *); - -extern unsigned long high_physmem; - void __wrap_free(void *ptr) { unsigned long addr = (unsigned long) ptr; diff --git a/arch/um/os-Linux/mem.c b/arch/um/os-Linux/mem.c index ba4398056fe9..72f302f4d197 100644 --- a/arch/um/os-Linux/mem.c +++ b/arch/um/os-Linux/mem.c @@ -1,6 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <stdio.h> @@ -12,189 +12,164 @@ #include <string.h> #include <sys/stat.h> #include <sys/mman.h> -#include <sys/param.h> +#include <sys/vfs.h> +#include <linux/magic.h> #include <init.h> +#include <kern_util.h> #include <os.h> - -/* Modified by which_tmpdir, which is called during early boot */ -static char *default_tmpdir = "/tmp"; +#include "internal.h" /* - * Modified when creating the physical memory file and when checking - * the tmp filesystem for usability, both happening during early boot. + * kasan_map_memory - maps memory from @start with a size of @len. + * The allocated memory is filled with zeroes upon success. + * @start: the start address of the memory to be mapped + * @len: the length of the memory to be mapped + * + * This function is used to map shadow memory for KASAN in uml */ -static char *tempdir = NULL; - -static void __init find_tempdir(void) +void kasan_map_memory(void *start, size_t len) { - const char *dirs[] = { "TMP", "TEMP", "TMPDIR", NULL }; - int i; - char *dir = NULL; + if (mmap(start, + len, + PROT_READ|PROT_WRITE, + MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE|MAP_NORESERVE, + -1, + 0) == MAP_FAILED) { + os_info("Couldn't allocate shadow memory: %s\n.", + strerror(errno)); + exit(1); + } - if (tempdir != NULL) - /* We've already been called */ - return; - for (i = 0; dirs[i]; i++) { - dir = getenv(dirs[i]); - if ((dir != NULL) && (*dir != '\0')) - break; + if (madvise(start, len, MADV_DONTDUMP)) { + os_info("Couldn't set MAD_DONTDUMP on shadow memory: %s\n.", + strerror(errno)); + exit(1); } - if ((dir == NULL) || (*dir == '\0')) - dir = default_tmpdir; - tempdir = malloc(strlen(dir) + 2); - if (tempdir == NULL) { - fprintf(stderr, "Failed to malloc tempdir, " - "errno = %d\n", errno); - return; + if (madvise(start, len, MADV_DONTFORK)) { + os_info("Couldn't set MADV_DONTFORK on shadow memory: %s\n.", + strerror(errno)); + exit(1); } - strcpy(tempdir, dir); - strcat(tempdir, "/"); } -/* - * This will return 1, with the first character in buf being the - * character following the next instance of c in the file. This will - * read the file as needed. If there's an error, -errno is returned; - * if the end of the file is reached, 0 is returned. - */ -static int next(int fd, char *buf, size_t size, char c) -{ - ssize_t n; - size_t len; - char *ptr; +/* Set by make_tempfile() during early boot. */ +char *tempdir = NULL; - while ((ptr = strchr(buf, c)) == NULL) { - n = read(fd, buf, size - 1); - if (n == 0) - return 0; - else if (n < 0) - return -errno; - - buf[n] = '\0'; +/* Check if dir is on tmpfs. Return 0 if yes, -1 if no or error. */ +static int __init check_tmpfs(const char *dir) +{ + struct statfs st; + + os_info("Checking if %s is on tmpfs...", dir); + if (statfs(dir, &st) < 0) { + os_info("%s\n", strerror(errno)); + } else if (st.f_type != TMPFS_MAGIC) { + os_info("no\n"); + } else { + os_info("OK\n"); + return 0; } - - ptr++; - len = strlen(ptr); - memmove(buf, ptr, len + 1); - - /* - * Refill the buffer so that if there's a partial string that we care - * about, it will be completed, and we can recognize it. - */ - n = read(fd, &buf[len], size - len - 1); - if (n < 0) - return -errno; - - buf[len + n] = '\0'; - return 1; + return -1; } -/* which_tmpdir is called only during early boot */ -static int checked_tmpdir = 0; - /* - * Look for a tmpfs mounted at /dev/shm. I couldn't find a cleaner - * way to do this than to parse /proc/mounts. statfs will return the - * same filesystem magic number and fs id for both /dev and /dev/shm - * when they are both tmpfs, so you can't tell if they are different - * filesystems. Also, there seems to be no other way of finding the - * mount point of a filesystem from within it. - * - * If a /dev/shm tmpfs entry is found, then we switch to using it. - * Otherwise, we stay with the default /tmp. + * Choose the tempdir to use. We want something on tmpfs so that our memory is + * not subject to the host's vm.dirty_ratio. If a tempdir is specified in the + * environment, we use that even if it's not on tmpfs, but we warn the user. + * Otherwise, we try common tmpfs locations, and if no tmpfs directory is found + * then we fall back to /tmp. */ -static void which_tmpdir(void) +static char * __init choose_tempdir(void) { - int fd, found; - char buf[128] = { '\0' }; - - if (checked_tmpdir) - return; - - checked_tmpdir = 1; - - printf("Checking for tmpfs mount on /dev/shm..."); - - fd = open("/proc/mounts", O_RDONLY); - if (fd < 0) { - printf("failed to open /proc/mounts, errno = %d\n", errno); - return; + static const char * const vars[] = { + "TMPDIR", + "TMP", + "TEMP", + NULL + }; + static const char fallback_dir[] = "/tmp"; + static const char * const tmpfs_dirs[] = { + "/dev/shm", + fallback_dir, + NULL + }; + int i; + const char *dir; + + os_info("Checking environment variables for a tempdir..."); + for (i = 0; vars[i]; i++) { + dir = getenv(vars[i]); + if ((dir != NULL) && (*dir != '\0')) { + os_info("%s\n", dir); + if (check_tmpfs(dir) >= 0) + goto done; + else + goto warn; + } } + os_info("none found\n"); - while (1) { - found = next(fd, buf, ARRAY_SIZE(buf), ' '); - if (found != 1) - break; - - if (!strncmp(buf, "/dev/shm", strlen("/dev/shm"))) - goto found; - - found = next(fd, buf, ARRAY_SIZE(buf), '\n'); - if (found != 1) - break; + for (i = 0; tmpfs_dirs[i]; i++) { + dir = tmpfs_dirs[i]; + if (check_tmpfs(dir) >= 0) + goto done; } -err: - if (found == 0) - printf("nothing mounted on /dev/shm\n"); - else if (found < 0) - printf("read returned errno %d\n", -found); - -out: - close(fd); - - return; - -found: - found = next(fd, buf, ARRAY_SIZE(buf), ' '); - if (found != 1) - goto err; - - if (strncmp(buf, "tmpfs", strlen("tmpfs"))) { - printf("not tmpfs\n"); - goto out; - } - - printf("OK\n"); - default_tmpdir = "/dev/shm"; - goto out; + dir = fallback_dir; +warn: + os_warn("Warning: tempdir %s is not on tmpfs\n", dir); +done: + /* Make a copy since getenv results may not remain valid forever. */ + return strdup(dir); } -static int __init make_tempfile(const char *template, char **out_tempname, - int do_unlink) +/* + * Create an unlinked tempfile in a suitable tempdir. template must be the + * basename part of the template with a leading '/'. + */ +static int __init make_tempfile(const char *template) { char *tempname; int fd; - which_tmpdir(); - tempname = malloc(MAXPATHLEN); + if (tempdir == NULL) { + tempdir = choose_tempdir(); + if (tempdir == NULL) { + os_warn("Failed to choose tempdir: %s\n", + strerror(errno)); + return -1; + } + } + +#ifdef O_TMPFILE + fd = open(tempdir, O_CLOEXEC | O_RDWR | O_EXCL | O_TMPFILE, 0700); + /* + * If the running system does not support O_TMPFILE flag then retry + * without it. + */ + if (fd != -1 || (errno != EINVAL && errno != EISDIR && + errno != EOPNOTSUPP)) + return fd; +#endif + + tempname = malloc(strlen(tempdir) + strlen(template) + 1); if (tempname == NULL) return -1; - find_tempdir(); - if ((tempdir == NULL) || (strlen(tempdir) >= MAXPATHLEN)) - goto out; - - if (template[0] != '/') - strcpy(tempname, tempdir); - else - tempname[0] = '\0'; - strncat(tempname, template, MAXPATHLEN-1-strlen(tempname)); + strcpy(tempname, tempdir); + strcat(tempname, template); fd = mkstemp(tempname); if (fd < 0) { - fprintf(stderr, "open - cannot create %s: %s\n", tempname, + os_warn("open - cannot create %s: %s\n", tempname, strerror(errno)); goto out; } - if (do_unlink && (unlink(tempname) < 0)) { + if (unlink(tempname) < 0) { perror("unlink"); goto close; } - if (out_tempname) { - *out_tempname = tempname; - } else - free(tempname); + free(tempname); return fd; close: close(fd); @@ -203,23 +178,17 @@ out: return -1; } -#define TEMPNAME_TEMPLATE "vm_file-XXXXXX" +#define TEMPNAME_TEMPLATE "/vm_file-XXXXXX" static int __init create_tmp_file(unsigned long long len) { int fd, err; char zero; - fd = make_tempfile(TEMPNAME_TEMPLATE, NULL, 1); + fd = make_tempfile(TEMPNAME_TEMPLATE); if (fd < 0) exit(1); - err = fchmod(fd, 0777); - if (err < 0) { - perror("fchmod"); - exit(1); - } - /* * Seek to len - 1 because writing a character there will * increase the file size by one byte, to the desired length. @@ -254,7 +223,6 @@ int __init create_mem_file(unsigned long long len) return fd; } - void __init check_tmpexec(void) { void *addr; @@ -262,17 +230,16 @@ void __init check_tmpexec(void) addr = mmap(NULL, UM_KERN_PAGE_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE, fd, 0); - printf("Checking PROT_EXEC mmap in %s...",tempdir); - fflush(stdout); + os_info("Checking PROT_EXEC mmap in %s...", tempdir); if (addr == MAP_FAILED) { err = errno; - perror("failed"); + os_warn("%s\n", strerror(err)); close(fd); if (err == EPERM) - printf("%s must be not mounted noexec\n",tempdir); + os_warn("%s must be not mounted noexec\n", tempdir); exit(1); } - printf("OK\n"); + os_info("OK\n"); munmap(addr, UM_KERN_PAGE_SIZE); close(fd); diff --git a/arch/um/os-Linux/process.c b/arch/um/os-Linux/process.c index b8f34c9e53ae..3a2a84ab9325 100644 --- a/arch/um/os-Linux/process.c +++ b/arch/um/os-Linux/process.c @@ -1,119 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 /* + * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de) * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <stdio.h> +#include <stdlib.h> #include <unistd.h> #include <errno.h> #include <signal.h> #include <fcntl.h> +#include <limits.h> +#include <linux/futex.h> #include <sys/mman.h> #include <sys/ptrace.h> +#include <sys/prctl.h> #include <sys/wait.h> #include <asm/unistd.h> #include <init.h> #include <longjmp.h> #include <os.h> -#include <skas_ptrace.h> +#include <skas/skas.h> -#define ARBITRARY_ADDR -1 -#define FAILURE_PID -1 - -#define STAT_PATH_LEN sizeof("/proc/#######/stat\0") -#define COMM_SCANF "%*[^)])" - -unsigned long os_process_pc(int pid) +void os_alarm_process(int pid) { - char proc_stat[STAT_PATH_LEN], buf[256]; - unsigned long pc = ARBITRARY_ADDR; - int fd, err; + if (pid <= 0) + return; - sprintf(proc_stat, "/proc/%d/stat", pid); - fd = open(proc_stat, O_RDONLY, 0); - if (fd < 0) { - printk(UM_KERN_ERR "os_process_pc - couldn't open '%s', " - "errno = %d\n", proc_stat, errno); - goto out; - } - CATCH_EINTR(err = read(fd, buf, sizeof(buf))); - if (err < 0) { - printk(UM_KERN_ERR "os_process_pc - couldn't read '%s', " - "err = %d\n", proc_stat, errno); - goto out_close; - } - os_close_file(fd); - pc = ARBITRARY_ADDR; - if (sscanf(buf, "%*d " COMM_SCANF " %*c %*d %*d %*d %*d %*d %*d %*d " - "%*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d " - "%*d %*d %*d %*d %*d %lu", &pc) != 1) - printk(UM_KERN_ERR "os_process_pc - couldn't find pc in '%s'\n", - buf); - out_close: - close(fd); - out: - return pc; + kill(pid, SIGALRM); } -int os_process_parent(int pid) +void os_kill_process(int pid, int reap_child) { - char stat[STAT_PATH_LEN]; - char data[256]; - int parent = FAILURE_PID, n, fd; - - if (pid == -1) - return parent; - - snprintf(stat, sizeof(stat), "/proc/%d/stat", pid); - fd = open(stat, O_RDONLY, 0); - if (fd < 0) { - printk(UM_KERN_ERR "Couldn't open '%s', errno = %d\n", stat, - errno); - return parent; - } + if (pid <= 0) + return; - CATCH_EINTR(n = read(fd, data, sizeof(data))); - close(fd); - - if (n < 0) { - printk(UM_KERN_ERR "Couldn't read '%s', errno = %d\n", stat, - errno); - return parent; - } - - parent = FAILURE_PID; - n = sscanf(data, "%*d " COMM_SCANF " %*c %d", &parent); - if (n != 1) - printk(UM_KERN_ERR "Failed to scan '%s'\n", data); + /* Block signals until child is reaped */ + block_signals(); - return parent; -} - -void os_stop_process(int pid) -{ - kill(pid, SIGSTOP); -} - -void os_kill_process(int pid, int reap_child) -{ kill(pid, SIGKILL); if (reap_child) CATCH_EINTR(waitpid(pid, NULL, __WALL)); -} - -/* This is here uniquely to have access to the userspace errno, i.e. the one - * used by ptrace in case of error. - */ - -long os_ptrace_ldt(long pid, long addr, long data) -{ - int ret; - - ret = ptrace(PTRACE_LDT, pid, addr, data); - if (ret < 0) - return -errno; - return ret; + unblock_signals(); } /* Kill off a ptraced child by all means available. kill it normally first, @@ -123,11 +52,27 @@ long os_ptrace_ldt(long pid, long addr, long data) void os_kill_ptraced_process(int pid, int reap_child) { + if (pid <= 0) + return; + + /* Block signals until child is reaped */ + block_signals(); + kill(pid, SIGKILL); ptrace(PTRACE_KILL, pid); ptrace(PTRACE_CONT, pid); if (reap_child) CATCH_EINTR(waitpid(pid, NULL, __WALL)); + + unblock_signals(); +} + +pid_t os_reap_child(void) +{ + int status; + + /* Try to reap a child */ + return waitpid(-1, &status, WNOHANG); } /* Don't use the glibc version, which caches the result in TLS. It misses some @@ -139,11 +84,6 @@ int os_getpid(void) return syscall(__NR_getpid); } -int os_getpgrp(void) -{ - return getpgrp(); -} - int os_map_memory(void *virt, int fd, unsigned long long off, unsigned long len, int r, int w, int x) { @@ -241,6 +181,31 @@ void init_new_thread_signals(void) set_handler(SIGBUS); signal(SIGHUP, SIG_IGN); set_handler(SIGIO); + /* We (currently) only use the child reaper IRQ in seccomp mode */ + if (using_seccomp) + set_handler(SIGCHLD); signal(SIGWINCH, SIG_IGN); - signal(SIGTERM, SIG_DFL); +} + +void os_set_pdeathsig(void) +{ + prctl(PR_SET_PDEATHSIG, SIGKILL); +} + +int os_futex_wait(void *uaddr, unsigned int val) +{ + int r; + + CATCH_EINTR(r = syscall(__NR_futex, uaddr, FUTEX_WAIT, val, + NULL, NULL, 0)); + return r < 0 ? -errno : r; +} + +int os_futex_wake(void *uaddr) +{ + int r; + + CATCH_EINTR(r = syscall(__NR_futex, uaddr, FUTEX_WAKE, INT_MAX, + NULL, NULL, 0)); + return r < 0 ? -errno : r; } diff --git a/arch/um/os-Linux/registers.c b/arch/um/os-Linux/registers.c index 2ff8d4fe83c4..bfba2cbc9478 100644 --- a/arch/um/os-Linux/registers.c +++ b/arch/um/os-Linux/registers.c @@ -1,7 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2004 PathScale, Inc * Copyright (C) 2004 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <errno.h> @@ -10,33 +10,14 @@ #include <sysdep/ptrace.h> #include <sysdep/ptrace_user.h> #include <registers.h> - -int save_registers(int pid, struct uml_pt_regs *regs) -{ - int err; - - err = ptrace(PTRACE_GETREGS, pid, 0, regs->gp); - if (err < 0) - return -errno; - return 0; -} - -int restore_registers(int pid, struct uml_pt_regs *regs) -{ - int err; - - err = ptrace(PTRACE_SETREGS, pid, 0, regs->gp); - if (err < 0) - return -errno; - return 0; -} +#include <stdlib.h> /* This is set once at boot time and not changed thereafter */ -static unsigned long exec_regs[MAX_REG_NR]; -static unsigned long exec_fp_regs[FP_SIZE]; +unsigned long exec_regs[MAX_REG_NR]; +unsigned long *exec_fp_regs; -int init_registers(int pid) +int init_pid_registers(int pid) { int err; @@ -44,7 +25,11 @@ int init_registers(int pid) if (err < 0) return -errno; - arch_init_registers(pid); + err = arch_init_registers(pid); + if (err < 0) + return err; + + exec_fp_regs = malloc(host_fp_size); get_fp_registers(pid, exec_fp_regs); return 0; } @@ -54,5 +39,5 @@ void get_safe_registers(unsigned long *regs, unsigned long *fp_regs) memcpy(regs, exec_regs, sizeof(exec_regs)); if (fp_regs) - memcpy(fp_regs, exec_fp_regs, sizeof(exec_fp_regs)); + memcpy(fp_regs, exec_fp_regs, host_fp_size); } diff --git a/arch/um/os-Linux/sigio.c b/arch/um/os-Linux/sigio.c index 8b61cc0e82c8..6de145f8fe3d 100644 --- a/arch/um/os-Linux/sigio.c +++ b/arch/um/os-Linux/sigio.c @@ -1,6 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2002 - 2008 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <unistd.h> @@ -11,6 +11,8 @@ #include <sched.h> #include <signal.h> #include <string.h> +#include <sys/epoll.h> +#include <asm/unistd.h> #include <kern_util.h> #include <init.h> #include <os.h> @@ -21,366 +23,136 @@ * Protected by sigio_lock(), also used by sigio_cleanup, which is an * exitcall. */ -static int write_sigio_pid = -1; -static unsigned long write_sigio_stack; +static struct os_helper_thread *write_sigio_td; -/* - * These arrays are initialized before the sigio thread is started, and - * the descriptors closed after it is killed. So, it can't see them change. - * On the UML side, they are changed under the sigio_lock. - */ -#define SIGIO_FDS_INIT {-1, -1} - -static int write_sigio_fds[2] = SIGIO_FDS_INIT; -static int sigio_private[2] = SIGIO_FDS_INIT; +static int epollfd = -1; -struct pollfds { - struct pollfd *poll; - int size; - int used; -}; +#define MAX_EPOLL_EVENTS 64 -/* - * Protected by sigio_lock(). Used by the sigio thread, but the UML thread - * synchronizes with it. - */ -static struct pollfds current_poll; -static struct pollfds next_poll; -static struct pollfds all_sigio_fds; +static struct epoll_event epoll_events[MAX_EPOLL_EVENTS]; -static int write_sigio_thread(void *unused) +static void *write_sigio_thread(void *unused) { - struct pollfds *fds, tmp; - struct pollfd *p; - int i, n, respond_fd; - char c; + int pid = getpid(); + int r; + + os_fix_helper_thread_signals(); - signal(SIGWINCH, SIG_IGN); - fds = ¤t_poll; while (1) { - n = poll(fds->poll, fds->used, -1); - if (n < 0) { + r = epoll_wait(epollfd, epoll_events, MAX_EPOLL_EVENTS, -1); + if (r < 0) { if (errno == EINTR) continue; - printk(UM_KERN_ERR "write_sigio_thread : poll returned " - "%d, errno = %d\n", n, errno); - } - for (i = 0; i < fds->used; i++) { - p = &fds->poll[i]; - if (p->revents == 0) - continue; - if (p->fd == sigio_private[1]) { - CATCH_EINTR(n = read(sigio_private[1], &c, - sizeof(c))); - if (n != sizeof(c)) - printk(UM_KERN_ERR - "write_sigio_thread : " - "read on socket failed, " - "err = %d\n", errno); - tmp = current_poll; - current_poll = next_poll; - next_poll = tmp; - respond_fd = sigio_private[1]; - } - else { - respond_fd = write_sigio_fds[1]; - fds->used--; - memmove(&fds->poll[i], &fds->poll[i + 1], - (fds->used - i) * sizeof(*fds->poll)); - } - - CATCH_EINTR(n = write(respond_fd, &c, sizeof(c))); - if (n != sizeof(c)) - printk(UM_KERN_ERR "write_sigio_thread : " - "write on socket failed, err = %d\n", - errno); + printk(UM_KERN_ERR "%s: epoll_wait failed, errno = %d\n", + __func__, errno); } - } - - return 0; -} - -static int need_poll(struct pollfds *polls, int n) -{ - struct pollfd *new; - if (n <= polls->size) - return 0; - - new = uml_kmalloc(n * sizeof(struct pollfd), UM_GFP_ATOMIC); - if (new == NULL) { - printk(UM_KERN_ERR "need_poll : failed to allocate new " - "pollfds\n"); - return -ENOMEM; + CATCH_EINTR(r = syscall(__NR_tgkill, pid, pid, SIGIO)); + if (r < 0) + printk(UM_KERN_ERR "%s: tgkill failed, errno = %d\n", + __func__, errno); } - memcpy(new, polls->poll, polls->used * sizeof(struct pollfd)); - kfree(polls->poll); - - polls->poll = new; - polls->size = n; - return 0; + return NULL; } -/* - * Must be called with sigio_lock held, because it's needed by the marked - * critical section. - */ -static void update_thread(void) +int __add_sigio_fd(int fd) { - unsigned long flags; - int n; - char c; - - flags = set_signals(0); - CATCH_EINTR(n = write(sigio_private[0], &c, sizeof(c))); - if (n != sizeof(c)) { - printk(UM_KERN_ERR "update_thread : write failed, err = %d\n", - errno); - goto fail; - } - - CATCH_EINTR(n = read(sigio_private[0], &c, sizeof(c))); - if (n != sizeof(c)) { - printk(UM_KERN_ERR "update_thread : read failed, err = %d\n", - errno); - goto fail; - } - - set_signals(flags); - return; - fail: - /* Critical section start */ - if (write_sigio_pid != -1) { - os_kill_process(write_sigio_pid, 1); - free_stack(write_sigio_stack, 0); - } - write_sigio_pid = -1; - close(sigio_private[0]); - close(sigio_private[1]); - close(write_sigio_fds[0]); - close(write_sigio_fds[1]); - /* Critical section end */ - set_signals(flags); + struct epoll_event event = { + .data.fd = fd, + .events = EPOLLIN | EPOLLET, + }; + int r; + + CATCH_EINTR(r = epoll_ctl(epollfd, EPOLL_CTL_ADD, fd, &event)); + return r < 0 ? -errno : 0; } int add_sigio_fd(int fd) { - struct pollfd *p; - int err = 0, i, n; + int err; sigio_lock(); - for (i = 0; i < all_sigio_fds.used; i++) { - if (all_sigio_fds.poll[i].fd == fd) - break; - } - if (i == all_sigio_fds.used) - goto out; - - p = &all_sigio_fds.poll[i]; + err = __add_sigio_fd(fd); + sigio_unlock(); - for (i = 0; i < current_poll.used; i++) { - if (current_poll.poll[i].fd == fd) - goto out; - } + return err; +} - n = current_poll.used; - err = need_poll(&next_poll, n + 1); - if (err) - goto out; +int __ignore_sigio_fd(int fd) +{ + struct epoll_event event; + int r; - memcpy(next_poll.poll, current_poll.poll, - current_poll.used * sizeof(struct pollfd)); - next_poll.poll[n] = *p; - next_poll.used = n + 1; - update_thread(); - out: - sigio_unlock(); - return err; + CATCH_EINTR(r = epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, &event)); + return r < 0 ? -errno : 0; } int ignore_sigio_fd(int fd) { - struct pollfd *p; - int err = 0, i, n = 0; - - /* - * This is called from exitcalls elsewhere in UML - if - * sigio_cleanup has already run, then update_thread will hang - * or fail because the thread is no longer running. - */ - if (write_sigio_pid == -1) - return -EIO; + int err; sigio_lock(); - for (i = 0; i < current_poll.used; i++) { - if (current_poll.poll[i].fd == fd) - break; - } - if (i == current_poll.used) - goto out; - - err = need_poll(&next_poll, current_poll.used - 1); - if (err) - goto out; - - for (i = 0; i < current_poll.used; i++) { - p = ¤t_poll.poll[i]; - if (p->fd != fd) - next_poll.poll[n++] = *p; - } - next_poll.used = current_poll.used - 1; - - update_thread(); - out: + err = __ignore_sigio_fd(fd); sigio_unlock(); - return err; -} - -static struct pollfd *setup_initial_poll(int fd) -{ - struct pollfd *p; - p = uml_kmalloc(sizeof(struct pollfd), UM_GFP_KERNEL); - if (p == NULL) { - printk(UM_KERN_ERR "setup_initial_poll : failed to allocate " - "poll\n"); - return NULL; - } - *p = ((struct pollfd) { .fd = fd, - .events = POLLIN, - .revents = 0 }); - return p; + return err; } static void write_sigio_workaround(void) { - struct pollfd *p; int err; - int l_write_sigio_fds[2]; - int l_sigio_private[2]; - int l_write_sigio_pid; - /* We call this *tons* of times - and most ones we must just fail. */ sigio_lock(); - l_write_sigio_pid = write_sigio_pid; - sigio_unlock(); - - if (l_write_sigio_pid != -1) - return; + if (write_sigio_td) + goto out; - err = os_pipe(l_write_sigio_fds, 1, 1); - if (err < 0) { - printk(UM_KERN_ERR "write_sigio_workaround - os_pipe 1 failed, " - "err = %d\n", -err); - return; + epollfd = epoll_create(MAX_EPOLL_EVENTS); + if (epollfd < 0) { + printk(UM_KERN_ERR "%s: epoll_create failed, errno = %d\n", + __func__, errno); + goto out; } - err = os_pipe(l_sigio_private, 1, 1); + + err = os_run_helper_thread(&write_sigio_td, write_sigio_thread, NULL); if (err < 0) { - printk(UM_KERN_ERR "write_sigio_workaround - os_pipe 2 failed, " - "err = %d\n", -err); - goto out_close1; + printk(UM_KERN_ERR "%s: os_run_helper_thread failed, errno = %d\n", + __func__, -err); + close(epollfd); + epollfd = -1; + goto out; } - p = setup_initial_poll(l_sigio_private[1]); - if (!p) - goto out_close2; - - sigio_lock(); - - /* - * Did we race? Don't try to optimize this, please, it's not so likely - * to happen, and no more than once at the boot. - */ - if (write_sigio_pid != -1) - goto out_free; - - current_poll = ((struct pollfds) { .poll = p, - .used = 1, - .size = 1 }); - - if (write_sigio_irq(l_write_sigio_fds[0])) - goto out_clear_poll; - - memcpy(write_sigio_fds, l_write_sigio_fds, sizeof(l_write_sigio_fds)); - memcpy(sigio_private, l_sigio_private, sizeof(l_sigio_private)); - - write_sigio_pid = run_helper_thread(write_sigio_thread, NULL, - CLONE_FILES | CLONE_VM, - &write_sigio_stack); - - if (write_sigio_pid < 0) - goto out_clear; - - sigio_unlock(); - return; - -out_clear: - write_sigio_pid = -1; - write_sigio_fds[0] = -1; - write_sigio_fds[1] = -1; - sigio_private[0] = -1; - sigio_private[1] = -1; -out_clear_poll: - current_poll = ((struct pollfds) { .poll = NULL, - .size = 0, - .used = 0 }); -out_free: +out: sigio_unlock(); - kfree(p); -out_close2: - close(l_sigio_private[0]); - close(l_sigio_private[1]); -out_close1: - close(l_write_sigio_fds[0]); - close(l_write_sigio_fds[1]); } -void sigio_broken(int fd, int read) +void sigio_broken(void) { - int err; - write_sigio_workaround(); - - sigio_lock(); - err = need_poll(&all_sigio_fds, all_sigio_fds.used + 1); - if (err) { - printk(UM_KERN_ERR "maybe_sigio_broken - failed to add pollfd " - "for descriptor %d\n", fd); - goto out; - } - - all_sigio_fds.poll[all_sigio_fds.used++] = - ((struct pollfd) { .fd = fd, - .events = read ? POLLIN : POLLOUT, - .revents = 0 }); -out: - sigio_unlock(); } /* Changed during early boot */ static int pty_output_sigio; -static int pty_close_sigio; -void maybe_sigio_broken(int fd, int read) +void maybe_sigio_broken(int fd) { if (!isatty(fd)) return; - if ((read || pty_output_sigio) && (!read || pty_close_sigio)) + if (pty_output_sigio) return; - sigio_broken(fd, read); + sigio_broken(); } static void sigio_cleanup(void) { - if (write_sigio_pid == -1) + if (!write_sigio_td) return; - os_kill_process(write_sigio_pid, 1); - free_stack(write_sigio_stack, 0); - write_sigio_pid = -1; + os_kill_helper_thread(write_sigio_td); + write_sigio_td = NULL; } __uml_exitcall(sigio_cleanup); @@ -514,19 +286,6 @@ static void tty_output(int master, int slave) printk(UM_KERN_CONT "tty_output : read failed, err = %d\n", n); } -static void tty_close(int master, int slave) -{ - printk(UM_KERN_INFO "Checking that host ptys support SIGIO on " - "close..."); - - close(slave); - if (got_sigio) { - printk(UM_KERN_CONT "Yes\n"); - pty_close_sigio = 1; - } else - printk(UM_KERN_CONT "No, enabling workaround\n"); -} - static void __init check_sigio(void) { if ((access("/dev/ptmx", R_OK) < 0) && @@ -536,7 +295,6 @@ static void __init check_sigio(void) return; } check_one_sigio(tty_output); - check_one_sigio(tty_close); } /* Here because it only does the SIGIO testing for now */ diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c index 9d9f1b4bf826..327fb3c52fc7 100644 --- a/arch/um/os-Linux/signal.c +++ b/arch/um/os-Linux/signal.c @@ -1,31 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 /* + * Copyright (C) 2015 Anton Ivanov (aivanov@{brocade.com,kot-begemot.co.uk}) + * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de) * Copyright (C) 2004 PathScale, Inc * Copyright (C) 2004 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <stdlib.h> #include <stdarg.h> +#include <stdbool.h> #include <errno.h> #include <signal.h> +#include <string.h> #include <strings.h> #include <as-layout.h> #include <kern_util.h> #include <os.h> #include <sysdep/mcontext.h> +#include <um_malloc.h> +#include <sys/ucontext.h> +#include <timetravel.h> #include "internal.h" -void (*sig_info[NSIG])(int, struct siginfo *, struct uml_pt_regs *) = { +void (*sig_info[NSIG])(int, struct siginfo *, struct uml_pt_regs *, void *mc) = { [SIGTRAP] = relay_signal, [SIGFPE] = relay_signal, [SIGILL] = relay_signal, [SIGWINCH] = winch, - [SIGBUS] = bus_handler, + [SIGBUS] = relay_signal, [SIGSEGV] = segv_handler, [SIGIO] = sigio_handler, - [SIGVTALRM] = timer_handler }; + [SIGCHLD] = sigchld_handler, +}; -static void sig_handler_common(int sig, siginfo_t *si, mcontext_t *mc) +static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc) { struct uml_pt_regs r; int save_errno = errno; @@ -38,10 +46,10 @@ static void sig_handler_common(int sig, siginfo_t *si, mcontext_t *mc) } /* enable signals if sig isn't IRQ signal */ - if ((sig != SIGIO) && (sig != SIGWINCH) && (sig != SIGVTALRM)) - unblock_signals(); + if ((sig != SIGIO) && (sig != SIGWINCH) && (sig != SIGCHLD)) + unblock_signals_trace(); - (*sig_info[sig])(sig, si, &r); + (*sig_info[sig])(sig, si, &r, mc); errno = save_errno; } @@ -55,72 +63,131 @@ static void sig_handler_common(int sig, siginfo_t *si, mcontext_t *mc) #define SIGIO_BIT 0 #define SIGIO_MASK (1 << SIGIO_BIT) -#define SIGVTALRM_BIT 1 -#define SIGVTALRM_MASK (1 << SIGVTALRM_BIT) +#define SIGALRM_BIT 1 +#define SIGALRM_MASK (1 << SIGALRM_BIT) -static int signals_enabled; -static unsigned int signals_pending; +#define SIGCHLD_BIT 2 +#define SIGCHLD_MASK (1 << SIGCHLD_BIT) -void sig_handler(int sig, siginfo_t *si, mcontext_t *mc) +__thread int signals_enabled; +#if IS_ENABLED(CONFIG_UML_TIME_TRAVEL_SUPPORT) +static int signals_blocked, signals_blocked_pending; +#endif +static __thread unsigned int signals_pending; +static __thread unsigned int signals_active; + +static void sig_handler(int sig, struct siginfo *si, mcontext_t *mc) { - int enabled; + int enabled = signals_enabled; + +#if IS_ENABLED(CONFIG_UML_TIME_TRAVEL_SUPPORT) + if ((signals_blocked || + __atomic_load_n(&signals_blocked_pending, __ATOMIC_SEQ_CST)) && + (sig == SIGIO)) { + /* increment so unblock will do another round */ + __atomic_add_fetch(&signals_blocked_pending, 1, + __ATOMIC_SEQ_CST); + return; + } +#endif - enabled = signals_enabled; if (!enabled && (sig == SIGIO)) { - signals_pending |= SIGIO_MASK; + /* + * In TT_MODE_EXTERNAL, need to still call time-travel + * handlers. This will mark signals_pending by itself + * (only if necessary.) + * Note we won't get here if signals are hard-blocked + * (which is handled above), in that case the hard- + * unblock will handle things. + */ + if (time_travel_mode == TT_MODE_EXTERNAL) + sigio_run_timetravel_handlers(); + else + signals_pending |= SIGIO_MASK; return; } - block_signals(); + if (!enabled && (sig == SIGCHLD)) { + signals_pending |= SIGCHLD_MASK; + return; + } + + block_signals_trace(); sig_handler_common(sig, si, mc); - set_signals(enabled); + um_set_signals_trace(enabled); } -static void real_alarm_handler(mcontext_t *mc) +static void timer_real_alarm_handler(mcontext_t *mc) { struct uml_pt_regs regs; if (mc != NULL) get_regs_from_mc(®s, mc); - regs.is_user = 0; - unblock_signals(); - timer_handler(SIGVTALRM, NULL, ®s); + else + memset(®s, 0, sizeof(regs)); + timer_handler(SIGALRM, NULL, ®s); } -void alarm_handler(int sig, struct siginfo *unused_si, mcontext_t *mc) +static void timer_alarm_handler(int sig, struct siginfo *unused_si, mcontext_t *mc) { int enabled; enabled = signals_enabled; if (!signals_enabled) { - signals_pending |= SIGVTALRM_MASK; + signals_pending |= SIGALRM_MASK; return; } - block_signals(); + block_signals_trace(); + + signals_active |= SIGALRM_MASK; + + timer_real_alarm_handler(mc); - real_alarm_handler(mc); - set_signals(enabled); + signals_active &= ~SIGALRM_MASK; + + um_set_signals_trace(enabled); +} + +void deliver_alarm(void) { + timer_alarm_handler(SIGALRM, NULL, NULL); } -void timer_init(void) +void timer_set_signal_handler(void) { - set_handler(SIGVTALRM); + set_handler(SIGALRM); +} + +int timer_alarm_pending(void) +{ + return !!(signals_pending & SIGALRM_MASK); } void set_sigstack(void *sig_stack, int size) { - stack_t stack = ((stack_t) { .ss_flags = 0, - .ss_sp = (__ptr_t) sig_stack, - .ss_size = size - sizeof(void *) }); + stack_t stack = { + .ss_flags = 0, + .ss_sp = sig_stack, + .ss_size = size + }; if (sigaltstack(&stack, NULL) != 0) panic("enabling signal stack failed, errno = %d\n", errno); } -static void (*handlers[_NSIG])(int sig, siginfo_t *si, mcontext_t *mc) = { +static void sigusr1_handler(int sig, struct siginfo *unused_si, mcontext_t *mc) +{ + uml_pm_wake(); +} + +void register_pm_wake_signal(void) +{ + set_handler(SIGUSR1); +} + +static void (*handlers[_NSIG])(int sig, struct siginfo *si, mcontext_t *mc) = { [SIGSEGV] = sig_handler, [SIGBUS] = sig_handler, [SIGILL] = sig_handler, @@ -129,51 +196,19 @@ static void (*handlers[_NSIG])(int sig, siginfo_t *si, mcontext_t *mc) = { [SIGIO] = sig_handler, [SIGWINCH] = sig_handler, - [SIGVTALRM] = alarm_handler -}; + /* SIGCHLD is only actually registered in seccomp mode. */ + [SIGCHLD] = sig_handler, + [SIGALRM] = timer_alarm_handler, + [SIGUSR1] = sigusr1_handler, +}; static void hard_handler(int sig, siginfo_t *si, void *p) { - struct ucontext *uc = p; + ucontext_t *uc = p; mcontext_t *mc = &uc->uc_mcontext; - unsigned long pending = 1UL << sig; - do { - int nested, bail; - - /* - * pending comes back with one bit set for each - * interrupt that arrived while setting up the stack, - * plus a bit for this interrupt, plus the zero bit is - * set if this is a nested interrupt. - * If bail is true, then we interrupted another - * handler setting up the stack. In this case, we - * have to return, and the upper handler will deal - * with this interrupt. - */ - bail = to_irq_stack(&pending); - if (bail) - return; - - nested = pending & 1; - pending &= ~1; - - while ((sig = ffs(pending)) != 0){ - sig--; - pending &= ~(1 << sig); - (*handlers[sig])(sig, si, mc); - } - - /* - * Again, pending comes back with a mask of signals - * that arrived while tearing down the stack. If this - * is non-zero, we just go back, set up the stack - * again, and handle the new interrupts. - */ - if (!nested) - pending = from_irq_stack(nested); - } while (pending); + (*handlers[sig])(sig, (struct siginfo *)si, mc); } void set_handler(int sig) @@ -186,9 +221,9 @@ void set_handler(int sig) /* block irq ones */ sigemptyset(&action.sa_mask); - sigaddset(&action.sa_mask, SIGVTALRM); sigaddset(&action.sa_mask, SIGIO); sigaddset(&action.sa_mask, SIGWINCH); + sigaddset(&action.sa_mask, SIGALRM); if (sig == SIGSEGV) flags |= SA_NODEFER; @@ -207,6 +242,11 @@ void set_handler(int sig) panic("sigprocmask failed - errno = %d\n", errno); } +void send_sigio_to_self(void) +{ + kill(os_getpid(), SIGIO); +} + int change_sig(int signal, int on) { sigset_t sigset; @@ -219,9 +259,29 @@ int change_sig(int signal, int on) return 0; } -void block_signals(void) +static inline void __block_signals(void) { + if (!signals_enabled) + return; + + os_local_ipi_disable(); + barrier(); signals_enabled = 0; +} + +static inline void __unblock_signals(void) +{ + if (signals_enabled) + return; + + signals_enabled = 1; + barrier(); + os_local_ipi_enable(); +} + +void block_signals(void) +{ + __block_signals(); /* * This must return with signals disabled, so this barrier * ensures that writes are flushed out before the return. @@ -238,6 +298,12 @@ void unblock_signals(void) if (signals_enabled == 1) return; + __unblock_signals(); + +#if IS_ENABLED(CONFIG_UML_TIME_TRAVEL_SUPPORT) + deliver_time_travel_irqs(); +#endif + /* * We loop because the IRQ handler returns with interrupts off. So, * interrupts may have arrived and we need to re-enable them and @@ -247,12 +313,9 @@ void unblock_signals(void) /* * Save and reset save_pending after enabling signals. This * way, signals_pending won't be changed while we're reading it. - */ - signals_enabled = 1; - - /* + * * Setting signals_enabled and reading signals_pending must - * happen in this order. + * happen in this order, so have the barrier here. */ barrier(); @@ -265,10 +328,13 @@ void unblock_signals(void) /* * We have pending interrupts, so disable signals, as the * handlers expect them off when they are called. They will - * be enabled again above. + * be enabled again above. We need to trace this, as we're + * expected to be enabling interrupts already, but any more + * tracing that happens inside the handlers we call for the + * pending signals will mess up the tracing state. */ - - signals_enabled = 0; + __block_signals(); + um_trace_signals_off(); /* * Deal with SIGIO first because the alarm handler might @@ -281,17 +347,34 @@ void unblock_signals(void) if (save_pending & SIGIO_MASK) sig_handler_common(SIGIO, NULL, NULL); - if (save_pending & SIGVTALRM_MASK) - real_alarm_handler(NULL); + if (save_pending & SIGCHLD_MASK) { + struct uml_pt_regs regs = {}; + + sigchld_handler(SIGCHLD, NULL, ®s, NULL); + } + + /* Do not reenter the handler */ + + if ((save_pending & SIGALRM_MASK) && (!(signals_active & SIGALRM_MASK))) + timer_real_alarm_handler(NULL); + + /* Rerun the loop only if there is still pending SIGIO and not in TIMER handler */ + + if (!(signals_pending & SIGIO_MASK) && (signals_active & SIGALRM_MASK)) + return; + + /* Re-enable signals and trace that we're doing so. */ + um_trace_signals_on(); + __unblock_signals(); } } -int get_signals(void) +int um_get_signals(void) { return signals_enabled; } -int set_signals(int enable) +int um_set_signals(int enable) { int ret; if (signals_enabled == enable) @@ -304,3 +387,117 @@ int set_signals(int enable) return ret; } + +int um_set_signals_trace(int enable) +{ + int ret; + if (signals_enabled == enable) + return enable; + + ret = signals_enabled; + if (enable) + unblock_signals_trace(); + else + block_signals_trace(); + + return ret; +} + +#if IS_ENABLED(CONFIG_UML_TIME_TRAVEL_SUPPORT) +void mark_sigio_pending(void) +{ + /* + * It would seem that this should be atomic so + * it isn't a read-modify-write with a signal + * that could happen in the middle, losing the + * value set by the signal. + * + * However, this function is only called when in + * time-travel=ext simulation mode, in which case + * the only signal ever pending is SIGIO, which + * is blocked while this can be called, and the + * timer signal (SIGALRM) cannot happen. + */ + signals_pending |= SIGIO_MASK; +} + +void block_signals_hard(void) +{ + signals_blocked++; + barrier(); +} + +void unblock_signals_hard(void) +{ + static bool unblocking; + + if (!signals_blocked) + panic("unblocking signals while not blocked"); + + if (--signals_blocked) + return; + /* + * Must be set to 0 before we check pending so the + * SIGIO handler will run as normal unless we're still + * going to process signals_blocked_pending. + */ + barrier(); + + /* + * Note that block_signals_hard()/unblock_signals_hard() can be called + * within the unblock_signals()/sigio_run_timetravel_handlers() below. + * This would still be prone to race conditions since it's actually a + * call _within_ e.g. vu_req_read_message(), where we observed this + * issue, which loops. Thus, if the inner call handles the recorded + * pending signals, we can get out of the inner call with the real + * signal hander no longer blocked, and still have a race. Thus don't + * handle unblocking in the inner call, if it happens, but only in + * the outermost call - 'unblocking' serves as an ownership for the + * signals_blocked_pending decrement. + */ + if (unblocking) + return; + unblocking = true; + + while (__atomic_load_n(&signals_blocked_pending, __ATOMIC_SEQ_CST)) { + if (signals_enabled) { + /* signals are enabled so we can touch this */ + signals_pending |= SIGIO_MASK; + /* + * this is a bit inefficient, but that's + * not really important + */ + block_signals(); + unblock_signals(); + } else { + /* + * we need to run time-travel handlers even + * if not enabled + */ + sigio_run_timetravel_handlers(); + } + + /* + * The decrement of signals_blocked_pending must be atomic so + * that the signal handler will either happen before or after + * the decrement, not during a read-modify-write: + * - If it happens before, it can increment it and we'll + * decrement it and do another round in the loop. + * - If it happens after it'll see 0 for both signals_blocked + * and signals_blocked_pending and thus run the handler as + * usual (subject to signals_enabled, but that's unrelated.) + * + * Note that a call to unblock_signals_hard() within the calls + * to unblock_signals() or sigio_run_timetravel_handlers() above + * will do nothing due to the 'unblocking' state, so this cannot + * underflow as the only one decrementing will be the outermost + * one. + */ + if (__atomic_sub_fetch(&signals_blocked_pending, 1, + __ATOMIC_SEQ_CST) < 0) + panic("signals_blocked_pending underflow"); + } + + unblocking = false; +} +#endif diff --git a/arch/um/os-Linux/skas/Makefile b/arch/um/os-Linux/skas/Makefile index d2ea3409e072..75f11989d2e9 100644 --- a/arch/um/os-Linux/skas/Makefile +++ b/arch/um/os-Linux/skas/Makefile @@ -1,10 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0 # # Copyright (C) 2002 - 2007 Jeff Dike (jdike@{linux.intel,addtoit}.com) -# Licensed under the GPL # obj-y := mem.o process.o USER_OBJS := $(obj-y) -include arch/um/scripts/Makefile.rules +include $(srctree)/arch/um/scripts/Makefile.rules diff --git a/arch/um/os-Linux/skas/mem.c b/arch/um/os-Linux/skas/mem.c index 689b18db798f..8b9921ac3ef8 100644 --- a/arch/um/os-Linux/skas/mem.c +++ b/arch/um/os-Linux/skas/mem.c @@ -1,6 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* + * Copyright (C) 2021 Benjamin Berg <benjamin@sipsolutions.net> * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <stddef.h> @@ -12,16 +13,47 @@ #include <as-layout.h> #include <mm_id.h> #include <os.h> -#include <proc_mm.h> #include <ptrace_user.h> #include <registers.h> #include <skas.h> #include <sysdep/ptrace.h> #include <sysdep/stub.h> +#include "../internal.h" -extern unsigned long batch_syscall_stub, __syscall_stub_start; +extern char __syscall_stub_start[]; -extern void wait_stub_done(int pid); +void syscall_stub_dump_error(struct mm_id *mm_idp) +{ + struct stub_data *proc_data = (void *)mm_idp->stack; + struct stub_syscall *sc; + + if (proc_data->syscall_data_len < 0 || + proc_data->syscall_data_len >= ARRAY_SIZE(proc_data->syscall_data)) + panic("Syscall data was corrupted by stub (len is: %d, expected maximum: %d)!", + proc_data->syscall_data_len, + mm_idp->syscall_data_len); + + sc = &proc_data->syscall_data[proc_data->syscall_data_len]; + + printk(UM_KERN_ERR "%s : length = %d, last offset = %d", + __func__, mm_idp->syscall_data_len, + proc_data->syscall_data_len); + printk(UM_KERN_ERR "%s : stub syscall type %d failed, return value = 0x%lx\n", + __func__, sc->syscall, proc_data->err); + + print_hex_dump(UM_KERN_ERR, " syscall data: ", 0, + 16, 4, sc, sizeof(*sc), 0); + + if (using_seccomp) { + printk(UM_KERN_ERR "%s: FD map num: %d", __func__, + mm_idp->syscall_fd_num); + print_hex_dump(UM_KERN_ERR, + " FD map: ", 0, 16, + sizeof(mm_idp->syscall_fd_map[0]), + mm_idp->syscall_fd_map, + sizeof(mm_idp->syscall_fd_map), 0); + } +} static inline unsigned long *check_init_stack(struct mm_id * mm_idp, unsigned long *stack) @@ -38,246 +70,215 @@ static unsigned long syscall_regs[MAX_REG_NR]; static int __init init_syscall_regs(void) { get_safe_registers(syscall_regs, NULL); + syscall_regs[REGS_IP_INDEX] = STUB_CODE + - ((unsigned long) &batch_syscall_stub - - (unsigned long) &__syscall_stub_start); + ((unsigned long) stub_syscall_handler - + (unsigned long) __syscall_stub_start); + syscall_regs[REGS_SP_INDEX] = STUB_DATA + + offsetof(struct stub_data, sigstack) + + sizeof(((struct stub_data *) 0)->sigstack) - + sizeof(void *); + return 0; } __initcall(init_syscall_regs); -extern int proc_mm; - -static inline long do_syscall_stub(struct mm_id * mm_idp, void **addr) +static inline long do_syscall_stub(struct mm_id *mm_idp) { + struct stub_data *proc_data = (void *)mm_idp->stack; int n, i; - long ret, offset; - unsigned long * data; - unsigned long * syscall; - int err, pid = mm_idp->u.pid; - - if (proc_mm) - /* FIXME: Need to look up userspace_pid by cpu */ - pid = userspace_pid[0]; - - n = ptrace_setregs(pid, syscall_regs); - if (n < 0) { - printk(UM_KERN_ERR "Registers - \n"); - for (i = 0; i < MAX_REG_NR; i++) - printk(UM_KERN_ERR "\t%d\t0x%lx\n", i, syscall_regs[i]); - panic("do_syscall_stub : PTRACE_SETREGS failed, errno = %d\n", - -n); - } + int err, pid = mm_idp->pid; + + /* Inform process how much we have filled in. */ + proc_data->syscall_data_len = mm_idp->syscall_data_len; + + if (using_seccomp) { + proc_data->restart_wait = 1; + wait_stub_done_seccomp(mm_idp, 0, 1); + } else { + n = ptrace_setregs(pid, syscall_regs); + if (n < 0) { + printk(UM_KERN_ERR "Registers -\n"); + for (i = 0; i < MAX_REG_NR; i++) + printk(UM_KERN_ERR "\t%d\t0x%lx\n", i, syscall_regs[i]); + panic("%s : PTRACE_SETREGS failed, errno = %d\n", + __func__, -n); + } - err = ptrace(PTRACE_CONT, pid, 0, 0); - if (err) - panic("Failed to continue stub, pid = %d, errno = %d\n", pid, - errno); + err = ptrace(PTRACE_CONT, pid, 0, 0); + if (err) + panic("Failed to continue stub, pid = %d, errno = %d\n", + pid, errno); - wait_stub_done(pid); + wait_stub_done(pid); + } /* - * When the stub stops, we find the following values on the - * beginning of the stack: - * (long )return_value - * (long )offset to failed sycall-data (0, if no error) + * proc_data->err will be negative if there was an (unexpected) error. + * In that case, syscall_data_len points to the last executed syscall, + * otherwise it will be zero (but we do not need to rely on that). */ - ret = *((unsigned long *) mm_idp->stack); - offset = *((unsigned long *) mm_idp->stack + 1); - if (offset) { - data = (unsigned long *)(mm_idp->stack + offset - STUB_DATA); - printk(UM_KERN_ERR "do_syscall_stub : ret = %ld, offset = %ld, " - "data = %p\n", ret, offset, data); - syscall = (unsigned long *)((unsigned long)data + data[0]); - printk(UM_KERN_ERR "do_syscall_stub: syscall %ld failed, " - "return value = 0x%lx, expected return value = 0x%lx\n", - syscall[0], ret, syscall[7]); - printk(UM_KERN_ERR " syscall parameters: " - "0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n", - syscall[1], syscall[2], syscall[3], - syscall[4], syscall[5], syscall[6]); - for (n = 1; n < data[0]/sizeof(long); n++) { - if (n == 1) - printk(UM_KERN_ERR " additional syscall " - "data:"); - if (n % 4 == 1) - printk("\n" UM_KERN_ERR " "); - printk(" 0x%lx", data[n]); - } - if (n > 1) - printk("\n"); + if (proc_data->err < 0) { + syscall_stub_dump_error(mm_idp); + + /* Store error code in case someone tries to add more syscalls */ + mm_idp->syscall_data_len = proc_data->err; + } else { + mm_idp->syscall_data_len = 0; } - else ret = 0; - *addr = check_init_stack(mm_idp, NULL); + if (using_seccomp) + mm_idp->syscall_fd_num = 0; - return ret; + return mm_idp->syscall_data_len; } -long run_syscall_stub(struct mm_id * mm_idp, int syscall, - unsigned long *args, long expected, void **addr, - int done) +int syscall_stub_flush(struct mm_id *mm_idp) { - unsigned long *stack = check_init_stack(mm_idp, *addr); - - *stack += sizeof(long); - stack += *stack / sizeof(long); - - *stack++ = syscall; - *stack++ = args[0]; - *stack++ = args[1]; - *stack++ = args[2]; - *stack++ = args[3]; - *stack++ = args[4]; - *stack++ = args[5]; - *stack++ = expected; - *stack = 0; - - if (!done && ((((unsigned long) stack) & ~UM_KERN_PAGE_MASK) < - UM_KERN_PAGE_SIZE - 10 * sizeof(long))) { - *addr = stack; + int res; + + if (mm_idp->syscall_data_len == 0) return 0; + + /* If an error happened already, report it and reset the state. */ + if (mm_idp->syscall_data_len < 0) { + res = mm_idp->syscall_data_len; + mm_idp->syscall_data_len = 0; + return res; } - return do_syscall_stub(mm_idp, addr); + res = do_syscall_stub(mm_idp); + mm_idp->syscall_data_len = 0; + + return res; } -long syscall_stub_data(struct mm_id * mm_idp, - unsigned long *data, int data_count, - void **addr, void **stub_addr) +struct stub_syscall *syscall_stub_alloc(struct mm_id *mm_idp) { - unsigned long *stack; - int ret = 0; - - /* - * If *addr still is uninitialized, it *must* contain NULL. - * Thus in this case do_syscall_stub correctly won't be called. - */ - if ((((unsigned long) *addr) & ~UM_KERN_PAGE_MASK) >= - UM_KERN_PAGE_SIZE - (10 + data_count) * sizeof(long)) { - ret = do_syscall_stub(mm_idp, addr); - /* in case of error, don't overwrite data on stack */ - if (ret) - return ret; + struct stub_syscall *sc; + struct stub_data *proc_data = (struct stub_data *) mm_idp->stack; + + if (mm_idp->syscall_data_len > 0 && + mm_idp->syscall_data_len == ARRAY_SIZE(proc_data->syscall_data)) + do_syscall_stub(mm_idp); + + if (mm_idp->syscall_data_len < 0) { + /* Return dummy to retain error state. */ + sc = &proc_data->syscall_data[0]; + } else { + sc = &proc_data->syscall_data[mm_idp->syscall_data_len]; + mm_idp->syscall_data_len += 1; } + memset(sc, 0, sizeof(*sc)); - stack = check_init_stack(mm_idp, *addr); - *addr = stack; + return sc; +} - *stack = data_count * sizeof(long); +static struct stub_syscall *syscall_stub_get_previous(struct mm_id *mm_idp, + int syscall_type, + unsigned long virt) +{ + if (mm_idp->syscall_data_len > 0) { + struct stub_data *proc_data = (void *) mm_idp->stack; + struct stub_syscall *sc; - memcpy(stack + 1, data, data_count * sizeof(long)); + sc = &proc_data->syscall_data[mm_idp->syscall_data_len - 1]; - *stub_addr = (void *)(((unsigned long)(stack + 1) & - ~UM_KERN_PAGE_MASK) + STUB_DATA); + if (sc->syscall == syscall_type && + sc->mem.addr + sc->mem.length == virt) + return sc; + } - return 0; + return NULL; } -int map(struct mm_id * mm_idp, unsigned long virt, unsigned long len, int prot, - int phys_fd, unsigned long long offset, int done, void **data) +static int get_stub_fd(struct mm_id *mm_idp, int fd) { - int ret; - - if (proc_mm) { - struct proc_mm_op map; - int fd = mm_idp->u.mm_fd; - - map = ((struct proc_mm_op) { .op = MM_MMAP, - .u = - { .mmap = - { .addr = virt, - .len = len, - .prot = prot, - .flags = MAP_SHARED | - MAP_FIXED, - .fd = phys_fd, - .offset= offset - } } } ); - CATCH_EINTR(ret = write(fd, &map, sizeof(map))); - if (ret != sizeof(map)) { - ret = -errno; - printk(UM_KERN_ERR "map : /proc/mm map failed, " - "err = %d\n", -ret); + int i; + + /* Find an FD slot (or flush and use first) */ + if (!using_seccomp) + return fd; + + /* Already crashed, value does not matter */ + if (mm_idp->syscall_data_len < 0) + return 0; + + /* Find existing FD in map if we can allocate another syscall */ + if (mm_idp->syscall_data_len < + ARRAY_SIZE(((struct stub_data *)NULL)->syscall_data)) { + for (i = 0; i < mm_idp->syscall_fd_num; i++) { + if (mm_idp->syscall_fd_map[i] == fd) + return i; } - else ret = 0; - } - else { - unsigned long args[] = { virt, len, prot, - MAP_SHARED | MAP_FIXED, phys_fd, - MMAP_OFFSET(offset) }; - ret = run_syscall_stub(mm_idp, STUB_MMAP_NR, args, virt, - data, done); + if (mm_idp->syscall_fd_num < STUB_MAX_FDS) { + i = mm_idp->syscall_fd_num; + mm_idp->syscall_fd_map[i] = fd; + + mm_idp->syscall_fd_num++; + + return i; + } } - return ret; + /* FD map full or no syscall space available, continue after flush */ + do_syscall_stub(mm_idp); + mm_idp->syscall_fd_map[0] = fd; + mm_idp->syscall_fd_num = 1; + + return 0; } -int unmap(struct mm_id * mm_idp, unsigned long addr, unsigned long len, - int done, void **data) +int map(struct mm_id *mm_idp, unsigned long virt, unsigned long len, int prot, + int phys_fd, unsigned long long offset) { - int ret; - - if (proc_mm) { - struct proc_mm_op unmap; - int fd = mm_idp->u.mm_fd; - - unmap = ((struct proc_mm_op) { .op = MM_MUNMAP, - .u = - { .munmap = - { .addr = - (unsigned long) addr, - .len = len } } } ); - CATCH_EINTR(ret = write(fd, &unmap, sizeof(unmap))); - if (ret != sizeof(unmap)) { - ret = -errno; - printk(UM_KERN_ERR "unmap - proc_mm write returned " - "%d\n", ret); + struct stub_syscall *sc; + + /* Compress with previous syscall if that is possible */ + sc = syscall_stub_get_previous(mm_idp, STUB_SYSCALL_MMAP, virt); + if (sc && sc->mem.prot == prot && + sc->mem.offset == MMAP_OFFSET(offset - sc->mem.length)) { + int prev_fd = sc->mem.fd; + + if (using_seccomp) + prev_fd = mm_idp->syscall_fd_map[sc->mem.fd]; + + if (phys_fd == prev_fd) { + sc->mem.length += len; + return 0; } - else ret = 0; } - else { - unsigned long args[] = { (unsigned long) addr, len, 0, 0, 0, - 0 }; - ret = run_syscall_stub(mm_idp, __NR_munmap, args, 0, - data, done); - } + phys_fd = get_stub_fd(mm_idp, phys_fd); - return ret; + sc = syscall_stub_alloc(mm_idp); + sc->syscall = STUB_SYSCALL_MMAP; + sc->mem.addr = virt; + sc->mem.length = len; + sc->mem.prot = prot; + sc->mem.fd = phys_fd; + sc->mem.offset = MMAP_OFFSET(offset); + + return 0; } -int protect(struct mm_id * mm_idp, unsigned long addr, unsigned long len, - unsigned int prot, int done, void **data) +int unmap(struct mm_id *mm_idp, unsigned long addr, unsigned long len) { - struct proc_mm_op protect; - int ret; - - if (proc_mm) { - int fd = mm_idp->u.mm_fd; - - protect = ((struct proc_mm_op) { .op = MM_MPROTECT, - .u = - { .mprotect = - { .addr = - (unsigned long) addr, - .len = len, - .prot = prot } } } ); - - CATCH_EINTR(ret = write(fd, &protect, sizeof(protect))); - if (ret != sizeof(protect)) { - ret = -errno; - printk(UM_KERN_ERR "protect failed, err = %d", -ret); - } - else ret = 0; - } - else { - unsigned long args[] = { addr, len, prot, 0, 0, 0 }; + struct stub_syscall *sc; - ret = run_syscall_stub(mm_idp, __NR_mprotect, args, 0, - data, done); + /* Compress with previous syscall if that is possible */ + sc = syscall_stub_get_previous(mm_idp, STUB_SYSCALL_MUNMAP, addr); + if (sc) { + sc->mem.length += len; + return 0; } - return ret; + sc = syscall_stub_alloc(mm_idp); + sc->syscall = STUB_SYSCALL_MUNMAP; + sc->mem.addr = addr; + sc->mem.length = len; + + return 0; } diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c index 4625949bf1e4..d6c22f8aa06d 100644 --- a/arch/um/os-Linux/skas/process.c +++ b/arch/um/os-Linux/skas/process.c @@ -1,33 +1,92 @@ +// SPDX-License-Identifier: GPL-2.0 /* + * Copyright (C) 2021 Benjamin Berg <benjamin@sipsolutions.net> + * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de) * Copyright (C) 2002- 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <stdlib.h> +#include <stdbool.h> #include <unistd.h> #include <sched.h> #include <errno.h> #include <string.h> +#include <fcntl.h> +#include <mem_user.h> #include <sys/mman.h> #include <sys/wait.h> +#include <sys/stat.h> +#include <sys/socket.h> #include <asm/unistd.h> #include <as-layout.h> #include <init.h> #include <kern_util.h> #include <mem.h> #include <os.h> -#include <proc_mm.h> #include <ptrace_user.h> #include <registers.h> #include <skas.h> -#include <skas_ptrace.h> #include <sysdep/stub.h> +#include <sysdep/mcontext.h> +#include <linux/futex.h> +#include <linux/threads.h> +#include <timetravel.h> +#include <asm-generic/rwonce.h> +#include "../internal.h" int is_skas_winch(int pid, int fd, void *data) { return pid == getpgrp(); } +static const char *ptrace_reg_name(int idx) +{ +#define R(n) case HOST_##n: return #n + + switch (idx) { +#ifdef __x86_64__ + R(BX); + R(CX); + R(DI); + R(SI); + R(DX); + R(BP); + R(AX); + R(R8); + R(R9); + R(R10); + R(R11); + R(R12); + R(R13); + R(R14); + R(R15); + R(ORIG_AX); + R(CS); + R(SS); + R(EFLAGS); +#elif defined(__i386__) + R(IP); + R(SP); + R(EFLAGS); + R(AX); + R(BX); + R(CX); + R(DX); + R(SI); + R(DI); + R(BP); + R(CS); + R(SS); + R(DS); + R(FS); + R(ES); + R(GS); + R(ORIG_AX); +#endif + } + return ""; +} + static int ptrace_dump_regs(int pid) { unsigned long regs[MAX_REG_NR]; @@ -37,8 +96,11 @@ static int ptrace_dump_regs(int pid) return -errno; printk(UM_KERN_ERR "Stub registers -\n"); - for (i = 0; i < ARRAY_SIZE(regs); i++) - printk(UM_KERN_ERR "\t%d - %lx\n", i, regs[i]); + for (i = 0; i < ARRAY_SIZE(regs); i++) { + const char *regname = ptrace_reg_name(i); + + printk(UM_KERN_ERR "\t%s\t(%2d): %lx\n", regname, i, regs[i]); + } return 0; } @@ -47,7 +109,7 @@ static int ptrace_dump_regs(int pid) * Signals that are OK to receive in the stub - we'll just continue it. * SIGWINCH will happen when UML is inside a detached screen. */ -#define STUB_SIG_MASK ((1 << SIGVTALRM) | (1 << SIGWINCH)) +#define STUB_SIG_MASK ((1 << SIGALRM) | (1 << SIGWINCH)) /* Signals that the stub will finish with - anything else is an error */ #define STUB_DONE_MASK (1 << SIGTRAP) @@ -66,8 +128,8 @@ void wait_stub_done(int pid) err = ptrace(PTRACE_CONT, pid, 0, 0); if (err) { - printk(UM_KERN_ERR "wait_stub_done : continue failed, " - "errno = %d\n", errno); + printk(UM_KERN_ERR "%s : continue failed, errno = %d\n", + __func__, errno); fatal_sigsegv(); } } @@ -78,384 +140,647 @@ void wait_stub_done(int pid) bad_wait: err = ptrace_dump_regs(pid); if (err) - printk(UM_KERN_ERR "Failed to get registers from stub, " - "errno = %d\n", -err); - printk(UM_KERN_ERR "wait_stub_done : failed to wait for SIGTRAP, " - "pid = %d, n = %d, errno = %d, status = 0x%x\n", pid, n, errno, - status); + printk(UM_KERN_ERR "Failed to get registers from stub, errno = %d\n", + -err); + printk(UM_KERN_ERR "%s : failed to wait for SIGTRAP, pid = %d, n = %d, errno = %d, status = 0x%x\n", + __func__, pid, n, errno, status); fatal_sigsegv(); } -extern unsigned long current_stub_stack(void); - -static void get_skas_faultinfo(int pid, struct faultinfo *fi) +void wait_stub_done_seccomp(struct mm_id *mm_idp, int running, int wait_sigsys) { - int err; + struct stub_data *data = (void *)mm_idp->stack; + int ret; - if (ptrace_faultinfo) { - err = ptrace(PTRACE_FAULTINFO, pid, 0, fi); - if (err) { - printk(UM_KERN_ERR "get_skas_faultinfo - " - "PTRACE_FAULTINFO failed, errno = %d\n", errno); - fatal_sigsegv(); + do { + const char byte = 0; + struct iovec iov = { + .iov_base = (void *)&byte, + .iov_len = sizeof(byte), + }; + union { + char data[CMSG_SPACE(sizeof(mm_idp->syscall_fd_map))]; + struct cmsghdr align; + } ctrl; + struct msghdr msgh = { + .msg_iov = &iov, + .msg_iovlen = 1, + }; + + if (!running) { + if (mm_idp->syscall_fd_num) { + unsigned int fds_size = + sizeof(int) * mm_idp->syscall_fd_num; + struct cmsghdr *cmsg; + + msgh.msg_control = ctrl.data; + msgh.msg_controllen = CMSG_SPACE(fds_size); + cmsg = CMSG_FIRSTHDR(&msgh); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(fds_size); + memcpy(CMSG_DATA(cmsg), mm_idp->syscall_fd_map, + fds_size); + + CATCH_EINTR(syscall(__NR_sendmsg, mm_idp->sock, + &msgh, 0)); + } + + data->signal = 0; + data->futex = FUTEX_IN_CHILD; + CATCH_EINTR(syscall(__NR_futex, &data->futex, + FUTEX_WAKE, 1, NULL, NULL, 0)); } - /* Special handling for i386, which has different structs */ - if (sizeof(struct ptrace_faultinfo) < sizeof(struct faultinfo)) - memset((char *)fi + sizeof(struct ptrace_faultinfo), 0, - sizeof(struct faultinfo) - - sizeof(struct ptrace_faultinfo)); - } - else { - unsigned long fpregs[FP_SIZE]; + do { + /* + * We need to check whether the child is still alive + * before and after the FUTEX_WAIT call. Before, in + * case it just died but we still updated data->futex + * to FUTEX_IN_CHILD. And after, in case it died while + * we were waiting (and SIGCHLD woke us up, see the + * IRQ handler in mmu.c). + * + * Either way, if PID is negative, then we have no + * choice but to kill the task. + */ + if (__READ_ONCE(mm_idp->pid) < 0) + goto out_kill; + + ret = syscall(__NR_futex, &data->futex, + FUTEX_WAIT, FUTEX_IN_CHILD, + NULL, NULL, 0); + if (ret < 0 && errno != EINTR && errno != EAGAIN) { + printk(UM_KERN_ERR "%s : FUTEX_WAIT failed, errno = %d\n", + __func__, errno); + goto out_kill; + } + } while (data->futex == FUTEX_IN_CHILD); - err = get_fp_registers(pid, fpregs); - if (err < 0) { - printk(UM_KERN_ERR "save_fp_registers returned %d\n", - err); - fatal_sigsegv(); - } - err = ptrace(PTRACE_CONT, pid, 0, SIGSEGV); - if (err) { - printk(UM_KERN_ERR "Failed to continue stub, pid = %d, " - "errno = %d\n", pid, errno); - fatal_sigsegv(); - } - wait_stub_done(pid); + if (__READ_ONCE(mm_idp->pid) < 0) + goto out_kill; - /* - * faultinfo is prepared by the stub-segv-handler at start of - * the stub stack page. We just have to copy it. - */ - memcpy(fi, (void *)current_stub_stack(), sizeof(*fi)); + running = 0; - err = put_fp_registers(pid, fpregs); - if (err < 0) { - printk(UM_KERN_ERR "put_fp_registers returned %d\n", - err); - fatal_sigsegv(); - } + /* We may receive a SIGALRM before SIGSYS, iterate again. */ + } while (wait_sigsys && data->signal == SIGALRM); + + if (data->mctx_offset > sizeof(data->sigstack) - sizeof(mcontext_t)) { + printk(UM_KERN_ERR "%s : invalid mcontext offset", __func__); + goto out_kill; } + + if (wait_sigsys && data->signal != SIGSYS) { + printk(UM_KERN_ERR "%s : expected SIGSYS but got %d", + __func__, data->signal); + goto out_kill; + } + + return; + +out_kill: + printk(UM_KERN_ERR "%s : failed to wait for stub, pid = %d, errno = %d\n", + __func__, mm_idp->pid, errno); + /* This is not true inside start_userspace */ + if (current_mm_id() == mm_idp) + fatal_sigsegv(); } -static void handle_segv(int pid, struct uml_pt_regs * regs) +extern unsigned long current_stub_stack(void); + +static void get_skas_faultinfo(int pid, struct faultinfo *fi) { - get_skas_faultinfo(pid, ®s->faultinfo); - segv(regs->faultinfo, 0, 1, NULL); + int err; + + err = ptrace(PTRACE_CONT, pid, 0, SIGSEGV); + if (err) { + printk(UM_KERN_ERR "Failed to continue stub, pid = %d, " + "errno = %d\n", pid, errno); + fatal_sigsegv(); + } + wait_stub_done(pid); + + /* + * faultinfo is prepared by the stub_segv_handler at start of + * the stub stack page. We just have to copy it. + */ + memcpy(fi, (void *)current_stub_stack(), sizeof(*fi)); } -/* - * To use the same value of using_sysemu as the caller, ask it that value - * (in local_using_sysemu - */ -static void handle_trap(int pid, struct uml_pt_regs *regs, - int local_using_sysemu) +static void handle_trap(struct uml_pt_regs *regs) { - int err, status; - if ((UPT_IP(regs) >= STUB_START) && (UPT_IP(regs) < STUB_END)) fatal_sigsegv(); - /* Mark this as a syscall */ - UPT_SYSCALL_NR(regs) = PT_SYSCALL_NR(regs->gp); + handle_syscall(regs); +} - if (!local_using_sysemu) - { - err = ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_NR_OFFSET, - __NR_getpid); - if (err < 0) { - printk(UM_KERN_ERR "handle_trap - nullifying syscall " - "failed, errno = %d\n", errno); - fatal_sigsegv(); - } +extern char __syscall_stub_start[]; - err = ptrace(PTRACE_SYSCALL, pid, 0, 0); - if (err < 0) { - printk(UM_KERN_ERR "handle_trap - continuing to end of " - "syscall failed, errno = %d\n", errno); - fatal_sigsegv(); - } +static int stub_exe_fd; - CATCH_EINTR(err = waitpid(pid, &status, WUNTRACED | __WALL)); - if ((err < 0) || !WIFSTOPPED(status) || - (WSTOPSIG(status) != SIGTRAP + 0x80)) { - err = ptrace_dump_regs(pid); - if (err) - printk(UM_KERN_ERR "Failed to get registers " - "from process, errno = %d\n", -err); - printk(UM_KERN_ERR "handle_trap - failed to wait at " - "end of syscall, errno = %d, status = %d\n", - errno, status); - fatal_sigsegv(); - } +struct tramp_data { + struct stub_data *stub_data; + /* 0 is inherited, 1 is the kernel side */ + int sockpair[2]; +}; + +#ifndef CLOSE_RANGE_CLOEXEC +#define CLOSE_RANGE_CLOEXEC (1U << 2) +#endif + +static int userspace_tramp(void *data) +{ + struct tramp_data *tramp_data = data; + char *const argv[] = { "uml-userspace", NULL }; + unsigned long long offset; + struct stub_init_data init_data = { + .seccomp = using_seccomp, + .stub_start = STUB_START, + }; + int ret; + + if (using_seccomp) { + init_data.signal_handler = STUB_CODE + + (unsigned long) stub_signal_interrupt - + (unsigned long) __syscall_stub_start; + init_data.signal_restorer = STUB_CODE + + (unsigned long) stub_signal_restorer - + (unsigned long) __syscall_stub_start; + } else { + init_data.signal_handler = STUB_CODE + + (unsigned long) stub_segv_handler - + (unsigned long) __syscall_stub_start; + init_data.signal_restorer = 0; } - handle_syscall(regs); + init_data.stub_code_fd = phys_mapping(uml_to_phys(__syscall_stub_start), + &offset); + init_data.stub_code_offset = MMAP_OFFSET(offset); + + init_data.stub_data_fd = phys_mapping(uml_to_phys(tramp_data->stub_data), + &offset); + init_data.stub_data_offset = MMAP_OFFSET(offset); + + /* + * Avoid leaking unneeded FDs to the stub by setting CLOEXEC on all FDs + * and then unsetting it on all memory related FDs. + * This is not strictly necessary from a safety perspective. + */ + syscall(__NR_close_range, 0, ~0U, CLOSE_RANGE_CLOEXEC); + + fcntl(init_data.stub_data_fd, F_SETFD, 0); + + /* dup2 signaling FD/socket to STDIN */ + if (dup2(tramp_data->sockpair[0], 0) < 0) + exit(3); + close(tramp_data->sockpair[0]); + + /* Write init_data and close write side */ + ret = write(tramp_data->sockpair[1], &init_data, sizeof(init_data)); + close(tramp_data->sockpair[1]); + + if (ret != sizeof(init_data)) + exit(4); + + /* Raw execveat for compatibility with older libc versions */ + syscall(__NR_execveat, stub_exe_fd, (unsigned long)"", + (unsigned long)argv, NULL, AT_EMPTY_PATH); + + exit(5); } -extern int __syscall_stub_start; +extern char stub_exe_start[]; +extern char stub_exe_end[]; + +extern char *tempdir; -static int userspace_tramp(void *stack) +#define STUB_EXE_NAME_TEMPLATE "/uml-userspace-XXXXXX" + +#ifndef MFD_EXEC +#define MFD_EXEC 0x0010U +#endif + +static int __init init_stub_exe_fd(void) { - void *addr; - int err; + size_t written = 0; + char *tmpfile = NULL; - ptrace(PTRACE_TRACEME, 0, 0, 0); + stub_exe_fd = memfd_create("uml-userspace", + MFD_EXEC | MFD_CLOEXEC | MFD_ALLOW_SEALING); - signal(SIGTERM, SIG_DFL); - signal(SIGWINCH, SIG_IGN); - err = set_interval(); - if (err) { - printk(UM_KERN_ERR "userspace_tramp - setting timer failed, " - "errno = %d\n", err); - exit(1); + if (stub_exe_fd < 0) { + printk(UM_KERN_INFO "Could not create executable memfd, using temporary file!"); + + tmpfile = malloc(strlen(tempdir) + + strlen(STUB_EXE_NAME_TEMPLATE) + 1); + if (tmpfile == NULL) + panic("Failed to allocate memory for stub binary name"); + + strcpy(tmpfile, tempdir); + strcat(tmpfile, STUB_EXE_NAME_TEMPLATE); + + stub_exe_fd = mkstemp(tmpfile); + if (stub_exe_fd < 0) + panic("Could not create temporary file for stub binary: %d", + -errno); } - if (!proc_mm) { - /* - * This has a pte, but it can't be mapped in with the usual - * tlb_flush mechanism because this is part of that mechanism - */ - int fd; - unsigned long long offset; - fd = phys_mapping(to_phys(&__syscall_stub_start), &offset); - addr = mmap64((void *) STUB_CODE, UM_KERN_PAGE_SIZE, - PROT_EXEC, MAP_FIXED | MAP_PRIVATE, fd, offset); - if (addr == MAP_FAILED) { - printk(UM_KERN_ERR "mapping mmap stub at 0x%lx failed, " - "errno = %d\n", STUB_CODE, errno); - exit(1); - } + while (written < stub_exe_end - stub_exe_start) { + ssize_t res = write(stub_exe_fd, stub_exe_start + written, + stub_exe_end - stub_exe_start - written); + if (res < 0) { + if (errno == EINTR) + continue; - if (stack != NULL) { - fd = phys_mapping(to_phys(stack), &offset); - addr = mmap((void *) STUB_DATA, - UM_KERN_PAGE_SIZE, PROT_READ | PROT_WRITE, - MAP_FIXED | MAP_SHARED, fd, offset); - if (addr == MAP_FAILED) { - printk(UM_KERN_ERR "mapping segfault stack " - "at 0x%lx failed, errno = %d\n", - STUB_DATA, errno); - exit(1); - } + if (tmpfile) + unlink(tmpfile); + panic("Failed write stub binary: %d", -errno); } + + written += res; } - if (!ptrace_faultinfo && (stack != NULL)) { - struct sigaction sa; - - unsigned long v = STUB_CODE + - (unsigned long) stub_segv_handler - - (unsigned long) &__syscall_stub_start; - - set_sigstack((void *) STUB_DATA, UM_KERN_PAGE_SIZE); - sigemptyset(&sa.sa_mask); - sa.sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO; - sa.sa_sigaction = (void *) v; - sa.sa_restorer = NULL; - if (sigaction(SIGSEGV, &sa, NULL) < 0) { - printk(UM_KERN_ERR "userspace_tramp - setting SIGSEGV " - "handler failed - errno = %d\n", errno); - exit(1); + + if (!tmpfile) { + fcntl(stub_exe_fd, F_ADD_SEALS, + F_SEAL_WRITE | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_SEAL); + } else { + if (fchmod(stub_exe_fd, 00500) < 0) { + unlink(tmpfile); + panic("Could not make stub binary executable: %d", + -errno); + } + + close(stub_exe_fd); + stub_exe_fd = open(tmpfile, O_RDONLY | O_CLOEXEC | O_NOFOLLOW); + if (stub_exe_fd < 0) { + unlink(tmpfile); + panic("Could not reopen stub binary: %d", -errno); } + + unlink(tmpfile); + free(tmpfile); } - kill(os_getpid(), SIGSTOP); return 0; } - -/* Each element set once, and only accessed by a single processor anyway */ -#undef NR_CPUS -#define NR_CPUS 1 -int userspace_pid[NR_CPUS]; - -int start_userspace(unsigned long stub_stack) +__initcall(init_stub_exe_fd); + +int using_seccomp; + +/** + * start_userspace() - prepare a new userspace process + * @mm_id: The corresponding struct mm_id + * + * Setups a new temporary stack page that is used while userspace_tramp() runs + * Clones the kernel process into a new userspace process, with FDs only. + * + * Return: When positive: the process id of the new userspace process, + * when negative: an error number. + * FIXME: can PIDs become negative?! + */ +int start_userspace(struct mm_id *mm_id) { + struct stub_data *proc_data = (void *)mm_id->stack; + struct tramp_data tramp_data = { + .stub_data = proc_data, + }; void *stack; unsigned long sp; - int pid, status, n, flags, err; + int status, n, err; + /* setup a temporary stack page */ stack = mmap(NULL, UM_KERN_PAGE_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (stack == MAP_FAILED) { err = -errno; - printk(UM_KERN_ERR "start_userspace : mmap failed, " - "errno = %d\n", errno); + printk(UM_KERN_ERR "%s : mmap failed, errno = %d\n", + __func__, errno); return err; } - sp = (unsigned long) stack + UM_KERN_PAGE_SIZE - sizeof(void *); + /* set stack pointer to the end of the stack page, so it can grow downwards */ + sp = (unsigned long)stack + UM_KERN_PAGE_SIZE; - flags = CLONE_FILES; - if (proc_mm) - flags |= CLONE_VM; - else - flags |= SIGCHLD; - - pid = clone(userspace_tramp, (void *) sp, flags, (void *) stub_stack); - if (pid < 0) { + /* socket pair for init data and SECCOMP FD passing (no CLOEXEC here) */ + if (socketpair(AF_UNIX, SOCK_STREAM, 0, tramp_data.sockpair)) { err = -errno; - printk(UM_KERN_ERR "start_userspace : clone failed, " - "errno = %d\n", errno); + printk(UM_KERN_ERR "%s : socketpair failed, errno = %d\n", + __func__, errno); return err; } - do { - CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED | __WALL)); - if (n < 0) { - err = -errno; - printk(UM_KERN_ERR "start_userspace : wait failed, " - "errno = %d\n", errno); - goto out_kill; - } - } while (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGVTALRM)); + if (using_seccomp) + proc_data->futex = FUTEX_IN_CHILD; - if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) { - err = -EINVAL; - printk(UM_KERN_ERR "start_userspace : expected SIGSTOP, got " - "status = %d\n", status); - goto out_kill; + mm_id->pid = clone(userspace_tramp, (void *) sp, + CLONE_VFORK | CLONE_VM | SIGCHLD, + (void *)&tramp_data); + if (mm_id->pid < 0) { + err = -errno; + printk(UM_KERN_ERR "%s : clone failed, errno = %d\n", + __func__, errno); + goto out_close; } - if (ptrace(PTRACE_OLDSETOPTIONS, pid, NULL, - (void *) PTRACE_O_TRACESYSGOOD) < 0) { - err = -errno; - printk(UM_KERN_ERR "start_userspace : PTRACE_OLDSETOPTIONS " - "failed, errno = %d\n", errno); - goto out_kill; + if (using_seccomp) { + wait_stub_done_seccomp(mm_id, 1, 1); + } else { + do { + CATCH_EINTR(n = waitpid(mm_id->pid, &status, + WUNTRACED | __WALL)); + if (n < 0) { + err = -errno; + printk(UM_KERN_ERR "%s : wait failed, errno = %d\n", + __func__, errno); + goto out_kill; + } + } while (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGALRM)); + + if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) { + err = -EINVAL; + printk(UM_KERN_ERR "%s : expected SIGSTOP, got status = %d\n", + __func__, status); + goto out_kill; + } + + if (ptrace(PTRACE_SETOPTIONS, mm_id->pid, NULL, + (void *) PTRACE_O_TRACESYSGOOD) < 0) { + err = -errno; + printk(UM_KERN_ERR "%s : PTRACE_SETOPTIONS failed, errno = %d\n", + __func__, errno); + goto out_kill; + } } if (munmap(stack, UM_KERN_PAGE_SIZE) < 0) { err = -errno; - printk(UM_KERN_ERR "start_userspace : munmap failed, " - "errno = %d\n", errno); + printk(UM_KERN_ERR "%s : munmap failed, errno = %d\n", + __func__, errno); goto out_kill; } - return pid; + close(tramp_data.sockpair[0]); + if (using_seccomp) + mm_id->sock = tramp_data.sockpair[1]; + else + close(tramp_data.sockpair[1]); + + return 0; + +out_kill: + os_kill_ptraced_process(mm_id->pid, 1); +out_close: + close(tramp_data.sockpair[0]); + close(tramp_data.sockpair[1]); + + mm_id->pid = -1; - out_kill: - os_kill_ptraced_process(pid, 1); return err; } +static int unscheduled_userspace_iterations; +extern unsigned long tt_extra_sched_jiffies; + void userspace(struct uml_pt_regs *regs) { - struct itimerval timer; - unsigned long long nsecs, now; - int err, status, op, pid = userspace_pid[0]; - /* To prevent races if using_sysemu changes under us.*/ - int local_using_sysemu; - siginfo_t si; + int err, status, op; + siginfo_t si_local; + siginfo_t *si; + int sig; /* Handle any immediate reschedules or signals */ interrupt_end(); - if (getitimer(ITIMER_VIRTUAL, &timer)) - printk(UM_KERN_ERR "Failed to get itimer, errno = %d\n", errno); - nsecs = timer.it_value.tv_sec * UM_NSEC_PER_SEC + - timer.it_value.tv_usec * UM_NSEC_PER_USEC; - nsecs += os_nsecs(); - while (1) { + struct mm_id *mm_id = current_mm_id(); + /* - * This can legitimately fail if the process loads a - * bogus value into a segment register. It will - * segfault and PTRACE_GETREGS will read that value - * out of the process. However, PTRACE_SETREGS will - * fail. In this case, there is nothing to do but - * just kill the process. + * At any given time, only one CPU thread can enter the + * turnstile to operate on the same stub process, including + * executing stub system calls (mmap and munmap). */ - if (ptrace(PTRACE_SETREGS, pid, 0, regs->gp)) - fatal_sigsegv(); + enter_turnstile(mm_id); - if (put_fp_registers(pid, regs->fp)) - fatal_sigsegv(); + /* + * When we are in time-travel mode, userspace can theoretically + * do a *lot* of work without being scheduled. The problem with + * this is that it will prevent kernel bookkeeping (primarily + * the RCU) from running and this can for example cause OOM + * situations. + * + * This code accounts a jiffie against the scheduling clock + * after the defined userspace iterations in the same thread. + * By doing so the situation is effectively prevented. + */ + if (time_travel_mode == TT_MODE_INFCPU || + time_travel_mode == TT_MODE_EXTERNAL) { +#ifdef CONFIG_UML_MAX_USERSPACE_ITERATIONS + if (CONFIG_UML_MAX_USERSPACE_ITERATIONS && + unscheduled_userspace_iterations++ > + CONFIG_UML_MAX_USERSPACE_ITERATIONS) { + tt_extra_sched_jiffies += 1; + unscheduled_userspace_iterations = 0; + } +#endif + } - /* Now we set local_using_sysemu to be used for one loop */ - local_using_sysemu = get_using_sysemu(); + time_travel_print_bc_msg(); - op = SELECT_PTRACE_OPERATION(local_using_sysemu, - singlestepping(NULL)); + current_mm_sync(); - if (ptrace(op, pid, 0, 0)) { - printk(UM_KERN_ERR "userspace - ptrace continue " - "failed, op = %d, errno = %d\n", op, errno); - fatal_sigsegv(); - } + if (using_seccomp) { + struct stub_data *proc_data = (void *) mm_id->stack; - CATCH_EINTR(err = waitpid(pid, &status, WUNTRACED | __WALL)); - if (err < 0) { - printk(UM_KERN_ERR "userspace - wait failed, " - "errno = %d\n", errno); - fatal_sigsegv(); - } + err = set_stub_state(regs, proc_data, singlestepping()); + if (err) { + printk(UM_KERN_ERR "%s - failed to set regs: %d", + __func__, err); + fatal_sigsegv(); + } - regs->is_user = 1; - if (ptrace(PTRACE_GETREGS, pid, 0, regs->gp)) { - printk(UM_KERN_ERR "userspace - PTRACE_GETREGS failed, " - "errno = %d\n", errno); - fatal_sigsegv(); - } + /* Must have been reset by the syscall caller */ + if (proc_data->restart_wait != 0) + panic("Programming error: Flag to only run syscalls in child was not cleared!"); - if (get_fp_registers(pid, regs->fp)) { - printk(UM_KERN_ERR "userspace - get_fp_registers failed, " - "errno = %d\n", errno); - fatal_sigsegv(); - } + /* Mark pending syscalls for flushing */ + proc_data->syscall_data_len = mm_id->syscall_data_len; - UPT_SYSCALL_NR(regs) = -1; /* Assume: It's not a syscall */ + wait_stub_done_seccomp(mm_id, 0, 0); - if (WIFSTOPPED(status)) { - int sig = WSTOPSIG(status); + sig = proc_data->signal; - ptrace(PTRACE_GETSIGINFO, pid, 0, &si); + if (sig == SIGTRAP && proc_data->err != 0) { + printk(UM_KERN_ERR "%s - Error flushing stub syscalls", + __func__); + syscall_stub_dump_error(mm_id); + mm_id->syscall_data_len = proc_data->err; + fatal_sigsegv(); + } - switch (sig) { - case SIGSEGV: - if (PTRACE_FULL_FAULTINFO || - !ptrace_faultinfo) { + mm_id->syscall_data_len = 0; + mm_id->syscall_fd_num = 0; + + err = get_stub_state(regs, proc_data, NULL); + if (err) { + printk(UM_KERN_ERR "%s - failed to get regs: %d", + __func__, err); + fatal_sigsegv(); + } + + if (proc_data->si_offset > sizeof(proc_data->sigstack) - sizeof(*si)) + panic("%s - Invalid siginfo offset from child", __func__); + + si = &si_local; + memcpy(si, &proc_data->sigstack[proc_data->si_offset], sizeof(*si)); + + regs->is_user = 1; + + /* Fill in ORIG_RAX and extract fault information */ + PT_SYSCALL_NR(regs->gp) = si->si_syscall; + if (sig == SIGSEGV) { + mcontext_t *mcontext = (void *)&proc_data->sigstack[proc_data->mctx_offset]; + + GET_FAULTINFO_FROM_MC(regs->faultinfo, mcontext); + } + } else { + int pid = mm_id->pid; + + /* Flush out any pending syscalls */ + err = syscall_stub_flush(mm_id); + if (err) { + if (err == -ENOMEM) + report_enomem(); + + printk(UM_KERN_ERR "%s - Error flushing stub syscalls: %d", + __func__, -err); + fatal_sigsegv(); + } + + /* + * This can legitimately fail if the process loads a + * bogus value into a segment register. It will + * segfault and PTRACE_GETREGS will read that value + * out of the process. However, PTRACE_SETREGS will + * fail. In this case, there is nothing to do but + * just kill the process. + */ + if (ptrace(PTRACE_SETREGS, pid, 0, regs->gp)) { + printk(UM_KERN_ERR "%s - ptrace set regs failed, errno = %d\n", + __func__, errno); + fatal_sigsegv(); + } + + if (put_fp_registers(pid, regs->fp)) { + printk(UM_KERN_ERR "%s - ptrace set fp regs failed, errno = %d\n", + __func__, errno); + fatal_sigsegv(); + } + + if (singlestepping()) + op = PTRACE_SYSEMU_SINGLESTEP; + else + op = PTRACE_SYSEMU; + + if (ptrace(op, pid, 0, 0)) { + printk(UM_KERN_ERR "%s - ptrace continue failed, op = %d, errno = %d\n", + __func__, op, errno); + fatal_sigsegv(); + } + + CATCH_EINTR(err = waitpid(pid, &status, WUNTRACED | __WALL)); + if (err < 0) { + printk(UM_KERN_ERR "%s - wait failed, errno = %d\n", + __func__, errno); + fatal_sigsegv(); + } + + regs->is_user = 1; + if (ptrace(PTRACE_GETREGS, pid, 0, regs->gp)) { + printk(UM_KERN_ERR "%s - PTRACE_GETREGS failed, errno = %d\n", + __func__, errno); + fatal_sigsegv(); + } + + if (get_fp_registers(pid, regs->fp)) { + printk(UM_KERN_ERR "%s - get_fp_registers failed, errno = %d\n", + __func__, errno); + fatal_sigsegv(); + } + + if (WIFSTOPPED(status)) { + sig = WSTOPSIG(status); + + /* + * These signal handlers need the si argument + * and SIGSEGV needs the faultinfo. + * The SIGIO and SIGALARM handlers which constitute + * the majority of invocations, do not use it. + */ + switch (sig) { + case SIGSEGV: get_skas_faultinfo(pid, ®s->faultinfo); - (*sig_info[SIGSEGV])(SIGSEGV, &si, - regs); + fallthrough; + case SIGTRAP: + case SIGILL: + case SIGBUS: + case SIGFPE: + case SIGWINCH: + ptrace(PTRACE_GETSIGINFO, pid, 0, + (struct siginfo *)&si_local); + si = &si_local; + break; + default: + si = NULL; + break; } - else handle_segv(pid, regs); + } else { + sig = 0; + } + } + + exit_turnstile(mm_id); + + UPT_SYSCALL_NR(regs) = -1; /* Assume: It's not a syscall */ + + if (sig) { + switch (sig) { + case SIGSEGV: + if (using_seccomp || PTRACE_FULL_FAULTINFO) + (*sig_info[SIGSEGV])(SIGSEGV, + (struct siginfo *)si, + regs, NULL); + else + segv(regs->faultinfo, 0, 1, NULL, NULL); + + break; + case SIGSYS: + handle_syscall(regs); break; case SIGTRAP + 0x80: - handle_trap(pid, regs, local_using_sysemu); + handle_trap(regs); break; case SIGTRAP: - relay_signal(SIGTRAP, &si, regs); + relay_signal(SIGTRAP, (struct siginfo *)si, regs, NULL); break; - case SIGVTALRM: - now = os_nsecs(); - if (now < nsecs) - break; - block_signals(); - (*sig_info[sig])(sig, &si, regs); - unblock_signals(); - nsecs = timer.it_value.tv_sec * - UM_NSEC_PER_SEC + - timer.it_value.tv_usec * - UM_NSEC_PER_USEC; - nsecs += os_nsecs(); + case SIGALRM: break; case SIGIO: case SIGILL: case SIGBUS: case SIGFPE: case SIGWINCH: - block_signals(); - (*sig_info[sig])(sig, &si, regs); - unblock_signals(); + block_signals_trace(); + (*sig_info[sig])(sig, (struct siginfo *)si, regs, NULL); + unblock_signals_trace(); break; default: - printk(UM_KERN_ERR "userspace - child stopped " - "with signal %d\n", sig); + printk(UM_KERN_ERR "%s - child stopped with signal %d\n", + __func__, sig); fatal_sigsegv(); } - pid = userspace_pid[0]; interrupt_end(); /* Avoid -ERESTARTSYS handling in host */ @@ -465,173 +790,6 @@ void userspace(struct uml_pt_regs *regs) } } -static unsigned long thread_regs[MAX_REG_NR]; -static unsigned long thread_fp_regs[FP_SIZE]; - -static int __init init_thread_regs(void) -{ - get_safe_registers(thread_regs, thread_fp_regs); - /* Set parent's instruction pointer to start of clone-stub */ - thread_regs[REGS_IP_INDEX] = STUB_CODE + - (unsigned long) stub_clone_handler - - (unsigned long) &__syscall_stub_start; - thread_regs[REGS_SP_INDEX] = STUB_DATA + UM_KERN_PAGE_SIZE - - sizeof(void *); -#ifdef __SIGNAL_FRAMESIZE - thread_regs[REGS_SP_INDEX] -= __SIGNAL_FRAMESIZE; -#endif - return 0; -} - -__initcall(init_thread_regs); - -int copy_context_skas0(unsigned long new_stack, int pid) -{ - struct timeval tv = { .tv_sec = 0, .tv_usec = UM_USEC_PER_SEC / UM_HZ }; - int err; - unsigned long current_stack = current_stub_stack(); - struct stub_data *data = (struct stub_data *) current_stack; - struct stub_data *child_data = (struct stub_data *) new_stack; - unsigned long long new_offset; - int new_fd = phys_mapping(to_phys((void *)new_stack), &new_offset); - - /* - * prepare offset and fd of child's stack as argument for parent's - * and child's mmap2 calls - */ - *data = ((struct stub_data) { .offset = MMAP_OFFSET(new_offset), - .fd = new_fd, - .timer = ((struct itimerval) - { .it_value = tv, - .it_interval = tv }) }); - - err = ptrace_setregs(pid, thread_regs); - if (err < 0) { - err = -errno; - printk(UM_KERN_ERR "copy_context_skas0 : PTRACE_SETREGS " - "failed, pid = %d, errno = %d\n", pid, -err); - return err; - } - - err = put_fp_registers(pid, thread_fp_regs); - if (err < 0) { - printk(UM_KERN_ERR "copy_context_skas0 : put_fp_registers " - "failed, pid = %d, err = %d\n", pid, err); - return err; - } - - /* set a well known return code for detection of child write failure */ - child_data->err = 12345678; - - /* - * Wait, until parent has finished its work: read child's pid from - * parent's stack, and check, if bad result. - */ - err = ptrace(PTRACE_CONT, pid, 0, 0); - if (err) { - err = -errno; - printk(UM_KERN_ERR "Failed to continue new process, pid = %d, " - "errno = %d\n", pid, errno); - return err; - } - - wait_stub_done(pid); - - pid = data->err; - if (pid < 0) { - printk(UM_KERN_ERR "copy_context_skas0 - stub-parent reports " - "error %d\n", -pid); - return pid; - } - - /* - * Wait, until child has finished too: read child's result from - * child's stack and check it. - */ - wait_stub_done(pid); - if (child_data->err != STUB_DATA) { - printk(UM_KERN_ERR "copy_context_skas0 - stub-child reports " - "error %ld\n", child_data->err); - err = child_data->err; - goto out_kill; - } - - if (ptrace(PTRACE_OLDSETOPTIONS, pid, NULL, - (void *)PTRACE_O_TRACESYSGOOD) < 0) { - err = -errno; - printk(UM_KERN_ERR "copy_context_skas0 : PTRACE_OLDSETOPTIONS " - "failed, errno = %d\n", errno); - goto out_kill; - } - - return pid; - - out_kill: - os_kill_ptraced_process(pid, 1); - return err; -} - -/* - * This is used only, if stub pages are needed, while proc_mm is - * available. Opening /proc/mm creates a new mm_context, which lacks - * the stub-pages. Thus, we map them using /proc/mm-fd - */ -int map_stub_pages(int fd, unsigned long code, unsigned long data, - unsigned long stack) -{ - struct proc_mm_op mmop; - int n; - unsigned long long code_offset; - int code_fd = phys_mapping(to_phys((void *) &__syscall_stub_start), - &code_offset); - - mmop = ((struct proc_mm_op) { .op = MM_MMAP, - .u = - { .mmap = - { .addr = code, - .len = UM_KERN_PAGE_SIZE, - .prot = PROT_EXEC, - .flags = MAP_FIXED | MAP_PRIVATE, - .fd = code_fd, - .offset = code_offset - } } }); - CATCH_EINTR(n = write(fd, &mmop, sizeof(mmop))); - if (n != sizeof(mmop)) { - n = errno; - printk(UM_KERN_ERR "mmap args - addr = 0x%lx, fd = %d, " - "offset = %llx\n", code, code_fd, - (unsigned long long) code_offset); - printk(UM_KERN_ERR "map_stub_pages : /proc/mm map for code " - "failed, err = %d\n", n); - return -n; - } - - if (stack) { - unsigned long long map_offset; - int map_fd = phys_mapping(to_phys((void *)stack), &map_offset); - mmop = ((struct proc_mm_op) - { .op = MM_MMAP, - .u = - { .mmap = - { .addr = data, - .len = UM_KERN_PAGE_SIZE, - .prot = PROT_READ | PROT_WRITE, - .flags = MAP_FIXED | MAP_SHARED, - .fd = map_fd, - .offset = map_offset - } } }); - CATCH_EINTR(n = write(fd, &mmop, sizeof(mmop))); - if (n != sizeof(mmop)) { - n = errno; - printk(UM_KERN_ERR "map_stub_pages : /proc/mm map for " - "data failed, err = %d\n", n); - return -n; - } - } - - return 0; -} - void new_thread(void *stack, jmp_buf *buf, void (*handler)(void)) { (*buf)[0].JB_IP = (unsigned long) handler; @@ -646,16 +804,17 @@ void new_thread(void *stack, jmp_buf *buf, void (*handler)(void)) void switch_threads(jmp_buf *me, jmp_buf *you) { + unscheduled_userspace_iterations = 0; + if (UML_SETJMP(me) == 0) UML_LONGJMP(you, 1); } static jmp_buf initial_jmpbuf; -/* XXX Make these percpu */ -static void (*cb_proc)(void *arg); -static void *cb_arg; -static jmp_buf *cb_back; +static __thread void (*cb_proc)(void *arg); +static __thread void *cb_arg; +static __thread jmp_buf *cb_back; int start_idle_thread(void *stack, jmp_buf *switch_buf) { @@ -674,7 +833,7 @@ int start_idle_thread(void *stack, jmp_buf *switch_buf) n = setjmp(initial_jmpbuf); switch (n) { case INIT_JMP_NEW_THREAD: - (*switch_buf)[0].JB_IP = (unsigned long) new_thread_handler; + (*switch_buf)[0].JB_IP = (unsigned long) uml_finishsetup; (*switch_buf)[0].JB_SP = (unsigned long) stack + UM_THREAD_SIZE - sizeof(void *); break; @@ -689,11 +848,16 @@ int start_idle_thread(void *stack, jmp_buf *switch_buf) kmalloc_ok = 0; return 1; default: - printk(UM_KERN_ERR "Bad sigsetjmp return in " - "start_idle_thread - %d\n", n); + printk(UM_KERN_ERR "Bad sigsetjmp return in %s - %d\n", + __func__, n); fatal_sigsegv(); } longjmp(*switch_buf, 1); + + /* unreachable */ + printk(UM_KERN_ERR "impossible long jump!"); + fatal_sigsegv(); + return 0; } void initial_thread_cb_skas(void (*proc)(void *), void *arg) @@ -704,10 +868,10 @@ void initial_thread_cb_skas(void (*proc)(void *), void *arg) cb_arg = arg; cb_back = &here; - block_signals(); + initial_jmpbuf_lock(); if (UML_SETJMP(&here) == 0) UML_LONGJMP(&initial_jmpbuf, INIT_JMP_CALLBACK); - unblock_signals(); + initial_jmpbuf_unlock(); cb_proc = NULL; cb_arg = NULL; @@ -716,29 +880,29 @@ void initial_thread_cb_skas(void (*proc)(void *), void *arg) void halt_skas(void) { - block_signals(); + initial_jmpbuf_lock(); UML_LONGJMP(&initial_jmpbuf, INIT_JMP_HALT); + /* unreachable */ } -void reboot_skas(void) +static bool noreboot; + +static int __init noreboot_cmd_param(char *str, int *add) { - block_signals(); - UML_LONGJMP(&initial_jmpbuf, INIT_JMP_REBOOT); + *add = 0; + noreboot = true; + return 0; } -void __switch_mm(struct mm_id *mm_idp) -{ - int err; +__uml_setup("noreboot", noreboot_cmd_param, +"noreboot\n" +" Rather than rebooting, exit always, akin to QEMU's -no-reboot option.\n" +" This is useful if you're using CONFIG_PANIC_TIMEOUT in order to catch\n" +" crashes in CI\n\n"); - /* FIXME: need cpu pid in __switch_mm */ - if (proc_mm) { - err = ptrace(PTRACE_SWITCH_MM, userspace_pid[0], 0, - mm_idp->u.mm_fd); - if (err) { - printk(UM_KERN_ERR "__switch_mm - PTRACE_SWITCH_MM " - "failed, errno = %d\n", errno); - fatal_sigsegv(); - } - } - else userspace_pid[0] = mm_idp->u.pid; +void reboot_skas(void) +{ + initial_jmpbuf_lock(); + UML_LONGJMP(&initial_jmpbuf, noreboot ? INIT_JMP_HALT : INIT_JMP_REBOOT); + /* unreachable */ } diff --git a/arch/um/os-Linux/smp.c b/arch/um/os-Linux/smp.c new file mode 100644 index 000000000000..18d3858a7cd2 --- /dev/null +++ b/arch/um/os-Linux/smp.c @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025 Ant Group + * Author: Tiwei Bie <tiwei.btw@antgroup.com> + */ + +#include <errno.h> +#include <pthread.h> +#include <signal.h> +#include <kern_util.h> +#include <um_malloc.h> +#include <init.h> +#include <os.h> +#include <smp.h> +#include "internal.h" + +struct cpu_thread_data { + int cpu; + sigset_t sigset; +}; + +static __thread int __curr_cpu; + +int uml_curr_cpu(void) +{ + return __curr_cpu; +} + +static pthread_t cpu_threads[CONFIG_NR_CPUS]; + +static void *cpu_thread(void *arg) +{ + struct cpu_thread_data *data = arg; + + __curr_cpu = data->cpu; + + uml_start_secondary(data); + + return NULL; +} + +int os_start_cpu_thread(int cpu) +{ + struct cpu_thread_data *data; + sigset_t sigset, oset; + int err; + + data = uml_kmalloc(sizeof(*data), UM_GFP_ATOMIC); + if (!data) + return -ENOMEM; + + sigfillset(&sigset); + if (sigprocmask(SIG_SETMASK, &sigset, &oset) < 0) { + err = errno; + goto err; + } + + data->cpu = cpu; + data->sigset = oset; + + err = pthread_create(&cpu_threads[cpu], NULL, cpu_thread, data); + if (sigprocmask(SIG_SETMASK, &oset, NULL) < 0) + panic("Failed to restore the signal mask, errno = %d", errno); + if (err != 0) + goto err; + + return 0; + +err: + kfree(data); + return -err; +} + +void os_start_secondary(void *arg, jmp_buf *switch_buf) +{ + struct cpu_thread_data *data = arg; + + sigaddset(&data->sigset, IPI_SIGNAL); + sigaddset(&data->sigset, SIGIO); + + if (sigprocmask(SIG_SETMASK, &data->sigset, NULL) < 0) + panic("Failed to restore the signal mask, errno = %d", errno); + + kfree(data); + longjmp(*switch_buf, 1); + + /* unreachable */ + printk(UM_KERN_ERR "impossible long jump!"); + fatal_sigsegv(); +} + +int os_send_ipi(int cpu, int vector) +{ + union sigval value = { .sival_int = vector }; + + return pthread_sigqueue(cpu_threads[cpu], IPI_SIGNAL, value); +} + +static void __local_ipi_set(int enable) +{ + sigset_t sigset; + + sigemptyset(&sigset); + sigaddset(&sigset, IPI_SIGNAL); + + if (sigprocmask(enable ? SIG_UNBLOCK : SIG_BLOCK, &sigset, NULL) < 0) + panic("%s: sigprocmask failed, errno = %d", __func__, errno); +} + +void os_local_ipi_enable(void) +{ + __local_ipi_set(1); +} + +void os_local_ipi_disable(void) +{ + __local_ipi_set(0); +} + +static void ipi_sig_handler(int sig, siginfo_t *si, void *uc) +{ + int save_errno = errno; + + signals_enabled = 0; + um_trace_signals_off(); + + uml_ipi_handler(si->si_value.sival_int); + + um_trace_signals_on(); + signals_enabled = 1; + + errno = save_errno; +} + +void __init os_init_smp(void) +{ + struct sigaction action = { + .sa_sigaction = ipi_sig_handler, + .sa_flags = SA_SIGINFO | SA_ONSTACK | SA_RESTART, + }; + + sigfillset(&action.sa_mask); + + if (sigaction(IPI_SIGNAL, &action, NULL) < 0) + panic("%s: sigaction failed, errno = %d", __func__, errno); + + cpu_threads[0] = pthread_self(); +} diff --git a/arch/um/os-Linux/start_up.c b/arch/um/os-Linux/start_up.c index 337518c5042a..054ac03bbf5e 100644 --- a/arch/um/os-Linux/start_up.c +++ b/arch/um/os-Linux/start_up.c @@ -1,6 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* + * Copyright (C) 2021 Benjamin Berg <benjamin@sipsolutions.net> * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <stdio.h> @@ -17,14 +18,24 @@ #include <sys/wait.h> #include <sys/time.h> #include <sys/resource.h> +#include <asm/ldt.h> #include <asm/unistd.h> #include <init.h> #include <os.h> +#include <smp.h> +#include <kern_util.h> #include <mem_user.h> #include <ptrace_user.h> +#include <stdbool.h> +#include <stub-data.h> +#include <sys/prctl.h> +#include <linux/seccomp.h> +#include <linux/filter.h> +#include <sysdep/mcontext.h> +#include <sysdep/stub.h> #include <registers.h> #include <skas.h> -#include <skas_ptrace.h> +#include "internal.h" static void ptrace_child(void) { @@ -95,6 +106,8 @@ static int start_ptraced_child(void) { int pid, n, status; + fflush(stdout); + pid = fork(); if (pid == 0) ptrace_child(); @@ -111,140 +124,32 @@ static int start_ptraced_child(void) return pid; } -/* When testing for SYSEMU support, if it is one of the broken versions, we - * must just avoid using sysemu, not panic, but only if SYSEMU features are - * broken. - * So only for SYSEMU features we test mustpanic, while normal host features - * must work anyway! - */ -static int stop_ptraced_child(int pid, int exitcode, int mustexit) +static void stop_ptraced_child(int pid, int exitcode) { - int status, n, ret = 0; + int status, n; + + if (ptrace(PTRACE_CONT, pid, 0, 0) < 0) + fatal_perror("stop_ptraced_child : ptrace failed"); - if (ptrace(PTRACE_CONT, pid, 0, 0) < 0) { - perror("stop_ptraced_child : ptrace failed"); - return -1; - } CATCH_EINTR(n = waitpid(pid, &status, 0)); if (!WIFEXITED(status) || (WEXITSTATUS(status) != exitcode)) { int exit_with = WEXITSTATUS(status); - if (exit_with == 2) - non_fatal("check_ptrace : child exited with status 2. " - "\nDisabling SYSEMU support.\n"); - non_fatal("check_ptrace : child exited with exitcode %d, while " - "expecting %d; status 0x%x\n", exit_with, - exitcode, status); - if (mustexit) - exit(1); - ret = -1; + fatal("stop_ptraced_child : child exited with exitcode %d, " + "while expecting %d; status 0x%x\n", exit_with, + exitcode, status); } - - return ret; -} - -/* Changed only during early boot */ -int ptrace_faultinfo; -static int disable_ptrace_faultinfo; - -int ptrace_ldt; -static int disable_ptrace_ldt; - -int proc_mm; -static int disable_proc_mm; - -int have_switch_mm; -static int disable_switch_mm; - -int skas_needs_stub; - -static int __init skas0_cmd_param(char *str, int* add) -{ - disable_ptrace_faultinfo = 1; - disable_ptrace_ldt = 1; - disable_proc_mm = 1; - disable_switch_mm = 1; - - return 0; } -/* The two __uml_setup would conflict, without this stupid alias. */ - -static int __init mode_skas0_cmd_param(char *str, int* add) - __attribute__((alias("skas0_cmd_param"))); - -__uml_setup("skas0", skas0_cmd_param, -"skas0\n" -" Disables SKAS3 and SKAS4 usage, so that SKAS0 is used\n\n"); - -__uml_setup("mode=skas0", mode_skas0_cmd_param, -"mode=skas0\n" -" Disables SKAS3 and SKAS4 usage, so that SKAS0 is used.\n\n"); - -/* Changed only during early boot */ -static int force_sysemu_disabled = 0; - -static int __init nosysemu_cmd_param(char *str, int* add) -{ - force_sysemu_disabled = 1; - return 0; -} - -__uml_setup("nosysemu", nosysemu_cmd_param, -"nosysemu\n" -" Turns off syscall emulation patch for ptrace (SYSEMU) on.\n" -" SYSEMU is a performance-patch introduced by Laurent Vivier. It changes\n" -" behaviour of ptrace() and helps reducing host context switch rate.\n" -" To make it working, you need a kernel patch for your host, too.\n" -" See http://perso.wanadoo.fr/laurent.vivier/UML/ for further \n" -" information.\n\n"); - static void __init check_sysemu(void) { - unsigned long regs[MAX_REG_NR]; int pid, n, status, count=0; - non_fatal("Checking syscall emulation patch for ptrace..."); - sysemu_supported = 0; + os_info("Checking syscall emulation for ptrace..."); pid = start_ptraced_child(); - if (ptrace(PTRACE_SYSEMU, pid, 0, 0) < 0) - goto fail; - - CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED)); - if (n < 0) - fatal_perror("check_sysemu : wait failed"); - if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGTRAP)) - fatal("check_sysemu : expected SIGTRAP, got status = %d\n", - status); - - if (ptrace(PTRACE_GETREGS, pid, 0, regs) < 0) - fatal_perror("check_sysemu : PTRACE_GETREGS failed"); - if (PT_SYSCALL_NR(regs) != __NR_getpid) { - non_fatal("check_sysemu got system call number %d, " - "expected %d...", PT_SYSCALL_NR(regs), __NR_getpid); - goto fail; - } - - n = ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_RET_OFFSET, os_getpid()); - if (n < 0) { - non_fatal("check_sysemu : failed to modify system call " - "return"); - goto fail; - } - - if (stop_ptraced_child(pid, 0, 0) < 0) - goto fail_stopped; - - sysemu_supported = 1; - non_fatal("OK\n"); - set_using_sysemu(!force_sysemu_disabled); - - non_fatal("Checking advanced syscall emulation patch for ptrace..."); - pid = start_ptraced_child(); - - if ((ptrace(PTRACE_OLDSETOPTIONS, pid, 0, + if ((ptrace(PTRACE_SETOPTIONS, pid, 0, (void *) PTRACE_O_TRACESYSGOOD) < 0)) - fatal_perror("check_sysemu: PTRACE_OLDSETOPTIONS failed"); + fatal_perror("check_sysemu: PTRACE_SETOPTIONS failed"); while (1) { count++; @@ -277,32 +182,27 @@ static void __init check_sysemu(void) goto fail; } } - if (stop_ptraced_child(pid, 0, 0) < 0) - goto fail_stopped; + stop_ptraced_child(pid, 0); - sysemu_supported = 2; - non_fatal("OK\n"); + os_info("OK\n"); - if (!force_sysemu_disabled) - set_using_sysemu(sysemu_supported); return; fail: - stop_ptraced_child(pid, 1, 0); -fail_stopped: - non_fatal("missing\n"); + stop_ptraced_child(pid, 1); + fatal("missing\n"); } static void __init check_ptrace(void) { int pid, syscall, n, status; - non_fatal("Checking that ptrace can change system call numbers..."); + os_info("Checking that ptrace can change system call numbers..."); pid = start_ptraced_child(); - if ((ptrace(PTRACE_OLDSETOPTIONS, pid, 0, + if ((ptrace(PTRACE_SETOPTIONS, pid, 0, (void *) PTRACE_O_TRACESYSGOOD) < 0)) - fatal_perror("check_ptrace: PTRACE_OLDSETOPTIONS failed"); + fatal_perror("check_ptrace: PTRACE_SETOPTIONS failed"); while (1) { if (ptrace(PTRACE_SYSCALL, pid, 0, 0) < 0) @@ -328,215 +228,268 @@ static void __init check_ptrace(void) break; } } - stop_ptraced_child(pid, 0, 1); - non_fatal("OK\n"); + stop_ptraced_child(pid, 0); + os_info("OK\n"); check_sysemu(); } -extern void check_tmpexec(void); +extern unsigned long host_fp_size; +extern unsigned long exec_regs[MAX_REG_NR]; +extern unsigned long *exec_fp_regs; -static void __init check_coredump_limit(void) +__initdata static struct stub_data *seccomp_test_stub_data; + +static void __init sigsys_handler(int sig, siginfo_t *info, void *p) { - struct rlimit lim; - int err = getrlimit(RLIMIT_CORE, &lim); + ucontext_t *uc = p; - if (err) { - perror("Getting core dump limit"); - return; - } + /* Stow away the location of the mcontext in the stack */ + seccomp_test_stub_data->mctx_offset = (unsigned long)&uc->uc_mcontext - + (unsigned long)&seccomp_test_stub_data->sigstack[0]; - printf("Core dump limits :\n\tsoft - "); - if (lim.rlim_cur == RLIM_INFINITY) - printf("NONE\n"); - else printf("%lu\n", lim.rlim_cur); + /* Prevent libc from clearing memory (mctx_offset in particular) */ + syscall(__NR_exit, 0); +} - printf("\thard - "); - if (lim.rlim_max == RLIM_INFINITY) - printf("NONE\n"); - else printf("%lu\n", lim.rlim_max); +static int __init seccomp_helper(void *data) +{ + static struct sock_filter filter[] = { + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_clock_nanosleep, 1, 0), + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW), + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRAP), + }; + static struct sock_fprog prog = { + .len = ARRAY_SIZE(filter), + .filter = filter, + }; + struct sigaction sa; + + /* close_range is needed for the stub */ + if (stub_syscall3(__NR_close_range, 1, ~0U, 0)) + exit(1); + + set_sigstack(seccomp_test_stub_data->sigstack, + sizeof(seccomp_test_stub_data->sigstack)); + + sa.sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO; + sa.sa_sigaction = (void *) sigsys_handler; + sa.sa_restorer = NULL; + if (sigaction(SIGSYS, &sa, NULL) < 0) + exit(2); + + prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + if (syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, + SECCOMP_FILTER_FLAG_TSYNC, &prog) != 0) + exit(3); + + sleep(0); + + /* Never reached. */ + _exit(4); } -void __init os_early_checks(void) +static bool __init init_seccomp(void) { int pid; + int status; + int n; + unsigned long sp; - /* Print out the core dump limits early */ - check_coredump_limit(); + /* + * We check that we can install a seccomp filter and then exit(0) + * from a trapped syscall. + * + * Note that we cannot verify that no seccomp filter already exists + * for a syscall that results in the process/thread to be killed. + */ - check_ptrace(); + os_info("Checking that seccomp filters can be installed..."); - /* Need to check this early because mmapping happens before the - * kernel is running. - */ - check_tmpexec(); + seccomp_test_stub_data = mmap(0, sizeof(*seccomp_test_stub_data), + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANON, 0, 0); - pid = start_ptraced_child(); - if (init_registers(pid)) - fatal("Failed to initialize default registers"); - stop_ptraced_child(pid, 1, 1); -} + /* Use the syscall data area as stack, we just need something */ + sp = (unsigned long)&seccomp_test_stub_data->syscall_data + + sizeof(seccomp_test_stub_data->syscall_data) - + sizeof(void *); + pid = clone(seccomp_helper, (void *)sp, CLONE_VFORK | CLONE_VM, NULL); -static int __init noprocmm_cmd_param(char *str, int* add) -{ - disable_proc_mm = 1; - return 0; -} + if (pid < 0) + fatal_perror("check_seccomp : clone failed"); -__uml_setup("noprocmm", noprocmm_cmd_param, -"noprocmm\n" -" Turns off usage of /proc/mm, even if host supports it.\n" -" To support /proc/mm, the host needs to be patched using\n" -" the current skas3 patch.\n\n"); + CATCH_EINTR(n = waitpid(pid, &status, __WCLONE)); + if (n < 0) + fatal_perror("check_seccomp : waitpid failed"); -static int __init noptracefaultinfo_cmd_param(char *str, int* add) -{ - disable_ptrace_faultinfo = 1; - return 0; -} + if (WIFEXITED(status) && WEXITSTATUS(status) == 0) { + struct uml_pt_regs *regs; + unsigned long fp_size; + int r; -__uml_setup("noptracefaultinfo", noptracefaultinfo_cmd_param, -"noptracefaultinfo\n" -" Turns off usage of PTRACE_FAULTINFO, even if host supports\n" -" it. To support PTRACE_FAULTINFO, the host needs to be patched\n" -" using the current skas3 patch.\n\n"); + /* Fill in the host_fp_size from the mcontext. */ + regs = calloc(1, sizeof(struct uml_pt_regs)); + get_stub_state(regs, seccomp_test_stub_data, &fp_size); + host_fp_size = fp_size; + free(regs); -static int __init noptraceldt_cmd_param(char *str, int* add) -{ - disable_ptrace_ldt = 1; - return 0; -} + /* Repeat with the correct size */ + regs = calloc(1, sizeof(struct uml_pt_regs) + host_fp_size); + r = get_stub_state(regs, seccomp_test_stub_data, NULL); -__uml_setup("noptraceldt", noptraceldt_cmd_param, -"noptraceldt\n" -" Turns off usage of PTRACE_LDT, even if host supports it.\n" -" To support PTRACE_LDT, the host needs to be patched using\n" -" the current skas3 patch.\n\n"); + /* Store as the default startup registers */ + exec_fp_regs = malloc(host_fp_size); + memcpy(exec_regs, regs->gp, sizeof(exec_regs)); + memcpy(exec_fp_regs, regs->fp, host_fp_size); -static inline void check_skas3_ptrace_faultinfo(void) -{ - struct ptrace_faultinfo fi; - int pid, n; + munmap(seccomp_test_stub_data, sizeof(*seccomp_test_stub_data)); - non_fatal(" - PTRACE_FAULTINFO..."); - pid = start_ptraced_child(); + free(regs); + + if (r) { + os_info("failed to fetch registers: %d\n", r); + return false; + } - n = ptrace(PTRACE_FAULTINFO, pid, 0, &fi); - if (n < 0) { - if (errno == EIO) - non_fatal("not found\n"); - else - perror("not found"); - } else if (disable_ptrace_faultinfo) - non_fatal("found but disabled on command line\n"); - else { - ptrace_faultinfo = 1; - non_fatal("found\n"); + os_info("OK\n"); + return true; } - stop_ptraced_child(pid, 1, 1); + if (WIFEXITED(status) && WEXITSTATUS(status) == 2) + os_info("missing\n"); + else + os_info("error\n"); + + munmap(seccomp_test_stub_data, sizeof(*seccomp_test_stub_data)); + return false; } -static inline void check_skas3_ptrace_ldt(void) + +static void __init check_coredump_limit(void) { -#ifdef PTRACE_LDT - int pid, n; - unsigned char ldtbuf[40]; - struct ptrace_ldt ldt_op = (struct ptrace_ldt) { - .func = 2, /* read default ldt */ - .ptr = ldtbuf, - .bytecount = sizeof(ldtbuf)}; - - non_fatal(" - PTRACE_LDT..."); - pid = start_ptraced_child(); + struct rlimit lim; + int err = getrlimit(RLIMIT_CORE, &lim); - n = ptrace(PTRACE_LDT, pid, 0, (unsigned long) &ldt_op); - if (n < 0) { - if (errno == EIO) - non_fatal("not found\n"); - else - perror("not found"); - } else if (disable_ptrace_ldt) - non_fatal("found, but use is disabled\n"); - else { - ptrace_ldt = 1; - non_fatal("found\n"); + if (err) { + perror("Getting core dump limit"); + return; } - stop_ptraced_child(pid, 1, 1); -#endif + os_info("Core dump limits :\n\tsoft - "); + if (lim.rlim_cur == RLIM_INFINITY) + os_info("NONE\n"); + else + os_info("%llu\n", (unsigned long long)lim.rlim_cur); + + os_info("\thard - "); + if (lim.rlim_max == RLIM_INFINITY) + os_info("NONE\n"); + else + os_info("%llu\n", (unsigned long long)lim.rlim_max); } -static inline void check_skas3_proc_mm(void) +void __init get_host_cpu_features( + void (*flags_helper_func)(char *line), + void (*cache_helper_func)(char *line)) { - non_fatal(" - /proc/mm..."); - if (access("/proc/mm", W_OK) < 0) - perror("not found"); - else if (disable_proc_mm) - non_fatal("found but disabled on command line\n"); - else { - proc_mm = 1; - non_fatal("found\n"); + FILE *cpuinfo; + char *line = NULL; + size_t len = 0; + int done_parsing = 0; + + cpuinfo = fopen("/proc/cpuinfo", "r"); + if (cpuinfo == NULL) { + os_info("Failed to get host CPU features\n"); + } else { + while ((getline(&line, &len, cpuinfo)) != -1) { + if (strstr(line, "flags")) { + flags_helper_func(line); + done_parsing++; + } + if (strstr(line, "cache_alignment")) { + cache_helper_func(line); + done_parsing++; + } + free(line); + line = NULL; + if (done_parsing > 1) + break; + } + fclose(cpuinfo); } } -void can_do_skas(void) -{ - non_fatal("Checking for the skas3 patch in the host:\n"); +static int seccomp_config __initdata; - check_skas3_proc_mm(); - check_skas3_ptrace_faultinfo(); - check_skas3_ptrace_ldt(); +static int __init uml_seccomp_config(char *line, int *add) +{ + *add = 0; + + if (strcmp(line, "off") == 0) + seccomp_config = 0; + else if (strcmp(line, "auto") == 0) + seccomp_config = 1; + else if (strcmp(line, "on") == 0) + seccomp_config = 2; + else + fatal("Invalid seccomp option '%s', expected on/auto/off\n", + line); - if (!proc_mm || !ptrace_faultinfo || !ptrace_ldt) - skas_needs_stub = 1; + return 0; } -int __init parse_iomem(char *str, int *add) +__uml_setup("seccomp=", uml_seccomp_config, +"seccomp=<on/auto/off>\n" +" Configure whether or not SECCOMP is used. With SECCOMP, userspace\n" +" processes work collaboratively with the kernel instead of being\n" +" traced using ptrace. All syscalls from the application are caught and\n" +" redirected using a signal. This signal handler in turn is permitted to\n" +" do the selected set of syscalls to communicate with the UML kernel and\n" +" do the required memory management.\n" +"\n" +" This method is overall faster than the ptrace based userspace, primarily\n" +" because it reduces the number of context switches for (minor) page faults.\n" +"\n" +" However, the SECCOMP filter is not (yet) restrictive enough to prevent\n" +" userspace from reading and writing all physical memory. Userspace\n" +" processes could also trick the stub into disabling SIGALRM which\n" +" prevents it from being interrupted for scheduling purposes.\n" +"\n" +" This is insecure and should only be used with a trusted userspace\n\n" +); + +void __init os_early_checks(void) { - struct iomem_region *new; - struct stat64 buf; - char *file, *driver; - int fd, size; - - driver = str; - file = strchr(str,','); - if (file == NULL) { - fprintf(stderr, "parse_iomem : failed to parse iomem\n"); - goto out; - } - *file = '\0'; - file++; - fd = open(file, O_RDWR, 0); - if (fd < 0) { - perror("parse_iomem - Couldn't open io file"); - goto out; - } + int pid; - if (fstat64(fd, &buf) < 0) { - perror("parse_iomem - cannot stat_fd file"); - goto out_close; - } + /* Print out the core dump limits early */ + check_coredump_limit(); + + /* Need to check this early because mmapping happens before the + * kernel is running. + */ + check_tmpexec(); + + if (seccomp_config) { + if (init_seccomp()) { + using_seccomp = 1; + return; + } - new = malloc(sizeof(*new)); - if (new == NULL) { - perror("Couldn't allocate iomem_region struct"); - goto out_close; + if (seccomp_config == 2) + fatal("SECCOMP userspace requested but not functional!\n"); } - size = (buf.st_size + UM_KERN_PAGE_SIZE) & ~(UM_KERN_PAGE_SIZE - 1); + if (uml_ncpus > 1) + fatal("SMP is not supported with PTRACE userspace.\n"); - *new = ((struct iomem_region) { .next = iomem_regions, - .driver = driver, - .fd = fd, - .size = size, - .phys = 0, - .virt = 0 }); - iomem_regions = new; - iomem_size += new->size + UM_KERN_PAGE_SIZE; + using_seccomp = 0; + check_ptrace(); - return 0; - out_close: - close(fd); - out: - return 1; + pid = start_ptraced_child(); + if (init_pid_registers(pid)) + fatal("Failed to initialize default registers"); + stop_ptraced_child(pid, 1); } diff --git a/arch/um/os-Linux/time.c b/arch/um/os-Linux/time.c index e9824d5dd7d5..13ebc86918d4 100644 --- a/arch/um/os-Linux/time.c +++ b/arch/um/os-Linux/time.c @@ -1,186 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0 /* + * Copyright (C) 2015 Anton Ivanov (aivanov@{brocade.com,kot-begemot.co.uk}) + * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de) + * Copyright (C) 2012-2014 Cisco Systems * Copyright (C) 2000 - 2007 Jeff Dike (jdike{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <stddef.h> +#include <unistd.h> #include <errno.h> #include <signal.h> #include <time.h> +#include <sys/signalfd.h> #include <sys/time.h> #include <kern_util.h> #include <os.h> +#include <smp.h> +#include <string.h> #include "internal.h" -int set_interval(void) -{ - int usec = UM_USEC_PER_SEC / UM_HZ; - struct itimerval interval = ((struct itimerval) { { 0, usec }, - { 0, usec } }); - - if (setitimer(ITIMER_VIRTUAL, &interval, NULL) == -1) - return -errno; +static timer_t event_high_res_timer[CONFIG_NR_CPUS] = { 0 }; - return 0; +static inline long long timespec_to_ns(const struct timespec *ts) +{ + return ((long long) ts->tv_sec * UM_NSEC_PER_SEC) + ts->tv_nsec; } -int timer_one_shot(int ticks) +long long os_persistent_clock_emulation(void) { - unsigned long usec = ticks * UM_USEC_PER_SEC / UM_HZ; - unsigned long sec = usec / UM_USEC_PER_SEC; - struct itimerval interval; - - usec %= UM_USEC_PER_SEC; - interval = ((struct itimerval) { { 0, 0 }, { sec, usec } }); + struct timespec realtime_tp; - if (setitimer(ITIMER_VIRTUAL, &interval, NULL) == -1) - return -errno; - - return 0; + clock_gettime(CLOCK_REALTIME, &realtime_tp); + return timespec_to_ns(&realtime_tp); } +#ifndef sigev_notify_thread_id +#define sigev_notify_thread_id _sigev_un._tid +#endif + /** - * timeval_to_ns - Convert timeval to nanoseconds - * @ts: pointer to the timeval variable to be converted - * - * Returns the scalar nanosecond representation of the timeval - * parameter. - * - * Ripped from linux/time.h because it's a kernel header, and thus - * unusable from here. + * os_timer_create() - create an new posix (interval) timer */ -static inline long long timeval_to_ns(const struct timeval *tv) +int os_timer_create(void) { - return ((long long) tv->tv_sec * UM_NSEC_PER_SEC) + - tv->tv_usec * UM_NSEC_PER_USEC; + int cpu = uml_curr_cpu(); + timer_t *t = &event_high_res_timer[cpu]; + struct sigevent sev = { + .sigev_notify = SIGEV_THREAD_ID, + .sigev_signo = SIGALRM, + .sigev_value.sival_ptr = t, + .sigev_notify_thread_id = gettid(), + }; + + if (timer_create(CLOCK_MONOTONIC, &sev, t) == -1) + return -1; + + return 0; } -long long disable_timer(void) +int os_timer_set_interval(int cpu, unsigned long long nsecs) { - struct itimerval time = ((struct itimerval) { { 0, 0 }, { 0, 0 } }); - long long remain, max = UM_NSEC_PER_SEC / UM_HZ; + struct itimerspec its; + + its.it_value.tv_sec = nsecs / UM_NSEC_PER_SEC; + its.it_value.tv_nsec = nsecs % UM_NSEC_PER_SEC; - if (setitimer(ITIMER_VIRTUAL, &time, &time) < 0) - printk(UM_KERN_ERR "disable_timer - setitimer failed, " - "errno = %d\n", errno); + its.it_interval.tv_sec = nsecs / UM_NSEC_PER_SEC; + its.it_interval.tv_nsec = nsecs % UM_NSEC_PER_SEC; - remain = timeval_to_ns(&time.it_value); - if (remain > max) - remain = max; + if (timer_settime(event_high_res_timer[cpu], 0, &its, NULL) == -1) + return -errno; - return remain; + return 0; } -long long os_nsecs(void) +int os_timer_one_shot(int cpu, unsigned long long nsecs) { - struct timeval tv; + struct itimerspec its = { + .it_value.tv_sec = nsecs / UM_NSEC_PER_SEC, + .it_value.tv_nsec = nsecs % UM_NSEC_PER_SEC, - gettimeofday(&tv, NULL); - return timeval_to_ns(&tv); -} + .it_interval.tv_sec = 0, + .it_interval.tv_nsec = 0, // we cheat here + }; -#ifdef UML_CONFIG_NO_HZ_COMMON -static int after_sleep_interval(struct timespec *ts) -{ + timer_settime(event_high_res_timer[cpu], 0, &its, NULL); return 0; } -static void deliver_alarm(void) +/** + * os_timer_disable() - disable the posix (interval) timer + * @cpu: the CPU for which the timer is to be disabled + */ +void os_timer_disable(int cpu) { - alarm_handler(SIGVTALRM, NULL, NULL); -} + struct itimerspec its; -static unsigned long long sleep_time(unsigned long long nsecs) -{ - return nsecs; + memset(&its, 0, sizeof(struct itimerspec)); + timer_settime(event_high_res_timer[cpu], 0, &its, NULL); } -#else -unsigned long long last_tick; -unsigned long long skew; - -static void deliver_alarm(void) +long long os_nsecs(void) { - unsigned long long this_tick = os_nsecs(); - int one_tick = UM_NSEC_PER_SEC / UM_HZ; - - /* Protection against the host's time going backwards */ - if ((last_tick != 0) && (this_tick < last_tick)) - this_tick = last_tick; - - if (last_tick == 0) - last_tick = this_tick - one_tick; - - skew += this_tick - last_tick; - - while (skew >= one_tick) { - alarm_handler(SIGVTALRM, NULL, NULL); - skew -= one_tick; - } + struct timespec ts; - last_tick = this_tick; + clock_gettime(CLOCK_MONOTONIC,&ts); + return timespec_to_ns(&ts); } -static unsigned long long sleep_time(unsigned long long nsecs) -{ - return nsecs > skew ? nsecs - skew : 0; -} +static __thread int wake_signals; -static inline long long timespec_to_us(const struct timespec *ts) +void os_idle_prepare(void) { - return ((long long) ts->tv_sec * UM_USEC_PER_SEC) + - ts->tv_nsec / UM_NSEC_PER_USEC; -} + sigset_t set; -static int after_sleep_interval(struct timespec *ts) -{ - int usec = UM_USEC_PER_SEC / UM_HZ; - long long start_usecs = timespec_to_us(ts); - struct timeval tv; - struct itimerval interval; + sigemptyset(&set); + sigaddset(&set, SIGALRM); + sigaddset(&set, IPI_SIGNAL); /* - * It seems that rounding can increase the value returned from - * setitimer to larger than the one passed in. Over time, - * this will cause the remaining time to be greater than the - * tick interval. If this happens, then just reduce the first - * tick to the interval value. + * We need to use signalfd rather than sigsuspend in idle sleep + * because the IPI signal is a real-time signal that carries data, + * and unlike handling SIGALRM, we cannot simply flag it in + * signals_pending. */ - if (start_usecs > usec) - start_usecs = usec; - - start_usecs -= skew / UM_NSEC_PER_USEC; - if (start_usecs < 0) - start_usecs = 0; - - tv = ((struct timeval) { .tv_sec = start_usecs / UM_USEC_PER_SEC, - .tv_usec = start_usecs % UM_USEC_PER_SEC }); - interval = ((struct itimerval) { { 0, usec }, tv }); - - if (setitimer(ITIMER_VIRTUAL, &interval, NULL) == -1) - return -errno; - - return 0; + wake_signals = signalfd(-1, &set, SFD_CLOEXEC); + if (wake_signals < 0) + panic("Failed to create signal FD, errno = %d", errno); } -#endif -void idle_sleep(unsigned long long nsecs) +/** + * os_idle_sleep() - sleep until interrupted + */ +void os_idle_sleep(void) { - struct timespec ts; + sigset_t set; /* - * nsecs can come in as zero, in which case, this starts a - * busy loop. To prevent this, reset nsecs to the tick - * interval if it is zero. + * Block SIGALRM while performing the need_resched check. + * Note that, because IRQs are disabled, the IPI signal is + * already blocked. */ - if (nsecs == 0) - nsecs = UM_NSEC_PER_SEC / UM_HZ; + sigemptyset(&set); + sigaddset(&set, SIGALRM); + sigprocmask(SIG_BLOCK, &set, NULL); - nsecs = sleep_time(nsecs); - ts = ((struct timespec) { .tv_sec = nsecs / UM_NSEC_PER_SEC, - .tv_nsec = nsecs % UM_NSEC_PER_SEC }); + /* + * Because disabling IRQs does not block SIGALRM, it is also + * necessary to check for any pending timer alarms. + */ + if (!uml_need_resched() && !timer_alarm_pending()) + os_poll(1, &wake_signals); - if (nanosleep(&ts, &ts) == 0) - deliver_alarm(); - after_sleep_interval(&ts); + /* Restore the signal mask. */ + sigprocmask(SIG_UNBLOCK, &set, NULL); } diff --git a/arch/um/os-Linux/tty.c b/arch/um/os-Linux/tty.c index 721d8afa329b..f784db83e026 100644 --- a/arch/um/os-Linux/tty.c +++ b/arch/um/os-Linux/tty.c @@ -1,6 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <stdlib.h> diff --git a/arch/um/os-Linux/umid.c b/arch/um/os-Linux/umid.c index c1dc89261f67..eb523ab1e218 100644 --- a/arch/um/os-Linux/umid.c +++ b/arch/um/os-Linux/umid.c @@ -1,6 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <stdio.h> @@ -35,11 +35,12 @@ static int __init make_uml_dir(void) err = -ENOENT; if (home == NULL) { - printk(UM_KERN_ERR "make_uml_dir : no value in " - "environment for $HOME\n"); + printk(UM_KERN_ERR + "%s: no value in environment for $HOME\n", + __func__); goto err; } - strlcpy(dir, home, sizeof(dir)); + strscpy(dir, home); uml_dir++; } strlcat(dir, uml_dir, sizeof(dir)); @@ -50,13 +51,15 @@ static int __init make_uml_dir(void) err = -ENOMEM; uml_dir = malloc(strlen(dir) + 1); if (uml_dir == NULL) { - printf("make_uml_dir : malloc failed, errno = %d\n", errno); + printk(UM_KERN_ERR "%s : malloc failed, errno = %d\n", + __func__, errno); goto err; } strcpy(uml_dir, dir); if ((mkdir(uml_dir, 0777) < 0) && (errno != EEXIST)) { - printf("Failed to mkdir '%s': %s\n", uml_dir, strerror(errno)); + printk(UM_KERN_ERR "Failed to mkdir '%s': %s\n", + uml_dir, strerror(errno)); err = -errno; goto err_free; } @@ -94,7 +97,7 @@ static int remove_files_and_dir(char *dir) while ((ent = readdir(directory)) != NULL) { if (!strcmp(ent->d_name, ".") || !strcmp(ent->d_name, "..")) continue; - len = strlen(dir) + sizeof("/") + strlen(ent->d_name) + 1; + len = strlen(dir) + strlen("/") + strlen(ent->d_name) + 1; if (len > sizeof(file)) { ret = -E2BIG; goto out; @@ -132,18 +135,16 @@ out: */ static inline int is_umdir_used(char *dir) { - char file[strlen(uml_dir) + UMID_LEN + sizeof("/pid\0")]; - char pid[sizeof("nnnnn\0")], *end; - int dead, fd, p, n, err; - - n = snprintf(file, sizeof(file), "%s/pid", dir); - if (n >= sizeof(file)) { - printk(UM_KERN_ERR "is_umdir_used - pid filename too long\n"); - err = -E2BIG; - goto out; - } + char pid[sizeof("nnnnnnnnn")], *end, *file; + int fd, p, n, err; + size_t filelen = strlen(dir) + sizeof("/pid") + 1; + + file = malloc(filelen); + if (!file) + return -ENOMEM; + + snprintf(file, filelen, "%s/pid", dir); - dead = 0; fd = open(file, O_RDONLY); if (fd < 0) { fd = -errno; @@ -182,6 +183,7 @@ static inline int is_umdir_used(char *dir) out_close: close(fd); out: + free(file); return 0; } @@ -207,18 +209,22 @@ static int umdir_take_if_dead(char *dir) static void __init create_pid_file(void) { - char file[strlen(uml_dir) + UMID_LEN + sizeof("/pid\0")]; - char pid[sizeof("nnnnn\0")]; + char pid[sizeof("nnnnnnnnn")], *file; int fd, n; - if (umid_file_name("pid", file, sizeof(file))) + n = strlen(uml_dir) + UMID_LEN + sizeof("/pid"); + file = malloc(n); + if (!file) return; + if (umid_file_name("pid", file, n)) + goto out; + fd = open(file, O_RDWR | O_CREAT | O_EXCL, 0644); if (fd < 0) { printk(UM_KERN_ERR "Open of machine pid file \"%s\" failed: " "%s\n", file, strerror(errno)); - return; + goto out; } snprintf(pid, sizeof(pid), "%d\n", getpid()); @@ -228,6 +234,8 @@ static void __init create_pid_file(void) errno); close(fd); +out: + free(file); } int __init set_umid(char *name) @@ -235,7 +243,7 @@ int __init set_umid(char *name) if (strlen(name) > UMID_LEN - 1) return -E2BIG; - strlcpy(umid, name, sizeof(umid)); + strscpy(umid, name); return 0; } @@ -254,7 +262,7 @@ static int __init make_umid(void) make_uml_dir(); if (*umid == '\0') { - strlcpy(tmp, uml_dir, sizeof(tmp)); + strscpy(tmp, uml_dir); strlcat(tmp, "XXXXXX", sizeof(tmp)); fd = mkstemp(tmp); if (fd < 0) { @@ -350,8 +358,10 @@ char *get_umid(void) static int __init set_uml_dir(char *name, int *add) { + *add = 0; + if (*name == '\0') { - printf("uml_dir can't be an empty string\n"); + os_warn("uml_dir can't be an empty string\n"); return 0; } @@ -362,7 +372,7 @@ static int __init set_uml_dir(char *name, int *add) uml_dir = malloc(strlen(name) + 2); if (uml_dir == NULL) { - printf("Failed to malloc uml_dir - error = %d\n", errno); + os_warn("Failed to malloc uml_dir - error = %d\n", errno); /* * Return 0 here because do_initcalls doesn't look at @@ -382,13 +392,19 @@ __uml_setup("uml_dir=", set_uml_dir, static void remove_umid_dir(void) { - char dir[strlen(uml_dir) + UMID_LEN + 1], err; + char *dir, err; + + dir = malloc(strlen(uml_dir) + UMID_LEN + 1); + if (!dir) + return; sprintf(dir, "%s%s", uml_dir, umid); err = remove_files_and_dir(dir); if (err) - printf("remove_umid_dir - remove_files_and_dir failed with " - "err = %d\n", err); + os_warn("%s - remove_files_and_dir failed with err = %d\n", + __func__, err); + + free(dir); } __uml_exitcall(remove_umid_dir); diff --git a/arch/um/os-Linux/user_syms.c b/arch/um/os-Linux/user_syms.c index db4a034aeee1..67f6112318b6 100644 --- a/arch/um/os-Linux/user_syms.c +++ b/arch/um/os-Linux/user_syms.c @@ -1,120 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0 +#define __NO_FORTIFY #include <linux/types.h> #include <linux/module.h> -/* Some of this are builtin function (some are not but could in the future), - * so I *must* declare good prototypes for them and then EXPORT them. - * The kernel code uses the macro defined by include/linux/string.h, - * so I undef macros; the userspace code does not include that and I - * add an EXPORT for the glibc one. +/* + * This file exports some critical string functions and compiler + * built-in functions (where calls are emitted by the compiler + * itself that we cannot avoid even in kernel code) to modules. + * + * "_user.c" code that previously used exports here such as hostfs + * really should be considered part of the 'hypervisor' and define + * its own API boundary like hostfs does now; don't add exports to + * this file for such cases. */ -#undef strlen -#undef strstr -#undef memcpy -#undef memset - -extern size_t strlen(const char *); -extern void *memmove(void *, const void *, size_t); -extern void *memset(void *, int, size_t); -extern int printf(const char *, ...); - /* If it's not defined, the export is included in lib/string.c.*/ #ifdef __HAVE_ARCH_STRSTR +#undef strstr EXPORT_SYMBOL(strstr); #endif #ifndef __x86_64__ +#undef memcpy extern void *memcpy(void *, const void *, size_t); EXPORT_SYMBOL(memcpy); -#endif - +extern void *memmove(void *, const void *, size_t); EXPORT_SYMBOL(memmove); +#undef memset +extern void *memset(void *, int, size_t); EXPORT_SYMBOL(memset); -EXPORT_SYMBOL(printf); - -/* Here, instead, I can provide a fake prototype. Yes, someone cares: genksyms. - * However, the modules will use the CRC defined *here*, no matter if it is - * good; so the versions of these symbols will always match - */ -#define EXPORT_SYMBOL_PROTO(sym) \ - int sym(void); \ - EXPORT_SYMBOL(sym); - -extern void readdir64(void) __attribute__((weak)); -EXPORT_SYMBOL(readdir64); -extern void truncate64(void) __attribute__((weak)); -EXPORT_SYMBOL(truncate64); - -#ifdef CONFIG_ARCH_REUSE_HOST_VSYSCALL_AREA -EXPORT_SYMBOL(vsyscall_ehdr); -EXPORT_SYMBOL(vsyscall_end); #endif -EXPORT_SYMBOL_PROTO(__errno_location); - -EXPORT_SYMBOL_PROTO(access); -EXPORT_SYMBOL_PROTO(open); -EXPORT_SYMBOL_PROTO(open64); -EXPORT_SYMBOL_PROTO(close); -EXPORT_SYMBOL_PROTO(read); -EXPORT_SYMBOL_PROTO(write); -EXPORT_SYMBOL_PROTO(dup2); -EXPORT_SYMBOL_PROTO(__xstat); -EXPORT_SYMBOL_PROTO(__lxstat); -EXPORT_SYMBOL_PROTO(__lxstat64); -EXPORT_SYMBOL_PROTO(__fxstat64); -EXPORT_SYMBOL_PROTO(lseek); -EXPORT_SYMBOL_PROTO(lseek64); -EXPORT_SYMBOL_PROTO(chown); -EXPORT_SYMBOL_PROTO(fchown); -EXPORT_SYMBOL_PROTO(truncate); -EXPORT_SYMBOL_PROTO(ftruncate64); -EXPORT_SYMBOL_PROTO(utime); -EXPORT_SYMBOL_PROTO(utimes); -EXPORT_SYMBOL_PROTO(futimes); -EXPORT_SYMBOL_PROTO(chmod); -EXPORT_SYMBOL_PROTO(fchmod); -EXPORT_SYMBOL_PROTO(rename); -EXPORT_SYMBOL_PROTO(__xmknod); - -EXPORT_SYMBOL_PROTO(symlink); -EXPORT_SYMBOL_PROTO(link); -EXPORT_SYMBOL_PROTO(unlink); -EXPORT_SYMBOL_PROTO(readlink); - -EXPORT_SYMBOL_PROTO(mkdir); -EXPORT_SYMBOL_PROTO(rmdir); -EXPORT_SYMBOL_PROTO(opendir); -EXPORT_SYMBOL_PROTO(readdir); -EXPORT_SYMBOL_PROTO(closedir); -EXPORT_SYMBOL_PROTO(seekdir); -EXPORT_SYMBOL_PROTO(telldir); - -EXPORT_SYMBOL_PROTO(ioctl); - -EXPORT_SYMBOL_PROTO(pread64); -EXPORT_SYMBOL_PROTO(pwrite64); - -EXPORT_SYMBOL_PROTO(statfs); -EXPORT_SYMBOL_PROTO(statfs64); - -EXPORT_SYMBOL_PROTO(getuid); - -EXPORT_SYMBOL_PROTO(fsync); -EXPORT_SYMBOL_PROTO(fdatasync); - -EXPORT_SYMBOL_PROTO(lstat64); -EXPORT_SYMBOL_PROTO(fstat64); -EXPORT_SYMBOL_PROTO(mknod); - -/* Export symbols used by GCC for the stack protector. */ -extern void __stack_smash_handler(void *) __attribute__((weak)); -EXPORT_SYMBOL(__stack_smash_handler); - -extern long __guard __attribute__((weak)); -EXPORT_SYMBOL(__guard); - #ifdef _FORTIFY_SOURCE -extern int __sprintf_chk(char *str, int flag, size_t strlen, const char *format); +extern int __sprintf_chk(char *str, int flag, size_t len, const char *format); EXPORT_SYMBOL(__sprintf_chk); #endif diff --git a/arch/um/os-Linux/util.c b/arch/um/os-Linux/util.c index 492ef5e6e166..e3ad71a0d13c 100644 --- a/arch/um/os-Linux/util.c +++ b/arch/um/os-Linux/util.c @@ -1,8 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ +#include <stdarg.h> #include <stdio.h> #include <stdlib.h> #include <unistd.h> @@ -10,15 +11,16 @@ #include <signal.h> #include <string.h> #include <termios.h> -#include <wait.h> +#include <sys/wait.h> #include <sys/mman.h> #include <sys/utsname.h> +#include <sys/random.h> +#include <init.h> #include <os.h> void stack_protections(unsigned long address) { - if (mprotect((void *) address, UM_THREAD_SIZE, - PROT_READ | PROT_WRITE | PROT_EXEC) < 0) + if (mprotect((void *) address, UM_THREAD_SIZE, PROT_READ | PROT_WRITE) < 0) panic("protecting stack failed, errno = %d", errno); } @@ -49,8 +51,8 @@ void setup_machinename(char *machine_out) struct utsname host; uname(&host); -#ifdef UML_CONFIG_UML_X86 -# ifndef UML_CONFIG_64BIT +#if IS_ENABLED(CONFIG_UML_X86) +# if !IS_ENABLED(CONFIG_64BIT) if (!strcmp(host.machine, "x86_64")) { strcpy(machine_out, "i686"); return; @@ -94,6 +96,21 @@ static inline void __attribute__ ((noreturn)) uml_abort(void) exit(127); } +ssize_t os_getrandom(void *buf, size_t len, unsigned int flags) +{ + return getrandom(buf, len, flags); +} + +/* + * UML helper threads must not handle SIGWINCH/INT/TERM + */ +void os_fix_helper_signals(void) +{ + signal(SIGWINCH, SIG_IGN); + signal(SIGINT, SIG_DFL); + signal(SIGTERM, SIG_DFL); +} + void os_dump_core(void) { int pid; @@ -142,3 +159,51 @@ void um_early_printk(const char *s, unsigned int n) { printf("%.*s", n, s); } + +static int quiet_info; + +static int __init quiet_cmd_param(char *str, int *add) +{ + quiet_info = 1; + return 0; +} + +__uml_setup("quiet", quiet_cmd_param, +"quiet\n" +" Turns off information messages during boot.\n\n"); + +/* + * The os_info/os_warn functions will be called by helper threads. These + * have a very limited stack size and using the libc formatting functions + * may overflow the stack. + * So pull in the kernel vscnprintf and use that instead with a fixed + * on-stack buffer. + */ +int vscnprintf(char *buf, size_t size, const char *fmt, va_list args); + +void os_info(const char *fmt, ...) +{ + char buf[256]; + va_list list; + int len; + + if (quiet_info) + return; + + va_start(list, fmt); + len = vscnprintf(buf, sizeof(buf), fmt, list); + fwrite(buf, len, 1, stderr); + va_end(list); +} + +void os_warn(const char *fmt, ...) +{ + char buf[256]; + va_list list; + int len; + + va_start(list, fmt); + len = vscnprintf(buf, sizeof(buf), fmt, list); + fwrite(buf, len, 1, stderr); + va_end(list); +} |
