summaryrefslogtreecommitdiff
path: root/arch/um/os-Linux
diff options
context:
space:
mode:
Diffstat (limited to 'arch/um/os-Linux')
-rw-r--r--arch/um/os-Linux/Makefile25
-rw-r--r--arch/um/os-Linux/aio.c391
-rw-r--r--arch/um/os-Linux/drivers/Makefile13
-rw-r--r--arch/um/os-Linux/drivers/etap.h21
-rw-r--r--arch/um/os-Linux/drivers/ethertap_kern.c100
-rw-r--r--arch/um/os-Linux/drivers/ethertap_user.c248
-rw-r--r--arch/um/os-Linux/drivers/tuntap.h21
-rw-r--r--arch/um/os-Linux/drivers/tuntap_kern.c86
-rw-r--r--arch/um/os-Linux/drivers/tuntap_user.c215
-rw-r--r--arch/um/os-Linux/elf_aux.c41
-rw-r--r--arch/um/os-Linux/execvp.c3
-rw-r--r--arch/um/os-Linux/file.c242
-rw-r--r--arch/um/os-Linux/helper.c90
-rw-r--r--arch/um/os-Linux/internal.h37
-rw-r--r--arch/um/os-Linux/irq.c202
-rw-r--r--arch/um/os-Linux/main.c76
-rw-r--r--arch/um/os-Linux/mem.c283
-rw-r--r--arch/um/os-Linux/process.c153
-rw-r--r--arch/um/os-Linux/registers.c37
-rw-r--r--arch/um/os-Linux/sigio.c374
-rw-r--r--arch/um/os-Linux/signal.c369
-rw-r--r--arch/um/os-Linux/skas/Makefile4
-rw-r--r--arch/um/os-Linux/skas/mem.c391
-rw-r--r--arch/um/os-Linux/skas/process.c1104
-rw-r--r--arch/um/os-Linux/smp.c148
-rw-r--r--arch/um/os-Linux/start_up.c541
-rw-r--r--arch/um/os-Linux/time.c223
-rw-r--r--arch/um/os-Linux/tty.c2
-rw-r--r--arch/um/os-Linux/umid.c74
-rw-r--r--arch/um/os-Linux/user_syms.c117
-rw-r--r--arch/um/os-Linux/util.c77
31 files changed, 2485 insertions, 3223 deletions
diff --git a/arch/um/os-Linux/Makefile b/arch/um/os-Linux/Makefile
index 08ff5094fcdd..f8d672d570d9 100644
--- a/arch/um/os-Linux/Makefile
+++ b/arch/um/os-Linux/Makefile
@@ -1,20 +1,23 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
-# Licensed under the GPL
#
-obj-y = aio.o execvp.o file.o helper.o irq.o main.o mem.o process.o \
+# Don't instrument UML-specific code
+KCOV_INSTRUMENT := n
+
+obj-y = elf_aux.o execvp.o file.o helper.o irq.o main.o mem.o process.o \
registers.o sigio.o signal.o start_up.o time.o tty.o \
- umid.o user_syms.o util.o drivers/ skas/
+ umid.o user_syms.o util.o skas/
-obj-$(CONFIG_ARCH_REUSE_HOST_VSYSCALL_AREA) += elf_aux.o
+CFLAGS_signal.o += -Wframe-larger-than=4096
-USER_OBJS := $(user-objs-y) aio.o elf_aux.o execvp.o file.o helper.o irq.o \
- main.o mem.o process.o registers.o sigio.o signal.o start_up.o time.o \
- tty.o umid.o util.o
+CFLAGS_main.o += -Wno-frame-larger-than
-HAVE_AIO_ABI := $(shell [ -r /usr/include/linux/aio_abi.h ] && \
- echo -DHAVE_AIO_ABI )
-CFLAGS_aio.o += $(HAVE_AIO_ABI)
+obj-$(CONFIG_SMP) += smp.o
+
+USER_OBJS := $(user-objs-y) elf_aux.o execvp.o file.o helper.o irq.o \
+ main.o mem.o process.o registers.o sigio.o signal.o start_up.o time.o \
+ tty.o umid.o util.o smp.o
-include arch/um/scripts/Makefile.rules
+include $(srctree)/arch/um/scripts/Makefile.rules
diff --git a/arch/um/os-Linux/aio.c b/arch/um/os-Linux/aio.c
deleted file mode 100644
index 3a6bc2af0961..000000000000
--- a/arch/um/os-Linux/aio.c
+++ /dev/null
@@ -1,391 +0,0 @@
-/*
- * Copyright (C) 2004 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
- */
-
-#include <unistd.h>
-#include <sched.h>
-#include <signal.h>
-#include <errno.h>
-#include <sys/time.h>
-#include <asm/unistd.h>
-#include <aio.h>
-#include <init.h>
-#include <kern_util.h>
-#include <os.h>
-
-struct aio_thread_req {
- enum aio_type type;
- int io_fd;
- unsigned long long offset;
- char *buf;
- int len;
- struct aio_context *aio;
-};
-
-#if defined(HAVE_AIO_ABI)
-#include <linux/aio_abi.h>
-
-/*
- * If we have the headers, we are going to build with AIO enabled.
- * If we don't have aio in libc, we define the necessary stubs here.
- */
-
-#if !defined(HAVE_AIO_LIBC)
-
-static long io_setup(int n, aio_context_t *ctxp)
-{
- return syscall(__NR_io_setup, n, ctxp);
-}
-
-static long io_submit(aio_context_t ctx, long nr, struct iocb **iocbpp)
-{
- return syscall(__NR_io_submit, ctx, nr, iocbpp);
-}
-
-static long io_getevents(aio_context_t ctx_id, long min_nr, long nr,
- struct io_event *events, struct timespec *timeout)
-{
- return syscall(__NR_io_getevents, ctx_id, min_nr, nr, events, timeout);
-}
-
-#endif
-
-/*
- * The AIO_MMAP cases force the mmapped page into memory here
- * rather than in whatever place first touches the data. I used
- * to do this by touching the page, but that's delicate because
- * gcc is prone to optimizing that away. So, what's done here
- * is we read from the descriptor from which the page was
- * mapped. The caller is required to pass an offset which is
- * inside the page that was mapped. Thus, when the read
- * returns, we know that the page is in the page cache, and
- * that it now backs the mmapped area.
- */
-
-static int do_aio(aio_context_t ctx, enum aio_type type, int fd, char *buf,
- int len, unsigned long long offset, struct aio_context *aio)
-{
- struct iocb *iocbp = & ((struct iocb) {
- .aio_data = (unsigned long) aio,
- .aio_fildes = fd,
- .aio_buf = (unsigned long) buf,
- .aio_nbytes = len,
- .aio_offset = offset
- });
- char c;
-
- switch (type) {
- case AIO_READ:
- iocbp->aio_lio_opcode = IOCB_CMD_PREAD;
- break;
- case AIO_WRITE:
- iocbp->aio_lio_opcode = IOCB_CMD_PWRITE;
- break;
- case AIO_MMAP:
- iocbp->aio_lio_opcode = IOCB_CMD_PREAD;
- iocbp->aio_buf = (unsigned long) &c;
- iocbp->aio_nbytes = sizeof(c);
- break;
- default:
- printk(UM_KERN_ERR "Bogus op in do_aio - %d\n", type);
- return -EINVAL;
- }
-
- return (io_submit(ctx, 1, &iocbp) > 0) ? 0 : -errno;
-}
-
-/* Initialized in an initcall and unchanged thereafter */
-static aio_context_t ctx = 0;
-
-static int aio_thread(void *arg)
-{
- struct aio_thread_reply reply;
- struct io_event event;
- int err, n, reply_fd;
-
- signal(SIGWINCH, SIG_IGN);
-
- while (1) {
- n = io_getevents(ctx, 1, 1, &event, NULL);
- if (n < 0) {
- if (errno == EINTR)
- continue;
- printk(UM_KERN_ERR "aio_thread - io_getevents failed, "
- "errno = %d\n", errno);
- }
- else {
- reply = ((struct aio_thread_reply)
- { .data = (void *) (long) event.data,
- .err = event.res });
- reply_fd = ((struct aio_context *) reply.data)->reply_fd;
- err = write(reply_fd, &reply, sizeof(reply));
- if (err != sizeof(reply))
- printk(UM_KERN_ERR "aio_thread - write failed, "
- "fd = %d, err = %d\n", reply_fd, errno);
- }
- }
- return 0;
-}
-
-#endif
-
-static int do_not_aio(struct aio_thread_req *req)
-{
- char c;
- unsigned long long actual;
- int n;
-
- actual = lseek64(req->io_fd, req->offset, SEEK_SET);
- if (actual != req->offset)
- return -errno;
-
- switch (req->type) {
- case AIO_READ:
- n = read(req->io_fd, req->buf, req->len);
- break;
- case AIO_WRITE:
- n = write(req->io_fd, req->buf, req->len);
- break;
- case AIO_MMAP:
- n = read(req->io_fd, &c, sizeof(c));
- break;
- default:
- printk(UM_KERN_ERR "do_not_aio - bad request type : %d\n",
- req->type);
- return -EINVAL;
- }
-
- if (n < 0)
- return -errno;
- return 0;
-}
-
-/* These are initialized in initcalls and not changed */
-static int aio_req_fd_r = -1;
-static int aio_req_fd_w = -1;
-static int aio_pid = -1;
-static unsigned long aio_stack;
-
-static int not_aio_thread(void *arg)
-{
- struct aio_thread_req req;
- struct aio_thread_reply reply;
- int err;
-
- signal(SIGWINCH, SIG_IGN);
- while (1) {
- err = read(aio_req_fd_r, &req, sizeof(req));
- if (err != sizeof(req)) {
- if (err < 0)
- printk(UM_KERN_ERR "not_aio_thread - "
- "read failed, fd = %d, err = %d\n",
- aio_req_fd_r,
- errno);
- else {
- printk(UM_KERN_ERR "not_aio_thread - short "
- "read, fd = %d, length = %d\n",
- aio_req_fd_r, err);
- }
- continue;
- }
- err = do_not_aio(&req);
- reply = ((struct aio_thread_reply) { .data = req.aio,
- .err = err });
- err = write(req.aio->reply_fd, &reply, sizeof(reply));
- if (err != sizeof(reply))
- printk(UM_KERN_ERR "not_aio_thread - write failed, "
- "fd = %d, err = %d\n", req.aio->reply_fd, errno);
- }
-
- return 0;
-}
-
-static int init_aio_24(void)
-{
- int fds[2], err;
-
- err = os_pipe(fds, 1, 1);
- if (err)
- goto out;
-
- aio_req_fd_w = fds[0];
- aio_req_fd_r = fds[1];
-
- err = os_set_fd_block(aio_req_fd_w, 0);
- if (err)
- goto out_close_pipe;
-
- err = run_helper_thread(not_aio_thread, NULL,
- CLONE_FILES | CLONE_VM, &aio_stack);
- if (err < 0)
- goto out_close_pipe;
-
- aio_pid = err;
- goto out;
-
-out_close_pipe:
- close(fds[0]);
- close(fds[1]);
- aio_req_fd_w = -1;
- aio_req_fd_r = -1;
-out:
-#ifndef HAVE_AIO_ABI
- printk(UM_KERN_INFO "/usr/include/linux/aio_abi.h not present during "
- "build\n");
-#endif
- printk(UM_KERN_INFO "2.6 host AIO support not used - falling back to "
- "I/O thread\n");
- return 0;
-}
-
-#ifdef HAVE_AIO_ABI
-#define DEFAULT_24_AIO 0
-static int init_aio_26(void)
-{
- int err;
-
- if (io_setup(256, &ctx)) {
- err = -errno;
- printk(UM_KERN_ERR "aio_thread failed to initialize context, "
- "err = %d\n", errno);
- return err;
- }
-
- err = run_helper_thread(aio_thread, NULL,
- CLONE_FILES | CLONE_VM, &aio_stack);
- if (err < 0)
- return err;
-
- aio_pid = err;
-
- printk(UM_KERN_INFO "Using 2.6 host AIO\n");
- return 0;
-}
-
-static int submit_aio_26(enum aio_type type, int io_fd, char *buf, int len,
- unsigned long long offset, struct aio_context *aio)
-{
- struct aio_thread_reply reply;
- int err;
-
- err = do_aio(ctx, type, io_fd, buf, len, offset, aio);
- if (err) {
- reply = ((struct aio_thread_reply) { .data = aio,
- .err = err });
- err = write(aio->reply_fd, &reply, sizeof(reply));
- if (err != sizeof(reply)) {
- err = -errno;
- printk(UM_KERN_ERR "submit_aio_26 - write failed, "
- "fd = %d, err = %d\n", aio->reply_fd, -err);
- }
- else err = 0;
- }
-
- return err;
-}
-
-#else
-#define DEFAULT_24_AIO 1
-static int init_aio_26(void)
-{
- return -ENOSYS;
-}
-
-static int submit_aio_26(enum aio_type type, int io_fd, char *buf, int len,
- unsigned long long offset, struct aio_context *aio)
-{
- return -ENOSYS;
-}
-#endif
-
-/* Initialized in an initcall and unchanged thereafter */
-static int aio_24 = DEFAULT_24_AIO;
-
-static int __init set_aio_24(char *name, int *add)
-{
- aio_24 = 1;
- return 0;
-}
-
-__uml_setup("aio=2.4", set_aio_24,
-"aio=2.4\n"
-" This is used to force UML to use 2.4-style AIO even when 2.6 AIO is\n"
-" available. 2.4 AIO is a single thread that handles one request at a\n"
-" time, synchronously. 2.6 AIO is a thread which uses the 2.6 AIO \n"
-" interface to handle an arbitrary number of pending requests. 2.6 AIO \n"
-" is not available in tt mode, on 2.4 hosts, or when UML is built with\n"
-" /usr/include/linux/aio_abi.h not available. Many distributions don't\n"
-" include aio_abi.h, so you will need to copy it from a kernel tree to\n"
-" your /usr/include/linux in order to build an AIO-capable UML\n\n"
-);
-
-static int init_aio(void)
-{
- int err;
-
- if (!aio_24) {
- err = init_aio_26();
- if (err && (errno == ENOSYS)) {
- printk(UM_KERN_INFO "2.6 AIO not supported on the "
- "host - reverting to 2.4 AIO\n");
- aio_24 = 1;
- }
- else return err;
- }
-
- if (aio_24)
- return init_aio_24();
-
- return 0;
-}
-
-/*
- * The reason for the __initcall/__uml_exitcall asymmetry is that init_aio
- * needs to be called when the kernel is running because it calls run_helper,
- * which needs get_free_page. exit_aio is a __uml_exitcall because the generic
- * kernel does not run __exitcalls on shutdown, and can't because many of them
- * break when called outside of module unloading.
- */
-__initcall(init_aio);
-
-static void exit_aio(void)
-{
- if (aio_pid != -1) {
- os_kill_process(aio_pid, 1);
- free_stack(aio_stack, 0);
- }
-}
-
-__uml_exitcall(exit_aio);
-
-static int submit_aio_24(enum aio_type type, int io_fd, char *buf, int len,
- unsigned long long offset, struct aio_context *aio)
-{
- struct aio_thread_req req = { .type = type,
- .io_fd = io_fd,
- .offset = offset,
- .buf = buf,
- .len = len,
- .aio = aio,
- };
- int err;
-
- err = write(aio_req_fd_w, &req, sizeof(req));
- if (err == sizeof(req))
- err = 0;
- else err = -errno;
-
- return err;
-}
-
-int submit_aio(enum aio_type type, int io_fd, char *buf, int len,
- unsigned long long offset, int reply_fd,
- struct aio_context *aio)
-{
- aio->reply_fd = reply_fd;
- if (aio_24)
- return submit_aio_24(type, io_fd, buf, len, offset, aio);
- else
- return submit_aio_26(type, io_fd, buf, len, offset, aio);
-}
diff --git a/arch/um/os-Linux/drivers/Makefile b/arch/um/os-Linux/drivers/Makefile
deleted file mode 100644
index 6c546dc9222b..000000000000
--- a/arch/um/os-Linux/drivers/Makefile
+++ /dev/null
@@ -1,13 +0,0 @@
-#
-# Copyright (C) 2000, 2002 Jeff Dike (jdike@karaya.com)
-# Licensed under the GPL
-#
-
-ethertap-objs := ethertap_kern.o ethertap_user.o
-tuntap-objs := tuntap_kern.o tuntap_user.o
-
-obj-y =
-obj-$(CONFIG_UML_NET_ETHERTAP) += ethertap.o
-obj-$(CONFIG_UML_NET_TUNTAP) += tuntap.o
-
-include arch/um/scripts/Makefile.rules
diff --git a/arch/um/os-Linux/drivers/etap.h b/arch/um/os-Linux/drivers/etap.h
deleted file mode 100644
index 54183a679fdd..000000000000
--- a/arch/um/os-Linux/drivers/etap.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
- */
-
-#ifndef __DRIVERS_ETAP_H
-#define __DRIVERS_ETAP_H
-
-#include <net_user.h>
-
-struct ethertap_data {
- char *dev_name;
- char *gate_addr;
- int data_fd;
- int control_fd;
- void *dev;
-};
-
-extern const struct net_user_info ethertap_user_info;
-
-#endif
diff --git a/arch/um/os-Linux/drivers/ethertap_kern.c b/arch/um/os-Linux/drivers/ethertap_kern.c
deleted file mode 100644
index f424600a583f..000000000000
--- a/arch/um/os-Linux/drivers/ethertap_kern.c
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and
- * James Leu (jleu@mindspring.net).
- * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Copyright (C) 2001 by various other people who didn't put their name here.
- * Licensed under the GPL.
- */
-
-#include <linux/init.h>
-#include <linux/netdevice.h>
-#include "etap.h"
-#include <net_kern.h>
-
-struct ethertap_init {
- char *dev_name;
- char *gate_addr;
-};
-
-static void etap_init(struct net_device *dev, void *data)
-{
- struct uml_net_private *pri;
- struct ethertap_data *epri;
- struct ethertap_init *init = data;
-
- pri = netdev_priv(dev);
- epri = (struct ethertap_data *) pri->user;
- epri->dev_name = init->dev_name;
- epri->gate_addr = init->gate_addr;
- epri->data_fd = -1;
- epri->control_fd = -1;
- epri->dev = dev;
-
- printk(KERN_INFO "ethertap backend - %s", epri->dev_name);
- if (epri->gate_addr != NULL)
- printk(KERN_CONT ", IP = %s", epri->gate_addr);
- printk(KERN_CONT "\n");
-}
-
-static int etap_read(int fd, struct sk_buff *skb, struct uml_net_private *lp)
-{
- int len;
-
- len = net_recvfrom(fd, skb_mac_header(skb),
- skb->dev->mtu + 2 + ETH_HEADER_ETHERTAP);
- if (len <= 0)
- return(len);
-
- skb_pull(skb, 2);
- len -= 2;
- return len;
-}
-
-static int etap_write(int fd, struct sk_buff *skb, struct uml_net_private *lp)
-{
- skb_push(skb, 2);
- return net_send(fd, skb->data, skb->len);
-}
-
-const struct net_kern_info ethertap_kern_info = {
- .init = etap_init,
- .protocol = eth_protocol,
- .read = etap_read,
- .write = etap_write,
-};
-
-int ethertap_setup(char *str, char **mac_out, void *data)
-{
- struct ethertap_init *init = data;
-
- *init = ((struct ethertap_init)
- { .dev_name = NULL,
- .gate_addr = NULL });
- if (tap_setup_common(str, "ethertap", &init->dev_name, mac_out,
- &init->gate_addr))
- return 0;
- if (init->dev_name == NULL) {
- printk(KERN_ERR "ethertap_setup : Missing tap device name\n");
- return 0;
- }
-
- return 1;
-}
-
-static struct transport ethertap_transport = {
- .list = LIST_HEAD_INIT(ethertap_transport.list),
- .name = "ethertap",
- .setup = ethertap_setup,
- .user = &ethertap_user_info,
- .kern = &ethertap_kern_info,
- .private_size = sizeof(struct ethertap_data),
- .setup_size = sizeof(struct ethertap_init),
-};
-
-static int register_ethertap(void)
-{
- register_transport(&ethertap_transport);
- return 0;
-}
-
-late_initcall(register_ethertap);
diff --git a/arch/um/os-Linux/drivers/ethertap_user.c b/arch/um/os-Linux/drivers/ethertap_user.c
deleted file mode 100644
index b39b6696ac58..000000000000
--- a/arch/um/os-Linux/drivers/ethertap_user.c
+++ /dev/null
@@ -1,248 +0,0 @@
-/*
- * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and
- * James Leu (jleu@mindspring.net).
- * Copyright (C) 2001 by various other people who didn't put their name here.
- * Licensed under the GPL.
- */
-
-#include <stdio.h>
-#include <unistd.h>
-#include <errno.h>
-#include <string.h>
-#include <sys/socket.h>
-#include <sys/wait.h>
-#include "etap.h"
-#include <os.h>
-#include <net_user.h>
-#include <um_malloc.h>
-
-#define MAX_PACKET ETH_MAX_PACKET
-
-static int etap_user_init(void *data, void *dev)
-{
- struct ethertap_data *pri = data;
-
- pri->dev = dev;
- return 0;
-}
-
-struct addr_change {
- enum { ADD_ADDR, DEL_ADDR } what;
- unsigned char addr[4];
- unsigned char netmask[4];
-};
-
-static void etap_change(int op, unsigned char *addr, unsigned char *netmask,
- int fd)
-{
- struct addr_change change;
- char *output;
- int n;
-
- change.what = op;
- memcpy(change.addr, addr, sizeof(change.addr));
- memcpy(change.netmask, netmask, sizeof(change.netmask));
- CATCH_EINTR(n = write(fd, &change, sizeof(change)));
- if (n != sizeof(change)) {
- printk(UM_KERN_ERR "etap_change - request failed, err = %d\n",
- errno);
- return;
- }
-
- output = uml_kmalloc(UM_KERN_PAGE_SIZE, UM_GFP_KERNEL);
- if (output == NULL)
- printk(UM_KERN_ERR "etap_change : Failed to allocate output "
- "buffer\n");
- read_output(fd, output, UM_KERN_PAGE_SIZE);
- if (output != NULL) {
- printk("%s", output);
- kfree(output);
- }
-}
-
-static void etap_open_addr(unsigned char *addr, unsigned char *netmask,
- void *arg)
-{
- etap_change(ADD_ADDR, addr, netmask, *((int *) arg));
-}
-
-static void etap_close_addr(unsigned char *addr, unsigned char *netmask,
- void *arg)
-{
- etap_change(DEL_ADDR, addr, netmask, *((int *) arg));
-}
-
-struct etap_pre_exec_data {
- int control_remote;
- int control_me;
- int data_me;
-};
-
-static void etap_pre_exec(void *arg)
-{
- struct etap_pre_exec_data *data = arg;
-
- dup2(data->control_remote, 1);
- close(data->data_me);
- close(data->control_me);
-}
-
-static int etap_tramp(char *dev, char *gate, int control_me,
- int control_remote, int data_me, int data_remote)
-{
- struct etap_pre_exec_data pe_data;
- int pid, err, n;
- char version_buf[sizeof("nnnnn\0")];
- char data_fd_buf[sizeof("nnnnnn\0")];
- char gate_buf[sizeof("nnn.nnn.nnn.nnn\0")];
- char *setup_args[] = { "uml_net", version_buf, "ethertap", dev,
- data_fd_buf, gate_buf, NULL };
- char *nosetup_args[] = { "uml_net", version_buf, "ethertap",
- dev, data_fd_buf, NULL };
- char **args, c;
-
- sprintf(data_fd_buf, "%d", data_remote);
- sprintf(version_buf, "%d", UML_NET_VERSION);
- if (gate != NULL) {
- strcpy(gate_buf, gate);
- args = setup_args;
- }
- else args = nosetup_args;
-
- err = 0;
- pe_data.control_remote = control_remote;
- pe_data.control_me = control_me;
- pe_data.data_me = data_me;
- pid = run_helper(etap_pre_exec, &pe_data, args);
-
- if (pid < 0)
- err = pid;
- close(data_remote);
- close(control_remote);
- CATCH_EINTR(n = read(control_me, &c, sizeof(c)));
- if (n != sizeof(c)) {
- err = -errno;
- printk(UM_KERN_ERR "etap_tramp : read of status failed, "
- "err = %d\n", -err);
- return err;
- }
- if (c != 1) {
- printk(UM_KERN_ERR "etap_tramp : uml_net failed\n");
- err = helper_wait(pid);
- }
- return err;
-}
-
-static int etap_open(void *data)
-{
- struct ethertap_data *pri = data;
- char *output;
- int data_fds[2], control_fds[2], err, output_len;
-
- err = tap_open_common(pri->dev, pri->gate_addr);
- if (err)
- return err;
-
- err = socketpair(AF_UNIX, SOCK_DGRAM, 0, data_fds);
- if (err) {
- err = -errno;
- printk(UM_KERN_ERR "etap_open - data socketpair failed - "
- "err = %d\n", errno);
- return err;
- }
-
- err = socketpair(AF_UNIX, SOCK_STREAM, 0, control_fds);
- if (err) {
- err = -errno;
- printk(UM_KERN_ERR "etap_open - control socketpair failed - "
- "err = %d\n", errno);
- goto out_close_data;
- }
-
- err = etap_tramp(pri->dev_name, pri->gate_addr, control_fds[0],
- control_fds[1], data_fds[0], data_fds[1]);
- output_len = UM_KERN_PAGE_SIZE;
- output = uml_kmalloc(output_len, UM_GFP_KERNEL);
- read_output(control_fds[0], output, output_len);
-
- if (output == NULL)
- printk(UM_KERN_ERR "etap_open : failed to allocate output "
- "buffer\n");
- else {
- printk("%s", output);
- kfree(output);
- }
-
- if (err < 0) {
- printk(UM_KERN_ERR "etap_tramp failed - err = %d\n", -err);
- goto out_close_control;
- }
-
- pri->data_fd = data_fds[0];
- pri->control_fd = control_fds[0];
- iter_addresses(pri->dev, etap_open_addr, &pri->control_fd);
- return data_fds[0];
-
-out_close_control:
- close(control_fds[0]);
- close(control_fds[1]);
-out_close_data:
- close(data_fds[0]);
- close(data_fds[1]);
- return err;
-}
-
-static void etap_close(int fd, void *data)
-{
- struct ethertap_data *pri = data;
-
- iter_addresses(pri->dev, etap_close_addr, &pri->control_fd);
- close(fd);
-
- if (shutdown(pri->data_fd, SHUT_RDWR) < 0)
- printk(UM_KERN_ERR "etap_close - shutdown data socket failed, "
- "errno = %d\n", errno);
-
- if (shutdown(pri->control_fd, SHUT_RDWR) < 0)
- printk(UM_KERN_ERR "etap_close - shutdown control socket "
- "failed, errno = %d\n", errno);
-
- close(pri->data_fd);
- pri->data_fd = -1;
- close(pri->control_fd);
- pri->control_fd = -1;
-}
-
-static void etap_add_addr(unsigned char *addr, unsigned char *netmask,
- void *data)
-{
- struct ethertap_data *pri = data;
-
- tap_check_ips(pri->gate_addr, addr);
- if (pri->control_fd == -1)
- return;
- etap_open_addr(addr, netmask, &pri->control_fd);
-}
-
-static void etap_del_addr(unsigned char *addr, unsigned char *netmask,
- void *data)
-{
- struct ethertap_data *pri = data;
-
- if (pri->control_fd == -1)
- return;
-
- etap_close_addr(addr, netmask, &pri->control_fd);
-}
-
-const struct net_user_info ethertap_user_info = {
- .init = etap_user_init,
- .open = etap_open,
- .close = etap_close,
- .remove = NULL,
- .add_address = etap_add_addr,
- .delete_address = etap_del_addr,
- .mtu = ETH_MAX_PACKET,
- .max_packet = ETH_MAX_PACKET + ETH_HEADER_ETHERTAP,
-};
diff --git a/arch/um/os-Linux/drivers/tuntap.h b/arch/um/os-Linux/drivers/tuntap.h
deleted file mode 100644
index 7367354ac8df..000000000000
--- a/arch/um/os-Linux/drivers/tuntap.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
- */
-
-#ifndef __UM_TUNTAP_H
-#define __UM_TUNTAP_H
-
-#include <net_user.h>
-
-struct tuntap_data {
- char *dev_name;
- int fixed_config;
- char *gate_addr;
- int fd;
- void *dev;
-};
-
-extern const struct net_user_info tuntap_user_info;
-
-#endif
diff --git a/arch/um/os-Linux/drivers/tuntap_kern.c b/arch/um/os-Linux/drivers/tuntap_kern.c
deleted file mode 100644
index d9d56e5810fe..000000000000
--- a/arch/um/os-Linux/drivers/tuntap_kern.c
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
- */
-
-#include <linux/netdevice.h>
-#include <linux/init.h>
-#include <linux/skbuff.h>
-#include <asm/errno.h>
-#include <net_kern.h>
-#include "tuntap.h"
-
-struct tuntap_init {
- char *dev_name;
- char *gate_addr;
-};
-
-static void tuntap_init(struct net_device *dev, void *data)
-{
- struct uml_net_private *pri;
- struct tuntap_data *tpri;
- struct tuntap_init *init = data;
-
- pri = netdev_priv(dev);
- tpri = (struct tuntap_data *) pri->user;
- tpri->dev_name = init->dev_name;
- tpri->fixed_config = (init->dev_name != NULL);
- tpri->gate_addr = init->gate_addr;
- tpri->fd = -1;
- tpri->dev = dev;
-
- printk(KERN_INFO "TUN/TAP backend - ");
- if (tpri->gate_addr != NULL)
- printk(KERN_CONT "IP = %s", tpri->gate_addr);
- printk(KERN_CONT "\n");
-}
-
-static int tuntap_read(int fd, struct sk_buff *skb, struct uml_net_private *lp)
-{
- return net_read(fd, skb_mac_header(skb),
- skb->dev->mtu + ETH_HEADER_OTHER);
-}
-
-static int tuntap_write(int fd, struct sk_buff *skb, struct uml_net_private *lp)
-{
- return net_write(fd, skb->data, skb->len);
-}
-
-const struct net_kern_info tuntap_kern_info = {
- .init = tuntap_init,
- .protocol = eth_protocol,
- .read = tuntap_read,
- .write = tuntap_write,
-};
-
-int tuntap_setup(char *str, char **mac_out, void *data)
-{
- struct tuntap_init *init = data;
-
- *init = ((struct tuntap_init)
- { .dev_name = NULL,
- .gate_addr = NULL });
- if (tap_setup_common(str, "tuntap", &init->dev_name, mac_out,
- &init->gate_addr))
- return 0;
-
- return 1;
-}
-
-static struct transport tuntap_transport = {
- .list = LIST_HEAD_INIT(tuntap_transport.list),
- .name = "tuntap",
- .setup = tuntap_setup,
- .user = &tuntap_user_info,
- .kern = &tuntap_kern_info,
- .private_size = sizeof(struct tuntap_data),
- .setup_size = sizeof(struct tuntap_init),
-};
-
-static int register_tuntap(void)
-{
- register_transport(&tuntap_transport);
- return 0;
-}
-
-late_initcall(register_tuntap);
diff --git a/arch/um/os-Linux/drivers/tuntap_user.c b/arch/um/os-Linux/drivers/tuntap_user.c
deleted file mode 100644
index 14126d9176aa..000000000000
--- a/arch/um/os-Linux/drivers/tuntap_user.c
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
- * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
- */
-
-#include <stdio.h>
-#include <unistd.h>
-#include <errno.h>
-#include <string.h>
-#include <linux/if_tun.h>
-#include <net/if.h>
-#include <sys/ioctl.h>
-#include <sys/socket.h>
-#include <sys/wait.h>
-#include <sys/uio.h>
-#include <kern_util.h>
-#include <os.h>
-#include "tuntap.h"
-
-static int tuntap_user_init(void *data, void *dev)
-{
- struct tuntap_data *pri = data;
-
- pri->dev = dev;
- return 0;
-}
-
-static void tuntap_add_addr(unsigned char *addr, unsigned char *netmask,
- void *data)
-{
- struct tuntap_data *pri = data;
-
- tap_check_ips(pri->gate_addr, addr);
- if ((pri->fd == -1) || pri->fixed_config)
- return;
- open_addr(addr, netmask, pri->dev_name);
-}
-
-static void tuntap_del_addr(unsigned char *addr, unsigned char *netmask,
- void *data)
-{
- struct tuntap_data *pri = data;
-
- if ((pri->fd == -1) || pri->fixed_config)
- return;
- close_addr(addr, netmask, pri->dev_name);
-}
-
-struct tuntap_pre_exec_data {
- int stdout;
- int close_me;
-};
-
-static void tuntap_pre_exec(void *arg)
-{
- struct tuntap_pre_exec_data *data = arg;
-
- dup2(data->stdout, 1);
- close(data->close_me);
-}
-
-static int tuntap_open_tramp(char *gate, int *fd_out, int me, int remote,
- char *buffer, int buffer_len, int *used_out)
-{
- struct tuntap_pre_exec_data data;
- char version_buf[sizeof("nnnnn\0")];
- char *argv[] = { "uml_net", version_buf, "tuntap", "up", gate,
- NULL };
- char buf[CMSG_SPACE(sizeof(*fd_out))];
- struct msghdr msg;
- struct cmsghdr *cmsg;
- struct iovec iov;
- int pid, n, err;
-
- sprintf(version_buf, "%d", UML_NET_VERSION);
-
- data.stdout = remote;
- data.close_me = me;
-
- pid = run_helper(tuntap_pre_exec, &data, argv);
-
- if (pid < 0)
- return -pid;
-
- close(remote);
-
- msg.msg_name = NULL;
- msg.msg_namelen = 0;
- if (buffer != NULL) {
- iov = ((struct iovec) { buffer, buffer_len });
- msg.msg_iov = &iov;
- msg.msg_iovlen = 1;
- }
- else {
- msg.msg_iov = NULL;
- msg.msg_iovlen = 0;
- }
- msg.msg_control = buf;
- msg.msg_controllen = sizeof(buf);
- msg.msg_flags = 0;
- n = recvmsg(me, &msg, 0);
- *used_out = n;
- if (n < 0) {
- err = -errno;
- printk(UM_KERN_ERR "tuntap_open_tramp : recvmsg failed - "
- "errno = %d\n", errno);
- return err;
- }
- helper_wait(pid);
-
- cmsg = CMSG_FIRSTHDR(&msg);
- if (cmsg == NULL) {
- printk(UM_KERN_ERR "tuntap_open_tramp : didn't receive a "
- "message\n");
- return -EINVAL;
- }
- if ((cmsg->cmsg_level != SOL_SOCKET) ||
- (cmsg->cmsg_type != SCM_RIGHTS)) {
- printk(UM_KERN_ERR "tuntap_open_tramp : didn't receive a "
- "descriptor\n");
- return -EINVAL;
- }
- *fd_out = ((int *) CMSG_DATA(cmsg))[0];
- os_set_exec_close(*fd_out);
- return 0;
-}
-
-static int tuntap_open(void *data)
-{
- struct ifreq ifr;
- struct tuntap_data *pri = data;
- char *output, *buffer;
- int err, fds[2], len, used;
-
- err = tap_open_common(pri->dev, pri->gate_addr);
- if (err < 0)
- return err;
-
- if (pri->fixed_config) {
- pri->fd = os_open_file("/dev/net/tun",
- of_cloexec(of_rdwr(OPENFLAGS())), 0);
- if (pri->fd < 0) {
- printk(UM_KERN_ERR "Failed to open /dev/net/tun, "
- "err = %d\n", -pri->fd);
- return pri->fd;
- }
- memset(&ifr, 0, sizeof(ifr));
- ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
- strlcpy(ifr.ifr_name, pri->dev_name, sizeof(ifr.ifr_name));
- if (ioctl(pri->fd, TUNSETIFF, &ifr) < 0) {
- err = -errno;
- printk(UM_KERN_ERR "TUNSETIFF failed, errno = %d\n",
- errno);
- close(pri->fd);
- return err;
- }
- }
- else {
- err = socketpair(AF_UNIX, SOCK_DGRAM, 0, fds);
- if (err) {
- err = -errno;
- printk(UM_KERN_ERR "tuntap_open : socketpair failed - "
- "errno = %d\n", errno);
- return err;
- }
-
- buffer = get_output_buffer(&len);
- if (buffer != NULL)
- len--;
- used = 0;
-
- err = tuntap_open_tramp(pri->gate_addr, &pri->fd, fds[0],
- fds[1], buffer, len, &used);
-
- output = buffer;
- if (err < 0) {
- printk("%s", output);
- free_output_buffer(buffer);
- printk(UM_KERN_ERR "tuntap_open_tramp failed - "
- "err = %d\n", -err);
- return err;
- }
-
- pri->dev_name = uml_strdup(buffer);
- output += IFNAMSIZ;
- printk("%s", output);
- free_output_buffer(buffer);
-
- close(fds[0]);
- iter_addresses(pri->dev, open_addr, pri->dev_name);
- }
-
- return pri->fd;
-}
-
-static void tuntap_close(int fd, void *data)
-{
- struct tuntap_data *pri = data;
-
- if (!pri->fixed_config)
- iter_addresses(pri->dev, close_addr, pri->dev_name);
- close(fd);
- pri->fd = -1;
-}
-
-const struct net_user_info tuntap_user_info = {
- .init = tuntap_user_init,
- .open = tuntap_open,
- .close = tuntap_close,
- .remove = NULL,
- .add_address = tuntap_add_addr,
- .delete_address = tuntap_del_addr,
- .mtu = ETH_MAX_PACKET,
- .max_packet = ETH_MAX_PACKET + ETH_HEADER_OTHER,
-};
diff --git a/arch/um/os-Linux/elf_aux.c b/arch/um/os-Linux/elf_aux.c
index 1a365ddc4d02..72f416edf252 100644
--- a/arch/um/os-Linux/elf_aux.c
+++ b/arch/um/os-Linux/elf_aux.c
@@ -1,7 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* arch/um/kernel/elf_aux.c
*
- * Scan the Elf auxiliary vector provided by the host to extract
+ * Scan the ELF auxiliary vector provided by the host to extract
* information about vsyscall-page, etc.
*
* Copyright (C) 2004 Fujitsu Siemens Computers GmbH
@@ -12,37 +13,27 @@
#include <init.h>
#include <elf_user.h>
#include <mem_user.h>
+#include "internal.h"
+#include <linux/swab.h>
+#if __BITS_PER_LONG == 64
+typedef Elf64_auxv_t elf_auxv_t;
+#else
typedef Elf32_auxv_t elf_auxv_t;
+#endif
/* These are initialized very early in boot and never changed */
char * elf_aux_platform;
-extern long elf_aux_hwcap;
-unsigned long vsyscall_ehdr;
-unsigned long vsyscall_end;
-unsigned long __kernel_vsyscall;
+long elf_aux_hwcap;
__init void scan_elf_aux( char **envp)
{
- long page_size = 0;
elf_auxv_t * auxv;
while ( *envp++ != NULL) ;
for ( auxv = (elf_auxv_t *)envp; auxv->a_type != AT_NULL; auxv++) {
switch ( auxv->a_type ) {
- case AT_SYSINFO:
- __kernel_vsyscall = auxv->a_un.a_val;
- /* See if the page is under TASK_SIZE */
- if (__kernel_vsyscall < (unsigned long) envp)
- __kernel_vsyscall = 0;
- break;
- case AT_SYSINFO_EHDR:
- vsyscall_ehdr = auxv->a_un.a_val;
- /* See if the page is under TASK_SIZE */
- if (vsyscall_ehdr < (unsigned long) envp)
- vsyscall_ehdr = 0;
- break;
case AT_HWCAP:
elf_aux_hwcap = auxv->a_un.a_val;
break;
@@ -54,20 +45,6 @@ __init void scan_elf_aux( char **envp)
elf_aux_platform =
(char *) (long) auxv->a_un.a_val;
break;
- case AT_PAGESZ:
- page_size = auxv->a_un.a_val;
- break;
}
}
- if ( ! __kernel_vsyscall || ! vsyscall_ehdr ||
- ! elf_aux_hwcap || ! elf_aux_platform ||
- ! page_size || (vsyscall_ehdr % page_size) ) {
- __kernel_vsyscall = 0;
- vsyscall_ehdr = 0;
- elf_aux_hwcap = 0;
- elf_aux_platform = "i586";
- }
- else {
- vsyscall_end = vsyscall_ehdr + page_size;
- }
}
diff --git a/arch/um/os-Linux/execvp.c b/arch/um/os-Linux/execvp.c
index 8fb25ca07c46..c09a5fd5e225 100644
--- a/arch/um/os-Linux/execvp.c
+++ b/arch/um/os-Linux/execvp.c
@@ -93,6 +93,7 @@ int execvp_noalloc(char *buf, const char *file, char *const argv[])
up finding no executable we can use, we want to diagnose
that we did find one but were denied access. */
got_eacces = 1;
+ break;
case ENOENT:
case ESTALE:
case ENOTDIR:
@@ -136,7 +137,7 @@ int main(int argc, char**argv)
int ret;
argc--;
if (!argc) {
- fprintf(stderr, "Not enough arguments\n");
+ os_warn("Not enough arguments\n");
return 1;
}
argv++;
diff --git a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c
index c17bd6f7d674..21f0e50fb1df 100644
--- a/arch/um/os-Linux/file.c
+++ b/arch/um/os-Linux/file.c
@@ -1,18 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
*/
#include <stdio.h>
#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
#include <errno.h>
#include <fcntl.h>
#include <signal.h>
+#include <linux/falloc.h>
#include <sys/ioctl.h>
#include <sys/mount.h>
#include <sys/socket.h>
#include <sys/stat.h>
+#include <sys/sysmacros.h>
#include <sys/un.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/eventfd.h>
+#include <poll.h>
#include <os.h>
static void copy_stat(struct uml_stat *dst, const struct stat64 *src)
@@ -98,21 +106,6 @@ int os_get_ifname(int fd, char* namebuf)
return 0;
}
-int os_set_slip(int fd)
-{
- int disc, sencap;
-
- disc = N_SLIP;
- if (ioctl(fd, TIOCSETD, &disc) < 0)
- return -errno;
-
- sencap = 0;
- if (ioctl(fd, SIOCSIFENCAP, &sencap) < 0)
- return -errno;
-
- return 0;
-}
-
int os_mode_fd(int fd, int mode)
{
int err;
@@ -233,6 +226,16 @@ out:
return err;
}
+int os_dup_file(int fd)
+{
+ int new_fd = dup(fd);
+
+ if (new_fd < 0)
+ return -errno;
+
+ return new_fd;
+}
+
void os_close_file(int fd)
{
close(fd);
@@ -257,6 +260,15 @@ int os_read_file(int fd, void *buf, int len)
return n;
}
+int os_pread_file(int fd, void *buf, int len, unsigned long long offset)
+{
+ int n = pread(fd, buf, len, offset);
+
+ if (n < 0)
+ return -errno;
+ return n;
+}
+
int os_write_file(int fd, const void *buf, int len)
{
int n = write(fd, (void *) buf, len);
@@ -266,6 +278,25 @@ int os_write_file(int fd, const void *buf, int len)
return n;
}
+int os_sync_file(int fd)
+{
+ int n = fdatasync(fd);
+
+ if (n < 0)
+ return -errno;
+ return n;
+}
+
+int os_pwrite_file(int fd, const void *buf, int len, unsigned long long offset)
+{
+ int n = pwrite(fd, (void *) buf, len, offset);
+
+ if (n < 0)
+ return -errno;
+ return n;
+}
+
+
int os_file_size(const char *file, unsigned long long *size_out)
{
struct uml_stat buf;
@@ -304,7 +335,7 @@ int os_file_size(const char *file, unsigned long long *size_out)
return 0;
}
-int os_file_modtime(const char *file, unsigned long *modtime)
+int os_file_modtime(const char *file, long long *modtime)
{
struct uml_stat buf;
int err;
@@ -461,44 +492,51 @@ int os_shutdown_socket(int fd, int r, int w)
return 0;
}
-int os_rcv_fd(int fd, int *helper_pid_out)
+/**
+ * os_rcv_fd_msg - receive message with (optional) FDs
+ * @fd: the FD to receive from
+ * @fds: the array for FDs to write to
+ * @n_fds: number of FDs to receive (@fds array size)
+ * @data: the message buffer
+ * @data_len: the size of the message to receive
+ *
+ * Receive a message with FDs.
+ *
+ * Returns: the size of the received message, or an error code
+ */
+ssize_t os_rcv_fd_msg(int fd, int *fds, unsigned int n_fds,
+ void *data, size_t data_len)
{
- int new, n;
- char buf[CMSG_SPACE(sizeof(new))];
- struct msghdr msg;
+#define MAX_RCV_FDS 2
+ char buf[CMSG_SPACE(sizeof(*fds) * MAX_RCV_FDS)];
struct cmsghdr *cmsg;
- struct iovec iov;
-
- msg.msg_name = NULL;
- msg.msg_namelen = 0;
- iov = ((struct iovec) { .iov_base = helper_pid_out,
- .iov_len = sizeof(*helper_pid_out) });
- msg.msg_iov = &iov;
- msg.msg_iovlen = 1;
- msg.msg_control = buf;
- msg.msg_controllen = sizeof(buf);
- msg.msg_flags = 0;
+ struct iovec iov = {
+ .iov_base = data,
+ .iov_len = data_len,
+ };
+ struct msghdr msg = {
+ .msg_iov = &iov,
+ .msg_iovlen = 1,
+ .msg_control = buf,
+ .msg_controllen = CMSG_SPACE(sizeof(*fds) * n_fds),
+ };
+ int n;
+
+ if (n_fds > MAX_RCV_FDS)
+ return -EINVAL;
n = recvmsg(fd, &msg, 0);
if (n < 0)
return -errno;
- else if (n != iov.iov_len)
- *helper_pid_out = -1;
cmsg = CMSG_FIRSTHDR(&msg);
- if (cmsg == NULL) {
- printk(UM_KERN_ERR "rcv_fd didn't receive anything, "
- "error = %d\n", errno);
- return -1;
- }
- if ((cmsg->cmsg_level != SOL_SOCKET) ||
- (cmsg->cmsg_type != SCM_RIGHTS)) {
- printk(UM_KERN_ERR "rcv_fd didn't receive a descriptor\n");
- return -1;
- }
+ if (!cmsg ||
+ cmsg->cmsg_level != SOL_SOCKET ||
+ cmsg->cmsg_type != SCM_RIGHTS)
+ return n;
- new = ((int *) CMSG_DATA(cmsg))[0];
- return new;
+ memcpy(fds, CMSG_DATA(cmsg), cmsg->cmsg_len - CMSG_LEN(0));
+ return n;
}
int os_create_unix_socket(const char *file, int len, int close_on_exec)
@@ -574,3 +612,115 @@ unsigned long long os_makedev(unsigned major, unsigned minor)
{
return makedev(major, minor);
}
+
+int os_falloc_punch(int fd, unsigned long long offset, int len)
+{
+ int n = fallocate(fd, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE, offset, len);
+
+ if (n < 0)
+ return -errno;
+ return n;
+}
+
+int os_falloc_zeroes(int fd, unsigned long long offset, int len)
+{
+ int n = fallocate(fd, FALLOC_FL_ZERO_RANGE|FALLOC_FL_KEEP_SIZE, offset, len);
+
+ if (n < 0)
+ return -errno;
+ return n;
+}
+
+int os_eventfd(unsigned int initval, int flags)
+{
+ int fd = eventfd(initval, flags);
+
+ if (fd < 0)
+ return -errno;
+ return fd;
+}
+
+int os_sendmsg_fds(int fd, const void *buf, unsigned int len, const int *fds,
+ unsigned int fds_num)
+{
+ struct iovec iov = {
+ .iov_base = (void *) buf,
+ .iov_len = len,
+ };
+ union {
+ char control[CMSG_SPACE(sizeof(*fds) * OS_SENDMSG_MAX_FDS)];
+ struct cmsghdr align;
+ } u;
+ unsigned int fds_size = sizeof(*fds) * fds_num;
+ struct msghdr msg = {
+ .msg_iov = &iov,
+ .msg_iovlen = 1,
+ .msg_control = u.control,
+ .msg_controllen = CMSG_SPACE(fds_size),
+ };
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ int err;
+
+ if (fds_num > OS_SENDMSG_MAX_FDS)
+ return -EINVAL;
+ memset(u.control, 0, sizeof(u.control));
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(fds_size);
+ memcpy(CMSG_DATA(cmsg), fds, fds_size);
+ err = sendmsg(fd, &msg, 0);
+
+ if (err < 0)
+ return -errno;
+ return err;
+}
+
+int os_poll(unsigned int n, const int *fds)
+{
+ /* currently need 2 FDs at most so avoid dynamic allocation */
+ struct pollfd pollfds[2] = {};
+ unsigned int i;
+ int ret;
+
+ if (n > ARRAY_SIZE(pollfds))
+ return -EINVAL;
+
+ for (i = 0; i < n; i++) {
+ pollfds[i].fd = fds[i];
+ pollfds[i].events = POLLIN;
+ }
+
+ ret = poll(pollfds, n, -1);
+ if (ret < 0)
+ return -errno;
+
+ /* Return the index of the available FD */
+ for (i = 0; i < n; i++) {
+ if (pollfds[i].revents)
+ return i;
+ }
+
+ return -EIO;
+}
+
+void *os_mmap_rw_shared(int fd, size_t size)
+{
+ void *res = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+
+ if (res == MAP_FAILED)
+ return NULL;
+
+ return res;
+}
+
+void *os_mremap_rw_shared(void *old_addr, size_t old_size, size_t new_size)
+{
+ void *res;
+
+ res = mremap(old_addr, old_size, new_size, MREMAP_MAYMOVE, NULL);
+
+ if (res == MAP_FAILED)
+ return NULL;
+
+ return res;
+}
diff --git a/arch/um/os-Linux/helper.c b/arch/um/os-Linux/helper.c
index e3ee4a51ef63..89c2ad2a4e3a 100644
--- a/arch/um/os-Linux/helper.c
+++ b/arch/um/os-Linux/helper.c
@@ -1,12 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
*/
#include <stdlib.h>
+#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <sched.h>
+#include <pthread.h>
#include <linux/limits.h>
#include <sys/socket.h>
#include <sys/wait.h>
@@ -45,7 +47,7 @@ int run_helper(void (*pre_exec)(void *), void *pre_data, char **argv)
unsigned long stack, sp;
int pid, fds[2], ret, n;
- stack = alloc_stack(0, __cant_sleep());
+ stack = alloc_stack(0, __uml_cant_sleep());
if (stack == 0)
return -ENOMEM;
@@ -64,12 +66,12 @@ int run_helper(void (*pre_exec)(void *), void *pre_data, char **argv)
goto out_close;
}
- sp = stack + UM_KERN_PAGE_SIZE - sizeof(void *);
+ sp = stack + UM_KERN_PAGE_SIZE;
data.pre_exec = pre_exec;
data.pre_data = pre_data;
data.argv = argv;
data.fd = fds[1];
- data.buf = __cant_sleep() ? uml_kmalloc(PATH_MAX, UM_GFP_ATOMIC) :
+ data.buf = __uml_cant_sleep() ? uml_kmalloc(PATH_MAX, UM_GFP_ATOMIC) :
uml_kmalloc(PATH_MAX, UM_GFP_KERNEL);
pid = clone(helper_child, (void *) sp, CLONE_VM, &data);
if (pid < 0) {
@@ -96,9 +98,13 @@ int run_helper(void (*pre_exec)(void *), void *pre_data, char **argv)
"ret = %d\n", -n);
ret = n;
}
- CATCH_EINTR(waitpid(pid, NULL, __WCLONE));
+ CATCH_EINTR(waitpid(pid, NULL, __WALL));
}
+ if (ret < 0)
+ printk(UM_KERN_ERR "run_helper : failed to exec %s on host: %s\n",
+ argv[0], strerror(-ret));
+
out_free2:
kfree(data.buf);
out_close:
@@ -116,11 +122,15 @@ int run_helper_thread(int (*proc)(void *), void *arg, unsigned int flags,
unsigned long stack, sp;
int pid, status, err;
- stack = alloc_stack(0, __cant_sleep());
+ /* To share memory space, use os_run_helper_thread() instead. */
+ if (flags & CLONE_VM)
+ return -EINVAL;
+
+ stack = alloc_stack(0, __uml_cant_sleep());
if (stack == 0)
return -ENOMEM;
- sp = stack + UM_KERN_PAGE_SIZE - sizeof(void *);
+ sp = stack + UM_KERN_PAGE_SIZE;
pid = clone(proc, (void *) sp, flags, arg);
if (pid < 0) {
err = -errno;
@@ -129,7 +139,7 @@ int run_helper_thread(int (*proc)(void *), void *arg, unsigned int flags,
return err;
}
if (stack_out == NULL) {
- CATCH_EINTR(pid = waitpid(pid, &status, __WCLONE));
+ CATCH_EINTR(pid = waitpid(pid, &status, __WALL));
if (pid < 0) {
err = -errno;
printk(UM_KERN_ERR "run_helper_thread - wait failed, "
@@ -148,7 +158,7 @@ int run_helper_thread(int (*proc)(void *), void *arg, unsigned int flags,
int helper_wait(int pid)
{
int ret, status;
- int wflags = __WCLONE;
+ int wflags = __WALL;
CATCH_EINTR(ret = waitpid(pid, &status, wflags));
if (ret < 0) {
@@ -162,3 +172,65 @@ int helper_wait(int pid)
} else
return 0;
}
+
+struct os_helper_thread {
+ pthread_t handle;
+};
+
+int os_run_helper_thread(struct os_helper_thread **td_out,
+ void *(*routine)(void *), void *arg)
+{
+ struct os_helper_thread *td;
+ sigset_t sigset, oset;
+ int err, flags;
+
+ flags = __uml_cant_sleep() ? UM_GFP_ATOMIC : UM_GFP_KERNEL;
+ td = uml_kmalloc(sizeof(*td), flags);
+ if (!td)
+ return -ENOMEM;
+
+ sigfillset(&sigset);
+ if (sigprocmask(SIG_SETMASK, &sigset, &oset) < 0) {
+ err = -errno;
+ kfree(td);
+ return err;
+ }
+
+ err = pthread_create(&td->handle, NULL, routine, arg);
+
+ if (sigprocmask(SIG_SETMASK, &oset, NULL) < 0)
+ panic("Failed to restore the signal mask: %d", errno);
+
+ if (err != 0)
+ kfree(td);
+ else
+ *td_out = td;
+
+ return -err;
+}
+
+void os_kill_helper_thread(struct os_helper_thread *td)
+{
+ pthread_cancel(td->handle);
+ pthread_join(td->handle, NULL);
+ kfree(td);
+}
+
+void os_fix_helper_thread_signals(void)
+{
+ sigset_t sigset;
+
+ sigemptyset(&sigset);
+
+ sigaddset(&sigset, SIGWINCH);
+ sigaddset(&sigset, SIGPIPE);
+ sigaddset(&sigset, SIGPROF);
+ sigaddset(&sigset, SIGINT);
+ sigaddset(&sigset, SIGTERM);
+ sigaddset(&sigset, SIGCHLD);
+ sigaddset(&sigset, SIGALRM);
+ sigaddset(&sigset, SIGIO);
+ sigaddset(&sigset, SIGUSR1);
+
+ pthread_sigmask(SIG_SETMASK, &sigset, NULL);
+}
diff --git a/arch/um/os-Linux/internal.h b/arch/um/os-Linux/internal.h
index 0dc2c9f135f6..bac9fcc8c14c 100644
--- a/arch/um/os-Linux/internal.h
+++ b/arch/um/os-Linux/internal.h
@@ -1 +1,36 @@
-void alarm_handler(int sig, struct siginfo *unused_si, mcontext_t *mc);
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __UM_OS_LINUX_INTERNAL_H
+#define __UM_OS_LINUX_INTERNAL_H
+
+#include <mm_id.h>
+#include <stub-data.h>
+#include <signal.h>
+
+/*
+ * elf_aux.c
+ */
+void scan_elf_aux(char **envp);
+
+/*
+ * mem.c
+ */
+void check_tmpexec(void);
+
+/*
+ * signal.c
+ */
+extern __thread int signals_enabled;
+int timer_alarm_pending(void);
+
+/*
+ * skas/process.c
+ */
+void wait_stub_done(int pid);
+void wait_stub_done_seccomp(struct mm_id *mm_idp, int running, int wait_sigsys);
+
+/*
+ * smp.c
+ */
+#define IPI_SIGNAL SIGRTMIN
+
+#endif /* __UM_OS_LINUX_INTERNAL_H */
diff --git a/arch/um/os-Linux/irq.c b/arch/um/os-Linux/irq.c
index b9afb74b79ad..cf7e49c08b21 100644
--- a/arch/um/os-Linux/irq.c
+++ b/arch/um/os-Linux/irq.c
@@ -1,135 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0
/*
+ * Copyright (C) 2017 - Cambridge Greys Ltd
+ * Copyright (C) 2011 - 2014 Cisco Systems Inc
* Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
*/
#include <stdlib.h>
#include <errno.h>
-#include <poll.h>
+#include <sys/epoll.h>
#include <signal.h>
#include <string.h>
#include <irq_user.h>
#include <os.h>
#include <um_malloc.h>
+/* Epoll support */
+
+static int epollfd = -1;
+
+#define MAX_EPOLL_EVENTS 64
+
+static struct epoll_event epoll_events[MAX_EPOLL_EVENTS];
+
+/* Helper to return an Epoll data pointer from an epoll event structure.
+ * We need to keep this one on the userspace side to keep includes separate
+ */
+
+void *os_epoll_get_data_pointer(int index)
+{
+ return epoll_events[index].data.ptr;
+}
+
+/* Helper to compare events versus the events in the epoll structure.
+ * Same as above - needs to be on the userspace side
+ */
+
+
+int os_epoll_triggered(int index, int events)
+{
+ return epoll_events[index].events & events;
+}
+/* Helper to set the event mask.
+ * The event mask is opaque to the kernel side, because it does not have
+ * access to the right includes/defines for EPOLL constants.
+ */
+
+int os_event_mask(enum um_irq_type irq_type)
+{
+ if (irq_type == IRQ_READ)
+ return EPOLLIN | EPOLLPRI | EPOLLERR | EPOLLHUP | EPOLLRDHUP;
+ if (irq_type == IRQ_WRITE)
+ return EPOLLOUT;
+ return 0;
+}
+
/*
- * Locked by irq_lock in arch/um/kernel/irq.c. Changed by os_create_pollfd
- * and os_free_irq_by_cb, which are called under irq_lock.
+ * Initial Epoll Setup
*/
-static struct pollfd *pollfds = NULL;
-static int pollfds_num = 0;
-static int pollfds_size = 0;
+int os_setup_epoll(void)
+{
+ epollfd = epoll_create(MAX_EPOLL_EVENTS);
+ return epollfd;
+}
-int os_waiting_for_events(struct irq_fd *active_fds)
+/*
+ * Helper to run the actual epoll_wait
+ */
+int os_waiting_for_events_epoll(void)
{
- struct irq_fd *irq_fd;
- int i, n, err;
+ int n, err;
- n = poll(pollfds, pollfds_num, 0);
+ n = epoll_wait(epollfd,
+ (struct epoll_event *) &epoll_events, MAX_EPOLL_EVENTS, 0);
if (n < 0) {
err = -errno;
if (errno != EINTR)
- printk(UM_KERN_ERR "os_waiting_for_events:"
- " poll returned %d, errno = %d\n", n, errno);
+ printk(
+ UM_KERN_ERR "os_waiting_for_events:"
+ " epoll returned %d, error = %s\n", n,
+ strerror(errno)
+ );
return err;
}
-
- if (n == 0)
- return 0;
-
- irq_fd = active_fds;
-
- for (i = 0; i < pollfds_num; i++) {
- if (pollfds[i].revents != 0) {
- irq_fd->current_events = pollfds[i].revents;
- pollfds[i].fd = -1;
- }
- irq_fd = irq_fd->next;
- }
return n;
}
-int os_create_pollfd(int fd, int events, void *tmp_pfd, int size_tmpfds)
-{
- if (pollfds_num == pollfds_size) {
- if (size_tmpfds <= pollfds_size * sizeof(pollfds[0])) {
- /* return min size needed for new pollfds area */
- return (pollfds_size + 1) * sizeof(pollfds[0]);
- }
-
- if (pollfds != NULL) {
- memcpy(tmp_pfd, pollfds,
- sizeof(pollfds[0]) * pollfds_size);
- /* remove old pollfds */
- kfree(pollfds);
- }
- pollfds = tmp_pfd;
- pollfds_size++;
- } else
- kfree(tmp_pfd); /* remove not used tmp_pfd */
-
- pollfds[pollfds_num] = ((struct pollfd) { .fd = fd,
- .events = events,
- .revents = 0 });
- pollfds_num++;
-
- return 0;
-}
-void os_free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg,
- struct irq_fd *active_fds, struct irq_fd ***last_irq_ptr2)
+/*
+ * Helper to add a fd to epoll
+ */
+int os_add_epoll_fd(int events, int fd, void *data)
{
- struct irq_fd **prev;
- int i = 0;
-
- prev = &active_fds;
- while (*prev != NULL) {
- if ((*test)(*prev, arg)) {
- struct irq_fd *old_fd = *prev;
- if ((pollfds[i].fd != -1) &&
- (pollfds[i].fd != (*prev)->fd)) {
- printk(UM_KERN_ERR "os_free_irq_by_cb - "
- "mismatch between active_fds and "
- "pollfds, fd %d vs %d\n",
- (*prev)->fd, pollfds[i].fd);
- goto out;
- }
-
- pollfds_num--;
-
- /*
- * This moves the *whole* array after pollfds[i]
- * (though it doesn't spot as such)!
- */
- memmove(&pollfds[i], &pollfds[i + 1],
- (pollfds_num - i) * sizeof(pollfds[0]));
- if (*last_irq_ptr2 == &old_fd->next)
- *last_irq_ptr2 = prev;
-
- *prev = (*prev)->next;
- if (old_fd->type == IRQ_WRITE)
- ignore_sigio_fd(old_fd->fd);
- kfree(old_fd);
- continue;
- }
- prev = &(*prev)->next;
- i++;
- }
- out:
- return;
+ struct epoll_event event;
+ int result;
+
+ event.data.ptr = data;
+ event.events = events | EPOLLET;
+ result = epoll_ctl(epollfd, EPOLL_CTL_ADD, fd, &event);
+ if ((result) && (errno == EEXIST))
+ result = os_mod_epoll_fd(events, fd, data);
+ if (result)
+ printk("epollctl add err fd %d, %s\n", fd, strerror(errno));
+ return result;
}
-int os_get_pollfd(int i)
+/*
+ * Helper to mod the fd event mask and/or data backreference
+ */
+int os_mod_epoll_fd(int events, int fd, void *data)
{
- return pollfds[i].fd;
+ struct epoll_event event;
+ int result;
+
+ event.data.ptr = data;
+ event.events = events;
+ result = epoll_ctl(epollfd, EPOLL_CTL_MOD, fd, &event);
+ if (result)
+ printk(UM_KERN_ERR
+ "epollctl mod err fd %d, %s\n", fd, strerror(errno));
+ return result;
}
-void os_set_pollfd(int i, int fd)
+/*
+ * Helper to delete the epoll fd
+ */
+int os_del_epoll_fd(int fd)
{
- pollfds[i].fd = fd;
+ struct epoll_event event;
+ /* This is quiet as we use this as IO ON/OFF - so it is often
+ * invoked on a non-existent fd
+ */
+ return epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, &event);
}
void os_set_ioignore(void)
{
signal(SIGIO, SIG_IGN);
}
+
+void os_close_epoll_fd(void)
+{
+ /* Needed so we do not leak an fd when rebooting */
+ os_close_file(epollfd);
+}
diff --git a/arch/um/os-Linux/main.c b/arch/um/os-Linux/main.c
index 749c96da7b99..7e114862a723 100644
--- a/arch/um/os-Linux/main.c
+++ b/arch/um/os-Linux/main.c
@@ -1,6 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
/*
+ * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de)
* Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
*/
#include <stdio.h>
@@ -10,19 +11,17 @@
#include <signal.h>
#include <string.h>
#include <sys/resource.h>
+#include <sys/personality.h>
#include <as-layout.h>
#include <init.h>
#include <kern_util.h>
#include <os.h>
#include <um_malloc.h>
+#include "internal.h"
-#define PGD_BOUND (4 * 1024 * 1024)
#define STACKSIZE (8 * 1024 * 1024)
-#define THREAD_NAME_LEN (256)
-long elf_aux_hwcap;
-
-static void set_stklim(void)
+static void __init set_stklim(void)
{
struct rlimit lim;
@@ -39,24 +38,13 @@ static void set_stklim(void)
}
}
-static __init void do_uml_initcalls(void)
-{
- initcall_t *call;
-
- call = &__uml_initcall_start;
- while (call < &__uml_initcall_end) {
- (*call)();
- call++;
- }
-}
-
static void last_ditch_exit(int sig)
{
uml_cleanup();
exit(1);
}
-static void install_fatal_handler(int sig)
+static void __init install_fatal_handler(int sig)
{
struct sigaction action;
@@ -73,15 +61,15 @@ static void install_fatal_handler(int sig)
action.sa_restorer = NULL;
action.sa_handler = last_ditch_exit;
if (sigaction(sig, &action, NULL) < 0) {
- printf("failed to install handler for signal %d - errno = %d\n",
- sig, errno);
+ os_warn("failed to install handler for signal %d "
+ "- errno = %d\n", sig, errno);
exit(1);
}
}
#define UML_LIB_PATH ":" OS_LIB_PATH "/uml"
-static void setup_env_path(void)
+static void __init setup_env_path(void)
{
char *new_path = NULL;
char *old_path = NULL;
@@ -112,17 +100,32 @@ static void setup_env_path(void)
}
}
-extern void scan_elf_aux( char **envp);
-
int __init main(int argc, char **argv, char **envp)
{
char **new_argv;
int ret, i, err;
+ /* Disable randomization and re-exec if it was changed successfully */
+ ret = personality(PER_LINUX | ADDR_NO_RANDOMIZE);
+ if (ret >= 0 && (ret & (PER_LINUX | ADDR_NO_RANDOMIZE)) !=
+ (PER_LINUX | ADDR_NO_RANDOMIZE)) {
+ char buf[4096] = {};
+ ssize_t ret;
+
+ ret = readlink("/proc/self/exe", buf, sizeof(buf));
+ if (ret < 0 || ret >= sizeof(buf)) {
+ perror("readlink failure");
+ exit(1);
+ }
+ execve(buf, argv, envp);
+ }
+
set_stklim();
setup_env_path();
+ setsid();
+
new_argv = malloc((argc + 1) * sizeof(char *));
if (new_argv == NULL) {
perror("Mallocing argv");
@@ -144,12 +147,10 @@ int __init main(int argc, char **argv, char **envp)
install_fatal_handler(SIGINT);
install_fatal_handler(SIGTERM);
-#ifdef CONFIG_ARCH_REUSE_HOST_VSYSCALL_AREA
scan_elf_aux(envp);
-#endif
- do_uml_initcalls();
- ret = linux_main(argc, argv);
+ change_sig(SIGPIPE, 0);
+ ret = linux_main(argc, argv, envp);
/*
* Disable SIGPROF - I have no idea why libc doesn't do this or turn
@@ -160,18 +161,18 @@ int __init main(int argc, char **argv, char **envp)
/*
* This signal stuff used to be in the reboot case. However,
- * sometimes a SIGVTALRM can come in when we're halting (reproducably
+ * sometimes a timer signal can come in when we're halting (reproducably
* when writing out gcov information, presumably because that takes
* some time) and cause a segfault.
*/
- /* stop timers and set SIGVTALRM to be ignored */
- disable_timer();
+ /* stop timers and set timer signal to be ignored */
+ os_timer_disable(0);
/* disable SIGIO for the fds and set SIGIO to be ignored */
err = deactivate_all_fds();
if (err)
- printf("deactivate_all_fds failed, errno = %d\n", -err);
+ os_warn("deactivate_all_fds failed, errno = %d\n", -err);
/*
* Let any pending signals fire now. This ensures
@@ -180,18 +181,23 @@ int __init main(int argc, char **argv, char **envp)
*/
unblock_signals();
+ os_info("\n");
/* Reboot */
if (ret) {
- printf("\n");
execvp(new_argv[0], new_argv);
perror("Failed to exec kernel");
ret = 1;
}
- printf("\n");
return uml_exitcode;
}
extern void *__real_malloc(int);
+extern void __real_free(void *);
+
+/* workaround for -Wmissing-prototypes warnings */
+void *__wrap_malloc(int size);
+void *__wrap_calloc(int n, int size);
+void __wrap_free(void *ptr);
void *__wrap_malloc(int size)
{
@@ -224,10 +230,6 @@ void *__wrap_calloc(int n, int size)
return ptr;
}
-extern void __real_free(void *);
-
-extern unsigned long high_physmem;
-
void __wrap_free(void *ptr)
{
unsigned long addr = (unsigned long) ptr;
diff --git a/arch/um/os-Linux/mem.c b/arch/um/os-Linux/mem.c
index ba4398056fe9..72f302f4d197 100644
--- a/arch/um/os-Linux/mem.c
+++ b/arch/um/os-Linux/mem.c
@@ -1,6 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
*/
#include <stdio.h>
@@ -12,189 +12,164 @@
#include <string.h>
#include <sys/stat.h>
#include <sys/mman.h>
-#include <sys/param.h>
+#include <sys/vfs.h>
+#include <linux/magic.h>
#include <init.h>
+#include <kern_util.h>
#include <os.h>
-
-/* Modified by which_tmpdir, which is called during early boot */
-static char *default_tmpdir = "/tmp";
+#include "internal.h"
/*
- * Modified when creating the physical memory file and when checking
- * the tmp filesystem for usability, both happening during early boot.
+ * kasan_map_memory - maps memory from @start with a size of @len.
+ * The allocated memory is filled with zeroes upon success.
+ * @start: the start address of the memory to be mapped
+ * @len: the length of the memory to be mapped
+ *
+ * This function is used to map shadow memory for KASAN in uml
*/
-static char *tempdir = NULL;
-
-static void __init find_tempdir(void)
+void kasan_map_memory(void *start, size_t len)
{
- const char *dirs[] = { "TMP", "TEMP", "TMPDIR", NULL };
- int i;
- char *dir = NULL;
+ if (mmap(start,
+ len,
+ PROT_READ|PROT_WRITE,
+ MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE|MAP_NORESERVE,
+ -1,
+ 0) == MAP_FAILED) {
+ os_info("Couldn't allocate shadow memory: %s\n.",
+ strerror(errno));
+ exit(1);
+ }
- if (tempdir != NULL)
- /* We've already been called */
- return;
- for (i = 0; dirs[i]; i++) {
- dir = getenv(dirs[i]);
- if ((dir != NULL) && (*dir != '\0'))
- break;
+ if (madvise(start, len, MADV_DONTDUMP)) {
+ os_info("Couldn't set MAD_DONTDUMP on shadow memory: %s\n.",
+ strerror(errno));
+ exit(1);
}
- if ((dir == NULL) || (*dir == '\0'))
- dir = default_tmpdir;
- tempdir = malloc(strlen(dir) + 2);
- if (tempdir == NULL) {
- fprintf(stderr, "Failed to malloc tempdir, "
- "errno = %d\n", errno);
- return;
+ if (madvise(start, len, MADV_DONTFORK)) {
+ os_info("Couldn't set MADV_DONTFORK on shadow memory: %s\n.",
+ strerror(errno));
+ exit(1);
}
- strcpy(tempdir, dir);
- strcat(tempdir, "/");
}
-/*
- * This will return 1, with the first character in buf being the
- * character following the next instance of c in the file. This will
- * read the file as needed. If there's an error, -errno is returned;
- * if the end of the file is reached, 0 is returned.
- */
-static int next(int fd, char *buf, size_t size, char c)
-{
- ssize_t n;
- size_t len;
- char *ptr;
+/* Set by make_tempfile() during early boot. */
+char *tempdir = NULL;
- while ((ptr = strchr(buf, c)) == NULL) {
- n = read(fd, buf, size - 1);
- if (n == 0)
- return 0;
- else if (n < 0)
- return -errno;
-
- buf[n] = '\0';
+/* Check if dir is on tmpfs. Return 0 if yes, -1 if no or error. */
+static int __init check_tmpfs(const char *dir)
+{
+ struct statfs st;
+
+ os_info("Checking if %s is on tmpfs...", dir);
+ if (statfs(dir, &st) < 0) {
+ os_info("%s\n", strerror(errno));
+ } else if (st.f_type != TMPFS_MAGIC) {
+ os_info("no\n");
+ } else {
+ os_info("OK\n");
+ return 0;
}
-
- ptr++;
- len = strlen(ptr);
- memmove(buf, ptr, len + 1);
-
- /*
- * Refill the buffer so that if there's a partial string that we care
- * about, it will be completed, and we can recognize it.
- */
- n = read(fd, &buf[len], size - len - 1);
- if (n < 0)
- return -errno;
-
- buf[len + n] = '\0';
- return 1;
+ return -1;
}
-/* which_tmpdir is called only during early boot */
-static int checked_tmpdir = 0;
-
/*
- * Look for a tmpfs mounted at /dev/shm. I couldn't find a cleaner
- * way to do this than to parse /proc/mounts. statfs will return the
- * same filesystem magic number and fs id for both /dev and /dev/shm
- * when they are both tmpfs, so you can't tell if they are different
- * filesystems. Also, there seems to be no other way of finding the
- * mount point of a filesystem from within it.
- *
- * If a /dev/shm tmpfs entry is found, then we switch to using it.
- * Otherwise, we stay with the default /tmp.
+ * Choose the tempdir to use. We want something on tmpfs so that our memory is
+ * not subject to the host's vm.dirty_ratio. If a tempdir is specified in the
+ * environment, we use that even if it's not on tmpfs, but we warn the user.
+ * Otherwise, we try common tmpfs locations, and if no tmpfs directory is found
+ * then we fall back to /tmp.
*/
-static void which_tmpdir(void)
+static char * __init choose_tempdir(void)
{
- int fd, found;
- char buf[128] = { '\0' };
-
- if (checked_tmpdir)
- return;
-
- checked_tmpdir = 1;
-
- printf("Checking for tmpfs mount on /dev/shm...");
-
- fd = open("/proc/mounts", O_RDONLY);
- if (fd < 0) {
- printf("failed to open /proc/mounts, errno = %d\n", errno);
- return;
+ static const char * const vars[] = {
+ "TMPDIR",
+ "TMP",
+ "TEMP",
+ NULL
+ };
+ static const char fallback_dir[] = "/tmp";
+ static const char * const tmpfs_dirs[] = {
+ "/dev/shm",
+ fallback_dir,
+ NULL
+ };
+ int i;
+ const char *dir;
+
+ os_info("Checking environment variables for a tempdir...");
+ for (i = 0; vars[i]; i++) {
+ dir = getenv(vars[i]);
+ if ((dir != NULL) && (*dir != '\0')) {
+ os_info("%s\n", dir);
+ if (check_tmpfs(dir) >= 0)
+ goto done;
+ else
+ goto warn;
+ }
}
+ os_info("none found\n");
- while (1) {
- found = next(fd, buf, ARRAY_SIZE(buf), ' ');
- if (found != 1)
- break;
-
- if (!strncmp(buf, "/dev/shm", strlen("/dev/shm")))
- goto found;
-
- found = next(fd, buf, ARRAY_SIZE(buf), '\n');
- if (found != 1)
- break;
+ for (i = 0; tmpfs_dirs[i]; i++) {
+ dir = tmpfs_dirs[i];
+ if (check_tmpfs(dir) >= 0)
+ goto done;
}
-err:
- if (found == 0)
- printf("nothing mounted on /dev/shm\n");
- else if (found < 0)
- printf("read returned errno %d\n", -found);
-
-out:
- close(fd);
-
- return;
-
-found:
- found = next(fd, buf, ARRAY_SIZE(buf), ' ');
- if (found != 1)
- goto err;
-
- if (strncmp(buf, "tmpfs", strlen("tmpfs"))) {
- printf("not tmpfs\n");
- goto out;
- }
-
- printf("OK\n");
- default_tmpdir = "/dev/shm";
- goto out;
+ dir = fallback_dir;
+warn:
+ os_warn("Warning: tempdir %s is not on tmpfs\n", dir);
+done:
+ /* Make a copy since getenv results may not remain valid forever. */
+ return strdup(dir);
}
-static int __init make_tempfile(const char *template, char **out_tempname,
- int do_unlink)
+/*
+ * Create an unlinked tempfile in a suitable tempdir. template must be the
+ * basename part of the template with a leading '/'.
+ */
+static int __init make_tempfile(const char *template)
{
char *tempname;
int fd;
- which_tmpdir();
- tempname = malloc(MAXPATHLEN);
+ if (tempdir == NULL) {
+ tempdir = choose_tempdir();
+ if (tempdir == NULL) {
+ os_warn("Failed to choose tempdir: %s\n",
+ strerror(errno));
+ return -1;
+ }
+ }
+
+#ifdef O_TMPFILE
+ fd = open(tempdir, O_CLOEXEC | O_RDWR | O_EXCL | O_TMPFILE, 0700);
+ /*
+ * If the running system does not support O_TMPFILE flag then retry
+ * without it.
+ */
+ if (fd != -1 || (errno != EINVAL && errno != EISDIR &&
+ errno != EOPNOTSUPP))
+ return fd;
+#endif
+
+ tempname = malloc(strlen(tempdir) + strlen(template) + 1);
if (tempname == NULL)
return -1;
- find_tempdir();
- if ((tempdir == NULL) || (strlen(tempdir) >= MAXPATHLEN))
- goto out;
-
- if (template[0] != '/')
- strcpy(tempname, tempdir);
- else
- tempname[0] = '\0';
- strncat(tempname, template, MAXPATHLEN-1-strlen(tempname));
+ strcpy(tempname, tempdir);
+ strcat(tempname, template);
fd = mkstemp(tempname);
if (fd < 0) {
- fprintf(stderr, "open - cannot create %s: %s\n", tempname,
+ os_warn("open - cannot create %s: %s\n", tempname,
strerror(errno));
goto out;
}
- if (do_unlink && (unlink(tempname) < 0)) {
+ if (unlink(tempname) < 0) {
perror("unlink");
goto close;
}
- if (out_tempname) {
- *out_tempname = tempname;
- } else
- free(tempname);
+ free(tempname);
return fd;
close:
close(fd);
@@ -203,23 +178,17 @@ out:
return -1;
}
-#define TEMPNAME_TEMPLATE "vm_file-XXXXXX"
+#define TEMPNAME_TEMPLATE "/vm_file-XXXXXX"
static int __init create_tmp_file(unsigned long long len)
{
int fd, err;
char zero;
- fd = make_tempfile(TEMPNAME_TEMPLATE, NULL, 1);
+ fd = make_tempfile(TEMPNAME_TEMPLATE);
if (fd < 0)
exit(1);
- err = fchmod(fd, 0777);
- if (err < 0) {
- perror("fchmod");
- exit(1);
- }
-
/*
* Seek to len - 1 because writing a character there will
* increase the file size by one byte, to the desired length.
@@ -254,7 +223,6 @@ int __init create_mem_file(unsigned long long len)
return fd;
}
-
void __init check_tmpexec(void)
{
void *addr;
@@ -262,17 +230,16 @@ void __init check_tmpexec(void)
addr = mmap(NULL, UM_KERN_PAGE_SIZE,
PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE, fd, 0);
- printf("Checking PROT_EXEC mmap in %s...",tempdir);
- fflush(stdout);
+ os_info("Checking PROT_EXEC mmap in %s...", tempdir);
if (addr == MAP_FAILED) {
err = errno;
- perror("failed");
+ os_warn("%s\n", strerror(err));
close(fd);
if (err == EPERM)
- printf("%s must be not mounted noexec\n",tempdir);
+ os_warn("%s must be not mounted noexec\n", tempdir);
exit(1);
}
- printf("OK\n");
+ os_info("OK\n");
munmap(addr, UM_KERN_PAGE_SIZE);
close(fd);
diff --git a/arch/um/os-Linux/process.c b/arch/um/os-Linux/process.c
index b8f34c9e53ae..3a2a84ab9325 100644
--- a/arch/um/os-Linux/process.c
+++ b/arch/um/os-Linux/process.c
@@ -1,119 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0
/*
+ * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de)
* Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
*/
#include <stdio.h>
+#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <signal.h>
#include <fcntl.h>
+#include <limits.h>
+#include <linux/futex.h>
#include <sys/mman.h>
#include <sys/ptrace.h>
+#include <sys/prctl.h>
#include <sys/wait.h>
#include <asm/unistd.h>
#include <init.h>
#include <longjmp.h>
#include <os.h>
-#include <skas_ptrace.h>
+#include <skas/skas.h>
-#define ARBITRARY_ADDR -1
-#define FAILURE_PID -1
-
-#define STAT_PATH_LEN sizeof("/proc/#######/stat\0")
-#define COMM_SCANF "%*[^)])"
-
-unsigned long os_process_pc(int pid)
+void os_alarm_process(int pid)
{
- char proc_stat[STAT_PATH_LEN], buf[256];
- unsigned long pc = ARBITRARY_ADDR;
- int fd, err;
+ if (pid <= 0)
+ return;
- sprintf(proc_stat, "/proc/%d/stat", pid);
- fd = open(proc_stat, O_RDONLY, 0);
- if (fd < 0) {
- printk(UM_KERN_ERR "os_process_pc - couldn't open '%s', "
- "errno = %d\n", proc_stat, errno);
- goto out;
- }
- CATCH_EINTR(err = read(fd, buf, sizeof(buf)));
- if (err < 0) {
- printk(UM_KERN_ERR "os_process_pc - couldn't read '%s', "
- "err = %d\n", proc_stat, errno);
- goto out_close;
- }
- os_close_file(fd);
- pc = ARBITRARY_ADDR;
- if (sscanf(buf, "%*d " COMM_SCANF " %*c %*d %*d %*d %*d %*d %*d %*d "
- "%*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d "
- "%*d %*d %*d %*d %*d %lu", &pc) != 1)
- printk(UM_KERN_ERR "os_process_pc - couldn't find pc in '%s'\n",
- buf);
- out_close:
- close(fd);
- out:
- return pc;
+ kill(pid, SIGALRM);
}
-int os_process_parent(int pid)
+void os_kill_process(int pid, int reap_child)
{
- char stat[STAT_PATH_LEN];
- char data[256];
- int parent = FAILURE_PID, n, fd;
-
- if (pid == -1)
- return parent;
-
- snprintf(stat, sizeof(stat), "/proc/%d/stat", pid);
- fd = open(stat, O_RDONLY, 0);
- if (fd < 0) {
- printk(UM_KERN_ERR "Couldn't open '%s', errno = %d\n", stat,
- errno);
- return parent;
- }
+ if (pid <= 0)
+ return;
- CATCH_EINTR(n = read(fd, data, sizeof(data)));
- close(fd);
-
- if (n < 0) {
- printk(UM_KERN_ERR "Couldn't read '%s', errno = %d\n", stat,
- errno);
- return parent;
- }
-
- parent = FAILURE_PID;
- n = sscanf(data, "%*d " COMM_SCANF " %*c %d", &parent);
- if (n != 1)
- printk(UM_KERN_ERR "Failed to scan '%s'\n", data);
+ /* Block signals until child is reaped */
+ block_signals();
- return parent;
-}
-
-void os_stop_process(int pid)
-{
- kill(pid, SIGSTOP);
-}
-
-void os_kill_process(int pid, int reap_child)
-{
kill(pid, SIGKILL);
if (reap_child)
CATCH_EINTR(waitpid(pid, NULL, __WALL));
-}
-
-/* This is here uniquely to have access to the userspace errno, i.e. the one
- * used by ptrace in case of error.
- */
-
-long os_ptrace_ldt(long pid, long addr, long data)
-{
- int ret;
-
- ret = ptrace(PTRACE_LDT, pid, addr, data);
- if (ret < 0)
- return -errno;
- return ret;
+ unblock_signals();
}
/* Kill off a ptraced child by all means available. kill it normally first,
@@ -123,11 +52,27 @@ long os_ptrace_ldt(long pid, long addr, long data)
void os_kill_ptraced_process(int pid, int reap_child)
{
+ if (pid <= 0)
+ return;
+
+ /* Block signals until child is reaped */
+ block_signals();
+
kill(pid, SIGKILL);
ptrace(PTRACE_KILL, pid);
ptrace(PTRACE_CONT, pid);
if (reap_child)
CATCH_EINTR(waitpid(pid, NULL, __WALL));
+
+ unblock_signals();
+}
+
+pid_t os_reap_child(void)
+{
+ int status;
+
+ /* Try to reap a child */
+ return waitpid(-1, &status, WNOHANG);
}
/* Don't use the glibc version, which caches the result in TLS. It misses some
@@ -139,11 +84,6 @@ int os_getpid(void)
return syscall(__NR_getpid);
}
-int os_getpgrp(void)
-{
- return getpgrp();
-}
-
int os_map_memory(void *virt, int fd, unsigned long long off, unsigned long len,
int r, int w, int x)
{
@@ -241,6 +181,31 @@ void init_new_thread_signals(void)
set_handler(SIGBUS);
signal(SIGHUP, SIG_IGN);
set_handler(SIGIO);
+ /* We (currently) only use the child reaper IRQ in seccomp mode */
+ if (using_seccomp)
+ set_handler(SIGCHLD);
signal(SIGWINCH, SIG_IGN);
- signal(SIGTERM, SIG_DFL);
+}
+
+void os_set_pdeathsig(void)
+{
+ prctl(PR_SET_PDEATHSIG, SIGKILL);
+}
+
+int os_futex_wait(void *uaddr, unsigned int val)
+{
+ int r;
+
+ CATCH_EINTR(r = syscall(__NR_futex, uaddr, FUTEX_WAIT, val,
+ NULL, NULL, 0));
+ return r < 0 ? -errno : r;
+}
+
+int os_futex_wake(void *uaddr)
+{
+ int r;
+
+ CATCH_EINTR(r = syscall(__NR_futex, uaddr, FUTEX_WAKE, INT_MAX,
+ NULL, NULL, 0));
+ return r < 0 ? -errno : r;
}
diff --git a/arch/um/os-Linux/registers.c b/arch/um/os-Linux/registers.c
index 2ff8d4fe83c4..bfba2cbc9478 100644
--- a/arch/um/os-Linux/registers.c
+++ b/arch/um/os-Linux/registers.c
@@ -1,7 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2004 PathScale, Inc
* Copyright (C) 2004 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
*/
#include <errno.h>
@@ -10,33 +10,14 @@
#include <sysdep/ptrace.h>
#include <sysdep/ptrace_user.h>
#include <registers.h>
-
-int save_registers(int pid, struct uml_pt_regs *regs)
-{
- int err;
-
- err = ptrace(PTRACE_GETREGS, pid, 0, regs->gp);
- if (err < 0)
- return -errno;
- return 0;
-}
-
-int restore_registers(int pid, struct uml_pt_regs *regs)
-{
- int err;
-
- err = ptrace(PTRACE_SETREGS, pid, 0, regs->gp);
- if (err < 0)
- return -errno;
- return 0;
-}
+#include <stdlib.h>
/* This is set once at boot time and not changed thereafter */
-static unsigned long exec_regs[MAX_REG_NR];
-static unsigned long exec_fp_regs[FP_SIZE];
+unsigned long exec_regs[MAX_REG_NR];
+unsigned long *exec_fp_regs;
-int init_registers(int pid)
+int init_pid_registers(int pid)
{
int err;
@@ -44,7 +25,11 @@ int init_registers(int pid)
if (err < 0)
return -errno;
- arch_init_registers(pid);
+ err = arch_init_registers(pid);
+ if (err < 0)
+ return err;
+
+ exec_fp_regs = malloc(host_fp_size);
get_fp_registers(pid, exec_fp_regs);
return 0;
}
@@ -54,5 +39,5 @@ void get_safe_registers(unsigned long *regs, unsigned long *fp_regs)
memcpy(regs, exec_regs, sizeof(exec_regs));
if (fp_regs)
- memcpy(fp_regs, exec_fp_regs, sizeof(exec_fp_regs));
+ memcpy(fp_regs, exec_fp_regs, host_fp_size);
}
diff --git a/arch/um/os-Linux/sigio.c b/arch/um/os-Linux/sigio.c
index 8b61cc0e82c8..6de145f8fe3d 100644
--- a/arch/um/os-Linux/sigio.c
+++ b/arch/um/os-Linux/sigio.c
@@ -1,6 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2002 - 2008 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
*/
#include <unistd.h>
@@ -11,6 +11,8 @@
#include <sched.h>
#include <signal.h>
#include <string.h>
+#include <sys/epoll.h>
+#include <asm/unistd.h>
#include <kern_util.h>
#include <init.h>
#include <os.h>
@@ -21,366 +23,136 @@
* Protected by sigio_lock(), also used by sigio_cleanup, which is an
* exitcall.
*/
-static int write_sigio_pid = -1;
-static unsigned long write_sigio_stack;
+static struct os_helper_thread *write_sigio_td;
-/*
- * These arrays are initialized before the sigio thread is started, and
- * the descriptors closed after it is killed. So, it can't see them change.
- * On the UML side, they are changed under the sigio_lock.
- */
-#define SIGIO_FDS_INIT {-1, -1}
-
-static int write_sigio_fds[2] = SIGIO_FDS_INIT;
-static int sigio_private[2] = SIGIO_FDS_INIT;
+static int epollfd = -1;
-struct pollfds {
- struct pollfd *poll;
- int size;
- int used;
-};
+#define MAX_EPOLL_EVENTS 64
-/*
- * Protected by sigio_lock(). Used by the sigio thread, but the UML thread
- * synchronizes with it.
- */
-static struct pollfds current_poll;
-static struct pollfds next_poll;
-static struct pollfds all_sigio_fds;
+static struct epoll_event epoll_events[MAX_EPOLL_EVENTS];
-static int write_sigio_thread(void *unused)
+static void *write_sigio_thread(void *unused)
{
- struct pollfds *fds, tmp;
- struct pollfd *p;
- int i, n, respond_fd;
- char c;
+ int pid = getpid();
+ int r;
+
+ os_fix_helper_thread_signals();
- signal(SIGWINCH, SIG_IGN);
- fds = &current_poll;
while (1) {
- n = poll(fds->poll, fds->used, -1);
- if (n < 0) {
+ r = epoll_wait(epollfd, epoll_events, MAX_EPOLL_EVENTS, -1);
+ if (r < 0) {
if (errno == EINTR)
continue;
- printk(UM_KERN_ERR "write_sigio_thread : poll returned "
- "%d, errno = %d\n", n, errno);
- }
- for (i = 0; i < fds->used; i++) {
- p = &fds->poll[i];
- if (p->revents == 0)
- continue;
- if (p->fd == sigio_private[1]) {
- CATCH_EINTR(n = read(sigio_private[1], &c,
- sizeof(c)));
- if (n != sizeof(c))
- printk(UM_KERN_ERR
- "write_sigio_thread : "
- "read on socket failed, "
- "err = %d\n", errno);
- tmp = current_poll;
- current_poll = next_poll;
- next_poll = tmp;
- respond_fd = sigio_private[1];
- }
- else {
- respond_fd = write_sigio_fds[1];
- fds->used--;
- memmove(&fds->poll[i], &fds->poll[i + 1],
- (fds->used - i) * sizeof(*fds->poll));
- }
-
- CATCH_EINTR(n = write(respond_fd, &c, sizeof(c)));
- if (n != sizeof(c))
- printk(UM_KERN_ERR "write_sigio_thread : "
- "write on socket failed, err = %d\n",
- errno);
+ printk(UM_KERN_ERR "%s: epoll_wait failed, errno = %d\n",
+ __func__, errno);
}
- }
-
- return 0;
-}
-
-static int need_poll(struct pollfds *polls, int n)
-{
- struct pollfd *new;
- if (n <= polls->size)
- return 0;
-
- new = uml_kmalloc(n * sizeof(struct pollfd), UM_GFP_ATOMIC);
- if (new == NULL) {
- printk(UM_KERN_ERR "need_poll : failed to allocate new "
- "pollfds\n");
- return -ENOMEM;
+ CATCH_EINTR(r = syscall(__NR_tgkill, pid, pid, SIGIO));
+ if (r < 0)
+ printk(UM_KERN_ERR "%s: tgkill failed, errno = %d\n",
+ __func__, errno);
}
- memcpy(new, polls->poll, polls->used * sizeof(struct pollfd));
- kfree(polls->poll);
-
- polls->poll = new;
- polls->size = n;
- return 0;
+ return NULL;
}
-/*
- * Must be called with sigio_lock held, because it's needed by the marked
- * critical section.
- */
-static void update_thread(void)
+int __add_sigio_fd(int fd)
{
- unsigned long flags;
- int n;
- char c;
-
- flags = set_signals(0);
- CATCH_EINTR(n = write(sigio_private[0], &c, sizeof(c)));
- if (n != sizeof(c)) {
- printk(UM_KERN_ERR "update_thread : write failed, err = %d\n",
- errno);
- goto fail;
- }
-
- CATCH_EINTR(n = read(sigio_private[0], &c, sizeof(c)));
- if (n != sizeof(c)) {
- printk(UM_KERN_ERR "update_thread : read failed, err = %d\n",
- errno);
- goto fail;
- }
-
- set_signals(flags);
- return;
- fail:
- /* Critical section start */
- if (write_sigio_pid != -1) {
- os_kill_process(write_sigio_pid, 1);
- free_stack(write_sigio_stack, 0);
- }
- write_sigio_pid = -1;
- close(sigio_private[0]);
- close(sigio_private[1]);
- close(write_sigio_fds[0]);
- close(write_sigio_fds[1]);
- /* Critical section end */
- set_signals(flags);
+ struct epoll_event event = {
+ .data.fd = fd,
+ .events = EPOLLIN | EPOLLET,
+ };
+ int r;
+
+ CATCH_EINTR(r = epoll_ctl(epollfd, EPOLL_CTL_ADD, fd, &event));
+ return r < 0 ? -errno : 0;
}
int add_sigio_fd(int fd)
{
- struct pollfd *p;
- int err = 0, i, n;
+ int err;
sigio_lock();
- for (i = 0; i < all_sigio_fds.used; i++) {
- if (all_sigio_fds.poll[i].fd == fd)
- break;
- }
- if (i == all_sigio_fds.used)
- goto out;
-
- p = &all_sigio_fds.poll[i];
+ err = __add_sigio_fd(fd);
+ sigio_unlock();
- for (i = 0; i < current_poll.used; i++) {
- if (current_poll.poll[i].fd == fd)
- goto out;
- }
+ return err;
+}
- n = current_poll.used;
- err = need_poll(&next_poll, n + 1);
- if (err)
- goto out;
+int __ignore_sigio_fd(int fd)
+{
+ struct epoll_event event;
+ int r;
- memcpy(next_poll.poll, current_poll.poll,
- current_poll.used * sizeof(struct pollfd));
- next_poll.poll[n] = *p;
- next_poll.used = n + 1;
- update_thread();
- out:
- sigio_unlock();
- return err;
+ CATCH_EINTR(r = epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, &event));
+ return r < 0 ? -errno : 0;
}
int ignore_sigio_fd(int fd)
{
- struct pollfd *p;
- int err = 0, i, n = 0;
-
- /*
- * This is called from exitcalls elsewhere in UML - if
- * sigio_cleanup has already run, then update_thread will hang
- * or fail because the thread is no longer running.
- */
- if (write_sigio_pid == -1)
- return -EIO;
+ int err;
sigio_lock();
- for (i = 0; i < current_poll.used; i++) {
- if (current_poll.poll[i].fd == fd)
- break;
- }
- if (i == current_poll.used)
- goto out;
-
- err = need_poll(&next_poll, current_poll.used - 1);
- if (err)
- goto out;
-
- for (i = 0; i < current_poll.used; i++) {
- p = &current_poll.poll[i];
- if (p->fd != fd)
- next_poll.poll[n++] = *p;
- }
- next_poll.used = current_poll.used - 1;
-
- update_thread();
- out:
+ err = __ignore_sigio_fd(fd);
sigio_unlock();
- return err;
-}
-
-static struct pollfd *setup_initial_poll(int fd)
-{
- struct pollfd *p;
- p = uml_kmalloc(sizeof(struct pollfd), UM_GFP_KERNEL);
- if (p == NULL) {
- printk(UM_KERN_ERR "setup_initial_poll : failed to allocate "
- "poll\n");
- return NULL;
- }
- *p = ((struct pollfd) { .fd = fd,
- .events = POLLIN,
- .revents = 0 });
- return p;
+ return err;
}
static void write_sigio_workaround(void)
{
- struct pollfd *p;
int err;
- int l_write_sigio_fds[2];
- int l_sigio_private[2];
- int l_write_sigio_pid;
- /* We call this *tons* of times - and most ones we must just fail. */
sigio_lock();
- l_write_sigio_pid = write_sigio_pid;
- sigio_unlock();
-
- if (l_write_sigio_pid != -1)
- return;
+ if (write_sigio_td)
+ goto out;
- err = os_pipe(l_write_sigio_fds, 1, 1);
- if (err < 0) {
- printk(UM_KERN_ERR "write_sigio_workaround - os_pipe 1 failed, "
- "err = %d\n", -err);
- return;
+ epollfd = epoll_create(MAX_EPOLL_EVENTS);
+ if (epollfd < 0) {
+ printk(UM_KERN_ERR "%s: epoll_create failed, errno = %d\n",
+ __func__, errno);
+ goto out;
}
- err = os_pipe(l_sigio_private, 1, 1);
+
+ err = os_run_helper_thread(&write_sigio_td, write_sigio_thread, NULL);
if (err < 0) {
- printk(UM_KERN_ERR "write_sigio_workaround - os_pipe 2 failed, "
- "err = %d\n", -err);
- goto out_close1;
+ printk(UM_KERN_ERR "%s: os_run_helper_thread failed, errno = %d\n",
+ __func__, -err);
+ close(epollfd);
+ epollfd = -1;
+ goto out;
}
- p = setup_initial_poll(l_sigio_private[1]);
- if (!p)
- goto out_close2;
-
- sigio_lock();
-
- /*
- * Did we race? Don't try to optimize this, please, it's not so likely
- * to happen, and no more than once at the boot.
- */
- if (write_sigio_pid != -1)
- goto out_free;
-
- current_poll = ((struct pollfds) { .poll = p,
- .used = 1,
- .size = 1 });
-
- if (write_sigio_irq(l_write_sigio_fds[0]))
- goto out_clear_poll;
-
- memcpy(write_sigio_fds, l_write_sigio_fds, sizeof(l_write_sigio_fds));
- memcpy(sigio_private, l_sigio_private, sizeof(l_sigio_private));
-
- write_sigio_pid = run_helper_thread(write_sigio_thread, NULL,
- CLONE_FILES | CLONE_VM,
- &write_sigio_stack);
-
- if (write_sigio_pid < 0)
- goto out_clear;
-
- sigio_unlock();
- return;
-
-out_clear:
- write_sigio_pid = -1;
- write_sigio_fds[0] = -1;
- write_sigio_fds[1] = -1;
- sigio_private[0] = -1;
- sigio_private[1] = -1;
-out_clear_poll:
- current_poll = ((struct pollfds) { .poll = NULL,
- .size = 0,
- .used = 0 });
-out_free:
+out:
sigio_unlock();
- kfree(p);
-out_close2:
- close(l_sigio_private[0]);
- close(l_sigio_private[1]);
-out_close1:
- close(l_write_sigio_fds[0]);
- close(l_write_sigio_fds[1]);
}
-void sigio_broken(int fd, int read)
+void sigio_broken(void)
{
- int err;
-
write_sigio_workaround();
-
- sigio_lock();
- err = need_poll(&all_sigio_fds, all_sigio_fds.used + 1);
- if (err) {
- printk(UM_KERN_ERR "maybe_sigio_broken - failed to add pollfd "
- "for descriptor %d\n", fd);
- goto out;
- }
-
- all_sigio_fds.poll[all_sigio_fds.used++] =
- ((struct pollfd) { .fd = fd,
- .events = read ? POLLIN : POLLOUT,
- .revents = 0 });
-out:
- sigio_unlock();
}
/* Changed during early boot */
static int pty_output_sigio;
-static int pty_close_sigio;
-void maybe_sigio_broken(int fd, int read)
+void maybe_sigio_broken(int fd)
{
if (!isatty(fd))
return;
- if ((read || pty_output_sigio) && (!read || pty_close_sigio))
+ if (pty_output_sigio)
return;
- sigio_broken(fd, read);
+ sigio_broken();
}
static void sigio_cleanup(void)
{
- if (write_sigio_pid == -1)
+ if (!write_sigio_td)
return;
- os_kill_process(write_sigio_pid, 1);
- free_stack(write_sigio_stack, 0);
- write_sigio_pid = -1;
+ os_kill_helper_thread(write_sigio_td);
+ write_sigio_td = NULL;
}
__uml_exitcall(sigio_cleanup);
@@ -514,19 +286,6 @@ static void tty_output(int master, int slave)
printk(UM_KERN_CONT "tty_output : read failed, err = %d\n", n);
}
-static void tty_close(int master, int slave)
-{
- printk(UM_KERN_INFO "Checking that host ptys support SIGIO on "
- "close...");
-
- close(slave);
- if (got_sigio) {
- printk(UM_KERN_CONT "Yes\n");
- pty_close_sigio = 1;
- } else
- printk(UM_KERN_CONT "No, enabling workaround\n");
-}
-
static void __init check_sigio(void)
{
if ((access("/dev/ptmx", R_OK) < 0) &&
@@ -536,7 +295,6 @@ static void __init check_sigio(void)
return;
}
check_one_sigio(tty_output);
- check_one_sigio(tty_close);
}
/* Here because it only does the SIGIO testing for now */
diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c
index 9d9f1b4bf826..327fb3c52fc7 100644
--- a/arch/um/os-Linux/signal.c
+++ b/arch/um/os-Linux/signal.c
@@ -1,31 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
/*
+ * Copyright (C) 2015 Anton Ivanov (aivanov@{brocade.com,kot-begemot.co.uk})
+ * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de)
* Copyright (C) 2004 PathScale, Inc
* Copyright (C) 2004 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
*/
#include <stdlib.h>
#include <stdarg.h>
+#include <stdbool.h>
#include <errno.h>
#include <signal.h>
+#include <string.h>
#include <strings.h>
#include <as-layout.h>
#include <kern_util.h>
#include <os.h>
#include <sysdep/mcontext.h>
+#include <um_malloc.h>
+#include <sys/ucontext.h>
+#include <timetravel.h>
#include "internal.h"
-void (*sig_info[NSIG])(int, struct siginfo *, struct uml_pt_regs *) = {
+void (*sig_info[NSIG])(int, struct siginfo *, struct uml_pt_regs *, void *mc) = {
[SIGTRAP] = relay_signal,
[SIGFPE] = relay_signal,
[SIGILL] = relay_signal,
[SIGWINCH] = winch,
- [SIGBUS] = bus_handler,
+ [SIGBUS] = relay_signal,
[SIGSEGV] = segv_handler,
[SIGIO] = sigio_handler,
- [SIGVTALRM] = timer_handler };
+ [SIGCHLD] = sigchld_handler,
+};
-static void sig_handler_common(int sig, siginfo_t *si, mcontext_t *mc)
+static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc)
{
struct uml_pt_regs r;
int save_errno = errno;
@@ -38,10 +46,10 @@ static void sig_handler_common(int sig, siginfo_t *si, mcontext_t *mc)
}
/* enable signals if sig isn't IRQ signal */
- if ((sig != SIGIO) && (sig != SIGWINCH) && (sig != SIGVTALRM))
- unblock_signals();
+ if ((sig != SIGIO) && (sig != SIGWINCH) && (sig != SIGCHLD))
+ unblock_signals_trace();
- (*sig_info[sig])(sig, si, &r);
+ (*sig_info[sig])(sig, si, &r, mc);
errno = save_errno;
}
@@ -55,72 +63,131 @@ static void sig_handler_common(int sig, siginfo_t *si, mcontext_t *mc)
#define SIGIO_BIT 0
#define SIGIO_MASK (1 << SIGIO_BIT)
-#define SIGVTALRM_BIT 1
-#define SIGVTALRM_MASK (1 << SIGVTALRM_BIT)
+#define SIGALRM_BIT 1
+#define SIGALRM_MASK (1 << SIGALRM_BIT)
-static int signals_enabled;
-static unsigned int signals_pending;
+#define SIGCHLD_BIT 2
+#define SIGCHLD_MASK (1 << SIGCHLD_BIT)
-void sig_handler(int sig, siginfo_t *si, mcontext_t *mc)
+__thread int signals_enabled;
+#if IS_ENABLED(CONFIG_UML_TIME_TRAVEL_SUPPORT)
+static int signals_blocked, signals_blocked_pending;
+#endif
+static __thread unsigned int signals_pending;
+static __thread unsigned int signals_active;
+
+static void sig_handler(int sig, struct siginfo *si, mcontext_t *mc)
{
- int enabled;
+ int enabled = signals_enabled;
+
+#if IS_ENABLED(CONFIG_UML_TIME_TRAVEL_SUPPORT)
+ if ((signals_blocked ||
+ __atomic_load_n(&signals_blocked_pending, __ATOMIC_SEQ_CST)) &&
+ (sig == SIGIO)) {
+ /* increment so unblock will do another round */
+ __atomic_add_fetch(&signals_blocked_pending, 1,
+ __ATOMIC_SEQ_CST);
+ return;
+ }
+#endif
- enabled = signals_enabled;
if (!enabled && (sig == SIGIO)) {
- signals_pending |= SIGIO_MASK;
+ /*
+ * In TT_MODE_EXTERNAL, need to still call time-travel
+ * handlers. This will mark signals_pending by itself
+ * (only if necessary.)
+ * Note we won't get here if signals are hard-blocked
+ * (which is handled above), in that case the hard-
+ * unblock will handle things.
+ */
+ if (time_travel_mode == TT_MODE_EXTERNAL)
+ sigio_run_timetravel_handlers();
+ else
+ signals_pending |= SIGIO_MASK;
return;
}
- block_signals();
+ if (!enabled && (sig == SIGCHLD)) {
+ signals_pending |= SIGCHLD_MASK;
+ return;
+ }
+
+ block_signals_trace();
sig_handler_common(sig, si, mc);
- set_signals(enabled);
+ um_set_signals_trace(enabled);
}
-static void real_alarm_handler(mcontext_t *mc)
+static void timer_real_alarm_handler(mcontext_t *mc)
{
struct uml_pt_regs regs;
if (mc != NULL)
get_regs_from_mc(&regs, mc);
- regs.is_user = 0;
- unblock_signals();
- timer_handler(SIGVTALRM, NULL, &regs);
+ else
+ memset(&regs, 0, sizeof(regs));
+ timer_handler(SIGALRM, NULL, &regs);
}
-void alarm_handler(int sig, struct siginfo *unused_si, mcontext_t *mc)
+static void timer_alarm_handler(int sig, struct siginfo *unused_si, mcontext_t *mc)
{
int enabled;
enabled = signals_enabled;
if (!signals_enabled) {
- signals_pending |= SIGVTALRM_MASK;
+ signals_pending |= SIGALRM_MASK;
return;
}
- block_signals();
+ block_signals_trace();
+
+ signals_active |= SIGALRM_MASK;
+
+ timer_real_alarm_handler(mc);
- real_alarm_handler(mc);
- set_signals(enabled);
+ signals_active &= ~SIGALRM_MASK;
+
+ um_set_signals_trace(enabled);
+}
+
+void deliver_alarm(void) {
+ timer_alarm_handler(SIGALRM, NULL, NULL);
}
-void timer_init(void)
+void timer_set_signal_handler(void)
{
- set_handler(SIGVTALRM);
+ set_handler(SIGALRM);
+}
+
+int timer_alarm_pending(void)
+{
+ return !!(signals_pending & SIGALRM_MASK);
}
void set_sigstack(void *sig_stack, int size)
{
- stack_t stack = ((stack_t) { .ss_flags = 0,
- .ss_sp = (__ptr_t) sig_stack,
- .ss_size = size - sizeof(void *) });
+ stack_t stack = {
+ .ss_flags = 0,
+ .ss_sp = sig_stack,
+ .ss_size = size
+ };
if (sigaltstack(&stack, NULL) != 0)
panic("enabling signal stack failed, errno = %d\n", errno);
}
-static void (*handlers[_NSIG])(int sig, siginfo_t *si, mcontext_t *mc) = {
+static void sigusr1_handler(int sig, struct siginfo *unused_si, mcontext_t *mc)
+{
+ uml_pm_wake();
+}
+
+void register_pm_wake_signal(void)
+{
+ set_handler(SIGUSR1);
+}
+
+static void (*handlers[_NSIG])(int sig, struct siginfo *si, mcontext_t *mc) = {
[SIGSEGV] = sig_handler,
[SIGBUS] = sig_handler,
[SIGILL] = sig_handler,
@@ -129,51 +196,19 @@ static void (*handlers[_NSIG])(int sig, siginfo_t *si, mcontext_t *mc) = {
[SIGIO] = sig_handler,
[SIGWINCH] = sig_handler,
- [SIGVTALRM] = alarm_handler
-};
+ /* SIGCHLD is only actually registered in seccomp mode. */
+ [SIGCHLD] = sig_handler,
+ [SIGALRM] = timer_alarm_handler,
+ [SIGUSR1] = sigusr1_handler,
+};
static void hard_handler(int sig, siginfo_t *si, void *p)
{
- struct ucontext *uc = p;
+ ucontext_t *uc = p;
mcontext_t *mc = &uc->uc_mcontext;
- unsigned long pending = 1UL << sig;
- do {
- int nested, bail;
-
- /*
- * pending comes back with one bit set for each
- * interrupt that arrived while setting up the stack,
- * plus a bit for this interrupt, plus the zero bit is
- * set if this is a nested interrupt.
- * If bail is true, then we interrupted another
- * handler setting up the stack. In this case, we
- * have to return, and the upper handler will deal
- * with this interrupt.
- */
- bail = to_irq_stack(&pending);
- if (bail)
- return;
-
- nested = pending & 1;
- pending &= ~1;
-
- while ((sig = ffs(pending)) != 0){
- sig--;
- pending &= ~(1 << sig);
- (*handlers[sig])(sig, si, mc);
- }
-
- /*
- * Again, pending comes back with a mask of signals
- * that arrived while tearing down the stack. If this
- * is non-zero, we just go back, set up the stack
- * again, and handle the new interrupts.
- */
- if (!nested)
- pending = from_irq_stack(nested);
- } while (pending);
+ (*handlers[sig])(sig, (struct siginfo *)si, mc);
}
void set_handler(int sig)
@@ -186,9 +221,9 @@ void set_handler(int sig)
/* block irq ones */
sigemptyset(&action.sa_mask);
- sigaddset(&action.sa_mask, SIGVTALRM);
sigaddset(&action.sa_mask, SIGIO);
sigaddset(&action.sa_mask, SIGWINCH);
+ sigaddset(&action.sa_mask, SIGALRM);
if (sig == SIGSEGV)
flags |= SA_NODEFER;
@@ -207,6 +242,11 @@ void set_handler(int sig)
panic("sigprocmask failed - errno = %d\n", errno);
}
+void send_sigio_to_self(void)
+{
+ kill(os_getpid(), SIGIO);
+}
+
int change_sig(int signal, int on)
{
sigset_t sigset;
@@ -219,9 +259,29 @@ int change_sig(int signal, int on)
return 0;
}
-void block_signals(void)
+static inline void __block_signals(void)
{
+ if (!signals_enabled)
+ return;
+
+ os_local_ipi_disable();
+ barrier();
signals_enabled = 0;
+}
+
+static inline void __unblock_signals(void)
+{
+ if (signals_enabled)
+ return;
+
+ signals_enabled = 1;
+ barrier();
+ os_local_ipi_enable();
+}
+
+void block_signals(void)
+{
+ __block_signals();
/*
* This must return with signals disabled, so this barrier
* ensures that writes are flushed out before the return.
@@ -238,6 +298,12 @@ void unblock_signals(void)
if (signals_enabled == 1)
return;
+ __unblock_signals();
+
+#if IS_ENABLED(CONFIG_UML_TIME_TRAVEL_SUPPORT)
+ deliver_time_travel_irqs();
+#endif
+
/*
* We loop because the IRQ handler returns with interrupts off. So,
* interrupts may have arrived and we need to re-enable them and
@@ -247,12 +313,9 @@ void unblock_signals(void)
/*
* Save and reset save_pending after enabling signals. This
* way, signals_pending won't be changed while we're reading it.
- */
- signals_enabled = 1;
-
- /*
+ *
* Setting signals_enabled and reading signals_pending must
- * happen in this order.
+ * happen in this order, so have the barrier here.
*/
barrier();
@@ -265,10 +328,13 @@ void unblock_signals(void)
/*
* We have pending interrupts, so disable signals, as the
* handlers expect them off when they are called. They will
- * be enabled again above.
+ * be enabled again above. We need to trace this, as we're
+ * expected to be enabling interrupts already, but any more
+ * tracing that happens inside the handlers we call for the
+ * pending signals will mess up the tracing state.
*/
-
- signals_enabled = 0;
+ __block_signals();
+ um_trace_signals_off();
/*
* Deal with SIGIO first because the alarm handler might
@@ -281,17 +347,34 @@ void unblock_signals(void)
if (save_pending & SIGIO_MASK)
sig_handler_common(SIGIO, NULL, NULL);
- if (save_pending & SIGVTALRM_MASK)
- real_alarm_handler(NULL);
+ if (save_pending & SIGCHLD_MASK) {
+ struct uml_pt_regs regs = {};
+
+ sigchld_handler(SIGCHLD, NULL, &regs, NULL);
+ }
+
+ /* Do not reenter the handler */
+
+ if ((save_pending & SIGALRM_MASK) && (!(signals_active & SIGALRM_MASK)))
+ timer_real_alarm_handler(NULL);
+
+ /* Rerun the loop only if there is still pending SIGIO and not in TIMER handler */
+
+ if (!(signals_pending & SIGIO_MASK) && (signals_active & SIGALRM_MASK))
+ return;
+
+ /* Re-enable signals and trace that we're doing so. */
+ um_trace_signals_on();
+ __unblock_signals();
}
}
-int get_signals(void)
+int um_get_signals(void)
{
return signals_enabled;
}
-int set_signals(int enable)
+int um_set_signals(int enable)
{
int ret;
if (signals_enabled == enable)
@@ -304,3 +387,117 @@ int set_signals(int enable)
return ret;
}
+
+int um_set_signals_trace(int enable)
+{
+ int ret;
+ if (signals_enabled == enable)
+ return enable;
+
+ ret = signals_enabled;
+ if (enable)
+ unblock_signals_trace();
+ else
+ block_signals_trace();
+
+ return ret;
+}
+
+#if IS_ENABLED(CONFIG_UML_TIME_TRAVEL_SUPPORT)
+void mark_sigio_pending(void)
+{
+ /*
+ * It would seem that this should be atomic so
+ * it isn't a read-modify-write with a signal
+ * that could happen in the middle, losing the
+ * value set by the signal.
+ *
+ * However, this function is only called when in
+ * time-travel=ext simulation mode, in which case
+ * the only signal ever pending is SIGIO, which
+ * is blocked while this can be called, and the
+ * timer signal (SIGALRM) cannot happen.
+ */
+ signals_pending |= SIGIO_MASK;
+}
+
+void block_signals_hard(void)
+{
+ signals_blocked++;
+ barrier();
+}
+
+void unblock_signals_hard(void)
+{
+ static bool unblocking;
+
+ if (!signals_blocked)
+ panic("unblocking signals while not blocked");
+
+ if (--signals_blocked)
+ return;
+ /*
+ * Must be set to 0 before we check pending so the
+ * SIGIO handler will run as normal unless we're still
+ * going to process signals_blocked_pending.
+ */
+ barrier();
+
+ /*
+ * Note that block_signals_hard()/unblock_signals_hard() can be called
+ * within the unblock_signals()/sigio_run_timetravel_handlers() below.
+ * This would still be prone to race conditions since it's actually a
+ * call _within_ e.g. vu_req_read_message(), where we observed this
+ * issue, which loops. Thus, if the inner call handles the recorded
+ * pending signals, we can get out of the inner call with the real
+ * signal hander no longer blocked, and still have a race. Thus don't
+ * handle unblocking in the inner call, if it happens, but only in
+ * the outermost call - 'unblocking' serves as an ownership for the
+ * signals_blocked_pending decrement.
+ */
+ if (unblocking)
+ return;
+ unblocking = true;
+
+ while (__atomic_load_n(&signals_blocked_pending, __ATOMIC_SEQ_CST)) {
+ if (signals_enabled) {
+ /* signals are enabled so we can touch this */
+ signals_pending |= SIGIO_MASK;
+ /*
+ * this is a bit inefficient, but that's
+ * not really important
+ */
+ block_signals();
+ unblock_signals();
+ } else {
+ /*
+ * we need to run time-travel handlers even
+ * if not enabled
+ */
+ sigio_run_timetravel_handlers();
+ }
+
+ /*
+ * The decrement of signals_blocked_pending must be atomic so
+ * that the signal handler will either happen before or after
+ * the decrement, not during a read-modify-write:
+ * - If it happens before, it can increment it and we'll
+ * decrement it and do another round in the loop.
+ * - If it happens after it'll see 0 for both signals_blocked
+ * and signals_blocked_pending and thus run the handler as
+ * usual (subject to signals_enabled, but that's unrelated.)
+ *
+ * Note that a call to unblock_signals_hard() within the calls
+ * to unblock_signals() or sigio_run_timetravel_handlers() above
+ * will do nothing due to the 'unblocking' state, so this cannot
+ * underflow as the only one decrementing will be the outermost
+ * one.
+ */
+ if (__atomic_sub_fetch(&signals_blocked_pending, 1,
+ __ATOMIC_SEQ_CST) < 0)
+ panic("signals_blocked_pending underflow");
+ }
+
+ unblocking = false;
+}
+#endif
diff --git a/arch/um/os-Linux/skas/Makefile b/arch/um/os-Linux/skas/Makefile
index d2ea3409e072..75f11989d2e9 100644
--- a/arch/um/os-Linux/skas/Makefile
+++ b/arch/um/os-Linux/skas/Makefile
@@ -1,10 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Copyright (C) 2002 - 2007 Jeff Dike (jdike@{linux.intel,addtoit}.com)
-# Licensed under the GPL
#
obj-y := mem.o process.o
USER_OBJS := $(obj-y)
-include arch/um/scripts/Makefile.rules
+include $(srctree)/arch/um/scripts/Makefile.rules
diff --git a/arch/um/os-Linux/skas/mem.c b/arch/um/os-Linux/skas/mem.c
index 689b18db798f..8b9921ac3ef8 100644
--- a/arch/um/os-Linux/skas/mem.c
+++ b/arch/um/os-Linux/skas/mem.c
@@ -1,6 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
/*
+ * Copyright (C) 2021 Benjamin Berg <benjamin@sipsolutions.net>
* Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
*/
#include <stddef.h>
@@ -12,16 +13,47 @@
#include <as-layout.h>
#include <mm_id.h>
#include <os.h>
-#include <proc_mm.h>
#include <ptrace_user.h>
#include <registers.h>
#include <skas.h>
#include <sysdep/ptrace.h>
#include <sysdep/stub.h>
+#include "../internal.h"
-extern unsigned long batch_syscall_stub, __syscall_stub_start;
+extern char __syscall_stub_start[];
-extern void wait_stub_done(int pid);
+void syscall_stub_dump_error(struct mm_id *mm_idp)
+{
+ struct stub_data *proc_data = (void *)mm_idp->stack;
+ struct stub_syscall *sc;
+
+ if (proc_data->syscall_data_len < 0 ||
+ proc_data->syscall_data_len >= ARRAY_SIZE(proc_data->syscall_data))
+ panic("Syscall data was corrupted by stub (len is: %d, expected maximum: %d)!",
+ proc_data->syscall_data_len,
+ mm_idp->syscall_data_len);
+
+ sc = &proc_data->syscall_data[proc_data->syscall_data_len];
+
+ printk(UM_KERN_ERR "%s : length = %d, last offset = %d",
+ __func__, mm_idp->syscall_data_len,
+ proc_data->syscall_data_len);
+ printk(UM_KERN_ERR "%s : stub syscall type %d failed, return value = 0x%lx\n",
+ __func__, sc->syscall, proc_data->err);
+
+ print_hex_dump(UM_KERN_ERR, " syscall data: ", 0,
+ 16, 4, sc, sizeof(*sc), 0);
+
+ if (using_seccomp) {
+ printk(UM_KERN_ERR "%s: FD map num: %d", __func__,
+ mm_idp->syscall_fd_num);
+ print_hex_dump(UM_KERN_ERR,
+ " FD map: ", 0, 16,
+ sizeof(mm_idp->syscall_fd_map[0]),
+ mm_idp->syscall_fd_map,
+ sizeof(mm_idp->syscall_fd_map), 0);
+ }
+}
static inline unsigned long *check_init_stack(struct mm_id * mm_idp,
unsigned long *stack)
@@ -38,246 +70,215 @@ static unsigned long syscall_regs[MAX_REG_NR];
static int __init init_syscall_regs(void)
{
get_safe_registers(syscall_regs, NULL);
+
syscall_regs[REGS_IP_INDEX] = STUB_CODE +
- ((unsigned long) &batch_syscall_stub -
- (unsigned long) &__syscall_stub_start);
+ ((unsigned long) stub_syscall_handler -
+ (unsigned long) __syscall_stub_start);
+ syscall_regs[REGS_SP_INDEX] = STUB_DATA +
+ offsetof(struct stub_data, sigstack) +
+ sizeof(((struct stub_data *) 0)->sigstack) -
+ sizeof(void *);
+
return 0;
}
__initcall(init_syscall_regs);
-extern int proc_mm;
-
-static inline long do_syscall_stub(struct mm_id * mm_idp, void **addr)
+static inline long do_syscall_stub(struct mm_id *mm_idp)
{
+ struct stub_data *proc_data = (void *)mm_idp->stack;
int n, i;
- long ret, offset;
- unsigned long * data;
- unsigned long * syscall;
- int err, pid = mm_idp->u.pid;
-
- if (proc_mm)
- /* FIXME: Need to look up userspace_pid by cpu */
- pid = userspace_pid[0];
-
- n = ptrace_setregs(pid, syscall_regs);
- if (n < 0) {
- printk(UM_KERN_ERR "Registers - \n");
- for (i = 0; i < MAX_REG_NR; i++)
- printk(UM_KERN_ERR "\t%d\t0x%lx\n", i, syscall_regs[i]);
- panic("do_syscall_stub : PTRACE_SETREGS failed, errno = %d\n",
- -n);
- }
+ int err, pid = mm_idp->pid;
+
+ /* Inform process how much we have filled in. */
+ proc_data->syscall_data_len = mm_idp->syscall_data_len;
+
+ if (using_seccomp) {
+ proc_data->restart_wait = 1;
+ wait_stub_done_seccomp(mm_idp, 0, 1);
+ } else {
+ n = ptrace_setregs(pid, syscall_regs);
+ if (n < 0) {
+ printk(UM_KERN_ERR "Registers -\n");
+ for (i = 0; i < MAX_REG_NR; i++)
+ printk(UM_KERN_ERR "\t%d\t0x%lx\n", i, syscall_regs[i]);
+ panic("%s : PTRACE_SETREGS failed, errno = %d\n",
+ __func__, -n);
+ }
- err = ptrace(PTRACE_CONT, pid, 0, 0);
- if (err)
- panic("Failed to continue stub, pid = %d, errno = %d\n", pid,
- errno);
+ err = ptrace(PTRACE_CONT, pid, 0, 0);
+ if (err)
+ panic("Failed to continue stub, pid = %d, errno = %d\n",
+ pid, errno);
- wait_stub_done(pid);
+ wait_stub_done(pid);
+ }
/*
- * When the stub stops, we find the following values on the
- * beginning of the stack:
- * (long )return_value
- * (long )offset to failed sycall-data (0, if no error)
+ * proc_data->err will be negative if there was an (unexpected) error.
+ * In that case, syscall_data_len points to the last executed syscall,
+ * otherwise it will be zero (but we do not need to rely on that).
*/
- ret = *((unsigned long *) mm_idp->stack);
- offset = *((unsigned long *) mm_idp->stack + 1);
- if (offset) {
- data = (unsigned long *)(mm_idp->stack + offset - STUB_DATA);
- printk(UM_KERN_ERR "do_syscall_stub : ret = %ld, offset = %ld, "
- "data = %p\n", ret, offset, data);
- syscall = (unsigned long *)((unsigned long)data + data[0]);
- printk(UM_KERN_ERR "do_syscall_stub: syscall %ld failed, "
- "return value = 0x%lx, expected return value = 0x%lx\n",
- syscall[0], ret, syscall[7]);
- printk(UM_KERN_ERR " syscall parameters: "
- "0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n",
- syscall[1], syscall[2], syscall[3],
- syscall[4], syscall[5], syscall[6]);
- for (n = 1; n < data[0]/sizeof(long); n++) {
- if (n == 1)
- printk(UM_KERN_ERR " additional syscall "
- "data:");
- if (n % 4 == 1)
- printk("\n" UM_KERN_ERR " ");
- printk(" 0x%lx", data[n]);
- }
- if (n > 1)
- printk("\n");
+ if (proc_data->err < 0) {
+ syscall_stub_dump_error(mm_idp);
+
+ /* Store error code in case someone tries to add more syscalls */
+ mm_idp->syscall_data_len = proc_data->err;
+ } else {
+ mm_idp->syscall_data_len = 0;
}
- else ret = 0;
- *addr = check_init_stack(mm_idp, NULL);
+ if (using_seccomp)
+ mm_idp->syscall_fd_num = 0;
- return ret;
+ return mm_idp->syscall_data_len;
}
-long run_syscall_stub(struct mm_id * mm_idp, int syscall,
- unsigned long *args, long expected, void **addr,
- int done)
+int syscall_stub_flush(struct mm_id *mm_idp)
{
- unsigned long *stack = check_init_stack(mm_idp, *addr);
-
- *stack += sizeof(long);
- stack += *stack / sizeof(long);
-
- *stack++ = syscall;
- *stack++ = args[0];
- *stack++ = args[1];
- *stack++ = args[2];
- *stack++ = args[3];
- *stack++ = args[4];
- *stack++ = args[5];
- *stack++ = expected;
- *stack = 0;
-
- if (!done && ((((unsigned long) stack) & ~UM_KERN_PAGE_MASK) <
- UM_KERN_PAGE_SIZE - 10 * sizeof(long))) {
- *addr = stack;
+ int res;
+
+ if (mm_idp->syscall_data_len == 0)
return 0;
+
+ /* If an error happened already, report it and reset the state. */
+ if (mm_idp->syscall_data_len < 0) {
+ res = mm_idp->syscall_data_len;
+ mm_idp->syscall_data_len = 0;
+ return res;
}
- return do_syscall_stub(mm_idp, addr);
+ res = do_syscall_stub(mm_idp);
+ mm_idp->syscall_data_len = 0;
+
+ return res;
}
-long syscall_stub_data(struct mm_id * mm_idp,
- unsigned long *data, int data_count,
- void **addr, void **stub_addr)
+struct stub_syscall *syscall_stub_alloc(struct mm_id *mm_idp)
{
- unsigned long *stack;
- int ret = 0;
-
- /*
- * If *addr still is uninitialized, it *must* contain NULL.
- * Thus in this case do_syscall_stub correctly won't be called.
- */
- if ((((unsigned long) *addr) & ~UM_KERN_PAGE_MASK) >=
- UM_KERN_PAGE_SIZE - (10 + data_count) * sizeof(long)) {
- ret = do_syscall_stub(mm_idp, addr);
- /* in case of error, don't overwrite data on stack */
- if (ret)
- return ret;
+ struct stub_syscall *sc;
+ struct stub_data *proc_data = (struct stub_data *) mm_idp->stack;
+
+ if (mm_idp->syscall_data_len > 0 &&
+ mm_idp->syscall_data_len == ARRAY_SIZE(proc_data->syscall_data))
+ do_syscall_stub(mm_idp);
+
+ if (mm_idp->syscall_data_len < 0) {
+ /* Return dummy to retain error state. */
+ sc = &proc_data->syscall_data[0];
+ } else {
+ sc = &proc_data->syscall_data[mm_idp->syscall_data_len];
+ mm_idp->syscall_data_len += 1;
}
+ memset(sc, 0, sizeof(*sc));
- stack = check_init_stack(mm_idp, *addr);
- *addr = stack;
+ return sc;
+}
- *stack = data_count * sizeof(long);
+static struct stub_syscall *syscall_stub_get_previous(struct mm_id *mm_idp,
+ int syscall_type,
+ unsigned long virt)
+{
+ if (mm_idp->syscall_data_len > 0) {
+ struct stub_data *proc_data = (void *) mm_idp->stack;
+ struct stub_syscall *sc;
- memcpy(stack + 1, data, data_count * sizeof(long));
+ sc = &proc_data->syscall_data[mm_idp->syscall_data_len - 1];
- *stub_addr = (void *)(((unsigned long)(stack + 1) &
- ~UM_KERN_PAGE_MASK) + STUB_DATA);
+ if (sc->syscall == syscall_type &&
+ sc->mem.addr + sc->mem.length == virt)
+ return sc;
+ }
- return 0;
+ return NULL;
}
-int map(struct mm_id * mm_idp, unsigned long virt, unsigned long len, int prot,
- int phys_fd, unsigned long long offset, int done, void **data)
+static int get_stub_fd(struct mm_id *mm_idp, int fd)
{
- int ret;
-
- if (proc_mm) {
- struct proc_mm_op map;
- int fd = mm_idp->u.mm_fd;
-
- map = ((struct proc_mm_op) { .op = MM_MMAP,
- .u =
- { .mmap =
- { .addr = virt,
- .len = len,
- .prot = prot,
- .flags = MAP_SHARED |
- MAP_FIXED,
- .fd = phys_fd,
- .offset= offset
- } } } );
- CATCH_EINTR(ret = write(fd, &map, sizeof(map)));
- if (ret != sizeof(map)) {
- ret = -errno;
- printk(UM_KERN_ERR "map : /proc/mm map failed, "
- "err = %d\n", -ret);
+ int i;
+
+ /* Find an FD slot (or flush and use first) */
+ if (!using_seccomp)
+ return fd;
+
+ /* Already crashed, value does not matter */
+ if (mm_idp->syscall_data_len < 0)
+ return 0;
+
+ /* Find existing FD in map if we can allocate another syscall */
+ if (mm_idp->syscall_data_len <
+ ARRAY_SIZE(((struct stub_data *)NULL)->syscall_data)) {
+ for (i = 0; i < mm_idp->syscall_fd_num; i++) {
+ if (mm_idp->syscall_fd_map[i] == fd)
+ return i;
}
- else ret = 0;
- }
- else {
- unsigned long args[] = { virt, len, prot,
- MAP_SHARED | MAP_FIXED, phys_fd,
- MMAP_OFFSET(offset) };
- ret = run_syscall_stub(mm_idp, STUB_MMAP_NR, args, virt,
- data, done);
+ if (mm_idp->syscall_fd_num < STUB_MAX_FDS) {
+ i = mm_idp->syscall_fd_num;
+ mm_idp->syscall_fd_map[i] = fd;
+
+ mm_idp->syscall_fd_num++;
+
+ return i;
+ }
}
- return ret;
+ /* FD map full or no syscall space available, continue after flush */
+ do_syscall_stub(mm_idp);
+ mm_idp->syscall_fd_map[0] = fd;
+ mm_idp->syscall_fd_num = 1;
+
+ return 0;
}
-int unmap(struct mm_id * mm_idp, unsigned long addr, unsigned long len,
- int done, void **data)
+int map(struct mm_id *mm_idp, unsigned long virt, unsigned long len, int prot,
+ int phys_fd, unsigned long long offset)
{
- int ret;
-
- if (proc_mm) {
- struct proc_mm_op unmap;
- int fd = mm_idp->u.mm_fd;
-
- unmap = ((struct proc_mm_op) { .op = MM_MUNMAP,
- .u =
- { .munmap =
- { .addr =
- (unsigned long) addr,
- .len = len } } } );
- CATCH_EINTR(ret = write(fd, &unmap, sizeof(unmap)));
- if (ret != sizeof(unmap)) {
- ret = -errno;
- printk(UM_KERN_ERR "unmap - proc_mm write returned "
- "%d\n", ret);
+ struct stub_syscall *sc;
+
+ /* Compress with previous syscall if that is possible */
+ sc = syscall_stub_get_previous(mm_idp, STUB_SYSCALL_MMAP, virt);
+ if (sc && sc->mem.prot == prot &&
+ sc->mem.offset == MMAP_OFFSET(offset - sc->mem.length)) {
+ int prev_fd = sc->mem.fd;
+
+ if (using_seccomp)
+ prev_fd = mm_idp->syscall_fd_map[sc->mem.fd];
+
+ if (phys_fd == prev_fd) {
+ sc->mem.length += len;
+ return 0;
}
- else ret = 0;
}
- else {
- unsigned long args[] = { (unsigned long) addr, len, 0, 0, 0,
- 0 };
- ret = run_syscall_stub(mm_idp, __NR_munmap, args, 0,
- data, done);
- }
+ phys_fd = get_stub_fd(mm_idp, phys_fd);
- return ret;
+ sc = syscall_stub_alloc(mm_idp);
+ sc->syscall = STUB_SYSCALL_MMAP;
+ sc->mem.addr = virt;
+ sc->mem.length = len;
+ sc->mem.prot = prot;
+ sc->mem.fd = phys_fd;
+ sc->mem.offset = MMAP_OFFSET(offset);
+
+ return 0;
}
-int protect(struct mm_id * mm_idp, unsigned long addr, unsigned long len,
- unsigned int prot, int done, void **data)
+int unmap(struct mm_id *mm_idp, unsigned long addr, unsigned long len)
{
- struct proc_mm_op protect;
- int ret;
-
- if (proc_mm) {
- int fd = mm_idp->u.mm_fd;
-
- protect = ((struct proc_mm_op) { .op = MM_MPROTECT,
- .u =
- { .mprotect =
- { .addr =
- (unsigned long) addr,
- .len = len,
- .prot = prot } } } );
-
- CATCH_EINTR(ret = write(fd, &protect, sizeof(protect)));
- if (ret != sizeof(protect)) {
- ret = -errno;
- printk(UM_KERN_ERR "protect failed, err = %d", -ret);
- }
- else ret = 0;
- }
- else {
- unsigned long args[] = { addr, len, prot, 0, 0, 0 };
+ struct stub_syscall *sc;
- ret = run_syscall_stub(mm_idp, __NR_mprotect, args, 0,
- data, done);
+ /* Compress with previous syscall if that is possible */
+ sc = syscall_stub_get_previous(mm_idp, STUB_SYSCALL_MUNMAP, addr);
+ if (sc) {
+ sc->mem.length += len;
+ return 0;
}
- return ret;
+ sc = syscall_stub_alloc(mm_idp);
+ sc->syscall = STUB_SYSCALL_MUNMAP;
+ sc->mem.addr = addr;
+ sc->mem.length = len;
+
+ return 0;
}
diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c
index 4625949bf1e4..d6c22f8aa06d 100644
--- a/arch/um/os-Linux/skas/process.c
+++ b/arch/um/os-Linux/skas/process.c
@@ -1,33 +1,92 @@
+// SPDX-License-Identifier: GPL-2.0
/*
+ * Copyright (C) 2021 Benjamin Berg <benjamin@sipsolutions.net>
+ * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de)
* Copyright (C) 2002- 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
*/
#include <stdlib.h>
+#include <stdbool.h>
#include <unistd.h>
#include <sched.h>
#include <errno.h>
#include <string.h>
+#include <fcntl.h>
+#include <mem_user.h>
#include <sys/mman.h>
#include <sys/wait.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
#include <asm/unistd.h>
#include <as-layout.h>
#include <init.h>
#include <kern_util.h>
#include <mem.h>
#include <os.h>
-#include <proc_mm.h>
#include <ptrace_user.h>
#include <registers.h>
#include <skas.h>
-#include <skas_ptrace.h>
#include <sysdep/stub.h>
+#include <sysdep/mcontext.h>
+#include <linux/futex.h>
+#include <linux/threads.h>
+#include <timetravel.h>
+#include <asm-generic/rwonce.h>
+#include "../internal.h"
int is_skas_winch(int pid, int fd, void *data)
{
return pid == getpgrp();
}
+static const char *ptrace_reg_name(int idx)
+{
+#define R(n) case HOST_##n: return #n
+
+ switch (idx) {
+#ifdef __x86_64__
+ R(BX);
+ R(CX);
+ R(DI);
+ R(SI);
+ R(DX);
+ R(BP);
+ R(AX);
+ R(R8);
+ R(R9);
+ R(R10);
+ R(R11);
+ R(R12);
+ R(R13);
+ R(R14);
+ R(R15);
+ R(ORIG_AX);
+ R(CS);
+ R(SS);
+ R(EFLAGS);
+#elif defined(__i386__)
+ R(IP);
+ R(SP);
+ R(EFLAGS);
+ R(AX);
+ R(BX);
+ R(CX);
+ R(DX);
+ R(SI);
+ R(DI);
+ R(BP);
+ R(CS);
+ R(SS);
+ R(DS);
+ R(FS);
+ R(ES);
+ R(GS);
+ R(ORIG_AX);
+#endif
+ }
+ return "";
+}
+
static int ptrace_dump_regs(int pid)
{
unsigned long regs[MAX_REG_NR];
@@ -37,8 +96,11 @@ static int ptrace_dump_regs(int pid)
return -errno;
printk(UM_KERN_ERR "Stub registers -\n");
- for (i = 0; i < ARRAY_SIZE(regs); i++)
- printk(UM_KERN_ERR "\t%d - %lx\n", i, regs[i]);
+ for (i = 0; i < ARRAY_SIZE(regs); i++) {
+ const char *regname = ptrace_reg_name(i);
+
+ printk(UM_KERN_ERR "\t%s\t(%2d): %lx\n", regname, i, regs[i]);
+ }
return 0;
}
@@ -47,7 +109,7 @@ static int ptrace_dump_regs(int pid)
* Signals that are OK to receive in the stub - we'll just continue it.
* SIGWINCH will happen when UML is inside a detached screen.
*/
-#define STUB_SIG_MASK ((1 << SIGVTALRM) | (1 << SIGWINCH))
+#define STUB_SIG_MASK ((1 << SIGALRM) | (1 << SIGWINCH))
/* Signals that the stub will finish with - anything else is an error */
#define STUB_DONE_MASK (1 << SIGTRAP)
@@ -66,8 +128,8 @@ void wait_stub_done(int pid)
err = ptrace(PTRACE_CONT, pid, 0, 0);
if (err) {
- printk(UM_KERN_ERR "wait_stub_done : continue failed, "
- "errno = %d\n", errno);
+ printk(UM_KERN_ERR "%s : continue failed, errno = %d\n",
+ __func__, errno);
fatal_sigsegv();
}
}
@@ -78,384 +140,647 @@ void wait_stub_done(int pid)
bad_wait:
err = ptrace_dump_regs(pid);
if (err)
- printk(UM_KERN_ERR "Failed to get registers from stub, "
- "errno = %d\n", -err);
- printk(UM_KERN_ERR "wait_stub_done : failed to wait for SIGTRAP, "
- "pid = %d, n = %d, errno = %d, status = 0x%x\n", pid, n, errno,
- status);
+ printk(UM_KERN_ERR "Failed to get registers from stub, errno = %d\n",
+ -err);
+ printk(UM_KERN_ERR "%s : failed to wait for SIGTRAP, pid = %d, n = %d, errno = %d, status = 0x%x\n",
+ __func__, pid, n, errno, status);
fatal_sigsegv();
}
-extern unsigned long current_stub_stack(void);
-
-static void get_skas_faultinfo(int pid, struct faultinfo *fi)
+void wait_stub_done_seccomp(struct mm_id *mm_idp, int running, int wait_sigsys)
{
- int err;
+ struct stub_data *data = (void *)mm_idp->stack;
+ int ret;
- if (ptrace_faultinfo) {
- err = ptrace(PTRACE_FAULTINFO, pid, 0, fi);
- if (err) {
- printk(UM_KERN_ERR "get_skas_faultinfo - "
- "PTRACE_FAULTINFO failed, errno = %d\n", errno);
- fatal_sigsegv();
+ do {
+ const char byte = 0;
+ struct iovec iov = {
+ .iov_base = (void *)&byte,
+ .iov_len = sizeof(byte),
+ };
+ union {
+ char data[CMSG_SPACE(sizeof(mm_idp->syscall_fd_map))];
+ struct cmsghdr align;
+ } ctrl;
+ struct msghdr msgh = {
+ .msg_iov = &iov,
+ .msg_iovlen = 1,
+ };
+
+ if (!running) {
+ if (mm_idp->syscall_fd_num) {
+ unsigned int fds_size =
+ sizeof(int) * mm_idp->syscall_fd_num;
+ struct cmsghdr *cmsg;
+
+ msgh.msg_control = ctrl.data;
+ msgh.msg_controllen = CMSG_SPACE(fds_size);
+ cmsg = CMSG_FIRSTHDR(&msgh);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(fds_size);
+ memcpy(CMSG_DATA(cmsg), mm_idp->syscall_fd_map,
+ fds_size);
+
+ CATCH_EINTR(syscall(__NR_sendmsg, mm_idp->sock,
+ &msgh, 0));
+ }
+
+ data->signal = 0;
+ data->futex = FUTEX_IN_CHILD;
+ CATCH_EINTR(syscall(__NR_futex, &data->futex,
+ FUTEX_WAKE, 1, NULL, NULL, 0));
}
- /* Special handling for i386, which has different structs */
- if (sizeof(struct ptrace_faultinfo) < sizeof(struct faultinfo))
- memset((char *)fi + sizeof(struct ptrace_faultinfo), 0,
- sizeof(struct faultinfo) -
- sizeof(struct ptrace_faultinfo));
- }
- else {
- unsigned long fpregs[FP_SIZE];
+ do {
+ /*
+ * We need to check whether the child is still alive
+ * before and after the FUTEX_WAIT call. Before, in
+ * case it just died but we still updated data->futex
+ * to FUTEX_IN_CHILD. And after, in case it died while
+ * we were waiting (and SIGCHLD woke us up, see the
+ * IRQ handler in mmu.c).
+ *
+ * Either way, if PID is negative, then we have no
+ * choice but to kill the task.
+ */
+ if (__READ_ONCE(mm_idp->pid) < 0)
+ goto out_kill;
+
+ ret = syscall(__NR_futex, &data->futex,
+ FUTEX_WAIT, FUTEX_IN_CHILD,
+ NULL, NULL, 0);
+ if (ret < 0 && errno != EINTR && errno != EAGAIN) {
+ printk(UM_KERN_ERR "%s : FUTEX_WAIT failed, errno = %d\n",
+ __func__, errno);
+ goto out_kill;
+ }
+ } while (data->futex == FUTEX_IN_CHILD);
- err = get_fp_registers(pid, fpregs);
- if (err < 0) {
- printk(UM_KERN_ERR "save_fp_registers returned %d\n",
- err);
- fatal_sigsegv();
- }
- err = ptrace(PTRACE_CONT, pid, 0, SIGSEGV);
- if (err) {
- printk(UM_KERN_ERR "Failed to continue stub, pid = %d, "
- "errno = %d\n", pid, errno);
- fatal_sigsegv();
- }
- wait_stub_done(pid);
+ if (__READ_ONCE(mm_idp->pid) < 0)
+ goto out_kill;
- /*
- * faultinfo is prepared by the stub-segv-handler at start of
- * the stub stack page. We just have to copy it.
- */
- memcpy(fi, (void *)current_stub_stack(), sizeof(*fi));
+ running = 0;
- err = put_fp_registers(pid, fpregs);
- if (err < 0) {
- printk(UM_KERN_ERR "put_fp_registers returned %d\n",
- err);
- fatal_sigsegv();
- }
+ /* We may receive a SIGALRM before SIGSYS, iterate again. */
+ } while (wait_sigsys && data->signal == SIGALRM);
+
+ if (data->mctx_offset > sizeof(data->sigstack) - sizeof(mcontext_t)) {
+ printk(UM_KERN_ERR "%s : invalid mcontext offset", __func__);
+ goto out_kill;
}
+
+ if (wait_sigsys && data->signal != SIGSYS) {
+ printk(UM_KERN_ERR "%s : expected SIGSYS but got %d",
+ __func__, data->signal);
+ goto out_kill;
+ }
+
+ return;
+
+out_kill:
+ printk(UM_KERN_ERR "%s : failed to wait for stub, pid = %d, errno = %d\n",
+ __func__, mm_idp->pid, errno);
+ /* This is not true inside start_userspace */
+ if (current_mm_id() == mm_idp)
+ fatal_sigsegv();
}
-static void handle_segv(int pid, struct uml_pt_regs * regs)
+extern unsigned long current_stub_stack(void);
+
+static void get_skas_faultinfo(int pid, struct faultinfo *fi)
{
- get_skas_faultinfo(pid, &regs->faultinfo);
- segv(regs->faultinfo, 0, 1, NULL);
+ int err;
+
+ err = ptrace(PTRACE_CONT, pid, 0, SIGSEGV);
+ if (err) {
+ printk(UM_KERN_ERR "Failed to continue stub, pid = %d, "
+ "errno = %d\n", pid, errno);
+ fatal_sigsegv();
+ }
+ wait_stub_done(pid);
+
+ /*
+ * faultinfo is prepared by the stub_segv_handler at start of
+ * the stub stack page. We just have to copy it.
+ */
+ memcpy(fi, (void *)current_stub_stack(), sizeof(*fi));
}
-/*
- * To use the same value of using_sysemu as the caller, ask it that value
- * (in local_using_sysemu
- */
-static void handle_trap(int pid, struct uml_pt_regs *regs,
- int local_using_sysemu)
+static void handle_trap(struct uml_pt_regs *regs)
{
- int err, status;
-
if ((UPT_IP(regs) >= STUB_START) && (UPT_IP(regs) < STUB_END))
fatal_sigsegv();
- /* Mark this as a syscall */
- UPT_SYSCALL_NR(regs) = PT_SYSCALL_NR(regs->gp);
+ handle_syscall(regs);
+}
- if (!local_using_sysemu)
- {
- err = ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_NR_OFFSET,
- __NR_getpid);
- if (err < 0) {
- printk(UM_KERN_ERR "handle_trap - nullifying syscall "
- "failed, errno = %d\n", errno);
- fatal_sigsegv();
- }
+extern char __syscall_stub_start[];
- err = ptrace(PTRACE_SYSCALL, pid, 0, 0);
- if (err < 0) {
- printk(UM_KERN_ERR "handle_trap - continuing to end of "
- "syscall failed, errno = %d\n", errno);
- fatal_sigsegv();
- }
+static int stub_exe_fd;
- CATCH_EINTR(err = waitpid(pid, &status, WUNTRACED | __WALL));
- if ((err < 0) || !WIFSTOPPED(status) ||
- (WSTOPSIG(status) != SIGTRAP + 0x80)) {
- err = ptrace_dump_regs(pid);
- if (err)
- printk(UM_KERN_ERR "Failed to get registers "
- "from process, errno = %d\n", -err);
- printk(UM_KERN_ERR "handle_trap - failed to wait at "
- "end of syscall, errno = %d, status = %d\n",
- errno, status);
- fatal_sigsegv();
- }
+struct tramp_data {
+ struct stub_data *stub_data;
+ /* 0 is inherited, 1 is the kernel side */
+ int sockpair[2];
+};
+
+#ifndef CLOSE_RANGE_CLOEXEC
+#define CLOSE_RANGE_CLOEXEC (1U << 2)
+#endif
+
+static int userspace_tramp(void *data)
+{
+ struct tramp_data *tramp_data = data;
+ char *const argv[] = { "uml-userspace", NULL };
+ unsigned long long offset;
+ struct stub_init_data init_data = {
+ .seccomp = using_seccomp,
+ .stub_start = STUB_START,
+ };
+ int ret;
+
+ if (using_seccomp) {
+ init_data.signal_handler = STUB_CODE +
+ (unsigned long) stub_signal_interrupt -
+ (unsigned long) __syscall_stub_start;
+ init_data.signal_restorer = STUB_CODE +
+ (unsigned long) stub_signal_restorer -
+ (unsigned long) __syscall_stub_start;
+ } else {
+ init_data.signal_handler = STUB_CODE +
+ (unsigned long) stub_segv_handler -
+ (unsigned long) __syscall_stub_start;
+ init_data.signal_restorer = 0;
}
- handle_syscall(regs);
+ init_data.stub_code_fd = phys_mapping(uml_to_phys(__syscall_stub_start),
+ &offset);
+ init_data.stub_code_offset = MMAP_OFFSET(offset);
+
+ init_data.stub_data_fd = phys_mapping(uml_to_phys(tramp_data->stub_data),
+ &offset);
+ init_data.stub_data_offset = MMAP_OFFSET(offset);
+
+ /*
+ * Avoid leaking unneeded FDs to the stub by setting CLOEXEC on all FDs
+ * and then unsetting it on all memory related FDs.
+ * This is not strictly necessary from a safety perspective.
+ */
+ syscall(__NR_close_range, 0, ~0U, CLOSE_RANGE_CLOEXEC);
+
+ fcntl(init_data.stub_data_fd, F_SETFD, 0);
+
+ /* dup2 signaling FD/socket to STDIN */
+ if (dup2(tramp_data->sockpair[0], 0) < 0)
+ exit(3);
+ close(tramp_data->sockpair[0]);
+
+ /* Write init_data and close write side */
+ ret = write(tramp_data->sockpair[1], &init_data, sizeof(init_data));
+ close(tramp_data->sockpair[1]);
+
+ if (ret != sizeof(init_data))
+ exit(4);
+
+ /* Raw execveat for compatibility with older libc versions */
+ syscall(__NR_execveat, stub_exe_fd, (unsigned long)"",
+ (unsigned long)argv, NULL, AT_EMPTY_PATH);
+
+ exit(5);
}
-extern int __syscall_stub_start;
+extern char stub_exe_start[];
+extern char stub_exe_end[];
+
+extern char *tempdir;
-static int userspace_tramp(void *stack)
+#define STUB_EXE_NAME_TEMPLATE "/uml-userspace-XXXXXX"
+
+#ifndef MFD_EXEC
+#define MFD_EXEC 0x0010U
+#endif
+
+static int __init init_stub_exe_fd(void)
{
- void *addr;
- int err;
+ size_t written = 0;
+ char *tmpfile = NULL;
- ptrace(PTRACE_TRACEME, 0, 0, 0);
+ stub_exe_fd = memfd_create("uml-userspace",
+ MFD_EXEC | MFD_CLOEXEC | MFD_ALLOW_SEALING);
- signal(SIGTERM, SIG_DFL);
- signal(SIGWINCH, SIG_IGN);
- err = set_interval();
- if (err) {
- printk(UM_KERN_ERR "userspace_tramp - setting timer failed, "
- "errno = %d\n", err);
- exit(1);
+ if (stub_exe_fd < 0) {
+ printk(UM_KERN_INFO "Could not create executable memfd, using temporary file!");
+
+ tmpfile = malloc(strlen(tempdir) +
+ strlen(STUB_EXE_NAME_TEMPLATE) + 1);
+ if (tmpfile == NULL)
+ panic("Failed to allocate memory for stub binary name");
+
+ strcpy(tmpfile, tempdir);
+ strcat(tmpfile, STUB_EXE_NAME_TEMPLATE);
+
+ stub_exe_fd = mkstemp(tmpfile);
+ if (stub_exe_fd < 0)
+ panic("Could not create temporary file for stub binary: %d",
+ -errno);
}
- if (!proc_mm) {
- /*
- * This has a pte, but it can't be mapped in with the usual
- * tlb_flush mechanism because this is part of that mechanism
- */
- int fd;
- unsigned long long offset;
- fd = phys_mapping(to_phys(&__syscall_stub_start), &offset);
- addr = mmap64((void *) STUB_CODE, UM_KERN_PAGE_SIZE,
- PROT_EXEC, MAP_FIXED | MAP_PRIVATE, fd, offset);
- if (addr == MAP_FAILED) {
- printk(UM_KERN_ERR "mapping mmap stub at 0x%lx failed, "
- "errno = %d\n", STUB_CODE, errno);
- exit(1);
- }
+ while (written < stub_exe_end - stub_exe_start) {
+ ssize_t res = write(stub_exe_fd, stub_exe_start + written,
+ stub_exe_end - stub_exe_start - written);
+ if (res < 0) {
+ if (errno == EINTR)
+ continue;
- if (stack != NULL) {
- fd = phys_mapping(to_phys(stack), &offset);
- addr = mmap((void *) STUB_DATA,
- UM_KERN_PAGE_SIZE, PROT_READ | PROT_WRITE,
- MAP_FIXED | MAP_SHARED, fd, offset);
- if (addr == MAP_FAILED) {
- printk(UM_KERN_ERR "mapping segfault stack "
- "at 0x%lx failed, errno = %d\n",
- STUB_DATA, errno);
- exit(1);
- }
+ if (tmpfile)
+ unlink(tmpfile);
+ panic("Failed write stub binary: %d", -errno);
}
+
+ written += res;
}
- if (!ptrace_faultinfo && (stack != NULL)) {
- struct sigaction sa;
-
- unsigned long v = STUB_CODE +
- (unsigned long) stub_segv_handler -
- (unsigned long) &__syscall_stub_start;
-
- set_sigstack((void *) STUB_DATA, UM_KERN_PAGE_SIZE);
- sigemptyset(&sa.sa_mask);
- sa.sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO;
- sa.sa_sigaction = (void *) v;
- sa.sa_restorer = NULL;
- if (sigaction(SIGSEGV, &sa, NULL) < 0) {
- printk(UM_KERN_ERR "userspace_tramp - setting SIGSEGV "
- "handler failed - errno = %d\n", errno);
- exit(1);
+
+ if (!tmpfile) {
+ fcntl(stub_exe_fd, F_ADD_SEALS,
+ F_SEAL_WRITE | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_SEAL);
+ } else {
+ if (fchmod(stub_exe_fd, 00500) < 0) {
+ unlink(tmpfile);
+ panic("Could not make stub binary executable: %d",
+ -errno);
+ }
+
+ close(stub_exe_fd);
+ stub_exe_fd = open(tmpfile, O_RDONLY | O_CLOEXEC | O_NOFOLLOW);
+ if (stub_exe_fd < 0) {
+ unlink(tmpfile);
+ panic("Could not reopen stub binary: %d", -errno);
}
+
+ unlink(tmpfile);
+ free(tmpfile);
}
- kill(os_getpid(), SIGSTOP);
return 0;
}
-
-/* Each element set once, and only accessed by a single processor anyway */
-#undef NR_CPUS
-#define NR_CPUS 1
-int userspace_pid[NR_CPUS];
-
-int start_userspace(unsigned long stub_stack)
+__initcall(init_stub_exe_fd);
+
+int using_seccomp;
+
+/**
+ * start_userspace() - prepare a new userspace process
+ * @mm_id: The corresponding struct mm_id
+ *
+ * Setups a new temporary stack page that is used while userspace_tramp() runs
+ * Clones the kernel process into a new userspace process, with FDs only.
+ *
+ * Return: When positive: the process id of the new userspace process,
+ * when negative: an error number.
+ * FIXME: can PIDs become negative?!
+ */
+int start_userspace(struct mm_id *mm_id)
{
+ struct stub_data *proc_data = (void *)mm_id->stack;
+ struct tramp_data tramp_data = {
+ .stub_data = proc_data,
+ };
void *stack;
unsigned long sp;
- int pid, status, n, flags, err;
+ int status, n, err;
+ /* setup a temporary stack page */
stack = mmap(NULL, UM_KERN_PAGE_SIZE,
PROT_READ | PROT_WRITE | PROT_EXEC,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (stack == MAP_FAILED) {
err = -errno;
- printk(UM_KERN_ERR "start_userspace : mmap failed, "
- "errno = %d\n", errno);
+ printk(UM_KERN_ERR "%s : mmap failed, errno = %d\n",
+ __func__, errno);
return err;
}
- sp = (unsigned long) stack + UM_KERN_PAGE_SIZE - sizeof(void *);
+ /* set stack pointer to the end of the stack page, so it can grow downwards */
+ sp = (unsigned long)stack + UM_KERN_PAGE_SIZE;
- flags = CLONE_FILES;
- if (proc_mm)
- flags |= CLONE_VM;
- else
- flags |= SIGCHLD;
-
- pid = clone(userspace_tramp, (void *) sp, flags, (void *) stub_stack);
- if (pid < 0) {
+ /* socket pair for init data and SECCOMP FD passing (no CLOEXEC here) */
+ if (socketpair(AF_UNIX, SOCK_STREAM, 0, tramp_data.sockpair)) {
err = -errno;
- printk(UM_KERN_ERR "start_userspace : clone failed, "
- "errno = %d\n", errno);
+ printk(UM_KERN_ERR "%s : socketpair failed, errno = %d\n",
+ __func__, errno);
return err;
}
- do {
- CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED | __WALL));
- if (n < 0) {
- err = -errno;
- printk(UM_KERN_ERR "start_userspace : wait failed, "
- "errno = %d\n", errno);
- goto out_kill;
- }
- } while (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGVTALRM));
+ if (using_seccomp)
+ proc_data->futex = FUTEX_IN_CHILD;
- if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) {
- err = -EINVAL;
- printk(UM_KERN_ERR "start_userspace : expected SIGSTOP, got "
- "status = %d\n", status);
- goto out_kill;
+ mm_id->pid = clone(userspace_tramp, (void *) sp,
+ CLONE_VFORK | CLONE_VM | SIGCHLD,
+ (void *)&tramp_data);
+ if (mm_id->pid < 0) {
+ err = -errno;
+ printk(UM_KERN_ERR "%s : clone failed, errno = %d\n",
+ __func__, errno);
+ goto out_close;
}
- if (ptrace(PTRACE_OLDSETOPTIONS, pid, NULL,
- (void *) PTRACE_O_TRACESYSGOOD) < 0) {
- err = -errno;
- printk(UM_KERN_ERR "start_userspace : PTRACE_OLDSETOPTIONS "
- "failed, errno = %d\n", errno);
- goto out_kill;
+ if (using_seccomp) {
+ wait_stub_done_seccomp(mm_id, 1, 1);
+ } else {
+ do {
+ CATCH_EINTR(n = waitpid(mm_id->pid, &status,
+ WUNTRACED | __WALL));
+ if (n < 0) {
+ err = -errno;
+ printk(UM_KERN_ERR "%s : wait failed, errno = %d\n",
+ __func__, errno);
+ goto out_kill;
+ }
+ } while (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGALRM));
+
+ if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) {
+ err = -EINVAL;
+ printk(UM_KERN_ERR "%s : expected SIGSTOP, got status = %d\n",
+ __func__, status);
+ goto out_kill;
+ }
+
+ if (ptrace(PTRACE_SETOPTIONS, mm_id->pid, NULL,
+ (void *) PTRACE_O_TRACESYSGOOD) < 0) {
+ err = -errno;
+ printk(UM_KERN_ERR "%s : PTRACE_SETOPTIONS failed, errno = %d\n",
+ __func__, errno);
+ goto out_kill;
+ }
}
if (munmap(stack, UM_KERN_PAGE_SIZE) < 0) {
err = -errno;
- printk(UM_KERN_ERR "start_userspace : munmap failed, "
- "errno = %d\n", errno);
+ printk(UM_KERN_ERR "%s : munmap failed, errno = %d\n",
+ __func__, errno);
goto out_kill;
}
- return pid;
+ close(tramp_data.sockpair[0]);
+ if (using_seccomp)
+ mm_id->sock = tramp_data.sockpair[1];
+ else
+ close(tramp_data.sockpair[1]);
+
+ return 0;
+
+out_kill:
+ os_kill_ptraced_process(mm_id->pid, 1);
+out_close:
+ close(tramp_data.sockpair[0]);
+ close(tramp_data.sockpair[1]);
+
+ mm_id->pid = -1;
- out_kill:
- os_kill_ptraced_process(pid, 1);
return err;
}
+static int unscheduled_userspace_iterations;
+extern unsigned long tt_extra_sched_jiffies;
+
void userspace(struct uml_pt_regs *regs)
{
- struct itimerval timer;
- unsigned long long nsecs, now;
- int err, status, op, pid = userspace_pid[0];
- /* To prevent races if using_sysemu changes under us.*/
- int local_using_sysemu;
- siginfo_t si;
+ int err, status, op;
+ siginfo_t si_local;
+ siginfo_t *si;
+ int sig;
/* Handle any immediate reschedules or signals */
interrupt_end();
- if (getitimer(ITIMER_VIRTUAL, &timer))
- printk(UM_KERN_ERR "Failed to get itimer, errno = %d\n", errno);
- nsecs = timer.it_value.tv_sec * UM_NSEC_PER_SEC +
- timer.it_value.tv_usec * UM_NSEC_PER_USEC;
- nsecs += os_nsecs();
-
while (1) {
+ struct mm_id *mm_id = current_mm_id();
+
/*
- * This can legitimately fail if the process loads a
- * bogus value into a segment register. It will
- * segfault and PTRACE_GETREGS will read that value
- * out of the process. However, PTRACE_SETREGS will
- * fail. In this case, there is nothing to do but
- * just kill the process.
+ * At any given time, only one CPU thread can enter the
+ * turnstile to operate on the same stub process, including
+ * executing stub system calls (mmap and munmap).
*/
- if (ptrace(PTRACE_SETREGS, pid, 0, regs->gp))
- fatal_sigsegv();
+ enter_turnstile(mm_id);
- if (put_fp_registers(pid, regs->fp))
- fatal_sigsegv();
+ /*
+ * When we are in time-travel mode, userspace can theoretically
+ * do a *lot* of work without being scheduled. The problem with
+ * this is that it will prevent kernel bookkeeping (primarily
+ * the RCU) from running and this can for example cause OOM
+ * situations.
+ *
+ * This code accounts a jiffie against the scheduling clock
+ * after the defined userspace iterations in the same thread.
+ * By doing so the situation is effectively prevented.
+ */
+ if (time_travel_mode == TT_MODE_INFCPU ||
+ time_travel_mode == TT_MODE_EXTERNAL) {
+#ifdef CONFIG_UML_MAX_USERSPACE_ITERATIONS
+ if (CONFIG_UML_MAX_USERSPACE_ITERATIONS &&
+ unscheduled_userspace_iterations++ >
+ CONFIG_UML_MAX_USERSPACE_ITERATIONS) {
+ tt_extra_sched_jiffies += 1;
+ unscheduled_userspace_iterations = 0;
+ }
+#endif
+ }
- /* Now we set local_using_sysemu to be used for one loop */
- local_using_sysemu = get_using_sysemu();
+ time_travel_print_bc_msg();
- op = SELECT_PTRACE_OPERATION(local_using_sysemu,
- singlestepping(NULL));
+ current_mm_sync();
- if (ptrace(op, pid, 0, 0)) {
- printk(UM_KERN_ERR "userspace - ptrace continue "
- "failed, op = %d, errno = %d\n", op, errno);
- fatal_sigsegv();
- }
+ if (using_seccomp) {
+ struct stub_data *proc_data = (void *) mm_id->stack;
- CATCH_EINTR(err = waitpid(pid, &status, WUNTRACED | __WALL));
- if (err < 0) {
- printk(UM_KERN_ERR "userspace - wait failed, "
- "errno = %d\n", errno);
- fatal_sigsegv();
- }
+ err = set_stub_state(regs, proc_data, singlestepping());
+ if (err) {
+ printk(UM_KERN_ERR "%s - failed to set regs: %d",
+ __func__, err);
+ fatal_sigsegv();
+ }
- regs->is_user = 1;
- if (ptrace(PTRACE_GETREGS, pid, 0, regs->gp)) {
- printk(UM_KERN_ERR "userspace - PTRACE_GETREGS failed, "
- "errno = %d\n", errno);
- fatal_sigsegv();
- }
+ /* Must have been reset by the syscall caller */
+ if (proc_data->restart_wait != 0)
+ panic("Programming error: Flag to only run syscalls in child was not cleared!");
- if (get_fp_registers(pid, regs->fp)) {
- printk(UM_KERN_ERR "userspace - get_fp_registers failed, "
- "errno = %d\n", errno);
- fatal_sigsegv();
- }
+ /* Mark pending syscalls for flushing */
+ proc_data->syscall_data_len = mm_id->syscall_data_len;
- UPT_SYSCALL_NR(regs) = -1; /* Assume: It's not a syscall */
+ wait_stub_done_seccomp(mm_id, 0, 0);
- if (WIFSTOPPED(status)) {
- int sig = WSTOPSIG(status);
+ sig = proc_data->signal;
- ptrace(PTRACE_GETSIGINFO, pid, 0, &si);
+ if (sig == SIGTRAP && proc_data->err != 0) {
+ printk(UM_KERN_ERR "%s - Error flushing stub syscalls",
+ __func__);
+ syscall_stub_dump_error(mm_id);
+ mm_id->syscall_data_len = proc_data->err;
+ fatal_sigsegv();
+ }
- switch (sig) {
- case SIGSEGV:
- if (PTRACE_FULL_FAULTINFO ||
- !ptrace_faultinfo) {
+ mm_id->syscall_data_len = 0;
+ mm_id->syscall_fd_num = 0;
+
+ err = get_stub_state(regs, proc_data, NULL);
+ if (err) {
+ printk(UM_KERN_ERR "%s - failed to get regs: %d",
+ __func__, err);
+ fatal_sigsegv();
+ }
+
+ if (proc_data->si_offset > sizeof(proc_data->sigstack) - sizeof(*si))
+ panic("%s - Invalid siginfo offset from child", __func__);
+
+ si = &si_local;
+ memcpy(si, &proc_data->sigstack[proc_data->si_offset], sizeof(*si));
+
+ regs->is_user = 1;
+
+ /* Fill in ORIG_RAX and extract fault information */
+ PT_SYSCALL_NR(regs->gp) = si->si_syscall;
+ if (sig == SIGSEGV) {
+ mcontext_t *mcontext = (void *)&proc_data->sigstack[proc_data->mctx_offset];
+
+ GET_FAULTINFO_FROM_MC(regs->faultinfo, mcontext);
+ }
+ } else {
+ int pid = mm_id->pid;
+
+ /* Flush out any pending syscalls */
+ err = syscall_stub_flush(mm_id);
+ if (err) {
+ if (err == -ENOMEM)
+ report_enomem();
+
+ printk(UM_KERN_ERR "%s - Error flushing stub syscalls: %d",
+ __func__, -err);
+ fatal_sigsegv();
+ }
+
+ /*
+ * This can legitimately fail if the process loads a
+ * bogus value into a segment register. It will
+ * segfault and PTRACE_GETREGS will read that value
+ * out of the process. However, PTRACE_SETREGS will
+ * fail. In this case, there is nothing to do but
+ * just kill the process.
+ */
+ if (ptrace(PTRACE_SETREGS, pid, 0, regs->gp)) {
+ printk(UM_KERN_ERR "%s - ptrace set regs failed, errno = %d\n",
+ __func__, errno);
+ fatal_sigsegv();
+ }
+
+ if (put_fp_registers(pid, regs->fp)) {
+ printk(UM_KERN_ERR "%s - ptrace set fp regs failed, errno = %d\n",
+ __func__, errno);
+ fatal_sigsegv();
+ }
+
+ if (singlestepping())
+ op = PTRACE_SYSEMU_SINGLESTEP;
+ else
+ op = PTRACE_SYSEMU;
+
+ if (ptrace(op, pid, 0, 0)) {
+ printk(UM_KERN_ERR "%s - ptrace continue failed, op = %d, errno = %d\n",
+ __func__, op, errno);
+ fatal_sigsegv();
+ }
+
+ CATCH_EINTR(err = waitpid(pid, &status, WUNTRACED | __WALL));
+ if (err < 0) {
+ printk(UM_KERN_ERR "%s - wait failed, errno = %d\n",
+ __func__, errno);
+ fatal_sigsegv();
+ }
+
+ regs->is_user = 1;
+ if (ptrace(PTRACE_GETREGS, pid, 0, regs->gp)) {
+ printk(UM_KERN_ERR "%s - PTRACE_GETREGS failed, errno = %d\n",
+ __func__, errno);
+ fatal_sigsegv();
+ }
+
+ if (get_fp_registers(pid, regs->fp)) {
+ printk(UM_KERN_ERR "%s - get_fp_registers failed, errno = %d\n",
+ __func__, errno);
+ fatal_sigsegv();
+ }
+
+ if (WIFSTOPPED(status)) {
+ sig = WSTOPSIG(status);
+
+ /*
+ * These signal handlers need the si argument
+ * and SIGSEGV needs the faultinfo.
+ * The SIGIO and SIGALARM handlers which constitute
+ * the majority of invocations, do not use it.
+ */
+ switch (sig) {
+ case SIGSEGV:
get_skas_faultinfo(pid,
&regs->faultinfo);
- (*sig_info[SIGSEGV])(SIGSEGV, &si,
- regs);
+ fallthrough;
+ case SIGTRAP:
+ case SIGILL:
+ case SIGBUS:
+ case SIGFPE:
+ case SIGWINCH:
+ ptrace(PTRACE_GETSIGINFO, pid, 0,
+ (struct siginfo *)&si_local);
+ si = &si_local;
+ break;
+ default:
+ si = NULL;
+ break;
}
- else handle_segv(pid, regs);
+ } else {
+ sig = 0;
+ }
+ }
+
+ exit_turnstile(mm_id);
+
+ UPT_SYSCALL_NR(regs) = -1; /* Assume: It's not a syscall */
+
+ if (sig) {
+ switch (sig) {
+ case SIGSEGV:
+ if (using_seccomp || PTRACE_FULL_FAULTINFO)
+ (*sig_info[SIGSEGV])(SIGSEGV,
+ (struct siginfo *)si,
+ regs, NULL);
+ else
+ segv(regs->faultinfo, 0, 1, NULL, NULL);
+
+ break;
+ case SIGSYS:
+ handle_syscall(regs);
break;
case SIGTRAP + 0x80:
- handle_trap(pid, regs, local_using_sysemu);
+ handle_trap(regs);
break;
case SIGTRAP:
- relay_signal(SIGTRAP, &si, regs);
+ relay_signal(SIGTRAP, (struct siginfo *)si, regs, NULL);
break;
- case SIGVTALRM:
- now = os_nsecs();
- if (now < nsecs)
- break;
- block_signals();
- (*sig_info[sig])(sig, &si, regs);
- unblock_signals();
- nsecs = timer.it_value.tv_sec *
- UM_NSEC_PER_SEC +
- timer.it_value.tv_usec *
- UM_NSEC_PER_USEC;
- nsecs += os_nsecs();
+ case SIGALRM:
break;
case SIGIO:
case SIGILL:
case SIGBUS:
case SIGFPE:
case SIGWINCH:
- block_signals();
- (*sig_info[sig])(sig, &si, regs);
- unblock_signals();
+ block_signals_trace();
+ (*sig_info[sig])(sig, (struct siginfo *)si, regs, NULL);
+ unblock_signals_trace();
break;
default:
- printk(UM_KERN_ERR "userspace - child stopped "
- "with signal %d\n", sig);
+ printk(UM_KERN_ERR "%s - child stopped with signal %d\n",
+ __func__, sig);
fatal_sigsegv();
}
- pid = userspace_pid[0];
interrupt_end();
/* Avoid -ERESTARTSYS handling in host */
@@ -465,173 +790,6 @@ void userspace(struct uml_pt_regs *regs)
}
}
-static unsigned long thread_regs[MAX_REG_NR];
-static unsigned long thread_fp_regs[FP_SIZE];
-
-static int __init init_thread_regs(void)
-{
- get_safe_registers(thread_regs, thread_fp_regs);
- /* Set parent's instruction pointer to start of clone-stub */
- thread_regs[REGS_IP_INDEX] = STUB_CODE +
- (unsigned long) stub_clone_handler -
- (unsigned long) &__syscall_stub_start;
- thread_regs[REGS_SP_INDEX] = STUB_DATA + UM_KERN_PAGE_SIZE -
- sizeof(void *);
-#ifdef __SIGNAL_FRAMESIZE
- thread_regs[REGS_SP_INDEX] -= __SIGNAL_FRAMESIZE;
-#endif
- return 0;
-}
-
-__initcall(init_thread_regs);
-
-int copy_context_skas0(unsigned long new_stack, int pid)
-{
- struct timeval tv = { .tv_sec = 0, .tv_usec = UM_USEC_PER_SEC / UM_HZ };
- int err;
- unsigned long current_stack = current_stub_stack();
- struct stub_data *data = (struct stub_data *) current_stack;
- struct stub_data *child_data = (struct stub_data *) new_stack;
- unsigned long long new_offset;
- int new_fd = phys_mapping(to_phys((void *)new_stack), &new_offset);
-
- /*
- * prepare offset and fd of child's stack as argument for parent's
- * and child's mmap2 calls
- */
- *data = ((struct stub_data) { .offset = MMAP_OFFSET(new_offset),
- .fd = new_fd,
- .timer = ((struct itimerval)
- { .it_value = tv,
- .it_interval = tv }) });
-
- err = ptrace_setregs(pid, thread_regs);
- if (err < 0) {
- err = -errno;
- printk(UM_KERN_ERR "copy_context_skas0 : PTRACE_SETREGS "
- "failed, pid = %d, errno = %d\n", pid, -err);
- return err;
- }
-
- err = put_fp_registers(pid, thread_fp_regs);
- if (err < 0) {
- printk(UM_KERN_ERR "copy_context_skas0 : put_fp_registers "
- "failed, pid = %d, err = %d\n", pid, err);
- return err;
- }
-
- /* set a well known return code for detection of child write failure */
- child_data->err = 12345678;
-
- /*
- * Wait, until parent has finished its work: read child's pid from
- * parent's stack, and check, if bad result.
- */
- err = ptrace(PTRACE_CONT, pid, 0, 0);
- if (err) {
- err = -errno;
- printk(UM_KERN_ERR "Failed to continue new process, pid = %d, "
- "errno = %d\n", pid, errno);
- return err;
- }
-
- wait_stub_done(pid);
-
- pid = data->err;
- if (pid < 0) {
- printk(UM_KERN_ERR "copy_context_skas0 - stub-parent reports "
- "error %d\n", -pid);
- return pid;
- }
-
- /*
- * Wait, until child has finished too: read child's result from
- * child's stack and check it.
- */
- wait_stub_done(pid);
- if (child_data->err != STUB_DATA) {
- printk(UM_KERN_ERR "copy_context_skas0 - stub-child reports "
- "error %ld\n", child_data->err);
- err = child_data->err;
- goto out_kill;
- }
-
- if (ptrace(PTRACE_OLDSETOPTIONS, pid, NULL,
- (void *)PTRACE_O_TRACESYSGOOD) < 0) {
- err = -errno;
- printk(UM_KERN_ERR "copy_context_skas0 : PTRACE_OLDSETOPTIONS "
- "failed, errno = %d\n", errno);
- goto out_kill;
- }
-
- return pid;
-
- out_kill:
- os_kill_ptraced_process(pid, 1);
- return err;
-}
-
-/*
- * This is used only, if stub pages are needed, while proc_mm is
- * available. Opening /proc/mm creates a new mm_context, which lacks
- * the stub-pages. Thus, we map them using /proc/mm-fd
- */
-int map_stub_pages(int fd, unsigned long code, unsigned long data,
- unsigned long stack)
-{
- struct proc_mm_op mmop;
- int n;
- unsigned long long code_offset;
- int code_fd = phys_mapping(to_phys((void *) &__syscall_stub_start),
- &code_offset);
-
- mmop = ((struct proc_mm_op) { .op = MM_MMAP,
- .u =
- { .mmap =
- { .addr = code,
- .len = UM_KERN_PAGE_SIZE,
- .prot = PROT_EXEC,
- .flags = MAP_FIXED | MAP_PRIVATE,
- .fd = code_fd,
- .offset = code_offset
- } } });
- CATCH_EINTR(n = write(fd, &mmop, sizeof(mmop)));
- if (n != sizeof(mmop)) {
- n = errno;
- printk(UM_KERN_ERR "mmap args - addr = 0x%lx, fd = %d, "
- "offset = %llx\n", code, code_fd,
- (unsigned long long) code_offset);
- printk(UM_KERN_ERR "map_stub_pages : /proc/mm map for code "
- "failed, err = %d\n", n);
- return -n;
- }
-
- if (stack) {
- unsigned long long map_offset;
- int map_fd = phys_mapping(to_phys((void *)stack), &map_offset);
- mmop = ((struct proc_mm_op)
- { .op = MM_MMAP,
- .u =
- { .mmap =
- { .addr = data,
- .len = UM_KERN_PAGE_SIZE,
- .prot = PROT_READ | PROT_WRITE,
- .flags = MAP_FIXED | MAP_SHARED,
- .fd = map_fd,
- .offset = map_offset
- } } });
- CATCH_EINTR(n = write(fd, &mmop, sizeof(mmop)));
- if (n != sizeof(mmop)) {
- n = errno;
- printk(UM_KERN_ERR "map_stub_pages : /proc/mm map for "
- "data failed, err = %d\n", n);
- return -n;
- }
- }
-
- return 0;
-}
-
void new_thread(void *stack, jmp_buf *buf, void (*handler)(void))
{
(*buf)[0].JB_IP = (unsigned long) handler;
@@ -646,16 +804,17 @@ void new_thread(void *stack, jmp_buf *buf, void (*handler)(void))
void switch_threads(jmp_buf *me, jmp_buf *you)
{
+ unscheduled_userspace_iterations = 0;
+
if (UML_SETJMP(me) == 0)
UML_LONGJMP(you, 1);
}
static jmp_buf initial_jmpbuf;
-/* XXX Make these percpu */
-static void (*cb_proc)(void *arg);
-static void *cb_arg;
-static jmp_buf *cb_back;
+static __thread void (*cb_proc)(void *arg);
+static __thread void *cb_arg;
+static __thread jmp_buf *cb_back;
int start_idle_thread(void *stack, jmp_buf *switch_buf)
{
@@ -674,7 +833,7 @@ int start_idle_thread(void *stack, jmp_buf *switch_buf)
n = setjmp(initial_jmpbuf);
switch (n) {
case INIT_JMP_NEW_THREAD:
- (*switch_buf)[0].JB_IP = (unsigned long) new_thread_handler;
+ (*switch_buf)[0].JB_IP = (unsigned long) uml_finishsetup;
(*switch_buf)[0].JB_SP = (unsigned long) stack +
UM_THREAD_SIZE - sizeof(void *);
break;
@@ -689,11 +848,16 @@ int start_idle_thread(void *stack, jmp_buf *switch_buf)
kmalloc_ok = 0;
return 1;
default:
- printk(UM_KERN_ERR "Bad sigsetjmp return in "
- "start_idle_thread - %d\n", n);
+ printk(UM_KERN_ERR "Bad sigsetjmp return in %s - %d\n",
+ __func__, n);
fatal_sigsegv();
}
longjmp(*switch_buf, 1);
+
+ /* unreachable */
+ printk(UM_KERN_ERR "impossible long jump!");
+ fatal_sigsegv();
+ return 0;
}
void initial_thread_cb_skas(void (*proc)(void *), void *arg)
@@ -704,10 +868,10 @@ void initial_thread_cb_skas(void (*proc)(void *), void *arg)
cb_arg = arg;
cb_back = &here;
- block_signals();
+ initial_jmpbuf_lock();
if (UML_SETJMP(&here) == 0)
UML_LONGJMP(&initial_jmpbuf, INIT_JMP_CALLBACK);
- unblock_signals();
+ initial_jmpbuf_unlock();
cb_proc = NULL;
cb_arg = NULL;
@@ -716,29 +880,29 @@ void initial_thread_cb_skas(void (*proc)(void *), void *arg)
void halt_skas(void)
{
- block_signals();
+ initial_jmpbuf_lock();
UML_LONGJMP(&initial_jmpbuf, INIT_JMP_HALT);
+ /* unreachable */
}
-void reboot_skas(void)
+static bool noreboot;
+
+static int __init noreboot_cmd_param(char *str, int *add)
{
- block_signals();
- UML_LONGJMP(&initial_jmpbuf, INIT_JMP_REBOOT);
+ *add = 0;
+ noreboot = true;
+ return 0;
}
-void __switch_mm(struct mm_id *mm_idp)
-{
- int err;
+__uml_setup("noreboot", noreboot_cmd_param,
+"noreboot\n"
+" Rather than rebooting, exit always, akin to QEMU's -no-reboot option.\n"
+" This is useful if you're using CONFIG_PANIC_TIMEOUT in order to catch\n"
+" crashes in CI\n\n");
- /* FIXME: need cpu pid in __switch_mm */
- if (proc_mm) {
- err = ptrace(PTRACE_SWITCH_MM, userspace_pid[0], 0,
- mm_idp->u.mm_fd);
- if (err) {
- printk(UM_KERN_ERR "__switch_mm - PTRACE_SWITCH_MM "
- "failed, errno = %d\n", errno);
- fatal_sigsegv();
- }
- }
- else userspace_pid[0] = mm_idp->u.pid;
+void reboot_skas(void)
+{
+ initial_jmpbuf_lock();
+ UML_LONGJMP(&initial_jmpbuf, noreboot ? INIT_JMP_HALT : INIT_JMP_REBOOT);
+ /* unreachable */
}
diff --git a/arch/um/os-Linux/smp.c b/arch/um/os-Linux/smp.c
new file mode 100644
index 000000000000..18d3858a7cd2
--- /dev/null
+++ b/arch/um/os-Linux/smp.c
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2025 Ant Group
+ * Author: Tiwei Bie <tiwei.btw@antgroup.com>
+ */
+
+#include <errno.h>
+#include <pthread.h>
+#include <signal.h>
+#include <kern_util.h>
+#include <um_malloc.h>
+#include <init.h>
+#include <os.h>
+#include <smp.h>
+#include "internal.h"
+
+struct cpu_thread_data {
+ int cpu;
+ sigset_t sigset;
+};
+
+static __thread int __curr_cpu;
+
+int uml_curr_cpu(void)
+{
+ return __curr_cpu;
+}
+
+static pthread_t cpu_threads[CONFIG_NR_CPUS];
+
+static void *cpu_thread(void *arg)
+{
+ struct cpu_thread_data *data = arg;
+
+ __curr_cpu = data->cpu;
+
+ uml_start_secondary(data);
+
+ return NULL;
+}
+
+int os_start_cpu_thread(int cpu)
+{
+ struct cpu_thread_data *data;
+ sigset_t sigset, oset;
+ int err;
+
+ data = uml_kmalloc(sizeof(*data), UM_GFP_ATOMIC);
+ if (!data)
+ return -ENOMEM;
+
+ sigfillset(&sigset);
+ if (sigprocmask(SIG_SETMASK, &sigset, &oset) < 0) {
+ err = errno;
+ goto err;
+ }
+
+ data->cpu = cpu;
+ data->sigset = oset;
+
+ err = pthread_create(&cpu_threads[cpu], NULL, cpu_thread, data);
+ if (sigprocmask(SIG_SETMASK, &oset, NULL) < 0)
+ panic("Failed to restore the signal mask, errno = %d", errno);
+ if (err != 0)
+ goto err;
+
+ return 0;
+
+err:
+ kfree(data);
+ return -err;
+}
+
+void os_start_secondary(void *arg, jmp_buf *switch_buf)
+{
+ struct cpu_thread_data *data = arg;
+
+ sigaddset(&data->sigset, IPI_SIGNAL);
+ sigaddset(&data->sigset, SIGIO);
+
+ if (sigprocmask(SIG_SETMASK, &data->sigset, NULL) < 0)
+ panic("Failed to restore the signal mask, errno = %d", errno);
+
+ kfree(data);
+ longjmp(*switch_buf, 1);
+
+ /* unreachable */
+ printk(UM_KERN_ERR "impossible long jump!");
+ fatal_sigsegv();
+}
+
+int os_send_ipi(int cpu, int vector)
+{
+ union sigval value = { .sival_int = vector };
+
+ return pthread_sigqueue(cpu_threads[cpu], IPI_SIGNAL, value);
+}
+
+static void __local_ipi_set(int enable)
+{
+ sigset_t sigset;
+
+ sigemptyset(&sigset);
+ sigaddset(&sigset, IPI_SIGNAL);
+
+ if (sigprocmask(enable ? SIG_UNBLOCK : SIG_BLOCK, &sigset, NULL) < 0)
+ panic("%s: sigprocmask failed, errno = %d", __func__, errno);
+}
+
+void os_local_ipi_enable(void)
+{
+ __local_ipi_set(1);
+}
+
+void os_local_ipi_disable(void)
+{
+ __local_ipi_set(0);
+}
+
+static void ipi_sig_handler(int sig, siginfo_t *si, void *uc)
+{
+ int save_errno = errno;
+
+ signals_enabled = 0;
+ um_trace_signals_off();
+
+ uml_ipi_handler(si->si_value.sival_int);
+
+ um_trace_signals_on();
+ signals_enabled = 1;
+
+ errno = save_errno;
+}
+
+void __init os_init_smp(void)
+{
+ struct sigaction action = {
+ .sa_sigaction = ipi_sig_handler,
+ .sa_flags = SA_SIGINFO | SA_ONSTACK | SA_RESTART,
+ };
+
+ sigfillset(&action.sa_mask);
+
+ if (sigaction(IPI_SIGNAL, &action, NULL) < 0)
+ panic("%s: sigaction failed, errno = %d", __func__, errno);
+
+ cpu_threads[0] = pthread_self();
+}
diff --git a/arch/um/os-Linux/start_up.c b/arch/um/os-Linux/start_up.c
index 337518c5042a..054ac03bbf5e 100644
--- a/arch/um/os-Linux/start_up.c
+++ b/arch/um/os-Linux/start_up.c
@@ -1,6 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
/*
+ * Copyright (C) 2021 Benjamin Berg <benjamin@sipsolutions.net>
* Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
*/
#include <stdio.h>
@@ -17,14 +18,24 @@
#include <sys/wait.h>
#include <sys/time.h>
#include <sys/resource.h>
+#include <asm/ldt.h>
#include <asm/unistd.h>
#include <init.h>
#include <os.h>
+#include <smp.h>
+#include <kern_util.h>
#include <mem_user.h>
#include <ptrace_user.h>
+#include <stdbool.h>
+#include <stub-data.h>
+#include <sys/prctl.h>
+#include <linux/seccomp.h>
+#include <linux/filter.h>
+#include <sysdep/mcontext.h>
+#include <sysdep/stub.h>
#include <registers.h>
#include <skas.h>
-#include <skas_ptrace.h>
+#include "internal.h"
static void ptrace_child(void)
{
@@ -95,6 +106,8 @@ static int start_ptraced_child(void)
{
int pid, n, status;
+ fflush(stdout);
+
pid = fork();
if (pid == 0)
ptrace_child();
@@ -111,140 +124,32 @@ static int start_ptraced_child(void)
return pid;
}
-/* When testing for SYSEMU support, if it is one of the broken versions, we
- * must just avoid using sysemu, not panic, but only if SYSEMU features are
- * broken.
- * So only for SYSEMU features we test mustpanic, while normal host features
- * must work anyway!
- */
-static int stop_ptraced_child(int pid, int exitcode, int mustexit)
+static void stop_ptraced_child(int pid, int exitcode)
{
- int status, n, ret = 0;
+ int status, n;
+
+ if (ptrace(PTRACE_CONT, pid, 0, 0) < 0)
+ fatal_perror("stop_ptraced_child : ptrace failed");
- if (ptrace(PTRACE_CONT, pid, 0, 0) < 0) {
- perror("stop_ptraced_child : ptrace failed");
- return -1;
- }
CATCH_EINTR(n = waitpid(pid, &status, 0));
if (!WIFEXITED(status) || (WEXITSTATUS(status) != exitcode)) {
int exit_with = WEXITSTATUS(status);
- if (exit_with == 2)
- non_fatal("check_ptrace : child exited with status 2. "
- "\nDisabling SYSEMU support.\n");
- non_fatal("check_ptrace : child exited with exitcode %d, while "
- "expecting %d; status 0x%x\n", exit_with,
- exitcode, status);
- if (mustexit)
- exit(1);
- ret = -1;
+ fatal("stop_ptraced_child : child exited with exitcode %d, "
+ "while expecting %d; status 0x%x\n", exit_with,
+ exitcode, status);
}
-
- return ret;
-}
-
-/* Changed only during early boot */
-int ptrace_faultinfo;
-static int disable_ptrace_faultinfo;
-
-int ptrace_ldt;
-static int disable_ptrace_ldt;
-
-int proc_mm;
-static int disable_proc_mm;
-
-int have_switch_mm;
-static int disable_switch_mm;
-
-int skas_needs_stub;
-
-static int __init skas0_cmd_param(char *str, int* add)
-{
- disable_ptrace_faultinfo = 1;
- disable_ptrace_ldt = 1;
- disable_proc_mm = 1;
- disable_switch_mm = 1;
-
- return 0;
}
-/* The two __uml_setup would conflict, without this stupid alias. */
-
-static int __init mode_skas0_cmd_param(char *str, int* add)
- __attribute__((alias("skas0_cmd_param")));
-
-__uml_setup("skas0", skas0_cmd_param,
-"skas0\n"
-" Disables SKAS3 and SKAS4 usage, so that SKAS0 is used\n\n");
-
-__uml_setup("mode=skas0", mode_skas0_cmd_param,
-"mode=skas0\n"
-" Disables SKAS3 and SKAS4 usage, so that SKAS0 is used.\n\n");
-
-/* Changed only during early boot */
-static int force_sysemu_disabled = 0;
-
-static int __init nosysemu_cmd_param(char *str, int* add)
-{
- force_sysemu_disabled = 1;
- return 0;
-}
-
-__uml_setup("nosysemu", nosysemu_cmd_param,
-"nosysemu\n"
-" Turns off syscall emulation patch for ptrace (SYSEMU) on.\n"
-" SYSEMU is a performance-patch introduced by Laurent Vivier. It changes\n"
-" behaviour of ptrace() and helps reducing host context switch rate.\n"
-" To make it working, you need a kernel patch for your host, too.\n"
-" See http://perso.wanadoo.fr/laurent.vivier/UML/ for further \n"
-" information.\n\n");
-
static void __init check_sysemu(void)
{
- unsigned long regs[MAX_REG_NR];
int pid, n, status, count=0;
- non_fatal("Checking syscall emulation patch for ptrace...");
- sysemu_supported = 0;
+ os_info("Checking syscall emulation for ptrace...");
pid = start_ptraced_child();
- if (ptrace(PTRACE_SYSEMU, pid, 0, 0) < 0)
- goto fail;
-
- CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED));
- if (n < 0)
- fatal_perror("check_sysemu : wait failed");
- if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGTRAP))
- fatal("check_sysemu : expected SIGTRAP, got status = %d\n",
- status);
-
- if (ptrace(PTRACE_GETREGS, pid, 0, regs) < 0)
- fatal_perror("check_sysemu : PTRACE_GETREGS failed");
- if (PT_SYSCALL_NR(regs) != __NR_getpid) {
- non_fatal("check_sysemu got system call number %d, "
- "expected %d...", PT_SYSCALL_NR(regs), __NR_getpid);
- goto fail;
- }
-
- n = ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_RET_OFFSET, os_getpid());
- if (n < 0) {
- non_fatal("check_sysemu : failed to modify system call "
- "return");
- goto fail;
- }
-
- if (stop_ptraced_child(pid, 0, 0) < 0)
- goto fail_stopped;
-
- sysemu_supported = 1;
- non_fatal("OK\n");
- set_using_sysemu(!force_sysemu_disabled);
-
- non_fatal("Checking advanced syscall emulation patch for ptrace...");
- pid = start_ptraced_child();
-
- if ((ptrace(PTRACE_OLDSETOPTIONS, pid, 0,
+ if ((ptrace(PTRACE_SETOPTIONS, pid, 0,
(void *) PTRACE_O_TRACESYSGOOD) < 0))
- fatal_perror("check_sysemu: PTRACE_OLDSETOPTIONS failed");
+ fatal_perror("check_sysemu: PTRACE_SETOPTIONS failed");
while (1) {
count++;
@@ -277,32 +182,27 @@ static void __init check_sysemu(void)
goto fail;
}
}
- if (stop_ptraced_child(pid, 0, 0) < 0)
- goto fail_stopped;
+ stop_ptraced_child(pid, 0);
- sysemu_supported = 2;
- non_fatal("OK\n");
+ os_info("OK\n");
- if (!force_sysemu_disabled)
- set_using_sysemu(sysemu_supported);
return;
fail:
- stop_ptraced_child(pid, 1, 0);
-fail_stopped:
- non_fatal("missing\n");
+ stop_ptraced_child(pid, 1);
+ fatal("missing\n");
}
static void __init check_ptrace(void)
{
int pid, syscall, n, status;
- non_fatal("Checking that ptrace can change system call numbers...");
+ os_info("Checking that ptrace can change system call numbers...");
pid = start_ptraced_child();
- if ((ptrace(PTRACE_OLDSETOPTIONS, pid, 0,
+ if ((ptrace(PTRACE_SETOPTIONS, pid, 0,
(void *) PTRACE_O_TRACESYSGOOD) < 0))
- fatal_perror("check_ptrace: PTRACE_OLDSETOPTIONS failed");
+ fatal_perror("check_ptrace: PTRACE_SETOPTIONS failed");
while (1) {
if (ptrace(PTRACE_SYSCALL, pid, 0, 0) < 0)
@@ -328,215 +228,268 @@ static void __init check_ptrace(void)
break;
}
}
- stop_ptraced_child(pid, 0, 1);
- non_fatal("OK\n");
+ stop_ptraced_child(pid, 0);
+ os_info("OK\n");
check_sysemu();
}
-extern void check_tmpexec(void);
+extern unsigned long host_fp_size;
+extern unsigned long exec_regs[MAX_REG_NR];
+extern unsigned long *exec_fp_regs;
-static void __init check_coredump_limit(void)
+__initdata static struct stub_data *seccomp_test_stub_data;
+
+static void __init sigsys_handler(int sig, siginfo_t *info, void *p)
{
- struct rlimit lim;
- int err = getrlimit(RLIMIT_CORE, &lim);
+ ucontext_t *uc = p;
- if (err) {
- perror("Getting core dump limit");
- return;
- }
+ /* Stow away the location of the mcontext in the stack */
+ seccomp_test_stub_data->mctx_offset = (unsigned long)&uc->uc_mcontext -
+ (unsigned long)&seccomp_test_stub_data->sigstack[0];
- printf("Core dump limits :\n\tsoft - ");
- if (lim.rlim_cur == RLIM_INFINITY)
- printf("NONE\n");
- else printf("%lu\n", lim.rlim_cur);
+ /* Prevent libc from clearing memory (mctx_offset in particular) */
+ syscall(__NR_exit, 0);
+}
- printf("\thard - ");
- if (lim.rlim_max == RLIM_INFINITY)
- printf("NONE\n");
- else printf("%lu\n", lim.rlim_max);
+static int __init seccomp_helper(void *data)
+{
+ static struct sock_filter filter[] = {
+ BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_clock_nanosleep, 1, 0),
+ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
+ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRAP),
+ };
+ static struct sock_fprog prog = {
+ .len = ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ struct sigaction sa;
+
+ /* close_range is needed for the stub */
+ if (stub_syscall3(__NR_close_range, 1, ~0U, 0))
+ exit(1);
+
+ set_sigstack(seccomp_test_stub_data->sigstack,
+ sizeof(seccomp_test_stub_data->sigstack));
+
+ sa.sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO;
+ sa.sa_sigaction = (void *) sigsys_handler;
+ sa.sa_restorer = NULL;
+ if (sigaction(SIGSYS, &sa, NULL) < 0)
+ exit(2);
+
+ prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ if (syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER,
+ SECCOMP_FILTER_FLAG_TSYNC, &prog) != 0)
+ exit(3);
+
+ sleep(0);
+
+ /* Never reached. */
+ _exit(4);
}
-void __init os_early_checks(void)
+static bool __init init_seccomp(void)
{
int pid;
+ int status;
+ int n;
+ unsigned long sp;
- /* Print out the core dump limits early */
- check_coredump_limit();
+ /*
+ * We check that we can install a seccomp filter and then exit(0)
+ * from a trapped syscall.
+ *
+ * Note that we cannot verify that no seccomp filter already exists
+ * for a syscall that results in the process/thread to be killed.
+ */
- check_ptrace();
+ os_info("Checking that seccomp filters can be installed...");
- /* Need to check this early because mmapping happens before the
- * kernel is running.
- */
- check_tmpexec();
+ seccomp_test_stub_data = mmap(0, sizeof(*seccomp_test_stub_data),
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANON, 0, 0);
- pid = start_ptraced_child();
- if (init_registers(pid))
- fatal("Failed to initialize default registers");
- stop_ptraced_child(pid, 1, 1);
-}
+ /* Use the syscall data area as stack, we just need something */
+ sp = (unsigned long)&seccomp_test_stub_data->syscall_data +
+ sizeof(seccomp_test_stub_data->syscall_data) -
+ sizeof(void *);
+ pid = clone(seccomp_helper, (void *)sp, CLONE_VFORK | CLONE_VM, NULL);
-static int __init noprocmm_cmd_param(char *str, int* add)
-{
- disable_proc_mm = 1;
- return 0;
-}
+ if (pid < 0)
+ fatal_perror("check_seccomp : clone failed");
-__uml_setup("noprocmm", noprocmm_cmd_param,
-"noprocmm\n"
-" Turns off usage of /proc/mm, even if host supports it.\n"
-" To support /proc/mm, the host needs to be patched using\n"
-" the current skas3 patch.\n\n");
+ CATCH_EINTR(n = waitpid(pid, &status, __WCLONE));
+ if (n < 0)
+ fatal_perror("check_seccomp : waitpid failed");
-static int __init noptracefaultinfo_cmd_param(char *str, int* add)
-{
- disable_ptrace_faultinfo = 1;
- return 0;
-}
+ if (WIFEXITED(status) && WEXITSTATUS(status) == 0) {
+ struct uml_pt_regs *regs;
+ unsigned long fp_size;
+ int r;
-__uml_setup("noptracefaultinfo", noptracefaultinfo_cmd_param,
-"noptracefaultinfo\n"
-" Turns off usage of PTRACE_FAULTINFO, even if host supports\n"
-" it. To support PTRACE_FAULTINFO, the host needs to be patched\n"
-" using the current skas3 patch.\n\n");
+ /* Fill in the host_fp_size from the mcontext. */
+ regs = calloc(1, sizeof(struct uml_pt_regs));
+ get_stub_state(regs, seccomp_test_stub_data, &fp_size);
+ host_fp_size = fp_size;
+ free(regs);
-static int __init noptraceldt_cmd_param(char *str, int* add)
-{
- disable_ptrace_ldt = 1;
- return 0;
-}
+ /* Repeat with the correct size */
+ regs = calloc(1, sizeof(struct uml_pt_regs) + host_fp_size);
+ r = get_stub_state(regs, seccomp_test_stub_data, NULL);
-__uml_setup("noptraceldt", noptraceldt_cmd_param,
-"noptraceldt\n"
-" Turns off usage of PTRACE_LDT, even if host supports it.\n"
-" To support PTRACE_LDT, the host needs to be patched using\n"
-" the current skas3 patch.\n\n");
+ /* Store as the default startup registers */
+ exec_fp_regs = malloc(host_fp_size);
+ memcpy(exec_regs, regs->gp, sizeof(exec_regs));
+ memcpy(exec_fp_regs, regs->fp, host_fp_size);
-static inline void check_skas3_ptrace_faultinfo(void)
-{
- struct ptrace_faultinfo fi;
- int pid, n;
+ munmap(seccomp_test_stub_data, sizeof(*seccomp_test_stub_data));
- non_fatal(" - PTRACE_FAULTINFO...");
- pid = start_ptraced_child();
+ free(regs);
+
+ if (r) {
+ os_info("failed to fetch registers: %d\n", r);
+ return false;
+ }
- n = ptrace(PTRACE_FAULTINFO, pid, 0, &fi);
- if (n < 0) {
- if (errno == EIO)
- non_fatal("not found\n");
- else
- perror("not found");
- } else if (disable_ptrace_faultinfo)
- non_fatal("found but disabled on command line\n");
- else {
- ptrace_faultinfo = 1;
- non_fatal("found\n");
+ os_info("OK\n");
+ return true;
}
- stop_ptraced_child(pid, 1, 1);
+ if (WIFEXITED(status) && WEXITSTATUS(status) == 2)
+ os_info("missing\n");
+ else
+ os_info("error\n");
+
+ munmap(seccomp_test_stub_data, sizeof(*seccomp_test_stub_data));
+ return false;
}
-static inline void check_skas3_ptrace_ldt(void)
+
+static void __init check_coredump_limit(void)
{
-#ifdef PTRACE_LDT
- int pid, n;
- unsigned char ldtbuf[40];
- struct ptrace_ldt ldt_op = (struct ptrace_ldt) {
- .func = 2, /* read default ldt */
- .ptr = ldtbuf,
- .bytecount = sizeof(ldtbuf)};
-
- non_fatal(" - PTRACE_LDT...");
- pid = start_ptraced_child();
+ struct rlimit lim;
+ int err = getrlimit(RLIMIT_CORE, &lim);
- n = ptrace(PTRACE_LDT, pid, 0, (unsigned long) &ldt_op);
- if (n < 0) {
- if (errno == EIO)
- non_fatal("not found\n");
- else
- perror("not found");
- } else if (disable_ptrace_ldt)
- non_fatal("found, but use is disabled\n");
- else {
- ptrace_ldt = 1;
- non_fatal("found\n");
+ if (err) {
+ perror("Getting core dump limit");
+ return;
}
- stop_ptraced_child(pid, 1, 1);
-#endif
+ os_info("Core dump limits :\n\tsoft - ");
+ if (lim.rlim_cur == RLIM_INFINITY)
+ os_info("NONE\n");
+ else
+ os_info("%llu\n", (unsigned long long)lim.rlim_cur);
+
+ os_info("\thard - ");
+ if (lim.rlim_max == RLIM_INFINITY)
+ os_info("NONE\n");
+ else
+ os_info("%llu\n", (unsigned long long)lim.rlim_max);
}
-static inline void check_skas3_proc_mm(void)
+void __init get_host_cpu_features(
+ void (*flags_helper_func)(char *line),
+ void (*cache_helper_func)(char *line))
{
- non_fatal(" - /proc/mm...");
- if (access("/proc/mm", W_OK) < 0)
- perror("not found");
- else if (disable_proc_mm)
- non_fatal("found but disabled on command line\n");
- else {
- proc_mm = 1;
- non_fatal("found\n");
+ FILE *cpuinfo;
+ char *line = NULL;
+ size_t len = 0;
+ int done_parsing = 0;
+
+ cpuinfo = fopen("/proc/cpuinfo", "r");
+ if (cpuinfo == NULL) {
+ os_info("Failed to get host CPU features\n");
+ } else {
+ while ((getline(&line, &len, cpuinfo)) != -1) {
+ if (strstr(line, "flags")) {
+ flags_helper_func(line);
+ done_parsing++;
+ }
+ if (strstr(line, "cache_alignment")) {
+ cache_helper_func(line);
+ done_parsing++;
+ }
+ free(line);
+ line = NULL;
+ if (done_parsing > 1)
+ break;
+ }
+ fclose(cpuinfo);
}
}
-void can_do_skas(void)
-{
- non_fatal("Checking for the skas3 patch in the host:\n");
+static int seccomp_config __initdata;
- check_skas3_proc_mm();
- check_skas3_ptrace_faultinfo();
- check_skas3_ptrace_ldt();
+static int __init uml_seccomp_config(char *line, int *add)
+{
+ *add = 0;
+
+ if (strcmp(line, "off") == 0)
+ seccomp_config = 0;
+ else if (strcmp(line, "auto") == 0)
+ seccomp_config = 1;
+ else if (strcmp(line, "on") == 0)
+ seccomp_config = 2;
+ else
+ fatal("Invalid seccomp option '%s', expected on/auto/off\n",
+ line);
- if (!proc_mm || !ptrace_faultinfo || !ptrace_ldt)
- skas_needs_stub = 1;
+ return 0;
}
-int __init parse_iomem(char *str, int *add)
+__uml_setup("seccomp=", uml_seccomp_config,
+"seccomp=<on/auto/off>\n"
+" Configure whether or not SECCOMP is used. With SECCOMP, userspace\n"
+" processes work collaboratively with the kernel instead of being\n"
+" traced using ptrace. All syscalls from the application are caught and\n"
+" redirected using a signal. This signal handler in turn is permitted to\n"
+" do the selected set of syscalls to communicate with the UML kernel and\n"
+" do the required memory management.\n"
+"\n"
+" This method is overall faster than the ptrace based userspace, primarily\n"
+" because it reduces the number of context switches for (minor) page faults.\n"
+"\n"
+" However, the SECCOMP filter is not (yet) restrictive enough to prevent\n"
+" userspace from reading and writing all physical memory. Userspace\n"
+" processes could also trick the stub into disabling SIGALRM which\n"
+" prevents it from being interrupted for scheduling purposes.\n"
+"\n"
+" This is insecure and should only be used with a trusted userspace\n\n"
+);
+
+void __init os_early_checks(void)
{
- struct iomem_region *new;
- struct stat64 buf;
- char *file, *driver;
- int fd, size;
-
- driver = str;
- file = strchr(str,',');
- if (file == NULL) {
- fprintf(stderr, "parse_iomem : failed to parse iomem\n");
- goto out;
- }
- *file = '\0';
- file++;
- fd = open(file, O_RDWR, 0);
- if (fd < 0) {
- perror("parse_iomem - Couldn't open io file");
- goto out;
- }
+ int pid;
- if (fstat64(fd, &buf) < 0) {
- perror("parse_iomem - cannot stat_fd file");
- goto out_close;
- }
+ /* Print out the core dump limits early */
+ check_coredump_limit();
+
+ /* Need to check this early because mmapping happens before the
+ * kernel is running.
+ */
+ check_tmpexec();
+
+ if (seccomp_config) {
+ if (init_seccomp()) {
+ using_seccomp = 1;
+ return;
+ }
- new = malloc(sizeof(*new));
- if (new == NULL) {
- perror("Couldn't allocate iomem_region struct");
- goto out_close;
+ if (seccomp_config == 2)
+ fatal("SECCOMP userspace requested but not functional!\n");
}
- size = (buf.st_size + UM_KERN_PAGE_SIZE) & ~(UM_KERN_PAGE_SIZE - 1);
+ if (uml_ncpus > 1)
+ fatal("SMP is not supported with PTRACE userspace.\n");
- *new = ((struct iomem_region) { .next = iomem_regions,
- .driver = driver,
- .fd = fd,
- .size = size,
- .phys = 0,
- .virt = 0 });
- iomem_regions = new;
- iomem_size += new->size + UM_KERN_PAGE_SIZE;
+ using_seccomp = 0;
+ check_ptrace();
- return 0;
- out_close:
- close(fd);
- out:
- return 1;
+ pid = start_ptraced_child();
+ if (init_pid_registers(pid))
+ fatal("Failed to initialize default registers");
+ stop_ptraced_child(pid, 1);
}
diff --git a/arch/um/os-Linux/time.c b/arch/um/os-Linux/time.c
index e9824d5dd7d5..13ebc86918d4 100644
--- a/arch/um/os-Linux/time.c
+++ b/arch/um/os-Linux/time.c
@@ -1,186 +1,157 @@
+// SPDX-License-Identifier: GPL-2.0
/*
+ * Copyright (C) 2015 Anton Ivanov (aivanov@{brocade.com,kot-begemot.co.uk})
+ * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de)
+ * Copyright (C) 2012-2014 Cisco Systems
* Copyright (C) 2000 - 2007 Jeff Dike (jdike{addtoit,linux.intel}.com)
- * Licensed under the GPL
*/
#include <stddef.h>
+#include <unistd.h>
#include <errno.h>
#include <signal.h>
#include <time.h>
+#include <sys/signalfd.h>
#include <sys/time.h>
#include <kern_util.h>
#include <os.h>
+#include <smp.h>
+#include <string.h>
#include "internal.h"
-int set_interval(void)
-{
- int usec = UM_USEC_PER_SEC / UM_HZ;
- struct itimerval interval = ((struct itimerval) { { 0, usec },
- { 0, usec } });
-
- if (setitimer(ITIMER_VIRTUAL, &interval, NULL) == -1)
- return -errno;
+static timer_t event_high_res_timer[CONFIG_NR_CPUS] = { 0 };
- return 0;
+static inline long long timespec_to_ns(const struct timespec *ts)
+{
+ return ((long long) ts->tv_sec * UM_NSEC_PER_SEC) + ts->tv_nsec;
}
-int timer_one_shot(int ticks)
+long long os_persistent_clock_emulation(void)
{
- unsigned long usec = ticks * UM_USEC_PER_SEC / UM_HZ;
- unsigned long sec = usec / UM_USEC_PER_SEC;
- struct itimerval interval;
-
- usec %= UM_USEC_PER_SEC;
- interval = ((struct itimerval) { { 0, 0 }, { sec, usec } });
+ struct timespec realtime_tp;
- if (setitimer(ITIMER_VIRTUAL, &interval, NULL) == -1)
- return -errno;
-
- return 0;
+ clock_gettime(CLOCK_REALTIME, &realtime_tp);
+ return timespec_to_ns(&realtime_tp);
}
+#ifndef sigev_notify_thread_id
+#define sigev_notify_thread_id _sigev_un._tid
+#endif
+
/**
- * timeval_to_ns - Convert timeval to nanoseconds
- * @ts: pointer to the timeval variable to be converted
- *
- * Returns the scalar nanosecond representation of the timeval
- * parameter.
- *
- * Ripped from linux/time.h because it's a kernel header, and thus
- * unusable from here.
+ * os_timer_create() - create an new posix (interval) timer
*/
-static inline long long timeval_to_ns(const struct timeval *tv)
+int os_timer_create(void)
{
- return ((long long) tv->tv_sec * UM_NSEC_PER_SEC) +
- tv->tv_usec * UM_NSEC_PER_USEC;
+ int cpu = uml_curr_cpu();
+ timer_t *t = &event_high_res_timer[cpu];
+ struct sigevent sev = {
+ .sigev_notify = SIGEV_THREAD_ID,
+ .sigev_signo = SIGALRM,
+ .sigev_value.sival_ptr = t,
+ .sigev_notify_thread_id = gettid(),
+ };
+
+ if (timer_create(CLOCK_MONOTONIC, &sev, t) == -1)
+ return -1;
+
+ return 0;
}
-long long disable_timer(void)
+int os_timer_set_interval(int cpu, unsigned long long nsecs)
{
- struct itimerval time = ((struct itimerval) { { 0, 0 }, { 0, 0 } });
- long long remain, max = UM_NSEC_PER_SEC / UM_HZ;
+ struct itimerspec its;
+
+ its.it_value.tv_sec = nsecs / UM_NSEC_PER_SEC;
+ its.it_value.tv_nsec = nsecs % UM_NSEC_PER_SEC;
- if (setitimer(ITIMER_VIRTUAL, &time, &time) < 0)
- printk(UM_KERN_ERR "disable_timer - setitimer failed, "
- "errno = %d\n", errno);
+ its.it_interval.tv_sec = nsecs / UM_NSEC_PER_SEC;
+ its.it_interval.tv_nsec = nsecs % UM_NSEC_PER_SEC;
- remain = timeval_to_ns(&time.it_value);
- if (remain > max)
- remain = max;
+ if (timer_settime(event_high_res_timer[cpu], 0, &its, NULL) == -1)
+ return -errno;
- return remain;
+ return 0;
}
-long long os_nsecs(void)
+int os_timer_one_shot(int cpu, unsigned long long nsecs)
{
- struct timeval tv;
+ struct itimerspec its = {
+ .it_value.tv_sec = nsecs / UM_NSEC_PER_SEC,
+ .it_value.tv_nsec = nsecs % UM_NSEC_PER_SEC,
- gettimeofday(&tv, NULL);
- return timeval_to_ns(&tv);
-}
+ .it_interval.tv_sec = 0,
+ .it_interval.tv_nsec = 0, // we cheat here
+ };
-#ifdef UML_CONFIG_NO_HZ_COMMON
-static int after_sleep_interval(struct timespec *ts)
-{
+ timer_settime(event_high_res_timer[cpu], 0, &its, NULL);
return 0;
}
-static void deliver_alarm(void)
+/**
+ * os_timer_disable() - disable the posix (interval) timer
+ * @cpu: the CPU for which the timer is to be disabled
+ */
+void os_timer_disable(int cpu)
{
- alarm_handler(SIGVTALRM, NULL, NULL);
-}
+ struct itimerspec its;
-static unsigned long long sleep_time(unsigned long long nsecs)
-{
- return nsecs;
+ memset(&its, 0, sizeof(struct itimerspec));
+ timer_settime(event_high_res_timer[cpu], 0, &its, NULL);
}
-#else
-unsigned long long last_tick;
-unsigned long long skew;
-
-static void deliver_alarm(void)
+long long os_nsecs(void)
{
- unsigned long long this_tick = os_nsecs();
- int one_tick = UM_NSEC_PER_SEC / UM_HZ;
-
- /* Protection against the host's time going backwards */
- if ((last_tick != 0) && (this_tick < last_tick))
- this_tick = last_tick;
-
- if (last_tick == 0)
- last_tick = this_tick - one_tick;
-
- skew += this_tick - last_tick;
-
- while (skew >= one_tick) {
- alarm_handler(SIGVTALRM, NULL, NULL);
- skew -= one_tick;
- }
+ struct timespec ts;
- last_tick = this_tick;
+ clock_gettime(CLOCK_MONOTONIC,&ts);
+ return timespec_to_ns(&ts);
}
-static unsigned long long sleep_time(unsigned long long nsecs)
-{
- return nsecs > skew ? nsecs - skew : 0;
-}
+static __thread int wake_signals;
-static inline long long timespec_to_us(const struct timespec *ts)
+void os_idle_prepare(void)
{
- return ((long long) ts->tv_sec * UM_USEC_PER_SEC) +
- ts->tv_nsec / UM_NSEC_PER_USEC;
-}
+ sigset_t set;
-static int after_sleep_interval(struct timespec *ts)
-{
- int usec = UM_USEC_PER_SEC / UM_HZ;
- long long start_usecs = timespec_to_us(ts);
- struct timeval tv;
- struct itimerval interval;
+ sigemptyset(&set);
+ sigaddset(&set, SIGALRM);
+ sigaddset(&set, IPI_SIGNAL);
/*
- * It seems that rounding can increase the value returned from
- * setitimer to larger than the one passed in. Over time,
- * this will cause the remaining time to be greater than the
- * tick interval. If this happens, then just reduce the first
- * tick to the interval value.
+ * We need to use signalfd rather than sigsuspend in idle sleep
+ * because the IPI signal is a real-time signal that carries data,
+ * and unlike handling SIGALRM, we cannot simply flag it in
+ * signals_pending.
*/
- if (start_usecs > usec)
- start_usecs = usec;
-
- start_usecs -= skew / UM_NSEC_PER_USEC;
- if (start_usecs < 0)
- start_usecs = 0;
-
- tv = ((struct timeval) { .tv_sec = start_usecs / UM_USEC_PER_SEC,
- .tv_usec = start_usecs % UM_USEC_PER_SEC });
- interval = ((struct itimerval) { { 0, usec }, tv });
-
- if (setitimer(ITIMER_VIRTUAL, &interval, NULL) == -1)
- return -errno;
-
- return 0;
+ wake_signals = signalfd(-1, &set, SFD_CLOEXEC);
+ if (wake_signals < 0)
+ panic("Failed to create signal FD, errno = %d", errno);
}
-#endif
-void idle_sleep(unsigned long long nsecs)
+/**
+ * os_idle_sleep() - sleep until interrupted
+ */
+void os_idle_sleep(void)
{
- struct timespec ts;
+ sigset_t set;
/*
- * nsecs can come in as zero, in which case, this starts a
- * busy loop. To prevent this, reset nsecs to the tick
- * interval if it is zero.
+ * Block SIGALRM while performing the need_resched check.
+ * Note that, because IRQs are disabled, the IPI signal is
+ * already blocked.
*/
- if (nsecs == 0)
- nsecs = UM_NSEC_PER_SEC / UM_HZ;
+ sigemptyset(&set);
+ sigaddset(&set, SIGALRM);
+ sigprocmask(SIG_BLOCK, &set, NULL);
- nsecs = sleep_time(nsecs);
- ts = ((struct timespec) { .tv_sec = nsecs / UM_NSEC_PER_SEC,
- .tv_nsec = nsecs % UM_NSEC_PER_SEC });
+ /*
+ * Because disabling IRQs does not block SIGALRM, it is also
+ * necessary to check for any pending timer alarms.
+ */
+ if (!uml_need_resched() && !timer_alarm_pending())
+ os_poll(1, &wake_signals);
- if (nanosleep(&ts, &ts) == 0)
- deliver_alarm();
- after_sleep_interval(&ts);
+ /* Restore the signal mask. */
+ sigprocmask(SIG_UNBLOCK, &set, NULL);
}
diff --git a/arch/um/os-Linux/tty.c b/arch/um/os-Linux/tty.c
index 721d8afa329b..f784db83e026 100644
--- a/arch/um/os-Linux/tty.c
+++ b/arch/um/os-Linux/tty.c
@@ -1,6 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
*/
#include <stdlib.h>
diff --git a/arch/um/os-Linux/umid.c b/arch/um/os-Linux/umid.c
index c1dc89261f67..eb523ab1e218 100644
--- a/arch/um/os-Linux/umid.c
+++ b/arch/um/os-Linux/umid.c
@@ -1,6 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
*/
#include <stdio.h>
@@ -35,11 +35,12 @@ static int __init make_uml_dir(void)
err = -ENOENT;
if (home == NULL) {
- printk(UM_KERN_ERR "make_uml_dir : no value in "
- "environment for $HOME\n");
+ printk(UM_KERN_ERR
+ "%s: no value in environment for $HOME\n",
+ __func__);
goto err;
}
- strlcpy(dir, home, sizeof(dir));
+ strscpy(dir, home);
uml_dir++;
}
strlcat(dir, uml_dir, sizeof(dir));
@@ -50,13 +51,15 @@ static int __init make_uml_dir(void)
err = -ENOMEM;
uml_dir = malloc(strlen(dir) + 1);
if (uml_dir == NULL) {
- printf("make_uml_dir : malloc failed, errno = %d\n", errno);
+ printk(UM_KERN_ERR "%s : malloc failed, errno = %d\n",
+ __func__, errno);
goto err;
}
strcpy(uml_dir, dir);
if ((mkdir(uml_dir, 0777) < 0) && (errno != EEXIST)) {
- printf("Failed to mkdir '%s': %s\n", uml_dir, strerror(errno));
+ printk(UM_KERN_ERR "Failed to mkdir '%s': %s\n",
+ uml_dir, strerror(errno));
err = -errno;
goto err_free;
}
@@ -94,7 +97,7 @@ static int remove_files_and_dir(char *dir)
while ((ent = readdir(directory)) != NULL) {
if (!strcmp(ent->d_name, ".") || !strcmp(ent->d_name, ".."))
continue;
- len = strlen(dir) + sizeof("/") + strlen(ent->d_name) + 1;
+ len = strlen(dir) + strlen("/") + strlen(ent->d_name) + 1;
if (len > sizeof(file)) {
ret = -E2BIG;
goto out;
@@ -132,18 +135,16 @@ out:
*/
static inline int is_umdir_used(char *dir)
{
- char file[strlen(uml_dir) + UMID_LEN + sizeof("/pid\0")];
- char pid[sizeof("nnnnn\0")], *end;
- int dead, fd, p, n, err;
-
- n = snprintf(file, sizeof(file), "%s/pid", dir);
- if (n >= sizeof(file)) {
- printk(UM_KERN_ERR "is_umdir_used - pid filename too long\n");
- err = -E2BIG;
- goto out;
- }
+ char pid[sizeof("nnnnnnnnn")], *end, *file;
+ int fd, p, n, err;
+ size_t filelen = strlen(dir) + sizeof("/pid") + 1;
+
+ file = malloc(filelen);
+ if (!file)
+ return -ENOMEM;
+
+ snprintf(file, filelen, "%s/pid", dir);
- dead = 0;
fd = open(file, O_RDONLY);
if (fd < 0) {
fd = -errno;
@@ -182,6 +183,7 @@ static inline int is_umdir_used(char *dir)
out_close:
close(fd);
out:
+ free(file);
return 0;
}
@@ -207,18 +209,22 @@ static int umdir_take_if_dead(char *dir)
static void __init create_pid_file(void)
{
- char file[strlen(uml_dir) + UMID_LEN + sizeof("/pid\0")];
- char pid[sizeof("nnnnn\0")];
+ char pid[sizeof("nnnnnnnnn")], *file;
int fd, n;
- if (umid_file_name("pid", file, sizeof(file)))
+ n = strlen(uml_dir) + UMID_LEN + sizeof("/pid");
+ file = malloc(n);
+ if (!file)
return;
+ if (umid_file_name("pid", file, n))
+ goto out;
+
fd = open(file, O_RDWR | O_CREAT | O_EXCL, 0644);
if (fd < 0) {
printk(UM_KERN_ERR "Open of machine pid file \"%s\" failed: "
"%s\n", file, strerror(errno));
- return;
+ goto out;
}
snprintf(pid, sizeof(pid), "%d\n", getpid());
@@ -228,6 +234,8 @@ static void __init create_pid_file(void)
errno);
close(fd);
+out:
+ free(file);
}
int __init set_umid(char *name)
@@ -235,7 +243,7 @@ int __init set_umid(char *name)
if (strlen(name) > UMID_LEN - 1)
return -E2BIG;
- strlcpy(umid, name, sizeof(umid));
+ strscpy(umid, name);
return 0;
}
@@ -254,7 +262,7 @@ static int __init make_umid(void)
make_uml_dir();
if (*umid == '\0') {
- strlcpy(tmp, uml_dir, sizeof(tmp));
+ strscpy(tmp, uml_dir);
strlcat(tmp, "XXXXXX", sizeof(tmp));
fd = mkstemp(tmp);
if (fd < 0) {
@@ -350,8 +358,10 @@ char *get_umid(void)
static int __init set_uml_dir(char *name, int *add)
{
+ *add = 0;
+
if (*name == '\0') {
- printf("uml_dir can't be an empty string\n");
+ os_warn("uml_dir can't be an empty string\n");
return 0;
}
@@ -362,7 +372,7 @@ static int __init set_uml_dir(char *name, int *add)
uml_dir = malloc(strlen(name) + 2);
if (uml_dir == NULL) {
- printf("Failed to malloc uml_dir - error = %d\n", errno);
+ os_warn("Failed to malloc uml_dir - error = %d\n", errno);
/*
* Return 0 here because do_initcalls doesn't look at
@@ -382,13 +392,19 @@ __uml_setup("uml_dir=", set_uml_dir,
static void remove_umid_dir(void)
{
- char dir[strlen(uml_dir) + UMID_LEN + 1], err;
+ char *dir, err;
+
+ dir = malloc(strlen(uml_dir) + UMID_LEN + 1);
+ if (!dir)
+ return;
sprintf(dir, "%s%s", uml_dir, umid);
err = remove_files_and_dir(dir);
if (err)
- printf("remove_umid_dir - remove_files_and_dir failed with "
- "err = %d\n", err);
+ os_warn("%s - remove_files_and_dir failed with err = %d\n",
+ __func__, err);
+
+ free(dir);
}
__uml_exitcall(remove_umid_dir);
diff --git a/arch/um/os-Linux/user_syms.c b/arch/um/os-Linux/user_syms.c
index db4a034aeee1..67f6112318b6 100644
--- a/arch/um/os-Linux/user_syms.c
+++ b/arch/um/os-Linux/user_syms.c
@@ -1,120 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
+#define __NO_FORTIFY
#include <linux/types.h>
#include <linux/module.h>
-/* Some of this are builtin function (some are not but could in the future),
- * so I *must* declare good prototypes for them and then EXPORT them.
- * The kernel code uses the macro defined by include/linux/string.h,
- * so I undef macros; the userspace code does not include that and I
- * add an EXPORT for the glibc one.
+/*
+ * This file exports some critical string functions and compiler
+ * built-in functions (where calls are emitted by the compiler
+ * itself that we cannot avoid even in kernel code) to modules.
+ *
+ * "_user.c" code that previously used exports here such as hostfs
+ * really should be considered part of the 'hypervisor' and define
+ * its own API boundary like hostfs does now; don't add exports to
+ * this file for such cases.
*/
-#undef strlen
-#undef strstr
-#undef memcpy
-#undef memset
-
-extern size_t strlen(const char *);
-extern void *memmove(void *, const void *, size_t);
-extern void *memset(void *, int, size_t);
-extern int printf(const char *, ...);
-
/* If it's not defined, the export is included in lib/string.c.*/
#ifdef __HAVE_ARCH_STRSTR
+#undef strstr
EXPORT_SYMBOL(strstr);
#endif
#ifndef __x86_64__
+#undef memcpy
extern void *memcpy(void *, const void *, size_t);
EXPORT_SYMBOL(memcpy);
-#endif
-
+extern void *memmove(void *, const void *, size_t);
EXPORT_SYMBOL(memmove);
+#undef memset
+extern void *memset(void *, int, size_t);
EXPORT_SYMBOL(memset);
-EXPORT_SYMBOL(printf);
-
-/* Here, instead, I can provide a fake prototype. Yes, someone cares: genksyms.
- * However, the modules will use the CRC defined *here*, no matter if it is
- * good; so the versions of these symbols will always match
- */
-#define EXPORT_SYMBOL_PROTO(sym) \
- int sym(void); \
- EXPORT_SYMBOL(sym);
-
-extern void readdir64(void) __attribute__((weak));
-EXPORT_SYMBOL(readdir64);
-extern void truncate64(void) __attribute__((weak));
-EXPORT_SYMBOL(truncate64);
-
-#ifdef CONFIG_ARCH_REUSE_HOST_VSYSCALL_AREA
-EXPORT_SYMBOL(vsyscall_ehdr);
-EXPORT_SYMBOL(vsyscall_end);
#endif
-EXPORT_SYMBOL_PROTO(__errno_location);
-
-EXPORT_SYMBOL_PROTO(access);
-EXPORT_SYMBOL_PROTO(open);
-EXPORT_SYMBOL_PROTO(open64);
-EXPORT_SYMBOL_PROTO(close);
-EXPORT_SYMBOL_PROTO(read);
-EXPORT_SYMBOL_PROTO(write);
-EXPORT_SYMBOL_PROTO(dup2);
-EXPORT_SYMBOL_PROTO(__xstat);
-EXPORT_SYMBOL_PROTO(__lxstat);
-EXPORT_SYMBOL_PROTO(__lxstat64);
-EXPORT_SYMBOL_PROTO(__fxstat64);
-EXPORT_SYMBOL_PROTO(lseek);
-EXPORT_SYMBOL_PROTO(lseek64);
-EXPORT_SYMBOL_PROTO(chown);
-EXPORT_SYMBOL_PROTO(fchown);
-EXPORT_SYMBOL_PROTO(truncate);
-EXPORT_SYMBOL_PROTO(ftruncate64);
-EXPORT_SYMBOL_PROTO(utime);
-EXPORT_SYMBOL_PROTO(utimes);
-EXPORT_SYMBOL_PROTO(futimes);
-EXPORT_SYMBOL_PROTO(chmod);
-EXPORT_SYMBOL_PROTO(fchmod);
-EXPORT_SYMBOL_PROTO(rename);
-EXPORT_SYMBOL_PROTO(__xmknod);
-
-EXPORT_SYMBOL_PROTO(symlink);
-EXPORT_SYMBOL_PROTO(link);
-EXPORT_SYMBOL_PROTO(unlink);
-EXPORT_SYMBOL_PROTO(readlink);
-
-EXPORT_SYMBOL_PROTO(mkdir);
-EXPORT_SYMBOL_PROTO(rmdir);
-EXPORT_SYMBOL_PROTO(opendir);
-EXPORT_SYMBOL_PROTO(readdir);
-EXPORT_SYMBOL_PROTO(closedir);
-EXPORT_SYMBOL_PROTO(seekdir);
-EXPORT_SYMBOL_PROTO(telldir);
-
-EXPORT_SYMBOL_PROTO(ioctl);
-
-EXPORT_SYMBOL_PROTO(pread64);
-EXPORT_SYMBOL_PROTO(pwrite64);
-
-EXPORT_SYMBOL_PROTO(statfs);
-EXPORT_SYMBOL_PROTO(statfs64);
-
-EXPORT_SYMBOL_PROTO(getuid);
-
-EXPORT_SYMBOL_PROTO(fsync);
-EXPORT_SYMBOL_PROTO(fdatasync);
-
-EXPORT_SYMBOL_PROTO(lstat64);
-EXPORT_SYMBOL_PROTO(fstat64);
-EXPORT_SYMBOL_PROTO(mknod);
-
-/* Export symbols used by GCC for the stack protector. */
-extern void __stack_smash_handler(void *) __attribute__((weak));
-EXPORT_SYMBOL(__stack_smash_handler);
-
-extern long __guard __attribute__((weak));
-EXPORT_SYMBOL(__guard);
-
#ifdef _FORTIFY_SOURCE
-extern int __sprintf_chk(char *str, int flag, size_t strlen, const char *format);
+extern int __sprintf_chk(char *str, int flag, size_t len, const char *format);
EXPORT_SYMBOL(__sprintf_chk);
#endif
diff --git a/arch/um/os-Linux/util.c b/arch/um/os-Linux/util.c
index 492ef5e6e166..e3ad71a0d13c 100644
--- a/arch/um/os-Linux/util.c
+++ b/arch/um/os-Linux/util.c
@@ -1,8 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
*/
+#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
@@ -10,15 +11,16 @@
#include <signal.h>
#include <string.h>
#include <termios.h>
-#include <wait.h>
+#include <sys/wait.h>
#include <sys/mman.h>
#include <sys/utsname.h>
+#include <sys/random.h>
+#include <init.h>
#include <os.h>
void stack_protections(unsigned long address)
{
- if (mprotect((void *) address, UM_THREAD_SIZE,
- PROT_READ | PROT_WRITE | PROT_EXEC) < 0)
+ if (mprotect((void *) address, UM_THREAD_SIZE, PROT_READ | PROT_WRITE) < 0)
panic("protecting stack failed, errno = %d", errno);
}
@@ -49,8 +51,8 @@ void setup_machinename(char *machine_out)
struct utsname host;
uname(&host);
-#ifdef UML_CONFIG_UML_X86
-# ifndef UML_CONFIG_64BIT
+#if IS_ENABLED(CONFIG_UML_X86)
+# if !IS_ENABLED(CONFIG_64BIT)
if (!strcmp(host.machine, "x86_64")) {
strcpy(machine_out, "i686");
return;
@@ -94,6 +96,21 @@ static inline void __attribute__ ((noreturn)) uml_abort(void)
exit(127);
}
+ssize_t os_getrandom(void *buf, size_t len, unsigned int flags)
+{
+ return getrandom(buf, len, flags);
+}
+
+/*
+ * UML helper threads must not handle SIGWINCH/INT/TERM
+ */
+void os_fix_helper_signals(void)
+{
+ signal(SIGWINCH, SIG_IGN);
+ signal(SIGINT, SIG_DFL);
+ signal(SIGTERM, SIG_DFL);
+}
+
void os_dump_core(void)
{
int pid;
@@ -142,3 +159,51 @@ void um_early_printk(const char *s, unsigned int n)
{
printf("%.*s", n, s);
}
+
+static int quiet_info;
+
+static int __init quiet_cmd_param(char *str, int *add)
+{
+ quiet_info = 1;
+ return 0;
+}
+
+__uml_setup("quiet", quiet_cmd_param,
+"quiet\n"
+" Turns off information messages during boot.\n\n");
+
+/*
+ * The os_info/os_warn functions will be called by helper threads. These
+ * have a very limited stack size and using the libc formatting functions
+ * may overflow the stack.
+ * So pull in the kernel vscnprintf and use that instead with a fixed
+ * on-stack buffer.
+ */
+int vscnprintf(char *buf, size_t size, const char *fmt, va_list args);
+
+void os_info(const char *fmt, ...)
+{
+ char buf[256];
+ va_list list;
+ int len;
+
+ if (quiet_info)
+ return;
+
+ va_start(list, fmt);
+ len = vscnprintf(buf, sizeof(buf), fmt, list);
+ fwrite(buf, len, 1, stderr);
+ va_end(list);
+}
+
+void os_warn(const char *fmt, ...)
+{
+ char buf[256];
+ va_list list;
+ int len;
+
+ va_start(list, fmt);
+ len = vscnprintf(buf, sizeof(buf), fmt, list);
+ fwrite(buf, len, 1, stderr);
+ va_end(list);
+}