summaryrefslogtreecommitdiff
path: root/arch/um/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/um/kernel')
-rw-r--r--arch/um/kernel/Makefile32
-rw-r--r--arch/um/kernel/asm-offsets.c50
-rw-r--r--arch/um/kernel/config.c.in4
-rw-r--r--arch/um/kernel/dtb.c42
-rw-r--r--arch/um/kernel/dyn.lds.S23
-rw-r--r--arch/um/kernel/early_printk.c5
-rw-r--r--arch/um/kernel/exec.c27
-rw-r--r--arch/um/kernel/exitcode.c23
-rw-r--r--arch/um/kernel/gmon_syms.c9
-rw-r--r--arch/um/kernel/gprof_syms.c2
-rw-r--r--arch/um/kernel/initrd.c59
-rw-r--r--arch/um/kernel/irq.c849
-rw-r--r--arch/um/kernel/kmsg_dump.c65
-rw-r--r--arch/um/kernel/ksyms.c12
-rw-r--r--arch/um/kernel/load_file.c59
-rw-r--r--arch/um/kernel/mem.c301
-rw-r--r--arch/um/kernel/physmem.c182
-rw-r--r--arch/um/kernel/process.c288
-rw-r--r--arch/um/kernel/ptrace.c91
-rw-r--r--arch/um/kernel/reboot.c54
-rw-r--r--arch/um/kernel/sigio.c35
-rw-r--r--arch/um/kernel/signal.c81
-rw-r--r--arch/um/kernel/skas/.gitignore2
-rw-r--r--arch/um/kernel/skas/Makefile47
-rw-r--r--arch/um/kernel/skas/clone.c56
-rw-r--r--arch/um/kernel/skas/mmu.c228
-rw-r--r--arch/um/kernel/skas/process.c88
-rw-r--r--arch/um/kernel/skas/stub.c181
-rw-r--r--arch/um/kernel/skas/stub_exe.c230
-rw-r--r--arch/um/kernel/skas/stub_exe_embed.S11
-rw-r--r--arch/um/kernel/skas/syscall.c72
-rw-r--r--arch/um/kernel/skas/uaccess.c239
-rw-r--r--arch/um/kernel/smp.c340
-rw-r--r--arch/um/kernel/stacktrace.c75
-rw-r--r--arch/um/kernel/syscall.c28
-rw-r--r--arch/um/kernel/sysrq.c70
-rw-r--r--arch/um/kernel/time.c1067
-rw-r--r--arch/um/kernel/tlb.c554
-rw-r--r--arch/um/kernel/trap.c358
-rw-r--r--arch/um/kernel/um_arch.c377
-rw-r--r--arch/um/kernel/um_arch.h16
-rw-r--r--arch/um/kernel/umid.c8
-rw-r--r--arch/um/kernel/uml.lds.S18
-rw-r--r--arch/um/kernel/vmlinux.lds.S2
44 files changed, 3957 insertions, 2403 deletions
diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile
index babe21826e3e..be60bc451b3f 100644
--- a/arch/um/kernel/Makefile
+++ b/arch/um/kernel/Makefile
@@ -1,30 +1,37 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux,intel}.com)
-# Licensed under the GPL
#
+# Don't instrument UML-specific code; without this, we may crash when
+# accessing the instrumentation buffer for the first time from the
+# kernel.
+KCOV_INSTRUMENT := n
+
CPPFLAGS_vmlinux.lds := -DSTART=$(LDS_START) \
-DELF_ARCH=$(LDS_ELF_ARCH) \
-DELF_FORMAT=$(LDS_ELF_FORMAT) \
$(LDS_EXTRA)
-extra-y := vmlinux.lds
-clean-files :=
+always-$(KBUILD_BUILTIN) := vmlinux.lds
obj-y = config.o exec.o exitcode.o irq.o ksyms.o mem.o \
physmem.o process.o ptrace.o reboot.o sigio.o \
- signal.o smp.o syscall.o sysrq.o time.o tlb.o trap.o \
- um_arch.o umid.o skas/
+ signal.o sysrq.o time.o tlb.o trap.o \
+ um_arch.o umid.o kmsg_dump.o capflags.o skas/
+obj-y += load_file.o
obj-$(CONFIG_BLK_DEV_INITRD) += initrd.o
obj-$(CONFIG_GPROF) += gprof_syms.o
-obj-$(CONFIG_GCOV) += gmon_syms.o
+obj-$(CONFIG_OF) += dtb.o
obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
+obj-$(CONFIG_STACKTRACE) += stacktrace.o
+obj-$(CONFIG_SMP) += smp.o
USER_OBJS := config.o
-include arch/um/scripts/Makefile.rules
+include $(srctree)/arch/um/scripts/Makefile.rules
-targets := config.c config.tmp
+targets := config.c config.tmp capflags.c
# Be careful with the below Sed code - sed is pitfall-rich!
# We use sed to lower build requirements, for "embedded" builders for instance.
@@ -39,6 +46,15 @@ quiet_cmd_quote1 = QUOTE $@
$(obj)/config.c: $(src)/config.c.in $(obj)/config.tmp FORCE
$(call if_changed,quote2)
+quiet_cmd_mkcapflags = MKCAP $@
+ cmd_mkcapflags = $(CONFIG_SHELL) $(src)/../../x86/kernel/cpu/mkcapflags.sh $@ $^
+
+cpufeature = $(src)/../../x86/include/asm/cpufeatures.h
+vmxfeature = $(src)/../../x86/include/asm/vmxfeatures.h
+
+$(obj)/capflags.c: $(cpufeature) $(vmxfeature) $(src)/../../x86/kernel/cpu/mkcapflags.sh FORCE
+ $(call if_changed,mkcapflags)
+
quiet_cmd_quote2 = QUOTE $@
cmd_quote2 = sed -e '/CONFIG/{' \
-e 's/"CONFIG"//' \
diff --git a/arch/um/kernel/asm-offsets.c b/arch/um/kernel/asm-offsets.c
index 1fb12235ab9c..d620b6f6de9b 100644
--- a/arch/um/kernel/asm-offsets.c
+++ b/arch/um/kernel/asm-offsets.c
@@ -1 +1,49 @@
-#include <sysdep/kernel-offsets.h>
+/* SPDX-License-Identifier: GPL-2.0 */
+#define COMPILE_OFFSETS
+#include <linux/stddef.h>
+#include <linux/sched.h>
+#include <linux/elf.h>
+#include <linux/crypto.h>
+#include <linux/kbuild.h>
+#include <linux/audit.h>
+#include <linux/fs.h>
+#include <asm/mman.h>
+#include <asm/seccomp.h>
+#include <asm/extable.h>
+
+/* workaround for a warning with -Wmissing-prototypes */
+void foo(void);
+
+void foo(void)
+{
+ DEFINE(KERNEL_MADV_REMOVE, MADV_REMOVE);
+
+ DEFINE(UM_KERN_PAGE_SIZE, PAGE_SIZE);
+ DEFINE(UM_KERN_PAGE_MASK, PAGE_MASK);
+ DEFINE(UM_KERN_PAGE_SHIFT, PAGE_SHIFT);
+
+ DEFINE(UM_GFP_KERNEL, GFP_KERNEL);
+ DEFINE(UM_GFP_ATOMIC, GFP_ATOMIC);
+
+ DEFINE(UM_THREAD_SIZE, THREAD_SIZE);
+
+ DEFINE(UM_NSEC_PER_SEC, NSEC_PER_SEC);
+ DEFINE(UM_NSEC_PER_USEC, NSEC_PER_USEC);
+
+ DEFINE(UM_KERN_GDT_ENTRY_TLS_ENTRIES, GDT_ENTRY_TLS_ENTRIES);
+
+ DEFINE(UM_SECCOMP_ARCH_NATIVE, SECCOMP_ARCH_NATIVE);
+
+ DEFINE(HOSTFS_ATTR_MODE, ATTR_MODE);
+ DEFINE(HOSTFS_ATTR_UID, ATTR_UID);
+ DEFINE(HOSTFS_ATTR_GID, ATTR_GID);
+ DEFINE(HOSTFS_ATTR_SIZE, ATTR_SIZE);
+ DEFINE(HOSTFS_ATTR_ATIME, ATTR_ATIME);
+ DEFINE(HOSTFS_ATTR_MTIME, ATTR_MTIME);
+ DEFINE(HOSTFS_ATTR_CTIME, ATTR_CTIME);
+ DEFINE(HOSTFS_ATTR_ATIME_SET, ATTR_ATIME_SET);
+ DEFINE(HOSTFS_ATTR_MTIME_SET, ATTR_MTIME_SET);
+
+ DEFINE(ALT_INSTR_SIZE, sizeof(struct alt_instr));
+ DEFINE(EXTABLE_SIZE, sizeof(struct exception_table_entry));
+}
diff --git a/arch/um/kernel/config.c.in b/arch/um/kernel/config.c.in
index 972bf1659564..3ece3c3b31cc 100644
--- a/arch/um/kernel/config.c.in
+++ b/arch/um/kernel/config.c.in
@@ -1,6 +1,6 @@
-/*
+// SPDX-License-Identifier: GPL-2.0
+/*
* Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
- * Licensed under the GPL
*/
#include <stdio.h>
diff --git a/arch/um/kernel/dtb.c b/arch/um/kernel/dtb.c
new file mode 100644
index 000000000000..47cd3d869fb2
--- /dev/null
+++ b/arch/um/kernel/dtb.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/init.h>
+#include <linux/of_fdt.h>
+#include <linux/printk.h>
+#include <linux/memblock.h>
+#include <init.h>
+
+#include "um_arch.h"
+
+static char *dtb __initdata;
+
+void uml_dtb_init(void)
+{
+ long long size;
+ void *area;
+
+ area = uml_load_file(dtb, &size);
+ if (area) {
+ if (!early_init_dt_scan(area, __pa(area))) {
+ pr_err("invalid DTB %s\n", dtb);
+ memblock_free(area, size);
+ return;
+ }
+
+ early_init_fdt_scan_reserved_mem();
+ }
+
+ unflatten_device_tree();
+}
+
+static int __init uml_dtb_setup(char *line, int *add)
+{
+ *add = 0;
+ dtb = line;
+ return 0;
+}
+
+__uml_setup("dtb=", uml_dtb_setup,
+"dtb=<file>\n"
+" Boot the kernel with the devicetree blob from the specified file.\n\n"
+);
diff --git a/arch/um/kernel/dyn.lds.S b/arch/um/kernel/dyn.lds.S
index adde088aeeff..a36b7918a011 100644
--- a/arch/um/kernel/dyn.lds.S
+++ b/arch/um/kernel/dyn.lds.S
@@ -1,4 +1,4 @@
-#include <asm-generic/vmlinux.lds.h>
+#include <asm/vmlinux.lds.h>
#include <asm/page.h>
OUTPUT_FORMAT(ELF_FORMAT)
@@ -6,6 +6,12 @@ OUTPUT_ARCH(ELF_ARCH)
ENTRY(_start)
jiffies = jiffies_64;
+VERSION {
+ {
+ local: *;
+ };
+}
+
SECTIONS
{
PROVIDE (__executable_start = START);
@@ -69,6 +75,8 @@ SECTIONS
TEXT_TEXT
SCHED_TEXT
LOCK_TEXT
+ IRQENTRY_TEXT
+ SOFTIRQENTRY_TEXT
*(.fixup)
*(.stub .text.* .gnu.linkonce.t.*)
/* .gnu.warning sections are handled specially by elf32.em. */
@@ -100,12 +108,14 @@ SECTIONS
be empty, which isn't pretty. */
. = ALIGN(32 / 8);
.preinit_array : { *(.preinit_array) }
- .init_array : { *(.init_array) }
+ .init_array : {
+ *(.kasan_init)
+ *(.init_array.*)
+ *(.init_array)
+ }
.fini_array : { *(.fini_array) }
.data : {
INIT_TASK_DATA(KERNEL_STACK_SIZE)
- . = ALIGN(KERNEL_STACK_SIZE);
- *(.data..init_irqstack)
DATA_DATA
*(.data.* .gnu.linkonce.d.*)
SORT(CONSTRUCTORS)
@@ -161,8 +171,11 @@ SECTIONS
PROVIDE (end = .);
STABS_DEBUG
-
DWARF_DEBUG
+ ELF_DETAILS
DISCARDS
}
+
+ASSERT(__syscall_stub_end - __syscall_stub_start <= PAGE_SIZE,
+ "STUB code must not be larger than one page");
diff --git a/arch/um/kernel/early_printk.c b/arch/um/kernel/early_printk.c
index 4a0800bc37b2..c350c2331bbe 100644
--- a/arch/um/kernel/early_printk.c
+++ b/arch/um/kernel/early_printk.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2011 Richard Weinberger <richrd@nod.at>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/kernel.h>
diff --git a/arch/um/kernel/exec.c b/arch/um/kernel/exec.c
index 0d7103c9eff3..13812fa97eee 100644
--- a/arch/um/kernel/exec.c
+++ b/arch/um/kernel/exec.c
@@ -1,50 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
*/
#include <linux/stddef.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/ptrace.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
#include <linux/slab.h>
#include <asm/current.h>
#include <asm/processor.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <as-layout.h>
#include <mem_user.h>
+#include <registers.h>
#include <skas.h>
#include <os.h>
void flush_thread(void)
{
- void *data = NULL;
- int ret;
-
arch_flush_thread(&current->thread.arch);
- ret = unmap(&current->mm->context.id, 0, STUB_START, 0, &data);
- ret = ret || unmap(&current->mm->context.id, STUB_END,
- host_task_size - STUB_END, 1, &data);
- if (ret) {
- printk(KERN_ERR "flush_thread - clearing address space failed, "
- "err = %d\n", ret);
- force_sig(SIGKILL, current);
- }
get_safe_registers(current_pt_regs()->regs.gp,
current_pt_regs()->regs.fp);
-
- __switch_mm(&current->mm->context.id);
}
void start_thread(struct pt_regs *regs, unsigned long eip, unsigned long esp)
{
PT_REGS_IP(regs) = eip;
PT_REGS_SP(regs) = esp;
- current->ptrace &= ~PT_DTRACE;
-#ifdef SUBARCH_EXECVE1
- SUBARCH_EXECVE1(regs->regs);
-#endif
+ clear_thread_flag(TIF_SINGLESTEP);
}
EXPORT_SYMBOL(start_thread);
diff --git a/arch/um/kernel/exitcode.c b/arch/um/kernel/exitcode.c
index 829df49dee99..43edc2aa57e4 100644
--- a/arch/um/kernel/exitcode.c
+++ b/arch/um/kernel/exitcode.c
@@ -1,6 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
*/
#include <linux/ctype.h>
@@ -10,7 +10,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/types.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
/*
* If read and write race, the read will still atomically read a valid
@@ -40,9 +40,11 @@ static ssize_t exitcode_proc_write(struct file *file,
const char __user *buffer, size_t count, loff_t *pos)
{
char *end, buf[sizeof("nnnnn\0")];
+ size_t size;
int tmp;
- if (copy_from_user(buf, buffer, count))
+ size = min(count, sizeof(buf));
+ if (copy_from_user(buf, buffer, size))
return -EFAULT;
tmp = simple_strtol(buf, &end, 0);
@@ -53,20 +55,19 @@ static ssize_t exitcode_proc_write(struct file *file,
return count;
}
-static const struct file_operations exitcode_proc_fops = {
- .owner = THIS_MODULE,
- .open = exitcode_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
- .write = exitcode_proc_write,
+static const struct proc_ops exitcode_proc_ops = {
+ .proc_open = exitcode_proc_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
+ .proc_write = exitcode_proc_write,
};
static int make_proc_exitcode(void)
{
struct proc_dir_entry *ent;
- ent = proc_create("exitcode", 0600, NULL, &exitcode_proc_fops);
+ ent = proc_create("exitcode", 0600, NULL, &exitcode_proc_ops);
if (ent == NULL) {
printk(KERN_WARNING "make_proc_exitcode : Failed to register "
"/proc/exitcode\n");
diff --git a/arch/um/kernel/gmon_syms.c b/arch/um/kernel/gmon_syms.c
deleted file mode 100644
index 1bf61266da8e..000000000000
--- a/arch/um/kernel/gmon_syms.c
+++ /dev/null
@@ -1,9 +0,0 @@
-/*
- * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
- */
-
-#include <linux/module.h>
-
-extern void __bb_init_func(void *) __attribute__((weak));
-EXPORT_SYMBOL(__bb_init_func);
diff --git a/arch/um/kernel/gprof_syms.c b/arch/um/kernel/gprof_syms.c
index 74ddb44288a3..84d536908775 100644
--- a/arch/um/kernel/gprof_syms.c
+++ b/arch/um/kernel/gprof_syms.c
@@ -1,6 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
*/
#include <linux/module.h>
diff --git a/arch/um/kernel/initrd.c b/arch/um/kernel/initrd.c
index 55cead809b18..99dba827461c 100644
--- a/arch/um/kernel/initrd.c
+++ b/arch/um/kernel/initrd.c
@@ -1,46 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
*/
#include <linux/init.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/initrd.h>
#include <asm/types.h>
#include <init.h>
#include <os.h>
+#include "um_arch.h"
+
/* Changed by uml_initrd_setup, which is a setup */
static char *initrd __initdata = NULL;
-static int load_initrd(char *filename, void *buf, int size);
-static int __init read_initrd(void)
+int __init read_initrd(void)
{
+ unsigned long long size;
void *area;
- long long size;
- int err;
-
- if (initrd == NULL)
- return 0;
-
- err = os_file_size(initrd, &size);
- if (err)
- return 0;
-
- /*
- * This is necessary because alloc_bootmem craps out if you
- * ask for no memory.
- */
- if (size == 0) {
- printk(KERN_ERR "\"%s\" is a zero-size initrd\n", initrd);
- return 0;
- }
- area = alloc_bootmem(size);
- if (area == NULL)
+ if (!initrd)
return 0;
- if (load_initrd(initrd, area, size) == -1)
+ area = uml_load_file(initrd, &size);
+ if (!area)
return 0;
initrd_start = (unsigned long) area;
@@ -48,10 +32,9 @@ static int __init read_initrd(void)
return 0;
}
-__uml_postsetup(read_initrd);
-
static int __init uml_initrd_setup(char *line, int *add)
{
+ *add = 0;
initrd = line;
return 0;
}
@@ -61,25 +44,3 @@ __uml_setup("initrd=", uml_initrd_setup,
" This is used to boot UML from an initrd image. The argument is the\n"
" name of the file containing the image.\n\n"
);
-
-static int load_initrd(char *filename, void *buf, int size)
-{
- int fd, n;
-
- fd = os_open_file(filename, of_read(OPENFLAGS()), 0);
- if (fd < 0) {
- printk(KERN_ERR "Opening '%s' failed - err = %d\n", filename,
- -fd);
- return -1;
- }
- n = os_read_file(fd, buf, size);
- if (n != size) {
- printk(KERN_ERR "Read of %d bytes from '%s' failed, "
- "err = %d\n", size,
- filename, -n);
- return -1;
- }
-
- os_close_file(fd);
- return 0;
-}
diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c
index 36e12f0cefd5..f4b13f15a9c1 100644
--- a/arch/um/kernel/irq.c
+++ b/arch/um/kernel/irq.c
@@ -1,6 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
/*
+ * Copyright (C) 2017 - Cambridge Greys Ltd
+ * Copyright (C) 2011 - 2014 Cisco Systems Inc
* Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
* Derived (i.e. mostly copied) from arch/i386/kernel/irq.c:
* Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
*/
@@ -16,245 +18,424 @@
#include <as-layout.h>
#include <kern_util.h>
#include <os.h>
+#include <irq_user.h>
+#include <irq_kern.h>
+#include <linux/time-internal.h>
+DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
+
+#define irq_stats(x) (&per_cpu(irq_stat, x))
+
+/* When epoll triggers we do not know why it did so
+ * we can also have different IRQs for read and write.
+ * This is why we keep a small irq_reg array for each fd -
+ * one entry per IRQ type
+ */
+struct irq_reg {
+ void *id;
+ int irq;
+ /* it's cheaper to store this than to query it */
+ int events;
+ bool active;
+ bool pending;
+ bool wakeup;
+#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT
+ bool pending_event;
+ void (*timetravel_handler)(int, int, void *,
+ struct time_travel_event *);
+ struct time_travel_event event;
+#endif
+};
+
+struct irq_entry {
+ struct list_head list;
+ int fd;
+ struct irq_reg reg[NUM_IRQ_TYPES];
+ bool suspended;
+ bool sigio_workaround;
+};
+
+static DEFINE_RAW_SPINLOCK(irq_lock);
+static LIST_HEAD(active_fds);
+static DECLARE_BITMAP(irqs_allocated, UM_LAST_SIGNAL_IRQ);
+static bool irqs_suspended;
+#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT
+static bool irqs_pending;
+#endif
+
+static void irq_io_loop(struct irq_reg *irq, struct uml_pt_regs *regs)
+{
/*
- * This list is accessed under irq_lock, except in sigio_handler,
- * where it is safe from being modified. IRQ handlers won't change it -
- * if an IRQ source has vanished, it will be freed by free_irqs just
- * before returning from sigio_handler. That will process a separate
- * list of irqs to free, with its own locking, coming back here to
- * remove list elements, taking the irq_lock to do so.
+ * irq->active guards against reentry
+ * irq->pending accumulates pending requests
+ * if pending is raised the irq_handler is re-run
+ * until pending is cleared
*/
-static struct irq_fd *active_fds = NULL;
-static struct irq_fd **last_irq_ptr = &active_fds;
+ if (irq->active) {
+ irq->active = false;
-extern void free_irqs(void);
+ do {
+ irq->pending = false;
+ do_IRQ(irq->irq, regs);
+ } while (irq->pending);
+
+ irq->active = true;
+ } else {
+ irq->pending = true;
+ }
+}
-void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
+#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT
+static void irq_event_handler(struct time_travel_event *ev)
{
- struct irq_fd *irq_fd;
- int n;
+ struct irq_reg *reg = container_of(ev, struct irq_reg, event);
- if (smp_sigio_handler())
+ /* do nothing if suspended; just cause a wakeup and mark as pending */
+ if (irqs_suspended) {
+ irqs_pending = true;
+ reg->pending_event = true;
return;
+ }
- while (1) {
- n = os_waiting_for_events(active_fds);
- if (n <= 0) {
- if (n == -EINTR)
- continue;
- else break;
- }
+ generic_handle_irq(reg->irq);
+}
+
+static bool irq_do_timetravel_handler(struct irq_entry *entry,
+ enum um_irq_type t)
+{
+ struct irq_reg *reg = &entry->reg[t];
+
+ if (!reg->timetravel_handler)
+ return false;
+
+ /*
+ * Handle all messages - we might get multiple even while
+ * interrupts are already suspended, due to suspend order
+ * etc. Note that time_travel_add_irq_event() will not add
+ * an event twice, if it's pending already "first wins".
+ */
+ reg->timetravel_handler(reg->irq, entry->fd, reg->id, &reg->event);
- for (irq_fd = active_fds; irq_fd != NULL;
- irq_fd = irq_fd->next) {
- if (irq_fd->current_events != 0) {
- irq_fd->current_events = 0;
- do_IRQ(irq_fd->irq, regs);
+ if (!reg->event.pending)
+ return false;
+
+ return true;
+}
+
+static void irq_do_pending_events(bool timetravel_handlers_only)
+{
+ struct irq_entry *entry;
+
+ if (!irqs_pending || timetravel_handlers_only)
+ return;
+
+ irqs_pending = false;
+
+ list_for_each_entry(entry, &active_fds, list) {
+ enum um_irq_type t;
+
+ for (t = 0; t < NUM_IRQ_TYPES; t++) {
+ struct irq_reg *reg = &entry->reg[t];
+
+ /*
+ * Any timetravel_handler was invoked already, just
+ * directly run the IRQ.
+ */
+ if (reg->pending_event) {
+ irq_enter();
+ generic_handle_irq(reg->irq);
+ irq_exit();
+ reg->pending_event = false;
}
}
}
-
- free_irqs();
+}
+#else
+static bool irq_do_timetravel_handler(struct irq_entry *entry,
+ enum um_irq_type t)
+{
+ return false;
}
-static DEFINE_SPINLOCK(irq_lock);
+static void irq_do_pending_events(bool timetravel_handlers_only)
+{
+}
+#endif
-static int activate_fd(int irq, int fd, int type, void *dev_id)
+static void sigio_reg_handler(int idx, struct irq_entry *entry, enum um_irq_type t,
+ struct uml_pt_regs *regs,
+ bool timetravel_handlers_only)
{
- struct pollfd *tmp_pfd;
- struct irq_fd *new_fd, *irq_fd;
- unsigned long flags;
- int events, err, n;
+ struct irq_reg *reg = &entry->reg[t];
- err = os_set_fd_async(fd);
- if (err < 0)
- goto out;
+ if (!reg->events)
+ return;
- err = -ENOMEM;
- new_fd = kmalloc(sizeof(struct irq_fd), GFP_KERNEL);
- if (new_fd == NULL)
- goto out;
+ if (os_epoll_triggered(idx, reg->events) <= 0)
+ return;
- if (type == IRQ_READ)
- events = UM_POLLIN | UM_POLLPRI;
- else events = UM_POLLOUT;
- *new_fd = ((struct irq_fd) { .next = NULL,
- .id = dev_id,
- .fd = fd,
- .type = type,
- .irq = irq,
- .events = events,
- .current_events = 0 } );
-
- err = -EBUSY;
- spin_lock_irqsave(&irq_lock, flags);
- for (irq_fd = active_fds; irq_fd != NULL; irq_fd = irq_fd->next) {
- if ((irq_fd->fd == fd) && (irq_fd->type == type)) {
- printk(KERN_ERR "Registering fd %d twice\n", fd);
- printk(KERN_ERR "Irqs : %d, %d\n", irq_fd->irq, irq);
- printk(KERN_ERR "Ids : 0x%p, 0x%p\n", irq_fd->id,
- dev_id);
- goto out_unlock;
- }
+ if (irq_do_timetravel_handler(entry, t))
+ return;
+
+ /*
+ * If we're called to only run time-travel handlers then don't
+ * actually proceed but mark sigio as pending (if applicable).
+ * For suspend/resume, timetravel_handlers_only may be true
+ * despite time-travel not being configured and used.
+ */
+ if (timetravel_handlers_only) {
+#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT
+ reg->pending_event = true;
+ irqs_pending = true;
+ mark_sigio_pending();
+#endif
+ return;
}
- if (type == IRQ_WRITE)
- fd = -1;
+ irq_io_loop(reg, regs);
+}
- tmp_pfd = NULL;
- n = 0;
+static void _sigio_handler(struct uml_pt_regs *regs,
+ bool timetravel_handlers_only)
+{
+ struct irq_entry *irq_entry;
+ int n, i;
- while (1) {
- n = os_create_pollfd(fd, events, tmp_pfd, n);
- if (n == 0)
- break;
-
- /*
- * n > 0
- * It means we couldn't put new pollfd to current pollfds
- * and tmp_fds is NULL or too small for new pollfds array.
- * Needed size is equal to n as minimum.
- *
- * Here we have to drop the lock in order to call
- * kmalloc, which might sleep.
- * If something else came in and changed the pollfds array
- * so we will not be able to put new pollfd struct to pollfds
- * then we free the buffer tmp_fds and try again.
- */
- spin_unlock_irqrestore(&irq_lock, flags);
- kfree(tmp_pfd);
+ if (timetravel_handlers_only && !um_irq_timetravel_handler_used())
+ return;
- tmp_pfd = kmalloc(n, GFP_KERNEL);
- if (tmp_pfd == NULL)
- goto out_kfree;
+ /* Flush out pending events that were ignored due to time-travel. */
+ if (!irqs_suspended)
+ irq_do_pending_events(timetravel_handlers_only);
- spin_lock_irqsave(&irq_lock, flags);
- }
+ while (1) {
+ /* This is now lockless - epoll keeps back-referencesto the irqs
+ * which have trigger it so there is no need to walk the irq
+ * list and lock it every time. We avoid locking by turning off
+ * IO for a specific fd by executing os_del_epoll_fd(fd) before
+ * we do any changes to the actual data structures
+ */
+ n = os_waiting_for_events_epoll();
- *last_irq_ptr = new_fd;
- last_irq_ptr = &new_fd->next;
+ if (n <= 0) {
+ if (n == -EINTR)
+ continue;
+ else
+ break;
+ }
- spin_unlock_irqrestore(&irq_lock, flags);
+ for (i = 0; i < n ; i++) {
+ enum um_irq_type t;
- /*
- * This calls activate_fd, so it has to be outside the critical
- * section.
- */
- maybe_sigio_broken(fd, (type == IRQ_READ));
+ irq_entry = os_epoll_get_data_pointer(i);
- return 0;
+ for (t = 0; t < NUM_IRQ_TYPES; t++)
+ sigio_reg_handler(i, irq_entry, t, regs,
+ timetravel_handlers_only);
+ }
+ }
- out_unlock:
- spin_unlock_irqrestore(&irq_lock, flags);
- out_kfree:
- kfree(new_fd);
- out:
- return err;
+ if (!timetravel_handlers_only)
+ free_irqs();
}
-static void free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg)
+void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs,
+ void *mc)
{
- unsigned long flags;
-
- spin_lock_irqsave(&irq_lock, flags);
- os_free_irq_by_cb(test, arg, active_fds, &last_irq_ptr);
- spin_unlock_irqrestore(&irq_lock, flags);
+ preempt_disable();
+ _sigio_handler(regs, irqs_suspended);
+ preempt_enable();
}
-struct irq_and_dev {
- int irq;
- void *dev;
-};
-
-static int same_irq_and_dev(struct irq_fd *irq, void *d)
+static struct irq_entry *get_irq_entry_by_fd(int fd)
{
- struct irq_and_dev *data = d;
+ struct irq_entry *walk;
- return ((irq->irq == data->irq) && (irq->id == data->dev));
+ lockdep_assert_held(&irq_lock);
+
+ list_for_each_entry(walk, &active_fds, list) {
+ if (walk->fd == fd)
+ return walk;
+ }
+
+ return NULL;
}
-static void free_irq_by_irq_and_dev(unsigned int irq, void *dev)
+static void remove_irq_entry(struct irq_entry *to_free, bool remove)
{
- struct irq_and_dev data = ((struct irq_and_dev) { .irq = irq,
- .dev = dev });
+ if (!to_free)
+ return;
- free_irq_by_cb(same_irq_and_dev, &data);
+ if (remove)
+ os_del_epoll_fd(to_free->fd);
+ list_del(&to_free->list);
}
-static int same_fd(struct irq_fd *irq, void *fd)
+static bool update_irq_entry(struct irq_entry *entry)
{
- return (irq->fd == *((int *)fd));
+ enum um_irq_type i;
+ int events = 0;
+
+ for (i = 0; i < NUM_IRQ_TYPES; i++)
+ events |= entry->reg[i].events;
+
+ if (events) {
+ /* will modify (instead of add) if needed */
+ os_add_epoll_fd(events, entry->fd, entry);
+ return true;
+ }
+
+ os_del_epoll_fd(entry->fd);
+ return false;
}
-void free_irq_by_fd(int fd)
+static struct irq_entry *update_or_remove_irq_entry(struct irq_entry *entry)
{
- free_irq_by_cb(same_fd, &fd);
+ if (update_irq_entry(entry))
+ return NULL;
+ remove_irq_entry(entry, false);
+ return entry;
}
-/* Must be called with irq_lock held */
-static struct irq_fd *find_irq_by_fd(int fd, int irqnum, int *index_out)
+static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id,
+ void (*timetravel_handler)(int, int, void *,
+ struct time_travel_event *))
{
- struct irq_fd *irq;
- int i = 0;
- int fdi;
+ struct irq_entry *irq_entry, *to_free = NULL;
+ int err, events = os_event_mask(type);
+ unsigned long flags;
- for (irq = active_fds; irq != NULL; irq = irq->next) {
- if ((irq->fd == fd) && (irq->irq == irqnum))
- break;
- i++;
- }
- if (irq == NULL) {
- printk(KERN_ERR "find_irq_by_fd doesn't have descriptor %d\n",
- fd);
+ err = os_set_fd_async(fd);
+ if (err < 0)
goto out;
+
+ raw_spin_lock_irqsave(&irq_lock, flags);
+ irq_entry = get_irq_entry_by_fd(fd);
+ if (irq_entry) {
+already:
+ /* cannot register the same FD twice with the same type */
+ if (WARN_ON(irq_entry->reg[type].events)) {
+ err = -EALREADY;
+ goto out_unlock;
+ }
+
+ /* temporarily disable to avoid IRQ-side locking */
+ os_del_epoll_fd(fd);
+ } else {
+ struct irq_entry *new;
+
+ /* don't restore interrupts */
+ raw_spin_unlock(&irq_lock);
+ new = kzalloc(sizeof(*irq_entry), GFP_ATOMIC);
+ if (!new) {
+ local_irq_restore(flags);
+ return -ENOMEM;
+ }
+ raw_spin_lock(&irq_lock);
+ irq_entry = get_irq_entry_by_fd(fd);
+ if (irq_entry) {
+ to_free = new;
+ goto already;
+ }
+ irq_entry = new;
+ irq_entry->fd = fd;
+ list_add_tail(&irq_entry->list, &active_fds);
+ maybe_sigio_broken(fd);
}
- fdi = os_get_pollfd(i);
- if ((fdi != -1) && (fdi != fd)) {
- printk(KERN_ERR "find_irq_by_fd - mismatch between active_fds "
- "and pollfds, fd %d vs %d, need %d\n", irq->fd,
- fdi, fd);
- irq = NULL;
- goto out;
+
+ irq_entry->reg[type].id = dev_id;
+ irq_entry->reg[type].irq = irq;
+ irq_entry->reg[type].active = true;
+ irq_entry->reg[type].events = events;
+
+#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT
+ if (um_irq_timetravel_handler_used()) {
+ irq_entry->reg[type].timetravel_handler = timetravel_handler;
+ irq_entry->reg[type].event.fn = irq_event_handler;
}
- *index_out = i;
- out:
- return irq;
+#endif
+
+ WARN_ON(!update_irq_entry(irq_entry));
+ err = 0;
+out_unlock:
+ raw_spin_unlock_irqrestore(&irq_lock, flags);
+out:
+ kfree(to_free);
+ return err;
+}
+
+/*
+ * Remove the entry or entries for a specific FD, if you
+ * don't want to remove all the possible entries then use
+ * um_free_irq() or deactivate_fd() instead.
+ */
+void free_irq_by_fd(int fd)
+{
+ struct irq_entry *to_free;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&irq_lock, flags);
+ to_free = get_irq_entry_by_fd(fd);
+ remove_irq_entry(to_free, true);
+ raw_spin_unlock_irqrestore(&irq_lock, flags);
+ kfree(to_free);
}
+EXPORT_SYMBOL(free_irq_by_fd);
-void reactivate_fd(int fd, int irqnum)
+static void free_irq_by_irq_and_dev(unsigned int irq, void *dev)
{
- struct irq_fd *irq;
+ struct irq_entry *entry, *to_free = NULL;
unsigned long flags;
- int i;
- spin_lock_irqsave(&irq_lock, flags);
- irq = find_irq_by_fd(fd, irqnum, &i);
- if (irq == NULL) {
- spin_unlock_irqrestore(&irq_lock, flags);
- return;
- }
- os_set_pollfd(i, irq->fd);
- spin_unlock_irqrestore(&irq_lock, flags);
+ raw_spin_lock_irqsave(&irq_lock, flags);
+ list_for_each_entry(entry, &active_fds, list) {
+ enum um_irq_type i;
- add_sigio_fd(fd);
+ for (i = 0; i < NUM_IRQ_TYPES; i++) {
+ struct irq_reg *reg = &entry->reg[i];
+
+ if (!reg->events)
+ continue;
+ if (reg->irq != irq)
+ continue;
+ if (reg->id != dev)
+ continue;
+
+ os_del_epoll_fd(entry->fd);
+ reg->events = 0;
+ to_free = update_or_remove_irq_entry(entry);
+ goto out;
+ }
+ }
+out:
+ raw_spin_unlock_irqrestore(&irq_lock, flags);
+ kfree(to_free);
}
void deactivate_fd(int fd, int irqnum)
{
- struct irq_fd *irq;
+ struct irq_entry *entry;
unsigned long flags;
- int i;
+ enum um_irq_type i;
- spin_lock_irqsave(&irq_lock, flags);
- irq = find_irq_by_fd(fd, irqnum, &i);
- if (irq == NULL) {
- spin_unlock_irqrestore(&irq_lock, flags);
- return;
+ os_del_epoll_fd(fd);
+
+ raw_spin_lock_irqsave(&irq_lock, flags);
+ entry = get_irq_entry_by_fd(fd);
+ if (!entry)
+ goto out;
+
+ for (i = 0; i < NUM_IRQ_TYPES; i++) {
+ if (!entry->reg[i].events)
+ continue;
+ if (entry->reg[i].irq == irqnum)
+ entry->reg[i].events = 0;
}
- os_set_pollfd(i, -1);
- spin_unlock_irqrestore(&irq_lock, flags);
+ entry = update_or_remove_irq_entry(entry);
+out:
+ raw_spin_unlock_irqrestore(&irq_lock, flags);
+ kfree(entry);
ignore_sigio_fd(fd);
}
@@ -268,17 +449,18 @@ EXPORT_SYMBOL(deactivate_fd);
*/
int deactivate_all_fds(void)
{
- struct irq_fd *irq;
- int err;
+ struct irq_entry *entry;
- for (irq = active_fds; irq != NULL; irq = irq->next) {
- err = os_clear_fd_async(irq->fd);
- if (err)
- return err;
- }
- /* If there is a signal already queued, after unblocking ignore it */
+ /* Stop IO. The IRQ loop has no lock so this is our
+ * only way of making sure we are safe to dispose
+ * of all IRQ handlers
+ */
os_set_ioignore();
+ /* we can no longer call kfree() here so just deactivate */
+ list_for_each_entry(entry, &active_fds, list)
+ os_del_epoll_fd(entry->fd);
+ os_close_epoll_fd();
return 0;
}
@@ -297,31 +479,180 @@ unsigned int do_IRQ(int irq, struct uml_pt_regs *regs)
return 1;
}
-void um_free_irq(unsigned int irq, void *dev)
+void um_free_irq(int irq, void *dev)
{
+ if (WARN(irq < 0 || irq > UM_LAST_SIGNAL_IRQ,
+ "freeing invalid irq %d", irq))
+ return;
+
free_irq_by_irq_and_dev(irq, dev);
free_irq(irq, dev);
+ clear_bit(irq, irqs_allocated);
}
EXPORT_SYMBOL(um_free_irq);
-int um_request_irq(unsigned int irq, int fd, int type,
- irq_handler_t handler,
- unsigned long irqflags, const char * devname,
- void *dev_id)
+static int
+_um_request_irq(int irq, int fd, enum um_irq_type type,
+ irq_handler_t handler, unsigned long irqflags,
+ const char *devname, void *dev_id,
+ void (*timetravel_handler)(int, int, void *,
+ struct time_travel_event *))
{
int err;
+ if (irq == UM_IRQ_ALLOC) {
+ int i;
+
+ for (i = UM_FIRST_DYN_IRQ; i < NR_IRQS; i++) {
+ if (!test_and_set_bit(i, irqs_allocated)) {
+ irq = i;
+ break;
+ }
+ }
+ }
+
+ if (irq < 0)
+ return -ENOSPC;
+
if (fd != -1) {
- err = activate_fd(irq, fd, type, dev_id);
+ err = activate_fd(irq, fd, type, dev_id, timetravel_handler);
if (err)
- return err;
+ goto error;
}
- return request_irq(irq, handler, irqflags, devname, dev_id);
+ err = request_irq(irq, handler, irqflags, devname, dev_id);
+ if (err < 0)
+ goto error;
+
+ return irq;
+error:
+ clear_bit(irq, irqs_allocated);
+ return err;
}
+int um_request_irq(int irq, int fd, enum um_irq_type type,
+ irq_handler_t handler, unsigned long irqflags,
+ const char *devname, void *dev_id)
+{
+ return _um_request_irq(irq, fd, type, handler, irqflags,
+ devname, dev_id, NULL);
+}
EXPORT_SYMBOL(um_request_irq);
-EXPORT_SYMBOL(reactivate_fd);
+
+#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT
+int um_request_irq_tt(int irq, int fd, enum um_irq_type type,
+ irq_handler_t handler, unsigned long irqflags,
+ const char *devname, void *dev_id,
+ void (*timetravel_handler)(int, int, void *,
+ struct time_travel_event *))
+{
+ return _um_request_irq(irq, fd, type, handler, irqflags,
+ devname, dev_id, timetravel_handler);
+}
+EXPORT_SYMBOL(um_request_irq_tt);
+
+void sigio_run_timetravel_handlers(void)
+{
+ _sigio_handler(NULL, true);
+}
+#endif
+
+#ifdef CONFIG_PM_SLEEP
+void um_irqs_suspend(void)
+{
+ struct irq_entry *entry;
+ unsigned long flags;
+
+ irqs_suspended = true;
+
+ raw_spin_lock_irqsave(&irq_lock, flags);
+ list_for_each_entry(entry, &active_fds, list) {
+ enum um_irq_type t;
+ bool clear = true;
+
+ for (t = 0; t < NUM_IRQ_TYPES; t++) {
+ if (!entry->reg[t].events)
+ continue;
+
+ /*
+ * For the SIGIO_WRITE_IRQ, which is used to handle the
+ * SIGIO workaround thread, we need special handling:
+ * enable wake for it itself, but below we tell it about
+ * any FDs that should be suspended.
+ */
+ if (entry->reg[t].wakeup ||
+ entry->reg[t].irq == SIGIO_WRITE_IRQ
+#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT
+ || entry->reg[t].timetravel_handler
+#endif
+ ) {
+ clear = false;
+ break;
+ }
+ }
+
+ if (clear) {
+ entry->suspended = true;
+ os_clear_fd_async(entry->fd);
+ entry->sigio_workaround =
+ !__ignore_sigio_fd(entry->fd);
+ }
+ }
+ raw_spin_unlock_irqrestore(&irq_lock, flags);
+}
+
+void um_irqs_resume(void)
+{
+ struct irq_entry *entry;
+ unsigned long flags;
+
+
+ raw_spin_lock_irqsave(&irq_lock, flags);
+ list_for_each_entry(entry, &active_fds, list) {
+ if (entry->suspended) {
+ int err = os_set_fd_async(entry->fd);
+
+ WARN(err < 0, "os_set_fd_async returned %d\n", err);
+ entry->suspended = false;
+
+ if (entry->sigio_workaround) {
+ err = __add_sigio_fd(entry->fd);
+ WARN(err < 0, "add_sigio_returned %d\n", err);
+ }
+ }
+ }
+ raw_spin_unlock_irqrestore(&irq_lock, flags);
+
+ irqs_suspended = false;
+ send_sigio_to_self();
+}
+
+static int normal_irq_set_wake(struct irq_data *d, unsigned int on)
+{
+ struct irq_entry *entry;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&irq_lock, flags);
+ list_for_each_entry(entry, &active_fds, list) {
+ enum um_irq_type t;
+
+ for (t = 0; t < NUM_IRQ_TYPES; t++) {
+ if (!entry->reg[t].events)
+ continue;
+
+ if (entry->reg[t].irq != d->irq)
+ continue;
+ entry->reg[t].wakeup = on;
+ goto unlock;
+ }
+ }
+unlock:
+ raw_spin_unlock_irqrestore(&irq_lock, flags);
+ return 0;
+}
+#else
+#define normal_irq_set_wake NULL
+#endif
/*
* irq_chip must define at least enable/disable and ack when
@@ -331,139 +662,67 @@ static void dummy(struct irq_data *d)
{
}
-/* This is used for everything else than the timer. */
+/* This is used for everything other than the timer. */
static struct irq_chip normal_irq_type = {
.name = "SIGIO",
.irq_disable = dummy,
.irq_enable = dummy,
.irq_ack = dummy,
+ .irq_mask = dummy,
+ .irq_unmask = dummy,
+ .irq_set_wake = normal_irq_set_wake,
};
-static struct irq_chip SIGVTALRM_irq_type = {
- .name = "SIGVTALRM",
+static struct irq_chip alarm_irq_type = {
+ .name = "SIGALRM",
.irq_disable = dummy,
.irq_enable = dummy,
.irq_ack = dummy,
+ .irq_mask = dummy,
+ .irq_unmask = dummy,
};
void __init init_IRQ(void)
{
int i;
- irq_set_chip_and_handler(TIMER_IRQ, &SIGVTALRM_irq_type, handle_edge_irq);
+ irq_set_chip_and_handler(TIMER_IRQ, &alarm_irq_type, handle_percpu_irq);
- for (i = 1; i < NR_IRQS; i++)
+ for (i = 1; i < UM_LAST_SIGNAL_IRQ; i++)
irq_set_chip_and_handler(i, &normal_irq_type, handle_edge_irq);
+ /* Initialize EPOLL Loop */
+ os_setup_epoll();
}
-/*
- * IRQ stack entry and exit:
- *
- * Unlike i386, UML doesn't receive IRQs on the normal kernel stack
- * and switch over to the IRQ stack after some preparation. We use
- * sigaltstack to receive signals on a separate stack from the start.
- * These two functions make sure the rest of the kernel won't be too
- * upset by being on a different stack. The IRQ stack has a
- * thread_info structure at the bottom so that current et al continue
- * to work.
- *
- * to_irq_stack copies the current task's thread_info to the IRQ stack
- * thread_info and sets the tasks's stack to point to the IRQ stack.
- *
- * from_irq_stack copies the thread_info struct back (flags may have
- * been modified) and resets the task's stack pointer.
- *
- * Tricky bits -
- *
- * What happens when two signals race each other? UML doesn't block
- * signals with sigprocmask, SA_DEFER, or sa_mask, so a second signal
- * could arrive while a previous one is still setting up the
- * thread_info.
- *
- * There are three cases -
- * The first interrupt on the stack - sets up the thread_info and
- * handles the interrupt
- * A nested interrupt interrupting the copying of the thread_info -
- * can't handle the interrupt, as the stack is in an unknown state
- * A nested interrupt not interrupting the copying of the
- * thread_info - doesn't do any setup, just handles the interrupt
- *
- * The first job is to figure out whether we interrupted stack setup.
- * This is done by xchging the signal mask with thread_info->pending.
- * If the value that comes back is zero, then there is no setup in
- * progress, and the interrupt can be handled. If the value is
- * non-zero, then there is stack setup in progress. In order to have
- * the interrupt handled, we leave our signal in the mask, and it will
- * be handled by the upper handler after it has set up the stack.
- *
- * Next is to figure out whether we are the outer handler or a nested
- * one. As part of setting up the stack, thread_info->real_thread is
- * set to non-NULL (and is reset to NULL on exit). This is the
- * nesting indicator. If it is non-NULL, then the stack is already
- * set up and the handler can run.
- */
-
-static unsigned long pending_mask;
-
-unsigned long to_irq_stack(unsigned long *mask_out)
+int __init arch_probe_nr_irqs(void)
{
- struct thread_info *ti;
- unsigned long mask, old;
- int nested;
-
- mask = xchg(&pending_mask, *mask_out);
- if (mask != 0) {
- /*
- * If any interrupts come in at this point, we want to
- * make sure that their bits aren't lost by our
- * putting our bit in. So, this loop accumulates bits
- * until xchg returns the same value that we put in.
- * When that happens, there were no new interrupts,
- * and pending_mask contains a bit for each interrupt
- * that came in.
- */
- old = *mask_out;
- do {
- old |= mask;
- mask = xchg(&pending_mask, old);
- } while (mask != old);
- return 1;
- }
-
- ti = current_thread_info();
- nested = (ti->real_thread != NULL);
- if (!nested) {
- struct task_struct *task;
- struct thread_info *tti;
-
- task = cpu_tasks[ti->cpu].task;
- tti = task_thread_info(task);
-
- *ti = *tti;
- ti->real_thread = tti;
- task->stack = ti;
- }
-
- mask = xchg(&pending_mask, 0);
- *mask_out |= mask | nested;
- return 0;
+ return NR_IRQS;
}
-unsigned long from_irq_stack(int nested)
+void sigchld_handler(int sig, struct siginfo *unused_si,
+ struct uml_pt_regs *regs, void *mc)
{
- struct thread_info *ti, *to;
- unsigned long mask;
+ do_IRQ(SIGCHLD_IRQ, regs);
+}
- ti = current_thread_info();
+/*
+ * /proc/interrupts printing for arch specific interrupts
+ */
+int arch_show_interrupts(struct seq_file *p, int prec)
+{
+#if IS_ENABLED(CONFIG_SMP)
+ int cpu;
- pending_mask = 1;
+ seq_printf(p, "%*s: ", prec, "RES");
+ for_each_online_cpu(cpu)
+ seq_printf(p, "%10u ", irq_stats(cpu)->irq_resched_count);
+ seq_puts(p, " Rescheduling interrupts\n");
- to = ti->real_thread;
- current->stack = to;
- ti->real_thread = NULL;
- *to = *ti;
+ seq_printf(p, "%*s: ", prec, "CAL");
+ for_each_online_cpu(cpu)
+ seq_printf(p, "%10u ", irq_stats(cpu)->irq_call_count);
+ seq_puts(p, " Function call interrupts\n");
+#endif
- mask = xchg(&pending_mask, 0);
- return mask & ~1;
+ return 0;
}
-
diff --git a/arch/um/kernel/kmsg_dump.c b/arch/um/kernel/kmsg_dump.c
new file mode 100644
index 000000000000..fc0f543d1d8e
--- /dev/null
+++ b/arch/um/kernel/kmsg_dump.c
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kmsg_dump.h>
+#include <linux/spinlock.h>
+#include <linux/console.h>
+#include <linux/string.h>
+#include <shared/init.h>
+#include <shared/kern.h>
+#include <os.h>
+
+static void kmsg_dumper_stdout(struct kmsg_dumper *dumper,
+ struct kmsg_dump_detail *detail)
+{
+ static struct kmsg_dump_iter iter;
+ static DEFINE_SPINLOCK(lock);
+ static char line[1024];
+ struct console *con;
+ unsigned long flags;
+ size_t len = 0;
+ int cookie;
+
+ /*
+ * If no consoles are available to output crash information, dump
+ * the kmsg buffer to stdout.
+ */
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(con) {
+ /*
+ * The ttynull console and disabled consoles are ignored
+ * since they cannot output. All other consoles are
+ * expected to output the crash information.
+ */
+ if (strcmp(con->name, "ttynull") != 0 &&
+ console_is_usable(con, console_srcu_read_flags(con), true)) {
+ break;
+ }
+ }
+ console_srcu_read_unlock(cookie);
+ if (con)
+ return;
+
+ if (!spin_trylock_irqsave(&lock, flags))
+ return;
+
+ kmsg_dump_rewind(&iter);
+
+ printf("kmsg_dump:\n");
+ while (kmsg_dump_get_line(&iter, true, line, sizeof(line), &len)) {
+ line[len] = '\0';
+ printf("%s", line);
+ }
+
+ spin_unlock_irqrestore(&lock, flags);
+}
+
+static struct kmsg_dumper kmsg_dumper = {
+ .dump = kmsg_dumper_stdout
+};
+
+static int __init kmsg_dumper_stdout_init(void)
+{
+ return kmsg_dump_register(&kmsg_dumper);
+}
+
+__uml_postsetup(kmsg_dumper_stdout_init);
diff --git a/arch/um/kernel/ksyms.c b/arch/um/kernel/ksyms.c
index 543c04756939..96314c31e61c 100644
--- a/arch/um/kernel/ksyms.c
+++ b/arch/um/kernel/ksyms.c
@@ -1,13 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
*/
#include <linux/module.h>
#include <os.h>
-EXPORT_SYMBOL(set_signals);
-EXPORT_SYMBOL(get_signals);
+EXPORT_SYMBOL(um_get_signals);
+EXPORT_SYMBOL(um_set_signals);
EXPORT_SYMBOL(os_stat_fd);
EXPORT_SYMBOL(os_stat_file);
@@ -33,12 +33,16 @@ EXPORT_SYMBOL(os_shutdown_socket);
EXPORT_SYMBOL(os_create_unix_socket);
EXPORT_SYMBOL(os_connect_socket);
EXPORT_SYMBOL(os_accept_connection);
-EXPORT_SYMBOL(os_rcv_fd);
+EXPORT_SYMBOL(os_rcv_fd_msg);
EXPORT_SYMBOL(run_helper);
EXPORT_SYMBOL(os_major);
EXPORT_SYMBOL(os_minor);
EXPORT_SYMBOL(os_makedev);
+EXPORT_SYMBOL(os_eventfd);
+EXPORT_SYMBOL(os_sendmsg_fds);
EXPORT_SYMBOL(add_sigio_fd);
EXPORT_SYMBOL(ignore_sigio_fd);
EXPORT_SYMBOL(sigio_broken);
+
+EXPORT_SYMBOL(syscall);
diff --git a/arch/um/kernel/load_file.c b/arch/um/kernel/load_file.c
new file mode 100644
index 000000000000..cb9d178ab7d8
--- /dev/null
+++ b/arch/um/kernel/load_file.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ */
+#include <linux/memblock.h>
+#include <os.h>
+
+#include "um_arch.h"
+
+static int __init __uml_load_file(const char *filename, void *buf, int size)
+{
+ int fd, n;
+
+ fd = os_open_file(filename, of_read(OPENFLAGS()), 0);
+ if (fd < 0) {
+ printk(KERN_ERR "Opening '%s' failed - err = %d\n", filename,
+ -fd);
+ return -1;
+ }
+ n = os_read_file(fd, buf, size);
+ if (n != size) {
+ printk(KERN_ERR "Read of %d bytes from '%s' failed, "
+ "err = %d\n", size,
+ filename, -n);
+ return -1;
+ }
+
+ os_close_file(fd);
+ return 0;
+}
+
+void *uml_load_file(const char *filename, unsigned long long *size)
+{
+ void *area;
+ int err;
+
+ *size = 0;
+
+ if (!filename)
+ return NULL;
+
+ err = os_file_size(filename, size);
+ if (err)
+ return NULL;
+
+ if (*size == 0) {
+ printk(KERN_ERR "\"%s\" is empty\n", filename);
+ return NULL;
+ }
+
+ area = memblock_alloc_or_panic(*size, SMP_CACHE_BYTES);
+
+ if (__uml_load_file(filename, area, *size)) {
+ memblock_free(area, *size);
+ return NULL;
+ }
+
+ return area;
+}
diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c
index 7ddb64baf327..39c4a7e21c6f 100644
--- a/arch/um/kernel/mem.c
+++ b/arch/um/kernel/mem.c
@@ -1,29 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
*/
#include <linux/stddef.h>
#include <linux/module.h>
-#include <linux/bootmem.h>
-#include <linux/highmem.h>
+#include <linux/memblock.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/slab.h>
-#include <asm/fixmap.h>
+#include <linux/init.h>
+#include <asm/sections.h>
#include <asm/page.h>
+#include <asm/pgalloc.h>
#include <as-layout.h>
#include <init.h>
#include <kern.h>
#include <kern_util.h>
#include <mem_user.h>
#include <os.h>
+#include <um_malloc.h>
+#include <linux/sched/task.h>
+#include <linux/kasan.h>
+
+#ifdef CONFIG_KASAN
+void __init kasan_init(void)
+{
+ /*
+ * kasan_map_memory will map all of the required address space and
+ * the host machine will allocate physical memory as necessary.
+ */
+ kasan_map_memory((void *)KASAN_SHADOW_START, KASAN_SHADOW_SIZE);
+ init_task.kasan_depth = 0;
+ /*
+ * Since kasan_init() is called before main(),
+ * KASAN is initialized but the enablement is deferred after
+ * jump_label_init(). See arch_mm_preinit().
+ */
+}
+
+static void (*kasan_init_ptr)(void)
+__section(".kasan_init") __used
+= kasan_init;
+#endif
/* allocated in paging_init, zeroed in mem_init, and unchanged thereafter */
unsigned long *empty_zero_page = NULL;
EXPORT_SYMBOL(empty_zero_page);
-/* allocated in paging_init and unchanged thereafter */
-static unsigned long *empty_bad_page = NULL;
/*
* Initialized during boot, and readonly for initializing page tables
@@ -32,202 +55,47 @@ static unsigned long *empty_bad_page = NULL;
pgd_t swapper_pg_dir[PTRS_PER_PGD];
/* Initialized at boot time, and readonly after that */
-unsigned long long highmem;
int kmalloc_ok = 0;
/* Used during early boot */
static unsigned long brk_end;
-#ifdef CONFIG_HIGHMEM
-static void setup_highmem(unsigned long highmem_start,
- unsigned long highmem_len)
+void __init arch_mm_preinit(void)
{
- unsigned long highmem_pfn;
- int i;
+ /* Safe to call after jump_label_init(). Enables KASAN. */
+ kasan_init_generic();
- highmem_pfn = __pa(highmem_start) >> PAGE_SHIFT;
- for (i = 0; i < highmem_len >> PAGE_SHIFT; i++)
- free_highmem_page(&mem_map[highmem_pfn + i]);
-}
-#endif
-
-void __init mem_init(void)
-{
/* clear the zero-page */
memset(empty_zero_page, 0, PAGE_SIZE);
/* Map in the area just after the brk now that kmalloc is about
* to be turned on.
*/
- brk_end = (unsigned long) UML_ROUND_UP(sbrk(0));
+ brk_end = PAGE_ALIGN((unsigned long) sbrk(0));
map_memory(brk_end, __pa(brk_end), uml_reserved - brk_end, 1, 1, 0);
- free_bootmem(__pa(brk_end), uml_reserved - brk_end);
+ memblock_free((void *)brk_end, uml_reserved - brk_end);
uml_reserved = brk_end;
-
- /* this will put all low memory onto the freelists */
- free_all_bootmem();
- max_low_pfn = totalram_pages;
-#ifdef CONFIG_HIGHMEM
- setup_highmem(end_iomem, highmem);
-#endif
- max_pfn = totalram_pages;
- mem_init_print_info(NULL);
- kmalloc_ok = 1;
-}
-
-/*
- * Create a page table and place a pointer to it in a middle page
- * directory entry.
- */
-static void __init one_page_table_init(pmd_t *pmd)
-{
- if (pmd_none(*pmd)) {
- pte_t *pte = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
- set_pmd(pmd, __pmd(_KERNPG_TABLE +
- (unsigned long) __pa(pte)));
- if (pte != pte_offset_kernel(pmd, 0))
- BUG();
- }
-}
-
-static void __init one_md_table_init(pud_t *pud)
-{
-#ifdef CONFIG_3_LEVEL_PGTABLES
- pmd_t *pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
- set_pud(pud, __pud(_KERNPG_TABLE + (unsigned long) __pa(pmd_table)));
- if (pmd_table != pmd_offset(pud, 0))
- BUG();
-#endif
-}
-
-static void __init fixrange_init(unsigned long start, unsigned long end,
- pgd_t *pgd_base)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- int i, j;
- unsigned long vaddr;
-
- vaddr = start;
- i = pgd_index(vaddr);
- j = pmd_index(vaddr);
- pgd = pgd_base + i;
-
- for ( ; (i < PTRS_PER_PGD) && (vaddr < end); pgd++, i++) {
- pud = pud_offset(pgd, vaddr);
- if (pud_none(*pud))
- one_md_table_init(pud);
- pmd = pmd_offset(pud, vaddr);
- for (; (j < PTRS_PER_PMD) && (vaddr < end); pmd++, j++) {
- one_page_table_init(pmd);
- vaddr += PMD_SIZE;
- }
- j = 0;
- }
-}
-
-#ifdef CONFIG_HIGHMEM
-pte_t *kmap_pte;
-pgprot_t kmap_prot;
-
-#define kmap_get_fixmap_pte(vaddr) \
- pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), (vaddr)),\
- (vaddr)), (vaddr))
-
-static void __init kmap_init(void)
-{
- unsigned long kmap_vstart;
-
- /* cache the first kmap pte */
- kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
- kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
-
- kmap_prot = PAGE_KERNEL;
-}
-
-static void __init init_highmem(void)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
- unsigned long vaddr;
-
- /*
- * Permanent kmaps:
- */
- vaddr = PKMAP_BASE;
- fixrange_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, swapper_pg_dir);
-
- pgd = swapper_pg_dir + pgd_index(vaddr);
- pud = pud_offset(pgd, vaddr);
- pmd = pmd_offset(pud, vaddr);
- pte = pte_offset_kernel(pmd, vaddr);
- pkmap_page_table = pte;
-
- kmap_init();
+ min_low_pfn = PFN_UP(__pa(uml_reserved));
+ max_pfn = max_low_pfn;
}
-#endif /* CONFIG_HIGHMEM */
-static void __init fixaddr_user_init( void)
+void __init mem_init(void)
{
-#ifdef CONFIG_ARCH_REUSE_HOST_VSYSCALL_AREA
- long size = FIXADDR_USER_END - FIXADDR_USER_START;
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
- phys_t p;
- unsigned long v, vaddr = FIXADDR_USER_START;
-
- if (!size)
- return;
-
- fixrange_init( FIXADDR_USER_START, FIXADDR_USER_END, swapper_pg_dir);
- v = (unsigned long) alloc_bootmem_low_pages(size);
- memcpy((void *) v , (void *) FIXADDR_USER_START, size);
- p = __pa(v);
- for ( ; size > 0; size -= PAGE_SIZE, vaddr += PAGE_SIZE,
- p += PAGE_SIZE) {
- pgd = swapper_pg_dir + pgd_index(vaddr);
- pud = pud_offset(pgd, vaddr);
- pmd = pmd_offset(pud, vaddr);
- pte = pte_offset_kernel(pmd, vaddr);
- pte_set_val(*pte, p, PAGE_READONLY);
- }
-#endif
+ kmalloc_ok = 1;
}
void __init paging_init(void)
{
- unsigned long zones_size[MAX_NR_ZONES], vaddr;
- int i;
+ unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 };
- empty_zero_page = (unsigned long *) alloc_bootmem_low_pages(PAGE_SIZE);
- empty_bad_page = (unsigned long *) alloc_bootmem_low_pages(PAGE_SIZE);
- for (i = 0; i < ARRAY_SIZE(zones_size); i++)
- zones_size[i] = 0;
+ empty_zero_page = (unsigned long *) memblock_alloc_low(PAGE_SIZE,
+ PAGE_SIZE);
+ if (!empty_zero_page)
+ panic("%s: Failed to allocate %lu bytes align=%lx\n",
+ __func__, PAGE_SIZE, PAGE_SIZE);
- zones_size[ZONE_NORMAL] = (end_iomem >> PAGE_SHIFT) -
- (uml_physmem >> PAGE_SHIFT);
-#ifdef CONFIG_HIGHMEM
- zones_size[ZONE_HIGHMEM] = highmem >> PAGE_SHIFT;
-#endif
- free_area_init(zones_size);
-
- /*
- * Fixed mappings, only the page table structure has to be
- * created - mappings will be set by set_fixmap():
- */
- vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
- fixrange_init(vaddr, FIXADDR_TOP, swapper_pg_dir);
-
- fixaddr_user_init();
-
-#ifdef CONFIG_HIGHMEM
- init_highmem();
-#endif
+ max_zone_pfn[ZONE_NORMAL] = high_physmem >> PAGE_SHIFT;
+ free_area_init(max_zone_pfn);
}
/*
@@ -239,64 +107,49 @@ void free_initmem(void)
{
}
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
/* Allocate and free page tables. */
pgd_t *pgd_alloc(struct mm_struct *mm)
{
- pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL);
+ pgd_t *pgd = __pgd_alloc(mm, 0);
- if (pgd) {
- memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t));
+ if (pgd)
memcpy(pgd + USER_PTRS_PER_PGD,
swapper_pg_dir + USER_PTRS_PER_PGD,
(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
- }
- return pgd;
-}
-
-void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
- free_page((unsigned long) pgd);
-}
-
-pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
-{
- pte_t *pte;
-
- pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
- return pte;
-}
-
-pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
-{
- struct page *pte;
-
- pte = alloc_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
- if (pte)
- pgtable_page_ctor(pte);
- return pte;
-}
-
-#ifdef CONFIG_3_LEVEL_PGTABLES
-pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
-{
- pmd_t *pmd = (pmd_t *) __get_free_page(GFP_KERNEL);
- if (pmd)
- memset(pmd, 0, PAGE_SIZE);
-
- return pmd;
+ return pgd;
}
-#endif
void *uml_kmalloc(int size, int flags)
{
return kmalloc(size, flags);
}
+
+static const pgprot_t protection_map[16] = {
+ [VM_NONE] = PAGE_NONE,
+ [VM_READ] = PAGE_READONLY,
+ [VM_WRITE] = PAGE_COPY,
+ [VM_WRITE | VM_READ] = PAGE_COPY,
+ [VM_EXEC] = PAGE_READONLY,
+ [VM_EXEC | VM_READ] = PAGE_READONLY,
+ [VM_EXEC | VM_WRITE] = PAGE_COPY,
+ [VM_EXEC | VM_WRITE | VM_READ] = PAGE_COPY,
+ [VM_SHARED] = PAGE_NONE,
+ [VM_SHARED | VM_READ] = PAGE_READONLY,
+ [VM_SHARED | VM_WRITE] = PAGE_SHARED,
+ [VM_SHARED | VM_WRITE | VM_READ] = PAGE_SHARED,
+ [VM_SHARED | VM_EXEC] = PAGE_READONLY,
+ [VM_SHARED | VM_EXEC | VM_READ] = PAGE_READONLY,
+ [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_SHARED,
+ [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED
+};
+DECLARE_VM_GET_PAGE_PROT
+
+void mark_rodata_ro(void)
+{
+ unsigned long rodata_start = PFN_ALIGN(__start_rodata);
+ unsigned long rodata_end = PFN_ALIGN(__end_rodata);
+
+ os_protect_memory((void *)rodata_start, rodata_end - rodata_start, 1, 0, 0);
+}
diff --git a/arch/um/kernel/physmem.c b/arch/um/kernel/physmem.c
index f116db15d402..ae6ca373c261 100644
--- a/arch/um/kernel/physmem.c
+++ b/arch/um/kernel/physmem.c
@@ -1,16 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
*/
#include <linux/module.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/mm.h>
#include <linux/pfn.h>
#include <asm/page.h>
+#include <asm/sections.h>
#include <as-layout.h>
#include <init.h>
#include <kern.h>
+#include <kern_util.h>
#include <mem_user.h>
#include <os.h>
@@ -20,43 +22,6 @@ static int physmem_fd = -1;
unsigned long high_physmem;
EXPORT_SYMBOL(high_physmem);
-extern unsigned long long physmem_size;
-
-int __init init_maps(unsigned long physmem, unsigned long iomem,
- unsigned long highmem)
-{
- struct page *p, *map;
- unsigned long phys_len, phys_pages, highmem_len, highmem_pages;
- unsigned long iomem_len, iomem_pages, total_len, total_pages;
- int i;
-
- phys_pages = physmem >> PAGE_SHIFT;
- phys_len = phys_pages * sizeof(struct page);
-
- iomem_pages = iomem >> PAGE_SHIFT;
- iomem_len = iomem_pages * sizeof(struct page);
-
- highmem_pages = highmem >> PAGE_SHIFT;
- highmem_len = highmem_pages * sizeof(struct page);
-
- total_pages = phys_pages + iomem_pages + highmem_pages;
- total_len = phys_len + iomem_len + highmem_len;
-
- map = alloc_bootmem_low_pages(total_len);
- if (map == NULL)
- return -ENOMEM;
-
- for (i = 0; i < total_pages; i++) {
- p = &map[i];
- memset(p, 0, sizeof(struct page));
- SetPageReserved(p);
- INIT_LIST_HEAD(&p->lru);
- }
-
- max_mapnr = total_pages;
- return 0;
-}
-
void map_memory(unsigned long virt, unsigned long phys, unsigned long len,
int r, int w, int x)
{
@@ -75,25 +40,46 @@ void map_memory(unsigned long virt, unsigned long phys, unsigned long len,
}
}
-extern int __syscall_stub_start;
-
+/**
+ * setup_physmem() - Setup physical memory for UML
+ * @start: Start address of the physical kernel memory,
+ * i.e start address of the executable image.
+ * @reserve_end: end address of the physical kernel memory.
+ * @len: Length of total physical memory that should be mapped/made
+ * available, in bytes.
+ *
+ * Creates an unlinked temporary file of size (len) and memory maps
+ * it on the last executable image address (uml_reserved).
+ *
+ * The offset is needed as the length of the total physical memory
+ * (len) includes the size of the memory used be the executable image,
+ * but the mapped-to address is the last address of the executable image
+ * (uml_reserved == end address of executable image).
+ *
+ * The memory mapped memory of the temporary file is used as backing memory
+ * of all user space processes/kernel tasks.
+ */
void __init setup_physmem(unsigned long start, unsigned long reserve_end,
- unsigned long len, unsigned long long highmem)
+ unsigned long len)
{
unsigned long reserve = reserve_end - start;
- int pfn = PFN_UP(__pa(reserve_end));
- int delta = (len - reserve) >> PAGE_SHIFT;
- int err, offset, bootmap_size;
+ unsigned long map_size = len - reserve;
+ int err;
+
+ if (len <= reserve) {
+ os_warn("Too few physical memory! Needed=%lu, given=%lu\n",
+ reserve, len);
+ exit(1);
+ }
- physmem_fd = create_mem_file(len + highmem);
+ physmem_fd = create_mem_file(len);
- offset = uml_reserved - uml_physmem;
- err = os_map_memory((void *) uml_reserved, physmem_fd, offset,
- len - offset, 1, 1, 1);
+ err = os_map_memory((void *) reserve_end, physmem_fd, reserve,
+ map_size, 1, 1, 1);
if (err < 0) {
- printf("setup_physmem - mapping %ld bytes of memory at 0x%p "
- "failed - errno = %d\n", len - offset,
- (void *) uml_reserved, err);
+ os_warn("setup_physmem - mapping %lu bytes of memory at 0x%p "
+ "failed - errno = %d\n", map_size,
+ (void *) reserve_end, err);
exit(1);
}
@@ -101,12 +87,14 @@ void __init setup_physmem(unsigned long start, unsigned long reserve_end,
* Special kludge - This page will be mapped in to userspace processes
* from physmem_fd, so it needs to be written out there.
*/
- os_seek_file(physmem_fd, __pa(&__syscall_stub_start));
- os_write_file(physmem_fd, &__syscall_stub_start, PAGE_SIZE);
+ os_seek_file(physmem_fd, __pa(__syscall_stub_start));
+ os_write_file(physmem_fd, __syscall_stub_start, PAGE_SIZE);
+
+ memblock_add(__pa(start), len);
+ memblock_reserve(__pa(start), reserve);
- bootmap_size = init_bootmem(pfn, pfn + delta);
- free_bootmem(__pa(reserve_end) + bootmap_size,
- len - bootmap_size - reserve);
+ min_low_pfn = PFN_UP(__pa(reserve_end));
+ max_low_pfn = min_low_pfn + (map_size >> PAGE_SHIFT);
}
int phys_mapping(unsigned long phys, unsigned long long *offset_out)
@@ -117,30 +105,16 @@ int phys_mapping(unsigned long phys, unsigned long long *offset_out)
fd = physmem_fd;
*offset_out = phys;
}
- else if (phys < __pa(end_iomem)) {
- struct iomem_region *region = iomem_regions;
-
- while (region != NULL) {
- if ((phys >= region->phys) &&
- (phys < region->phys + region->size)) {
- fd = region->fd;
- *offset_out = phys - region->phys;
- break;
- }
- region = region->next;
- }
- }
- else if (phys < __pa(end_iomem) + highmem) {
- fd = physmem_fd;
- *offset_out = phys - iomem_size;
- }
return fd;
}
+EXPORT_SYMBOL(phys_mapping);
static int __init uml_mem_setup(char *line, int *add)
{
char *retptr;
+
+ *add = 0;
physmem_size = memparse(line,&retptr);
return 0;
}
@@ -153,63 +127,3 @@ __uml_setup("mem=", uml_mem_setup,
" be more, and the excess, if it's ever used, will just be swapped out.\n"
" Example: mem=64M\n\n"
);
-
-extern int __init parse_iomem(char *str, int *add);
-
-__uml_setup("iomem=", parse_iomem,
-"iomem=<name>,<file>\n"
-" Configure <file> as an IO memory region named <name>.\n\n"
-);
-
-/*
- * This list is constructed in parse_iomem and addresses filled in in
- * setup_iomem, both of which run during early boot. Afterwards, it's
- * unchanged.
- */
-struct iomem_region *iomem_regions;
-
-/* Initialized in parse_iomem and unchanged thereafter */
-int iomem_size;
-
-unsigned long find_iomem(char *driver, unsigned long *len_out)
-{
- struct iomem_region *region = iomem_regions;
-
- while (region != NULL) {
- if (!strcmp(region->driver, driver)) {
- *len_out = region->size;
- return region->virt;
- }
-
- region = region->next;
- }
-
- return 0;
-}
-EXPORT_SYMBOL(find_iomem);
-
-static int setup_iomem(void)
-{
- struct iomem_region *region = iomem_regions;
- unsigned long iomem_start = high_physmem + PAGE_SIZE;
- int err;
-
- while (region != NULL) {
- err = os_map_memory((void *) iomem_start, region->fd, 0,
- region->size, 1, 1, 0);
- if (err)
- printk(KERN_ERR "Mapping iomem region for driver '%s' "
- "failed, errno = %d\n", region->driver, -err);
- else {
- region->virt = iomem_start;
- region->phys = __pa(region->virt);
- }
-
- iomem_start += region->size + PAGE_SIZE;
- region = region->next;
- }
-
- return 0;
-}
-
-__initcall(setup_iomem);
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index bbcef522bcb1..63b38a3f73f7 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -1,7 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
/*
+ * Copyright (C) 2015 Anton Ivanov (aivanov@{brocade.com,kot-begemot.co.uk})
+ * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de)
* Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
* Copyright 2003 PathScale, Inc.
- * Licensed under the GPL
*/
#include <linux/stddef.h>
@@ -13,44 +15,38 @@
#include <linux/proc_fs.h>
#include <linux/ptrace.h>
#include <linux/random.h>
+#include <linux/cpu.h>
#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
#include <linux/seq_file.h>
#include <linux/tick.h>
#include <linux/threads.h>
-#include <linux/tracehook.h>
+#include <linux/resume_user_mode.h>
#include <asm/current.h>
-#include <asm/pgtable.h>
#include <asm/mmu_context.h>
-#include <asm/uaccess.h>
+#include <asm/switch_to.h>
+#include <asm/exec.h>
+#include <linux/uaccess.h>
#include <as-layout.h>
#include <kern_util.h>
#include <os.h>
#include <skas.h>
+#include <registers.h>
+#include <linux/time-internal.h>
+#include <linux/elfcore.h>
/*
* This is a per-cpu array. A processor only modifies its entry and it only
* cares about its entry, so it's OK if another processor is modifying its
* entry.
*/
-struct cpu_task cpu_tasks[NR_CPUS] = { [0 ... NR_CPUS - 1] = { -1, NULL } };
-
-static inline int external_pid(void)
-{
- /* FIXME: Need to look up userspace_pid by cpu */
- return userspace_pid[0];
-}
-
-int pid_to_processor_id(int pid)
-{
- int i;
-
- for (i = 0; i < ncpus; i++) {
- if (cpu_tasks[i].pid == pid)
- return i;
- }
- return -1;
-}
+struct task_struct *cpu_tasks[NR_CPUS] = {
+ [0 ... NR_CPUS - 1] = &init_task,
+};
+EXPORT_SYMBOL(cpu_tasks);
void free_stack(unsigned long stack, int order)
{
@@ -71,46 +67,35 @@ unsigned long alloc_stack(int order, int atomic)
static inline void set_current(struct task_struct *task)
{
- cpu_tasks[task_thread_info(task)->cpu] = ((struct cpu_task)
- { external_pid(), task });
+ cpu_tasks[task_thread_info(task)->cpu] = task;
}
-extern void arch_switch_to(struct task_struct *to);
-
-void *__switch_to(struct task_struct *from, struct task_struct *to)
+struct task_struct *__switch_to(struct task_struct *from, struct task_struct *to)
{
to->thread.prev_sched = from;
set_current(to);
- do {
- current->thread.saved_task = NULL;
-
- switch_threads(&from->thread.switch_buf,
- &to->thread.switch_buf);
-
- arch_switch_to(current);
-
- if (current->thread.saved_task)
- show_regs(&(current->thread.regs));
- to = current->thread.saved_task;
- from = current;
- } while (current->thread.saved_task);
+ switch_threads(&from->thread.switch_buf, &to->thread.switch_buf);
+ arch_switch_to(current);
return current->thread.prev_sched;
}
void interrupt_end(void)
{
- if (need_resched())
- schedule();
- if (test_thread_flag(TIF_SIGPENDING))
- do_signal();
- if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME))
- tracehook_notify_resume(&current->thread.regs);
-}
-
-void exit_thread(void)
-{
+ struct pt_regs *regs = &current->thread.regs;
+ unsigned long thread_flags;
+
+ thread_flags = read_thread_flags();
+ while (thread_flags & _TIF_WORK_MASK) {
+ if (thread_flags & _TIF_NEED_RESCHED)
+ schedule();
+ if (thread_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
+ do_signal(regs);
+ if (thread_flags & _TIF_NOTIFY_RESUME)
+ resume_user_mode_work(regs);
+ thread_flags = read_thread_flags();
+ }
}
int get_current_pid(void)
@@ -124,28 +109,26 @@ int get_current_pid(void)
*/
void new_thread_handler(void)
{
- int (*fn)(void *), n;
+ int (*fn)(void *);
void *arg;
if (current->thread.prev_sched != NULL)
schedule_tail(current->thread.prev_sched);
current->thread.prev_sched = NULL;
- fn = current->thread.request.u.thread.proc;
- arg = current->thread.request.u.thread.arg;
+ fn = current->thread.request.thread.proc;
+ arg = current->thread.request.thread.arg;
/*
* callback returns only if the kernel thread execs a process
*/
- n = fn(arg);
+ fn(arg);
userspace(&current->thread.regs.regs);
}
/* Called magically, see new_thread_handler above */
-void fork_handler(void)
+static void fork_handler(void)
{
- force_flush_all();
-
schedule_tail(current->thread.prev_sched);
/*
@@ -160,16 +143,17 @@ void fork_handler(void)
userspace(&current->thread.regs.regs);
}
-int copy_thread(unsigned long clone_flags, unsigned long sp,
- unsigned long arg, struct task_struct * p)
+int copy_thread(struct task_struct * p, const struct kernel_clone_args *args)
{
+ u64 clone_flags = args->flags;
+ unsigned long sp = args->stack;
+ unsigned long tls = args->tls;
void (*handler)(void);
- int kthread = current->flags & PF_KTHREAD;
int ret = 0;
p->thread = (struct thread_struct) INIT_THREAD;
- if (!kthread) {
+ if (!args->fn) {
memcpy(&p->thread.regs.regs, current_pt_regs(),
sizeof(p->thread.regs.regs));
PT_REGS_SET_SYSCALL_RETURN(&p->thread.regs, 0);
@@ -181,21 +165,21 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
arch_copy_thread(&current->thread.arch, &p->thread.arch);
} else {
get_safe_registers(p->thread.regs.regs.gp, p->thread.regs.regs.fp);
- p->thread.request.u.thread.proc = (int (*)(void *))sp;
- p->thread.request.u.thread.arg = (void *)arg;
+ p->thread.request.thread.proc = args->fn;
+ p->thread.request.thread.arg = args->fn_arg;
handler = new_thread_handler;
}
new_thread(task_stack_page(p), &p->thread.switch_buf, handler);
- if (!kthread) {
+ if (!args->fn) {
clear_flushed_tls(p);
/*
* Set a new TLS for the child thread?
*/
if (clone_flags & CLONE_SETTLS)
- ret = arch_copy_tls(p);
+ ret = arch_set_tls(p, tls);
}
return ret;
@@ -203,34 +187,50 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
void initial_thread_cb(void (*proc)(void *), void *arg)
{
- int save_kmalloc_ok = kmalloc_ok;
-
- kmalloc_ok = 0;
initial_thread_cb_skas(proc, arg);
- kmalloc_ok = save_kmalloc_ok;
+}
+
+int arch_dup_task_struct(struct task_struct *dst,
+ struct task_struct *src)
+{
+ /* init_task is not dynamically sized (missing FPU state) */
+ if (unlikely(src == &init_task)) {
+ memcpy(dst, src, sizeof(init_task));
+ memset((void *)dst + sizeof(init_task), 0,
+ arch_task_struct_size - sizeof(init_task));
+ } else {
+ memcpy(dst, src, arch_task_struct_size);
+ }
+
+ return 0;
+}
+
+void um_idle_sleep(void)
+{
+ if (time_travel_mode != TT_MODE_OFF)
+ time_travel_sleep();
+ else
+ os_idle_sleep();
}
void arch_cpu_idle(void)
{
- unsigned long long nsecs;
+ um_idle_sleep();
+}
- cpu_tasks[current_thread_info()->cpu].pid = os_getpid();
- nsecs = disable_timer();
- idle_sleep(nsecs);
- local_irq_enable();
+void arch_cpu_idle_prepare(void)
+{
+ os_idle_prepare();
}
-int __cant_sleep(void) {
+int __uml_cant_sleep(void) {
return in_atomic() || irqs_disabled() || in_interrupt();
/* Is in_interrupt() really needed? */
}
-int user_context(unsigned long sp)
+int uml_need_resched(void)
{
- unsigned long stack;
-
- stack = sp & (PAGE_MASK << CONFIG_KERNEL_STACK_ORDER);
- return stack != (unsigned long) current_thread_info();
+ return need_resched();
}
extern exitcall_t __uml_exitcall_begin, __uml_exitcall_end;
@@ -250,127 +250,20 @@ char *uml_strdup(const char *string)
}
EXPORT_SYMBOL(uml_strdup);
-int copy_to_user_proc(void __user *to, void *from, int size)
-{
- return copy_to_user(to, from, size);
-}
-
int copy_from_user_proc(void *to, void __user *from, int size)
{
return copy_from_user(to, from, size);
}
-int clear_user_proc(void __user *buf, int size)
-{
- return clear_user(buf, size);
-}
-
-int strlen_user_proc(char __user *str)
-{
- return strlen_user(str);
-}
-
-int smp_sigio_handler(void)
-{
-#ifdef CONFIG_SMP
- int cpu = current_thread_info()->cpu;
- IPI_handler(cpu);
- if (cpu != 0)
- return 1;
-#endif
- return 0;
-}
-
-int cpu(void)
+int singlestepping(void)
{
- return current_thread_info()->cpu;
-}
-
-static atomic_t using_sysemu = ATOMIC_INIT(0);
-int sysemu_supported;
-
-void set_using_sysemu(int value)
-{
- if (value > sysemu_supported)
- return;
- atomic_set(&using_sysemu, value);
-}
-
-int get_using_sysemu(void)
-{
- return atomic_read(&using_sysemu);
-}
-
-static int sysemu_proc_show(struct seq_file *m, void *v)
-{
- seq_printf(m, "%d\n", get_using_sysemu());
- return 0;
-}
-
-static int sysemu_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, sysemu_proc_show, NULL);
-}
-
-static ssize_t sysemu_proc_write(struct file *file, const char __user *buf,
- size_t count, loff_t *pos)
-{
- char tmp[2];
-
- if (copy_from_user(tmp, buf, 1))
- return -EFAULT;
-
- if (tmp[0] >= '0' && tmp[0] <= '2')
- set_using_sysemu(tmp[0] - '0');
- /* We use the first char, but pretend to write everything */
- return count;
-}
-
-static const struct file_operations sysemu_proc_fops = {
- .owner = THIS_MODULE,
- .open = sysemu_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
- .write = sysemu_proc_write,
-};
-
-int __init make_proc_sysemu(void)
-{
- struct proc_dir_entry *ent;
- if (!sysemu_supported)
- return 0;
-
- ent = proc_create("sysemu", 0600, NULL, &sysemu_proc_fops);
-
- if (ent == NULL)
- {
- printk(KERN_WARNING "Failed to register /proc/sysemu\n");
- return 0;
- }
-
- return 0;
-}
-
-late_initcall(make_proc_sysemu);
-
-int singlestepping(void * t)
-{
- struct task_struct *task = t ? t : current;
-
- if (!(task->ptrace & PT_DTRACE))
- return 0;
-
- if (task->thread.singlestep_syscall)
- return 1;
-
- return 2;
+ return test_thread_flag(TIF_SINGLESTEP);
}
/*
* Only x86 and x86_64 have an arch_align_stack().
* All other arches have "#define arch_align_stack(x) (x)"
- * in their asm/system.h
+ * in their asm/exec.h
* As this is included in UML from asm-um/system-generic.h,
* we can use it to behave as the subarch does.
*/
@@ -378,19 +271,16 @@ int singlestepping(void * t)
unsigned long arch_align_stack(unsigned long sp)
{
if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
- sp -= get_random_int() % 8192;
+ sp -= get_random_u32_below(8192);
return sp & ~0xf;
}
#endif
-unsigned long get_wchan(struct task_struct *p)
+unsigned long __get_wchan(struct task_struct *p)
{
unsigned long stack_page, sp, ip;
bool seen_sched = 0;
- if ((p == NULL) || (p == current) || (p->state == TASK_RUNNING))
- return 0;
-
stack_page = (unsigned long) task_stack_page(p);
/* Bail if the process has no kernel stack for some reason */
if (stack_page == 0)
@@ -417,11 +307,3 @@ unsigned long get_wchan(struct task_struct *p)
return 0;
}
-
-int elf_core_copy_fpregs(struct task_struct *t, elf_fpregset_t *fpu)
-{
- int cpu = current_thread_info()->cpu;
-
- return save_fp_registers(userspace_pid[cpu], (unsigned long *) fpu);
-}
-
diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c
index 694d551c8899..fdbb37b5c399 100644
--- a/arch/um/kernel/ptrace.c
+++ b/arch/um/kernel/ptrace.c
@@ -1,21 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
*/
#include <linux/audit.h>
#include <linux/ptrace.h>
#include <linux/sched.h>
-#include <linux/tracehook.h>
-#include <asm/uaccess.h>
-#include <skas_ptrace.h>
-
+#include <linux/uaccess.h>
+#include <asm/ptrace-abi.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/syscalls.h>
void user_enable_single_step(struct task_struct *child)
{
- child->ptrace |= PT_DTRACE;
- child->thread.singlestep_syscall = 0;
+ set_tsk_thread_flag(child, TIF_SINGLESTEP);
#ifdef SUBARCH_SET_SINGLESTEPPING
SUBARCH_SET_SINGLESTEPPING(child, 1);
@@ -24,8 +23,7 @@ void user_enable_single_step(struct task_struct *child)
void user_disable_single_step(struct task_struct *child)
{
- child->ptrace &= ~PT_DTRACE;
- child->thread.singlestep_syscall = 0;
+ clear_tsk_thread_flag(child, TIF_SINGLESTEP);
#ifdef SUBARCH_SET_SINGLESTEPPING
SUBARCH_SET_SINGLESTEPPING(child, 0);
@@ -40,9 +38,6 @@ void ptrace_disable(struct task_struct *child)
user_disable_single_step(child);
}
-extern int peek_user(struct task_struct * child, long addr, long data);
-extern int poke_user(struct task_struct * child, long addr, long data);
-
long arch_ptrace(struct task_struct *child, long request,
unsigned long addr, unsigned long data)
{
@@ -68,7 +63,7 @@ long arch_ptrace(struct task_struct *child, long request,
#ifdef PTRACE_GETREGS
case PTRACE_GETREGS: { /* Get all gp regs from the child. */
- if (!access_ok(VERIFY_WRITE, p, MAX_REG_OFFSET)) {
+ if (!access_ok(p, MAX_REG_OFFSET)) {
ret = -EIO;
break;
}
@@ -83,7 +78,7 @@ long arch_ptrace(struct task_struct *child, long request,
#ifdef PTRACE_SETREGS
case PTRACE_SETREGS: { /* Set all gp regs in the child. */
unsigned long tmp = 0;
- if (!access_ok(VERIFY_READ, p, MAX_REG_OFFSET)) {
+ if (!access_ok(p, MAX_REG_OFFSET)) {
ret = -EIO;
break;
}
@@ -104,35 +99,6 @@ long arch_ptrace(struct task_struct *child, long request,
ret = ptrace_set_thread_area(child, addr, vp);
break;
- case PTRACE_FAULTINFO: {
- /*
- * Take the info from thread->arch->faultinfo,
- * but transfer max. sizeof(struct ptrace_faultinfo).
- * On i386, ptrace_faultinfo is smaller!
- */
- ret = copy_to_user(p, &child->thread.arch.faultinfo,
- sizeof(struct ptrace_faultinfo)) ?
- -EIO : 0;
- break;
- }
-
-#ifdef PTRACE_LDT
- case PTRACE_LDT: {
- struct ptrace_ldt ldt;
-
- if (copy_from_user(&ldt, p, sizeof(ldt))) {
- ret = -EIO;
- break;
- }
-
- /*
- * This one is confusing, so just punt and return -EIO for
- * now
- */
- ret = -EIO;
- break;
- }
-#endif
default:
ret = ptrace_request(child, request, addr, data);
if (ret == -EIO)
@@ -143,39 +109,33 @@ long arch_ptrace(struct task_struct *child, long request,
return ret;
}
-static void send_sigtrap(struct task_struct *tsk, struct uml_pt_regs *regs,
- int error_code)
+static void send_sigtrap(struct uml_pt_regs *regs, int error_code)
{
- struct siginfo info;
-
- memset(&info, 0, sizeof(info));
- info.si_signo = SIGTRAP;
- info.si_code = TRAP_BRKPT;
-
- /* User-mode eip? */
- info.si_addr = UPT_IS_USER(regs) ? (void __user *) UPT_IP(regs) : NULL;
-
/* Send us the fake SIGTRAP */
- force_sig_info(SIGTRAP, &info, tsk);
+ force_sig_fault(SIGTRAP, TRAP_BRKPT,
+ /* User-mode eip? */
+ UPT_IS_USER(regs) ? (void __user *) UPT_IP(regs) : NULL);
}
/*
- * XXX Check PT_DTRACE vs TIF_SINGLESTEP for singlestepping check and
+ * XXX Check TIF_SINGLESTEP for singlestepping check and
* PT_PTRACED vs TIF_SYSCALL_TRACE for syscall tracing check
*/
-void syscall_trace_enter(struct pt_regs *regs)
+int syscall_trace_enter(struct pt_regs *regs)
{
- audit_syscall_entry(HOST_AUDIT_ARCH,
- UPT_SYSCALL_NR(&regs->regs),
+ audit_syscall_entry(UPT_SYSCALL_NR(&regs->regs),
UPT_SYSCALL_ARG1(&regs->regs),
UPT_SYSCALL_ARG2(&regs->regs),
UPT_SYSCALL_ARG3(&regs->regs),
UPT_SYSCALL_ARG4(&regs->regs));
+ if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
+ trace_sys_enter(regs, UPT_SYSCALL_NR(&regs->regs));
+
if (!test_thread_flag(TIF_SYSCALL_TRACE))
- return;
+ return 0;
- tracehook_report_syscall_entry(regs);
+ return ptrace_report_syscall_entry(regs);
}
void syscall_trace_leave(struct pt_regs *regs)
@@ -185,13 +145,16 @@ void syscall_trace_leave(struct pt_regs *regs)
audit_syscall_exit(regs);
/* Fake a debug trap */
- if (ptraced & PT_DTRACE)
- send_sigtrap(current, &regs->regs, 0);
+ if (test_thread_flag(TIF_SINGLESTEP))
+ send_sigtrap(&regs->regs, 0);
+
+ if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
+ trace_sys_exit(regs, PT_REGS_SYSCALL_RET(regs));
if (!test_thread_flag(TIF_SYSCALL_TRACE))
return;
- tracehook_report_syscall_exit(regs, 0);
+ ptrace_report_syscall_exit(regs, 0);
/* force do_signal() --> is_syscall() */
if (ptraced & PT_PTRACED)
set_thread_flag(TIF_SIGPENDING);
diff --git a/arch/um/kernel/reboot.c b/arch/um/kernel/reboot.c
index ced8903921ae..680bce4bd8fa 100644
--- a/arch/um/kernel/reboot.c
+++ b/arch/um/kernel/reboot.c
@@ -1,42 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
*/
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/sched/mm.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/oom.h>
+#include <linux/reboot.h>
#include <kern_util.h>
#include <os.h>
#include <skas.h>
void (*pm_power_off)(void);
+EXPORT_SYMBOL(pm_power_off);
static void kill_off_processes(void)
{
- if (proc_mm)
- /*
- * FIXME: need to loop over userspace_pids
- */
- os_kill_ptraced_process(userspace_pid[0], 1);
- else {
- struct task_struct *p;
- int pid;
+ struct task_struct *p;
+ int pid;
- read_lock(&tasklist_lock);
- for_each_process(p) {
- struct task_struct *t;
+ read_lock(&tasklist_lock);
+ for_each_process(p) {
+ struct task_struct *t;
- t = find_lock_task_mm(p);
- if (!t)
- continue;
- pid = t->mm->context.id.u.pid;
- task_unlock(t);
- os_kill_ptraced_process(pid, 1);
- }
- read_unlock(&tasklist_lock);
+ t = find_lock_task_mm(p);
+ if (!t)
+ continue;
+ pid = t->mm->context.id.pid;
+ task_unlock(t);
+ os_kill_ptraced_process(pid, 1);
}
+ read_unlock(&tasklist_lock);
}
void uml_cleanup(void)
@@ -62,3 +59,18 @@ void machine_halt(void)
{
machine_power_off();
}
+
+static int sys_power_off_handler(struct sys_off_data *data)
+{
+ machine_power_off();
+ return 0;
+}
+
+static int register_power_off(void)
+{
+ register_sys_off_handler(SYS_OFF_MODE_POWER_OFF,
+ SYS_OFF_PRIO_DEFAULT,
+ sys_power_off_handler, NULL);
+ return 0;
+}
+__initcall(register_power_off);
diff --git a/arch/um/kernel/sigio.c b/arch/um/kernel/sigio.c
index b5e0cbb34382..4fc04742048a 100644
--- a/arch/um/kernel/sigio.c
+++ b/arch/um/kernel/sigio.c
@@ -1,6 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2002 - 2007 Jeff Dike (jdike@{linux.intel,addtoit}.com)
- * Licensed under the GPL
*/
#include <linux/interrupt.h>
@@ -8,42 +8,15 @@
#include <os.h>
#include <sigio.h>
-/* Protected by sigio_lock() called from write_sigio_workaround */
-static int sigio_irq_fd = -1;
-
-static irqreturn_t sigio_interrupt(int irq, void *data)
-{
- char c;
-
- os_read_file(sigio_irq_fd, &c, sizeof(c));
- reactivate_fd(sigio_irq_fd, SIGIO_WRITE_IRQ);
- return IRQ_HANDLED;
-}
-
-int write_sigio_irq(int fd)
-{
- int err;
-
- err = um_request_irq(SIGIO_WRITE_IRQ, fd, IRQ_READ, sigio_interrupt,
- 0, "write sigio", NULL);
- if (err) {
- printk(KERN_ERR "write_sigio_irq : um_request_irq failed, "
- "err = %d\n", err);
- return -1;
- }
- sigio_irq_fd = fd;
- return 0;
-}
-
/* These are called from os-Linux/sigio.c to protect its pollfds arrays. */
-static DEFINE_SPINLOCK(sigio_spinlock);
+static DEFINE_MUTEX(sigio_mutex);
void sigio_lock(void)
{
- spin_lock(&sigio_spinlock);
+ mutex_lock(&sigio_mutex);
}
void sigio_unlock(void)
{
- spin_unlock(&sigio_spinlock);
+ mutex_unlock(&sigio_mutex);
}
diff --git a/arch/um/kernel/signal.c b/arch/um/kernel/signal.c
index 3e831b3fd07b..a56b44522766 100644
--- a/arch/um/kernel/signal.c
+++ b/arch/um/kernel/signal.c
@@ -1,32 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
*/
#include <linux/module.h>
#include <linux/ptrace.h>
#include <linux/sched.h>
+#include <linux/ftrace.h>
#include <asm/siginfo.h>
#include <asm/signal.h>
#include <asm/unistd.h>
#include <frame_kern.h>
#include <kern_util.h>
+#include <os.h>
EXPORT_SYMBOL(block_signals);
EXPORT_SYMBOL(unblock_signals);
+void block_signals_trace(void)
+{
+ block_signals();
+ if (current_thread_info())
+ trace_hardirqs_off();
+}
+
+void unblock_signals_trace(void)
+{
+ if (current_thread_info())
+ trace_hardirqs_on();
+ unblock_signals();
+}
+
+void um_trace_signals_on(void)
+{
+ if (current_thread_info())
+ trace_hardirqs_on();
+}
+
+void um_trace_signals_off(void)
+{
+ if (current_thread_info())
+ trace_hardirqs_off();
+}
+
/*
* OK, we're invoking a handler
*/
-static void handle_signal(struct pt_regs *regs, unsigned long signr,
- struct k_sigaction *ka, siginfo_t *info)
+static void handle_signal(struct ksignal *ksig, struct pt_regs *regs)
{
sigset_t *oldset = sigmask_to_save();
int singlestep = 0;
unsigned long sp;
int err;
- if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED))
+ if (test_thread_flag(TIF_SINGLESTEP) && (current->ptrace & PT_PTRACED))
singlestep = 1;
/* Did we come from a system call? */
@@ -39,11 +66,11 @@ static void handle_signal(struct pt_regs *regs, unsigned long signr,
break;
case -ERESTARTSYS:
- if (!(ka->sa.sa_flags & SA_RESTART)) {
+ if (!(ksig->ka.sa.sa_flags & SA_RESTART)) {
PT_REGS_SYSCALL_RET(regs) = -EINTR;
break;
}
- /* fallthrough */
+ fallthrough;
case -ERESTARTNOINTR:
PT_REGS_RESTART_SYSCALL(regs);
PT_REGS_ORIG_SYSCALL(regs) = PT_REGS_SYSCALL_NR(regs);
@@ -52,32 +79,28 @@ static void handle_signal(struct pt_regs *regs, unsigned long signr,
}
sp = PT_REGS_SP(regs);
- if ((ka->sa.sa_flags & SA_ONSTACK) && (sas_ss_flags(sp) == 0))
+ if ((ksig->ka.sa.sa_flags & SA_ONSTACK) && (sas_ss_flags(sp) == 0))
sp = current->sas_ss_sp + current->sas_ss_size;
#ifdef CONFIG_ARCH_HAS_SC_SIGNALS
- if (!(ka->sa.sa_flags & SA_SIGINFO))
- err = setup_signal_stack_sc(sp, signr, ka, regs, oldset);
+ if (!(ksig->ka.sa.sa_flags & SA_SIGINFO))
+ err = setup_signal_stack_sc(sp, ksig, regs, oldset);
else
#endif
- err = setup_signal_stack_si(sp, signr, ka, regs, info, oldset);
+ err = setup_signal_stack_si(sp, ksig, regs, oldset);
- if (err)
- force_sigsegv(signr, current);
- else
- signal_delivered(signr, info, ka, regs, singlestep);
+ signal_setup_done(err, ksig, singlestep);
}
-static int kern_do_signal(struct pt_regs *regs)
+void do_signal(struct pt_regs *regs)
{
- struct k_sigaction ka_copy;
- siginfo_t info;
- int sig, handled_sig = 0;
+ struct ksignal ksig;
+ int handled_sig = 0;
- while ((sig = get_signal_to_deliver(&info, &ka_copy, regs, NULL)) > 0) {
+ while (get_signal(&ksig)) {
handled_sig = 1;
/* Whee! Actually deliver the signal. */
- handle_signal(regs, sig, &ka_copy, &info);
+ handle_signal(&ksig, regs);
}
/* Did we come from a system call? */
@@ -98,27 +121,9 @@ static int kern_do_signal(struct pt_regs *regs)
}
/*
- * This closes a way to execute a system call on the host. If
- * you set a breakpoint on a system call instruction and singlestep
- * from it, the tracing thread used to PTRACE_SINGLESTEP the process
- * rather than PTRACE_SYSCALL it, allowing the system call to execute
- * on the host. The tracing thread will check this flag and
- * PTRACE_SYSCALL if necessary.
- */
- if (current->ptrace & PT_DTRACE)
- current->thread.singlestep_syscall =
- is_syscall(PT_REGS_IP(&current->thread.regs));
-
- /*
* if there's no signal to deliver, we just put the saved sigmask
* back
*/
if (!handled_sig)
restore_saved_sigmask();
- return handled_sig;
-}
-
-int do_signal(void)
-{
- return kern_do_signal(&current->thread.regs);
}
diff --git a/arch/um/kernel/skas/.gitignore b/arch/um/kernel/skas/.gitignore
new file mode 100644
index 000000000000..c3409ced0f38
--- /dev/null
+++ b/arch/um/kernel/skas/.gitignore
@@ -0,0 +1,2 @@
+stub_exe
+stub_exe.dbg
diff --git a/arch/um/kernel/skas/Makefile b/arch/um/kernel/skas/Makefile
index 0b76d8869c94..3384be42691f 100644
--- a/arch/um/kernel/skas/Makefile
+++ b/arch/um/kernel/skas/Makefile
@@ -1,15 +1,50 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
-# Licensed under the GPL
#
-obj-y := clone.o mmu.o process.o syscall.o uaccess.o
+obj-y := stub.o mmu.o process.o syscall.o uaccess.o \
+ stub_exe_embed.o
-# clone.o is in the stub, so it can't be built with profiling
+# Stub executable
+
+stub_exe_objs-y := stub_exe.o
+
+stub_exe_objs := $(foreach F,$(stub_exe_objs-y),$(obj)/$F)
+
+# Object file containing the ELF executable
+$(obj)/stub_exe_embed.o: $(src)/stub_exe_embed.S $(obj)/stub_exe
+
+$(obj)/stub_exe.dbg: $(stub_exe_objs) FORCE
+ $(call if_changed,stub_exe)
+
+$(obj)/stub_exe: OBJCOPYFLAGS := -S
+$(obj)/stub_exe: $(obj)/stub_exe.dbg FORCE
+ $(call if_changed,objcopy)
+
+quiet_cmd_stub_exe = STUB_EXE $@
+ cmd_stub_exe = $(CC) -nostdlib -o $@ \
+ $(filter-out $(UM_GPROF_OPT) $(UM_GCOV_OPT),$(KBUILD_CFLAGS)) $(STUB_EXE_LDFLAGS) \
+ $(filter %.o,$^)
+
+STUB_EXE_LDFLAGS = -Wl,-n -static
+
+targets += stub_exe.dbg stub_exe $(stub_exe_objs-y)
+
+# end
+
+# stub.o is in the stub, so it can't be built with profiling
# GCC hardened also auto-enables -fpic, but we need %ebx so it can't work ->
# disable it
-CFLAGS_clone.o := $(CFLAGS_NO_HARDENING)
-UNPROFILE_OBJS := clone.o
+CFLAGS_stub.o := $(CFLAGS_NO_HARDENING)
+CFLAGS_stub_exe.o := $(CFLAGS_NO_HARDENING)
+
+# Clang will call memset() from __builtin_alloca() when stack variable
+# initialization is enabled, which is used in stub_exe.c.
+CFLAGS_stub_exe.o += $(call cc-option, -ftrivial-auto-var-init=uninitialized)
+
+UNPROFILE_OBJS := stub.o stub_exe.o
+KCOV_INSTRUMENT := n
-include arch/um/scripts/Makefile.rules
+include $(srctree)/arch/um/scripts/Makefile.rules
diff --git a/arch/um/kernel/skas/clone.c b/arch/um/kernel/skas/clone.c
deleted file mode 100644
index 289771dadf81..000000000000
--- a/arch/um/kernel/skas/clone.c
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (C) 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
- */
-
-#include <signal.h>
-#include <sched.h>
-#include <asm/unistd.h>
-#include <sys/time.h>
-#include <as-layout.h>
-#include <ptrace_user.h>
-#include <stub-data.h>
-#include <sysdep/stub.h>
-
-/*
- * This is in a separate file because it needs to be compiled with any
- * extraneous gcc flags (-pg, -fprofile-arcs, -ftest-coverage) disabled
- *
- * Use UM_KERN_PAGE_SIZE instead of PAGE_SIZE because that calls getpagesize
- * on some systems.
- */
-
-void __attribute__ ((__section__ (".__syscall_stub")))
-stub_clone_handler(void)
-{
- struct stub_data *data = (struct stub_data *) STUB_DATA;
- long err;
-
- err = stub_syscall2(__NR_clone, CLONE_PARENT | CLONE_FILES | SIGCHLD,
- STUB_DATA + UM_KERN_PAGE_SIZE / 2 - sizeof(void *));
- if (err != 0)
- goto out;
-
- err = stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0);
- if (err)
- goto out;
-
- err = stub_syscall3(__NR_setitimer, ITIMER_VIRTUAL,
- (long) &data->timer, 0);
- if (err)
- goto out;
-
- remap_stack(data->fd, data->offset);
- goto done;
-
- out:
- /*
- * save current result.
- * Parent: pid;
- * child: retcode of mmap already saved and it jumps around this
- * assignment
- */
- data->err = err;
- done:
- trap_myself();
-}
diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c
index ff03067a3b14..00957788591b 100644
--- a/arch/um/kernel/skas/mmu.c
+++ b/arch/um/kernel/skas/mmu.c
@@ -1,178 +1,162 @@
+// SPDX-License-Identifier: GPL-2.0
/*
+ * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de)
* Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
*/
#include <linux/mm.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/slab.h>
+
+#include <shared/irq_kern.h>
#include <asm/pgalloc.h>
-#include <asm/pgtable.h>
+#include <asm/sections.h>
+#include <asm/mmu_context.h>
#include <as-layout.h>
#include <os.h>
#include <skas.h>
+#include <stub-data.h>
-extern int __syscall_stub_start;
+/* Ensure the stub_data struct covers the allocated area */
+static_assert(sizeof(struct stub_data) == STUB_DATA_PAGES * UM_KERN_PAGE_SIZE);
-static int init_stub_pte(struct mm_struct *mm, unsigned long proc,
- unsigned long kernel)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
-
- pgd = pgd_offset(mm, proc);
- pud = pud_alloc(mm, pgd, proc);
- if (!pud)
- goto out;
+static spinlock_t mm_list_lock;
+static struct list_head mm_list;
- pmd = pmd_alloc(mm, pud, proc);
- if (!pmd)
- goto out_pmd;
+void enter_turnstile(struct mm_id *mm_id) __acquires(turnstile)
+{
+ struct mm_context *ctx = container_of(mm_id, struct mm_context, id);
- pte = pte_alloc_map(mm, NULL, pmd, proc);
- if (!pte)
- goto out_pte;
+ mutex_lock(&ctx->turnstile);
+}
- *pte = mk_pte(virt_to_page(kernel), __pgprot(_PAGE_PRESENT));
- *pte = pte_mkread(*pte);
- return 0;
+void exit_turnstile(struct mm_id *mm_id) __releases(turnstile)
+{
+ struct mm_context *ctx = container_of(mm_id, struct mm_context, id);
- out_pte:
- pmd_free(mm, pmd);
- out_pmd:
- pud_free(mm, pud);
- out:
- return -ENOMEM;
+ mutex_unlock(&ctx->turnstile);
}
int init_new_context(struct task_struct *task, struct mm_struct *mm)
{
- struct mm_context *from_mm = NULL;
- struct mm_context *to_mm = &mm->context;
+ struct mm_id *new_id = &mm->context.id;
unsigned long stack = 0;
int ret = -ENOMEM;
- if (skas_needs_stub) {
- stack = get_zeroed_page(GFP_KERNEL);
- if (stack == 0)
- goto out;
- }
+ mutex_init(&mm->context.turnstile);
+ spin_lock_init(&mm->context.sync_tlb_lock);
- to_mm->id.stack = stack;
- if (current->mm != NULL && current->mm != &init_mm)
- from_mm = &current->mm->context;
+ stack = __get_free_pages(GFP_KERNEL | __GFP_ZERO, ilog2(STUB_DATA_PAGES));
+ if (stack == 0)
+ goto out;
- if (proc_mm) {
- ret = new_mm(stack);
- if (ret < 0) {
- printk(KERN_ERR "init_new_context_skas - "
- "new_mm failed, errno = %d\n", ret);
- goto out_free;
- }
- to_mm->id.u.mm_fd = ret;
- }
- else {
- if (from_mm)
- to_mm->id.u.pid = copy_context_skas0(stack,
- from_mm->id.u.pid);
- else to_mm->id.u.pid = start_userspace(stack);
-
- if (to_mm->id.u.pid < 0) {
- ret = to_mm->id.u.pid;
- goto out_free;
- }
+ new_id->stack = stack;
+ new_id->syscall_data_len = 0;
+ new_id->syscall_fd_num = 0;
+
+ scoped_guard(spinlock_irqsave, &mm_list_lock) {
+ /* Insert into list, used for lookups when the child dies */
+ list_add(&mm->context.list, &mm_list);
}
- ret = init_new_ldt(to_mm, from_mm);
- if (ret < 0) {
- printk(KERN_ERR "init_new_context_skas - init_ldt"
- " failed, errno = %d\n", ret);
+ ret = start_userspace(new_id);
+ if (ret < 0)
goto out_free;
- }
+
+ /* Ensure the new MM is clean and nothing unwanted is mapped */
+ unmap(new_id, 0, STUB_START);
return 0;
out_free:
- if (to_mm->id.stack != 0)
- free_page(to_mm->id.stack);
+ free_pages(new_id->stack, ilog2(STUB_DATA_PAGES));
out:
return ret;
}
-void uml_setup_stubs(struct mm_struct *mm)
+void destroy_context(struct mm_struct *mm)
{
- int err, ret;
+ struct mm_context *mmu = &mm->context;
- if (!skas_needs_stub)
+ /*
+ * If init_new_context wasn't called, this will be
+ * zero, resulting in a kill(0), which will result in the
+ * whole UML suddenly dying. Also, cover negative and
+ * 1 cases, since they shouldn't happen either.
+ *
+ * Negative cases happen if the child died unexpectedly.
+ */
+ if (mmu->id.pid >= 0 && mmu->id.pid < 2) {
+ printk(KERN_ERR "corrupt mm_context - pid = %d\n",
+ mmu->id.pid);
return;
+ }
- ret = init_stub_pte(mm, STUB_CODE,
- (unsigned long) &__syscall_stub_start);
- if (ret)
- goto out;
-
- ret = init_stub_pte(mm, STUB_DATA, mm->context.id.stack);
- if (ret)
- goto out;
-
- mm->context.stub_pages[0] = virt_to_page(&__syscall_stub_start);
- mm->context.stub_pages[1] = virt_to_page(mm->context.id.stack);
+ scoped_guard(spinlock_irqsave, &mm_list_lock)
+ list_del(&mm->context.list);
- /* dup_mmap already holds mmap_sem */
- err = install_special_mapping(mm, STUB_START, STUB_END - STUB_START,
- VM_READ | VM_MAYREAD | VM_EXEC |
- VM_MAYEXEC | VM_DONTCOPY,
- mm->context.stub_pages);
- if (err) {
- printk(KERN_ERR "install_special_mapping returned %d\n", err);
- goto out;
+ if (mmu->id.pid > 0) {
+ os_kill_ptraced_process(mmu->id.pid, 1);
+ mmu->id.pid = -1;
}
- return;
-out:
- force_sigsegv(SIGSEGV, current);
+ if (using_seccomp && mmu->id.sock)
+ os_close_file(mmu->id.sock);
+
+ free_pages(mmu->id.stack, ilog2(STUB_DATA_PAGES));
}
-void arch_exit_mmap(struct mm_struct *mm)
+static irqreturn_t mm_sigchld_irq(int irq, void* dev)
{
- pte_t *pte;
+ struct mm_context *mm_context;
+ pid_t pid;
- pte = virt_to_pte(mm, STUB_CODE);
- if (pte != NULL)
- pte_clear(mm, STUB_CODE, pte);
+ guard(spinlock)(&mm_list_lock);
- pte = virt_to_pte(mm, STUB_DATA);
- if (pte == NULL)
- return;
+ while ((pid = os_reap_child()) > 0) {
+ /*
+ * A child died, check if we have an MM with the PID. This is
+ * only relevant in SECCOMP mode (as ptrace will fail anyway).
+ *
+ * See wait_stub_done_seccomp for more details.
+ */
+ list_for_each_entry(mm_context, &mm_list, list) {
+ if (mm_context->id.pid == pid) {
+ struct stub_data *stub_data;
+ printk("Unexpectedly lost MM child! Affected tasks will segfault.");
+
+ /* Marks the MM as dead */
+ mm_context->id.pid = -1;
+
+ stub_data = (void *)mm_context->id.stack;
+ stub_data->futex = FUTEX_IN_KERN;
+#if IS_ENABLED(CONFIG_SMP)
+ os_futex_wake(&stub_data->futex);
+#endif
+
+ /*
+ * NOTE: Currently executing syscalls by
+ * affected tasks may finish normally.
+ */
+ break;
+ }
+ }
+ }
- pte_clear(mm, STUB_DATA, pte);
+ return IRQ_HANDLED;
}
-void destroy_context(struct mm_struct *mm)
+static int __init init_child_tracking(void)
{
- struct mm_context *mmu = &mm->context;
+ int err;
- if (proc_mm)
- os_close_file(mmu->id.u.mm_fd);
- else {
- /*
- * If init_new_context wasn't called, this will be
- * zero, resulting in a kill(0), which will result in the
- * whole UML suddenly dying. Also, cover negative and
- * 1 cases, since they shouldn't happen either.
- */
- if (mmu->id.u.pid < 2) {
- printk(KERN_ERR "corrupt mm_context - pid = %d\n",
- mmu->id.u.pid);
- return;
- }
- os_kill_ptraced_process(mmu->id.u.pid, 1);
- }
+ spin_lock_init(&mm_list_lock);
+ INIT_LIST_HEAD(&mm_list);
- if (skas_needs_stub)
- free_page(mmu->id.stack);
+ err = request_irq(SIGCHLD_IRQ, mm_sigchld_irq, 0, "SIGCHLD", NULL);
+ if (err < 0)
+ panic("Failed to register SIGCHLD IRQ: %d", err);
- free_ldt(mmu);
+ return 0;
}
+early_initcall(init_child_tracking)
diff --git a/arch/um/kernel/skas/process.c b/arch/um/kernel/skas/process.c
index 4da11b3c8ddb..4a7673b0261a 100644
--- a/arch/um/kernel/skas/process.c
+++ b/arch/um/kernel/skas/process.c
@@ -1,73 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
*/
#include <linux/init.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/task_stack.h>
+#include <linux/sched/task.h>
+#include <linux/smp-internal.h>
+
+#include <asm/tlbflush.h>
+
#include <as-layout.h>
#include <kern.h>
#include <os.h>
#include <skas.h>
-
-int new_mm(unsigned long stack)
-{
- int fd, err;
-
- fd = os_open_file("/proc/mm", of_cloexec(of_write(OPENFLAGS())), 0);
- if (fd < 0)
- return fd;
-
- if (skas_needs_stub) {
- err = map_stub_pages(fd, STUB_CODE, STUB_DATA, stack);
- if (err) {
- os_close_file(fd);
- return err;
- }
- }
-
- return fd;
-}
+#include <kern_util.h>
extern void start_kernel(void);
static int __init start_kernel_proc(void *unused)
{
- int pid;
-
- block_signals();
- pid = os_getpid();
+ block_signals_trace();
- cpu_tasks[0].pid = pid;
- cpu_tasks[0].task = current;
-#ifdef CONFIG_SMP
- init_cpu_online(get_cpu_mask(0));
-#endif
start_kernel();
return 0;
}
-extern int userspace_pid[];
-
-extern char cpu0_irqstack[];
+char cpu_irqstacks[NR_CPUS][THREAD_SIZE] __aligned(THREAD_SIZE);
int __init start_uml(void)
{
- stack_protections((unsigned long) &cpu0_irqstack);
- set_sigstack(cpu0_irqstack, THREAD_SIZE);
- if (proc_mm) {
- userspace_pid[0] = start_userspace(0);
- if (userspace_pid[0] < 0) {
- printf("start_uml - start_userspace returned %d\n",
- userspace_pid[0]);
- exit(1);
- }
- }
+ stack_protections((unsigned long) &cpu_irqstacks[0]);
+ set_sigstack(cpu_irqstacks[0], THREAD_SIZE);
init_new_thread_signals();
- init_task.thread.request.u.thread.proc = start_kernel_proc;
- init_task.thread.request.u.thread.arg = NULL;
+ init_task.thread.request.thread.proc = start_kernel_proc;
+ init_task.thread.request.thread.arg = NULL;
return start_idle_thread(task_stack_page(&init_task),
&init_task.thread.switch_buf);
}
@@ -79,3 +49,31 @@ unsigned long current_stub_stack(void)
return current->mm->context.id.stack;
}
+
+struct mm_id *current_mm_id(void)
+{
+ if (current->mm == NULL)
+ return NULL;
+
+ return &current->mm->context.id;
+}
+
+void current_mm_sync(void)
+{
+ if (current->mm == NULL)
+ return;
+
+ um_tlb_sync(current->mm);
+}
+
+static DEFINE_SPINLOCK(initial_jmpbuf_spinlock);
+
+void initial_jmpbuf_lock(void)
+{
+ spin_lock_irq(&initial_jmpbuf_spinlock);
+}
+
+void initial_jmpbuf_unlock(void)
+{
+ spin_unlock_irq(&initial_jmpbuf_spinlock);
+}
diff --git a/arch/um/kernel/skas/stub.c b/arch/um/kernel/skas/stub.c
new file mode 100644
index 000000000000..67cab46a602c
--- /dev/null
+++ b/arch/um/kernel/skas/stub.c
@@ -0,0 +1,181 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 Benjamin Berg <benjamin@sipsolutions.net>
+ */
+
+#include <sysdep/stub.h>
+
+#include <linux/futex.h>
+#include <sys/socket.h>
+#include <errno.h>
+
+/*
+ * Known security issues
+ *
+ * Userspace can jump to this address to execute *any* syscall that is
+ * permitted by the stub. As we will return afterwards, it can do
+ * whatever it likes, including:
+ * - Tricking the kernel into handing out the memory FD
+ * - Using this memory FD to read/write all physical memory
+ * - Running in parallel to the kernel processing a syscall
+ * (possibly creating data races?)
+ * - Blocking e.g. SIGALRM to avoid time based scheduling
+ *
+ * To avoid this, the permitted location for each syscall needs to be
+ * checked for in the SECCOMP filter (which is reasonably simple). Also,
+ * more care will need to go into considerations how the code might be
+ * tricked by using a prepared stack (or even modifying the stack from
+ * another thread in case SMP support is added).
+ *
+ * As for the SIGALRM, the best counter measure will be to check in the
+ * kernel that the process is reporting back the SIGALRM in a timely
+ * fashion.
+ */
+static __always_inline int syscall_handler(int fd_map[STUB_MAX_FDS])
+{
+ struct stub_data *d = get_stub_data();
+ int i;
+ unsigned long res;
+ int fd;
+
+ for (i = 0; i < d->syscall_data_len; i++) {
+ struct stub_syscall *sc = &d->syscall_data[i];
+
+ switch (sc->syscall) {
+ case STUB_SYSCALL_MMAP:
+ if (fd_map)
+ fd = fd_map[sc->mem.fd];
+ else
+ fd = sc->mem.fd;
+
+ res = stub_syscall6(STUB_MMAP_NR,
+ sc->mem.addr, sc->mem.length,
+ sc->mem.prot,
+ MAP_SHARED | MAP_FIXED,
+ fd, sc->mem.offset);
+ if (res != sc->mem.addr) {
+ d->err = res;
+ d->syscall_data_len = i;
+ return -1;
+ }
+ break;
+ case STUB_SYSCALL_MUNMAP:
+ res = stub_syscall2(__NR_munmap,
+ sc->mem.addr, sc->mem.length);
+ if (res) {
+ d->err = res;
+ d->syscall_data_len = i;
+ return -1;
+ }
+ break;
+ default:
+ d->err = -95; /* EOPNOTSUPP */
+ d->syscall_data_len = i;
+ return -1;
+ }
+ }
+
+ d->err = 0;
+ d->syscall_data_len = 0;
+
+ return 0;
+}
+
+void __section(".__syscall_stub")
+stub_syscall_handler(void)
+{
+ syscall_handler(NULL);
+
+ trap_myself();
+}
+
+void __section(".__syscall_stub")
+stub_signal_interrupt(int sig, siginfo_t *info, void *p)
+{
+ struct stub_data *d = get_stub_data();
+ char rcv_data;
+ union {
+ char data[CMSG_SPACE(sizeof(int) * STUB_MAX_FDS)];
+ struct cmsghdr align;
+ } ctrl = {};
+ struct iovec iov = {
+ .iov_base = &rcv_data,
+ .iov_len = 1,
+ };
+ struct msghdr msghdr = {
+ .msg_iov = &iov,
+ .msg_iovlen = 1,
+ .msg_control = &ctrl,
+ .msg_controllen = sizeof(ctrl),
+ };
+ ucontext_t *uc = p;
+ struct cmsghdr *fd_msg;
+ int *fd_map;
+ int num_fds;
+ long res;
+
+ d->signal = sig;
+ d->si_offset = (unsigned long)info - (unsigned long)&d->sigstack[0];
+ d->mctx_offset = (unsigned long)&uc->uc_mcontext - (unsigned long)&d->sigstack[0];
+
+restart_wait:
+ d->futex = FUTEX_IN_KERN;
+ do {
+ res = stub_syscall3(__NR_futex, (unsigned long)&d->futex,
+ FUTEX_WAKE, 1);
+ } while (res == -EINTR);
+
+ do {
+ res = stub_syscall4(__NR_futex, (unsigned long)&d->futex,
+ FUTEX_WAIT, FUTEX_IN_KERN, 0);
+ } while (res == -EINTR || d->futex == FUTEX_IN_KERN);
+
+ if (res < 0 && res != -EAGAIN)
+ stub_syscall1(__NR_exit_group, 1);
+
+ if (d->syscall_data_len) {
+ /* Read passed FDs (if any) */
+ do {
+ res = stub_syscall3(__NR_recvmsg, 0, (unsigned long)&msghdr, 0);
+ } while (res == -EINTR);
+
+ /* We should never have a receive error (other than -EAGAIN) */
+ if (res < 0 && res != -EAGAIN)
+ stub_syscall1(__NR_exit_group, 1);
+
+ /* Receive the FDs */
+ num_fds = 0;
+ fd_msg = msghdr.msg_control;
+ fd_map = (void *)&CMSG_DATA(fd_msg);
+ if (res == iov.iov_len && msghdr.msg_controllen > sizeof(struct cmsghdr))
+ num_fds = (fd_msg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
+
+ /* Try running queued syscalls. */
+ res = syscall_handler(fd_map);
+
+ while (num_fds)
+ stub_syscall2(__NR_close, fd_map[--num_fds], 0);
+ } else {
+ res = 0;
+ }
+
+ if (res < 0 || d->restart_wait) {
+ /* Report SIGSYS if we restart. */
+ d->signal = SIGSYS;
+ d->restart_wait = 0;
+
+ goto restart_wait;
+ }
+
+ /* Restore arch dependent state that is not part of the mcontext */
+ stub_seccomp_restore_state(&d->arch_data);
+
+ /* Return so that the host modified mcontext is restored. */
+}
+
+void __section(".__syscall_stub")
+stub_signal_restorer(void)
+{
+ /* We must not have anything on the stack when doing rt_sigreturn */
+ stub_syscall0(__NR_rt_sigreturn);
+}
diff --git a/arch/um/kernel/skas/stub_exe.c b/arch/um/kernel/skas/stub_exe.c
new file mode 100644
index 000000000000..cbafaa684e66
--- /dev/null
+++ b/arch/um/kernel/skas/stub_exe.c
@@ -0,0 +1,230 @@
+#include <sys/ptrace.h>
+#include <sys/prctl.h>
+#include <sys/fcntl.h>
+#include <asm/unistd.h>
+#include <sysdep/stub.h>
+#include <stub-data.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>
+#include <generated/asm-offsets.h>
+
+void _start(void);
+
+noinline static void real_init(void)
+{
+ struct stub_init_data init_data;
+ unsigned long res;
+ struct {
+ void *ss_sp;
+ int ss_flags;
+ size_t ss_size;
+ } stack = {
+ .ss_size = STUB_DATA_PAGES * UM_KERN_PAGE_SIZE,
+ };
+ struct {
+ void *sa_handler_;
+ unsigned long sa_flags;
+ void *sa_restorer;
+ unsigned long long sa_mask;
+ } sa = {
+ /* Need to set SA_RESTORER (but the handler never returns) */
+ .sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO | 0x04000000,
+ };
+
+ /* set a nice name */
+ stub_syscall2(__NR_prctl, PR_SET_NAME, (unsigned long)"uml-userspace");
+
+ /* Make sure this process dies if the kernel dies */
+ stub_syscall2(__NR_prctl, PR_SET_PDEATHSIG, SIGKILL);
+
+ /* Needed in SECCOMP mode (and safe to do anyway) */
+ stub_syscall5(__NR_prctl, PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+
+ /* read information from STDIN and close it */
+ res = stub_syscall3(__NR_read, 0,
+ (unsigned long)&init_data, sizeof(init_data));
+ if (res != sizeof(init_data))
+ stub_syscall1(__NR_exit, 10);
+
+ /* In SECCOMP mode, FD 0 is a socket and is later used for FD passing */
+ if (!init_data.seccomp)
+ stub_syscall1(__NR_close, 0);
+ else
+ stub_syscall3(__NR_fcntl, 0, F_SETFL, O_NONBLOCK);
+
+ /* map stub code + data */
+ res = stub_syscall6(STUB_MMAP_NR,
+ init_data.stub_start, UM_KERN_PAGE_SIZE,
+ PROT_READ | PROT_EXEC, MAP_FIXED | MAP_SHARED,
+ init_data.stub_code_fd, init_data.stub_code_offset);
+ if (res != init_data.stub_start)
+ stub_syscall1(__NR_exit, 11);
+
+ res = stub_syscall6(STUB_MMAP_NR,
+ init_data.stub_start + UM_KERN_PAGE_SIZE,
+ STUB_DATA_PAGES * UM_KERN_PAGE_SIZE,
+ PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED,
+ init_data.stub_data_fd, init_data.stub_data_offset);
+ if (res != init_data.stub_start + UM_KERN_PAGE_SIZE)
+ stub_syscall1(__NR_exit, 12);
+
+ /* In SECCOMP mode, we only need the signalling FD from now on */
+ if (init_data.seccomp) {
+ res = stub_syscall3(__NR_close_range, 1, ~0U, 0);
+ if (res != 0)
+ stub_syscall1(__NR_exit, 13);
+ }
+
+ /* setup signal stack inside stub data */
+ stack.ss_sp = (void *)init_data.stub_start + UM_KERN_PAGE_SIZE;
+ stub_syscall2(__NR_sigaltstack, (unsigned long)&stack, 0);
+
+ /* register signal handlers */
+ sa.sa_handler_ = (void *) init_data.signal_handler;
+ sa.sa_restorer = (void *) init_data.signal_restorer;
+ if (!init_data.seccomp) {
+ /* In ptrace mode, the SIGSEGV handler never returns */
+ sa.sa_mask = 0;
+
+ res = stub_syscall4(__NR_rt_sigaction, SIGSEGV,
+ (unsigned long)&sa, 0, sizeof(sa.sa_mask));
+ if (res != 0)
+ stub_syscall1(__NR_exit, 14);
+ } else {
+ /* SECCOMP mode uses rt_sigreturn, need to mask all signals */
+ sa.sa_mask = ~0ULL;
+
+ res = stub_syscall4(__NR_rt_sigaction, SIGSEGV,
+ (unsigned long)&sa, 0, sizeof(sa.sa_mask));
+ if (res != 0)
+ stub_syscall1(__NR_exit, 15);
+
+ res = stub_syscall4(__NR_rt_sigaction, SIGSYS,
+ (unsigned long)&sa, 0, sizeof(sa.sa_mask));
+ if (res != 0)
+ stub_syscall1(__NR_exit, 16);
+
+ res = stub_syscall4(__NR_rt_sigaction, SIGALRM,
+ (unsigned long)&sa, 0, sizeof(sa.sa_mask));
+ if (res != 0)
+ stub_syscall1(__NR_exit, 17);
+
+ res = stub_syscall4(__NR_rt_sigaction, SIGTRAP,
+ (unsigned long)&sa, 0, sizeof(sa.sa_mask));
+ if (res != 0)
+ stub_syscall1(__NR_exit, 18);
+
+ res = stub_syscall4(__NR_rt_sigaction, SIGILL,
+ (unsigned long)&sa, 0, sizeof(sa.sa_mask));
+ if (res != 0)
+ stub_syscall1(__NR_exit, 19);
+
+ res = stub_syscall4(__NR_rt_sigaction, SIGFPE,
+ (unsigned long)&sa, 0, sizeof(sa.sa_mask));
+ if (res != 0)
+ stub_syscall1(__NR_exit, 20);
+ }
+
+ /*
+ * If in seccomp mode, install the SECCOMP filter and trigger a syscall.
+ * Otherwise set PTRACE_TRACEME and do a SIGSTOP.
+ */
+ if (init_data.seccomp) {
+ struct sock_filter filter[] = {
+#if __BITS_PER_LONG > 32
+ /* [0] Load upper 32bit of instruction pointer from seccomp_data */
+ BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
+ (offsetof(struct seccomp_data, instruction_pointer) + 4)),
+
+ /* [1] Jump forward 3 instructions if the upper address is not identical */
+ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, (init_data.stub_start) >> 32, 0, 3),
+#endif
+ /* [2] Load lower 32bit of instruction pointer from seccomp_data */
+ BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
+ (offsetof(struct seccomp_data, instruction_pointer))),
+
+ /* [3] Mask out lower bits */
+ BPF_STMT(BPF_ALU | BPF_AND | BPF_K, 0xfffff000),
+
+ /* [4] Jump to [6] if the lower bits are not on the expected page */
+ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, (init_data.stub_start) & 0xfffff000, 1, 0),
+
+ /* [5] Trap call, allow */
+ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRAP),
+
+ /* [6,7] Check architecture */
+ BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
+ offsetof(struct seccomp_data, arch)),
+ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K,
+ UM_SECCOMP_ARCH_NATIVE, 1, 0),
+
+ /* [8] Kill (for architecture check) */
+ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS),
+
+ /* [9] Load syscall number */
+ BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+
+ /* [10-16] Check against permitted syscalls */
+ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_futex,
+ 7, 0),
+ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K,__NR_recvmsg,
+ 6, 0),
+ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K,__NR_close,
+ 5, 0),
+ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, STUB_MMAP_NR,
+ 4, 0),
+ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_munmap,
+ 3, 0),
+#ifdef __i386__
+ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_set_thread_area,
+ 2, 0),
+#else
+ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_arch_prctl,
+ 2, 0),
+#endif
+ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_rt_sigreturn,
+ 1, 0),
+
+ /* [17] Not one of the permitted syscalls */
+ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS),
+
+ /* [18] Permitted call for the stub */
+ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = sizeof(filter) / sizeof(filter[0]),
+ .filter = filter,
+ };
+
+ if (stub_syscall3(__NR_seccomp, SECCOMP_SET_MODE_FILTER,
+ SECCOMP_FILTER_FLAG_TSYNC,
+ (unsigned long)&prog) != 0)
+ stub_syscall1(__NR_exit, 21);
+
+ /* Fall through, the exit syscall will cause SIGSYS */
+ } else {
+ stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0);
+
+ stub_syscall2(__NR_kill, stub_syscall0(__NR_getpid), SIGSTOP);
+ }
+
+ stub_syscall1(__NR_exit, 30);
+
+ __builtin_unreachable();
+}
+
+__attribute__((naked)) void _start(void)
+{
+ /*
+ * Since the stack after exec() starts at the top-most address,
+ * but that's exactly where we also want to map the stub data
+ * and code, this must:
+ * - push the stack by 1 code and STUB_DATA_PAGES data pages
+ * - call real_init()
+ * This way, real_init() can use the stack normally, while the
+ * original stack further down (higher address) will become
+ * inaccessible after the mmap() calls above.
+ */
+ stub_start(real_init);
+}
diff --git a/arch/um/kernel/skas/stub_exe_embed.S b/arch/um/kernel/skas/stub_exe_embed.S
new file mode 100644
index 000000000000..6d8914fbe8f1
--- /dev/null
+++ b/arch/um/kernel/skas/stub_exe_embed.S
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/init.h>
+#include <linux/linkage.h>
+
+__INITDATA
+
+SYM_DATA_START(stub_exe_start)
+ .incbin "arch/um/kernel/skas/stub_exe"
+SYM_DATA_END_LABEL(stub_exe_start, SYM_L_GLOBAL, stub_exe_end)
+
+__FINIT
diff --git a/arch/um/kernel/skas/syscall.c b/arch/um/kernel/skas/syscall.c
index c0681e097432..ba7494f9bfe4 100644
--- a/arch/um/kernel/skas/syscall.c
+++ b/arch/um/kernel/skas/syscall.c
@@ -1,40 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
*/
#include <linux/kernel.h>
#include <linux/ptrace.h>
+#include <linux/seccomp.h>
#include <kern_util.h>
#include <sysdep/ptrace.h>
-#include <sysdep/syscalls.h>
-
-extern int syscall_table_size;
-#define NR_SYSCALLS (syscall_table_size / sizeof(void *))
+#include <sysdep/ptrace_user.h>
+#include <linux/time-internal.h>
+#include <asm/syscall.h>
+#include <asm/unistd.h>
+#include <asm/delay.h>
void handle_syscall(struct uml_pt_regs *r)
{
struct pt_regs *regs = container_of(r, struct pt_regs, regs);
- long result;
int syscall;
- syscall_trace_enter(regs);
+ /* Initialize the syscall number and default return value. */
+ UPT_SYSCALL_NR(r) = PT_SYSCALL_NR(r->gp);
+ PT_REGS_SET_SYSCALL_RETURN(regs, -ENOSYS);
+
+ if (syscall_trace_enter(regs))
+ goto out;
+
+ /* Do the seccomp check after ptrace; failures should be fast. */
+ if (secure_computing() == -1)
+ goto out;
+
+ syscall = UPT_SYSCALL_NR(r);
/*
- * This should go in the declaration of syscall, but when I do that,
- * strace -f -c bash -c 'ls ; ls' breaks, sometimes not tracing
- * children at all, sometimes hanging when bash doesn't see the first
- * ls exit.
- * The assembly looks functionally the same to me. This is
- * gcc version 4.0.1 20050727 (Red Hat 4.0.1-5)
- * in case it's a compiler bug.
+ * If no time passes, then sched_yield may not actually yield, causing
+ * broken spinlock implementations in userspace (ASAN) to hang for long
+ * periods of time.
*/
- syscall = UPT_SYSCALL_NR(r);
- if ((syscall >= NR_SYSCALLS) || (syscall < 0))
- result = -ENOSYS;
- else result = EXECUTE_SYSCALL(syscall, regs);
+ if ((time_travel_mode == TT_MODE_INFCPU ||
+ time_travel_mode == TT_MODE_EXTERNAL) &&
+ syscall == __NR_sched_yield)
+ tt_extra_sched_jiffies += 1;
+
+ if (syscall >= 0 && syscall < __NR_syscalls) {
+ unsigned long ret;
+
+ ret = (*sys_call_table[syscall])(UPT_SYSCALL_ARG1(&regs->regs),
+ UPT_SYSCALL_ARG2(&regs->regs),
+ UPT_SYSCALL_ARG3(&regs->regs),
+ UPT_SYSCALL_ARG4(&regs->regs),
+ UPT_SYSCALL_ARG5(&regs->regs),
+ UPT_SYSCALL_ARG6(&regs->regs));
+
+ PT_REGS_SET_SYSCALL_RETURN(regs, ret);
- PT_REGS_SET_SYSCALL_RETURN(regs, result);
+ /*
+ * An error value here can be some form of -ERESTARTSYS
+ * and then we'd just loop. Make any error syscalls take
+ * some time, so that it won't just loop if something is
+ * not ready, and hopefully other things will make some
+ * progress.
+ */
+ if (IS_ERR_VALUE(ret) &&
+ (time_travel_mode == TT_MODE_INFCPU ||
+ time_travel_mode == TT_MODE_EXTERNAL)) {
+ um_udelay(1);
+ schedule();
+ }
+ }
+out:
syscall_trace_leave(regs);
}
diff --git a/arch/um/kernel/skas/uaccess.c b/arch/um/kernel/skas/uaccess.c
index 1d3e0c17340b..198269e384c4 100644
--- a/arch/um/kernel/skas/uaccess.c
+++ b/arch/um/kernel/skas/uaccess.c
@@ -1,6 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
*/
#include <linux/err.h>
@@ -10,13 +10,14 @@
#include <linux/sched.h>
#include <asm/current.h>
#include <asm/page.h>
-#include <asm/pgtable.h>
#include <kern_util.h>
+#include <asm/futex.h>
#include <os.h>
pte_t *virt_to_pte(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
@@ -27,7 +28,11 @@ pte_t *virt_to_pte(struct mm_struct *mm, unsigned long addr)
if (!pgd_present(*pgd))
return NULL;
- pud = pud_offset(pgd, addr);
+ p4d = p4d_offset(pgd, addr);
+ if (!p4d_present(*p4d))
+ return NULL;
+
+ pud = pud_offset(p4d, addr);
if (!pud_present(*pud))
return NULL;
@@ -59,38 +64,38 @@ static pte_t *maybe_map(unsigned long virt, int is_write)
static int do_op_one_page(unsigned long addr, int len, int is_write,
int (*op)(unsigned long addr, int len, void *arg), void *arg)
{
- jmp_buf buf;
struct page *page;
pte_t *pte;
- int n, faulted;
+ int n;
pte = maybe_map(addr, is_write);
if (pte == NULL)
return -1;
page = pte_page(*pte);
+#ifdef CONFIG_64BIT
+ pagefault_disable();
+ addr = (unsigned long) page_address(page) +
+ (addr & ~PAGE_MASK);
+#else
addr = (unsigned long) kmap_atomic(page) +
(addr & ~PAGE_MASK);
+#endif
+ n = (*op)(addr, len, arg);
- current->thread.fault_catcher = &buf;
-
- faulted = UML_SETJMP(&buf);
- if (faulted == 0)
- n = (*op)(addr, len, arg);
- else
- n = -1;
-
- current->thread.fault_catcher = NULL;
-
+#ifdef CONFIG_64BIT
+ pagefault_enable();
+#else
kunmap_atomic((void *)addr);
+#endif
return n;
}
-static int buffer_op(unsigned long addr, int len, int is_write,
- int (*op)(unsigned long, int, void *), void *arg)
+static long buffer_op(unsigned long addr, int len, int is_write,
+ int (*op)(unsigned long, int, void *), void *arg)
{
- int size, remain, n;
+ long size, remain, n;
size = min(PAGE_ALIGN(addr) - addr, (unsigned long) len);
remain = len;
@@ -139,18 +144,11 @@ static int copy_chunk_from_user(unsigned long from, int len, void *arg)
return 0;
}
-int copy_from_user(void *to, const void __user *from, int n)
+unsigned long raw_copy_from_user(void *to, const void __user *from, unsigned long n)
{
- if (segment_eq(get_fs(), KERNEL_DS)) {
- memcpy(to, (__force void*)from, n);
- return 0;
- }
-
- return access_ok(VERIFY_READ, from, n) ?
- buffer_op((unsigned long) from, n, 0, copy_chunk_from_user, &to):
- n;
+ return buffer_op((unsigned long) from, n, 0, copy_chunk_from_user, &to);
}
-EXPORT_SYMBOL(copy_from_user);
+EXPORT_SYMBOL(raw_copy_from_user);
static int copy_chunk_to_user(unsigned long to, int len, void *arg)
{
@@ -161,18 +159,11 @@ static int copy_chunk_to_user(unsigned long to, int len, void *arg)
return 0;
}
-int copy_to_user(void __user *to, const void *from, int n)
+unsigned long raw_copy_to_user(void __user *to, const void *from, unsigned long n)
{
- if (segment_eq(get_fs(), KERNEL_DS)) {
- memcpy((__force void *) to, from, n);
- return 0;
- }
-
- return access_ok(VERIFY_WRITE, to, n) ?
- buffer_op((unsigned long) to, n, 1, copy_chunk_to_user, &from) :
- n;
+ return buffer_op((unsigned long) to, n, 1, copy_chunk_to_user, &from);
}
-EXPORT_SYMBOL(copy_to_user);
+EXPORT_SYMBOL(raw_copy_to_user);
static int strncpy_chunk_from_user(unsigned long from, int len, void *arg)
{
@@ -188,19 +179,13 @@ static int strncpy_chunk_from_user(unsigned long from, int len, void *arg)
return 0;
}
-int strncpy_from_user(char *dst, const char __user *src, int count)
+long strncpy_from_user(char *dst, const char __user *src, long count)
{
- int n;
+ long n;
char *ptr = dst;
- if (segment_eq(get_fs(), KERNEL_DS)) {
- strncpy(dst, (__force void *) src, count);
- return strnlen(dst, count);
- }
-
- if (!access_ok(VERIFY_READ, src, 1))
+ if (!access_ok(src, 1))
return -EFAULT;
-
n = buffer_op((unsigned long) src, count, 0, strncpy_chunk_from_user,
&ptr);
if (n != 0)
@@ -215,22 +200,11 @@ static int clear_chunk(unsigned long addr, int len, void *unused)
return 0;
}
-int __clear_user(void __user *mem, int len)
+unsigned long __clear_user(void __user *mem, unsigned long len)
{
return buffer_op((unsigned long) mem, len, 1, clear_chunk, NULL);
}
-
-int clear_user(void __user *mem, int len)
-{
- if (segment_eq(get_fs(), KERNEL_DS)) {
- memset((__force void*)mem, 0, len);
- return 0;
- }
-
- return access_ok(VERIFY_WRITE, mem, len) ?
- buffer_op((unsigned long) mem, len, 1, clear_chunk, NULL) : len;
-}
-EXPORT_SYMBOL(clear_user);
+EXPORT_SYMBOL(__clear_user);
static int strnlen_chunk(unsigned long str, int len, void *arg)
{
@@ -244,16 +218,151 @@ static int strnlen_chunk(unsigned long str, int len, void *arg)
return 0;
}
-int strnlen_user(const void __user *str, int len)
+long strnlen_user(const char __user *str, long len)
{
int count = 0, n;
- if (segment_eq(get_fs(), KERNEL_DS))
- return strnlen((__force char*)str, len) + 1;
-
+ if (!access_ok(str, 1))
+ return -EFAULT;
n = buffer_op((unsigned long) str, len, 0, strnlen_chunk, &count);
if (n == 0)
return count + 1;
- return -EFAULT;
+ return 0;
}
EXPORT_SYMBOL(strnlen_user);
+
+/**
+ * arch_futex_atomic_op_inuser() - Atomic arithmetic operation with constant
+ * argument and comparison of the previous
+ * futex value with another constant.
+ *
+ * @op: operation to execute
+ * @oparg: argument to operation
+ * @oval: old value at uaddr
+ * @uaddr: pointer to user space address
+ *
+ * Return:
+ * 0 - On success
+ * -EFAULT - User access resulted in a page fault
+ * -EAGAIN - Atomic operation was unable to complete due to contention
+ * -ENOSYS - Operation not supported
+ */
+
+int arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval, u32 __user *uaddr)
+{
+ int oldval, ret;
+ struct page *page;
+ unsigned long addr = (unsigned long) uaddr;
+ pte_t *pte;
+
+ ret = -EFAULT;
+ if (!access_ok(uaddr, sizeof(*uaddr)))
+ return -EFAULT;
+ preempt_disable();
+ pte = maybe_map(addr, 1);
+ if (pte == NULL)
+ goto out_inuser;
+
+ page = pte_page(*pte);
+#ifdef CONFIG_64BIT
+ pagefault_disable();
+ addr = (unsigned long) page_address(page) +
+ (((unsigned long) addr) & ~PAGE_MASK);
+#else
+ addr = (unsigned long) kmap_atomic(page) +
+ ((unsigned long) addr & ~PAGE_MASK);
+#endif
+ uaddr = (u32 *) addr;
+ oldval = *uaddr;
+
+ ret = 0;
+
+ switch (op) {
+ case FUTEX_OP_SET:
+ *uaddr = oparg;
+ break;
+ case FUTEX_OP_ADD:
+ *uaddr += oparg;
+ break;
+ case FUTEX_OP_OR:
+ *uaddr |= oparg;
+ break;
+ case FUTEX_OP_ANDN:
+ *uaddr &= ~oparg;
+ break;
+ case FUTEX_OP_XOR:
+ *uaddr ^= oparg;
+ break;
+ default:
+ ret = -ENOSYS;
+ }
+#ifdef CONFIG_64BIT
+ pagefault_enable();
+#else
+ kunmap_atomic((void *)addr);
+#endif
+
+out_inuser:
+ preempt_enable();
+
+ if (ret == 0)
+ *oval = oldval;
+
+ return ret;
+}
+EXPORT_SYMBOL(arch_futex_atomic_op_inuser);
+
+/**
+ * futex_atomic_cmpxchg_inatomic() - Compare and exchange the content of the
+ * uaddr with newval if the current value is
+ * oldval.
+ * @uval: pointer to store content of @uaddr
+ * @uaddr: pointer to user space address
+ * @oldval: old value
+ * @newval: new value to store to @uaddr
+ *
+ * Return:
+ * 0 - On success
+ * -EFAULT - User access resulted in a page fault
+ * -EAGAIN - Atomic operation was unable to complete due to contention
+ */
+
+int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
+ u32 oldval, u32 newval)
+{
+ struct page *page;
+ pte_t *pte;
+ int ret = -EFAULT;
+
+ if (!access_ok(uaddr, sizeof(*uaddr)))
+ return -EFAULT;
+
+ preempt_disable();
+ pte = maybe_map((unsigned long) uaddr, 1);
+ if (pte == NULL)
+ goto out_inatomic;
+
+ page = pte_page(*pte);
+#ifdef CONFIG_64BIT
+ pagefault_disable();
+ uaddr = page_address(page) + (((unsigned long) uaddr) & ~PAGE_MASK);
+#else
+ uaddr = kmap_atomic(page) + ((unsigned long) uaddr & ~PAGE_MASK);
+#endif
+
+ *uval = *uaddr;
+
+ ret = cmpxchg(uaddr, oldval, newval);
+
+#ifdef CONFIG_64BIT
+ pagefault_enable();
+#else
+ kunmap_atomic(uaddr);
+#endif
+ ret = 0;
+
+out_inatomic:
+ preempt_enable();
+ return ret;
+}
+EXPORT_SYMBOL(futex_atomic_cmpxchg_inatomic);
diff --git a/arch/um/kernel/smp.c b/arch/um/kernel/smp.c
index 5c8c3ea7db7b..f1e52b7348fb 100644
--- a/arch/um/kernel/smp.c
+++ b/arch/um/kernel/smp.c
@@ -1,238 +1,242 @@
+// SPDX-License-Identifier: GPL-2.0
/*
+ * Copyright (C) 2025 Ant Group
+ * Author: Tiwei Bie <tiwei.btw@antgroup.com>
+ *
+ * Based on the previous implementation in TT mode
* Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
*/
-#include <linux/percpu.h>
-#include <asm/pgalloc.h>
-#include <asm/tlb.h>
-
-#ifdef CONFIG_SMP
-
#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
#include <linux/module.h>
+#include <linux/processor.h>
#include <linux/threads.h>
-#include <linux/interrupt.h>
-#include <linux/err.h>
+#include <linux/cpu.h>
#include <linux/hardirq.h>
-#include <asm/smp.h>
-#include <asm/processor.h>
-#include <asm/spinlock.h>
+#include <linux/smp.h>
+#include <linux/smp-internal.h>
+#include <init.h>
#include <kern.h>
-#include <irq_user.h>
#include <os.h>
+#include <smp.h>
-/* Per CPU bogomips and other parameters
- * The only piece used here is the ipi pipe, which is set before SMP is
- * started and never changed.
- */
-struct cpuinfo_um cpu_data[NR_CPUS];
+enum {
+ UML_IPI_RES = 0,
+ UML_IPI_CALL_SINGLE,
+ UML_IPI_CALL,
+ UML_IPI_STOP,
+};
-/* A statistic, can be a little off */
-int num_reschedules_sent = 0;
+void arch_smp_send_reschedule(int cpu)
+{
+ os_send_ipi(cpu, UML_IPI_RES);
+}
-/* Not changed after boot */
-struct task_struct *idle_threads[NR_CPUS];
+void arch_send_call_function_single_ipi(int cpu)
+{
+ os_send_ipi(cpu, UML_IPI_CALL_SINGLE);
+}
-void smp_send_reschedule(int cpu)
+void arch_send_call_function_ipi_mask(const struct cpumask *mask)
{
- os_write_file(cpu_data[cpu].ipi_pipe[1], "R", 1);
- num_reschedules_sent++;
+ int cpu;
+
+ for_each_cpu(cpu, mask)
+ os_send_ipi(cpu, UML_IPI_CALL);
}
void smp_send_stop(void)
{
- int i;
+ int cpu, me = smp_processor_id();
- printk(KERN_INFO "Stopping all CPUs...");
- for (i = 0; i < num_online_cpus(); i++) {
- if (i == current_thread->cpu)
+ for_each_online_cpu(cpu) {
+ if (cpu == me)
continue;
- os_write_file(cpu_data[i].ipi_pipe[1], "S", 1);
+ os_send_ipi(cpu, UML_IPI_STOP);
}
- printk(KERN_CONT "done\n");
}
-static cpumask_t smp_commenced_mask = CPU_MASK_NONE;
-static cpumask_t cpu_callin_map = CPU_MASK_NONE;
+static void ipi_handler(int vector, struct uml_pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs((struct pt_regs *)regs);
+ int cpu = raw_smp_processor_id();
+
+ irq_enter();
+
+ if (current->mm)
+ os_alarm_process(current->mm->context.id.pid);
+
+ switch (vector) {
+ case UML_IPI_RES:
+ inc_irq_stat(irq_resched_count);
+ scheduler_ipi();
+ break;
+
+ case UML_IPI_CALL_SINGLE:
+ inc_irq_stat(irq_call_count);
+ generic_smp_call_function_single_interrupt();
+ break;
+
+ case UML_IPI_CALL:
+ inc_irq_stat(irq_call_count);
+ generic_smp_call_function_interrupt();
+ break;
+
+ case UML_IPI_STOP:
+ set_cpu_online(cpu, false);
+ while (1)
+ pause();
+ break;
+
+ default:
+ pr_err("CPU#%d received unknown IPI (vector=%d)!\n", cpu, vector);
+ break;
+ }
+
+ irq_exit();
+ set_irq_regs(old_regs);
+}
-static int idle_proc(void *cpup)
+void uml_ipi_handler(int vector)
{
- int cpu = (int) cpup, err;
+ struct uml_pt_regs r = { .is_user = 0 };
- err = os_pipe(cpu_data[cpu].ipi_pipe, 1, 1);
- if (err < 0)
- panic("CPU#%d failed to create IPI pipe, err = %d", cpu, -err);
+ preempt_disable();
+ ipi_handler(vector, &r);
+ preempt_enable();
+}
- os_set_fd_async(cpu_data[cpu].ipi_pipe[0]);
+/* AP states used only during CPU startup */
+enum {
+ UML_CPU_PAUSED = 0,
+ UML_CPU_RUNNING,
+};
- wmb();
- if (cpu_test_and_set(cpu, cpu_callin_map)) {
- printk(KERN_ERR "huh, CPU#%d already present??\n", cpu);
- BUG();
- }
+static int cpu_states[NR_CPUS];
- while (!cpu_isset(cpu, smp_commenced_mask))
- cpu_relax();
+static int start_secondary(void *unused)
+{
+ int err, cpu = raw_smp_processor_id();
notify_cpu_starting(cpu);
set_cpu_online(cpu, true);
- default_idle();
- return 0;
-}
-static struct task_struct *idle_thread(int cpu)
-{
- struct task_struct *new_task;
-
- current->thread.request.u.thread.proc = idle_proc;
- current->thread.request.u.thread.arg = (void *) cpu;
- new_task = fork_idle(cpu);
- if (IS_ERR(new_task))
- panic("copy_process failed in idle_thread, error = %ld",
- PTR_ERR(new_task));
-
- cpu_tasks[cpu] = ((struct cpu_task)
- { .pid = new_task->thread.mode.tt.extern_pid,
- .task = new_task } );
- idle_threads[cpu] = new_task;
- panic("skas mode doesn't support SMP");
- return new_task;
-}
+ err = um_setup_timer();
+ if (err)
+ panic("CPU#%d failed to setup timer, err = %d", cpu, err);
-void smp_prepare_cpus(unsigned int maxcpus)
-{
- struct task_struct *idle;
- unsigned long waittime;
- int err, cpu, me = smp_processor_id();
- int i;
+ local_irq_enable();
- for (i = 0; i < ncpus; ++i)
- set_cpu_possible(i, true);
+ cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
- set_cpu_online(me, true);
- cpu_set(me, cpu_callin_map);
+ return 0;
+}
- err = os_pipe(cpu_data[me].ipi_pipe, 1, 1);
- if (err < 0)
- panic("CPU#0 failed to create IPI pipe, errno = %d", -err);
+void uml_start_secondary(void *opaque)
+{
+ int cpu = raw_smp_processor_id();
+ struct mm_struct *mm = &init_mm;
+ struct task_struct *idle;
- os_set_fd_async(cpu_data[me].ipi_pipe[0]);
+ stack_protections((unsigned long) &cpu_irqstacks[cpu]);
+ set_sigstack(&cpu_irqstacks[cpu], THREAD_SIZE);
- for (cpu = 1; cpu < ncpus; cpu++) {
- printk(KERN_INFO "Booting processor %d...\n", cpu);
+ set_cpu_present(cpu, true);
+ os_futex_wait(&cpu_states[cpu], UML_CPU_PAUSED);
- idle = idle_thread(cpu);
+ smp_rmb(); /* paired with smp_wmb() in __cpu_up() */
- init_idle(idle, cpu);
+ idle = cpu_tasks[cpu];
+ idle->thread_info.cpu = cpu;
- waittime = 200000000;
- while (waittime-- && !cpu_isset(cpu, cpu_callin_map))
- cpu_relax();
+ mmgrab(mm);
+ idle->active_mm = mm;
- printk(KERN_INFO "%s\n",
- cpu_isset(cpu, cpu_calling_map) ? "done" : "failed");
- }
-}
+ idle->thread.request.thread.proc = start_secondary;
+ idle->thread.request.thread.arg = NULL;
-void smp_prepare_boot_cpu(void)
-{
- set_cpu_online(smp_processor_id(), true);
+ new_thread(task_stack_page(idle), &idle->thread.switch_buf,
+ new_thread_handler);
+ os_start_secondary(opaque, &idle->thread.switch_buf);
}
-int __cpu_up(unsigned int cpu, struct task_struct *tidle)
+void __init smp_prepare_cpus(unsigned int max_cpus)
{
- cpu_set(cpu, smp_commenced_mask);
- while (!cpu_online(cpu))
- mb();
- return 0;
-}
+ int err, cpu, me = smp_processor_id();
+ unsigned long deadline;
-int setup_profiling_timer(unsigned int multiplier)
-{
- printk(KERN_INFO "setup_profiling_timer\n");
- return 0;
-}
+ os_init_smp();
-void smp_call_function_slave(int cpu);
+ for_each_possible_cpu(cpu) {
+ if (cpu == me)
+ continue;
-void IPI_handler(int cpu)
-{
- unsigned char c;
- int fd;
-
- fd = cpu_data[cpu].ipi_pipe[0];
- while (os_read_file(fd, &c, 1) == 1) {
- switch (c) {
- case 'C':
- smp_call_function_slave(cpu);
- break;
-
- case 'R':
- scheduler_ipi();
- break;
-
- case 'S':
- printk(KERN_INFO "CPU#%d stopping\n", cpu);
- while (1)
- pause();
- break;
-
- default:
- printk(KERN_ERR "CPU#%d received unknown IPI [%c]!\n",
- cpu, c);
- break;
+ pr_debug("Booting processor %d...\n", cpu);
+ err = os_start_cpu_thread(cpu);
+ if (err) {
+ pr_crit("CPU#%d failed to start cpu thread, err = %d",
+ cpu, err);
+ continue;
}
+
+ deadline = jiffies + msecs_to_jiffies(1000);
+ spin_until_cond(cpu_present(cpu) ||
+ time_is_before_jiffies(deadline));
+
+ if (!cpu_present(cpu))
+ pr_crit("CPU#%d failed to boot\n", cpu);
}
}
-int hard_smp_processor_id(void)
+int __cpu_up(unsigned int cpu, struct task_struct *tidle)
{
- return pid_to_processor_id(os_getpid());
-}
+ cpu_tasks[cpu] = tidle;
+ smp_wmb(); /* paired with smp_rmb() in uml_start_secondary() */
+ cpu_states[cpu] = UML_CPU_RUNNING;
+ os_futex_wake(&cpu_states[cpu]);
+ spin_until_cond(cpu_online(cpu));
-static DEFINE_SPINLOCK(call_lock);
-static atomic_t scf_started;
-static atomic_t scf_finished;
-static void (*func)(void *info);
-static void *info;
-
-void smp_call_function_slave(int cpu)
-{
- atomic_inc(&scf_started);
- (*func)(info);
- atomic_inc(&scf_finished);
+ return 0;
}
-int smp_call_function(void (*_func)(void *info), void *_info, int wait)
+void __init smp_cpus_done(unsigned int max_cpus)
{
- int cpus = num_online_cpus() - 1;
- int i;
+}
- if (!cpus)
- return 0;
+/* Set in uml_ncpus_setup */
+int uml_ncpus = 1;
- /* Can deadlock when called with interrupts disabled */
- WARN_ON(irqs_disabled());
+void __init prefill_possible_map(void)
+{
+ int cpu;
- spin_lock_bh(&call_lock);
- atomic_set(&scf_started, 0);
- atomic_set(&scf_finished, 0);
- func = _func;
- info = _info;
+ for (cpu = 0; cpu < uml_ncpus; cpu++)
+ set_cpu_possible(cpu, true);
+ for (; cpu < NR_CPUS; cpu++)
+ set_cpu_possible(cpu, false);
+}
- for_each_online_cpu(i)
- os_write_file(cpu_data[i].ipi_pipe[1], "C", 1);
+static int __init uml_ncpus_setup(char *line, int *add)
+{
+ *add = 0;
- while (atomic_read(&scf_started) != cpus)
- barrier();
+ if (kstrtoint(line, 10, &uml_ncpus)) {
+ os_warn("%s: Couldn't parse '%s'\n", __func__, line);
+ return -1;
+ }
- if (wait)
- while (atomic_read(&scf_finished) != cpus)
- barrier();
+ uml_ncpus = clamp(uml_ncpus, 1, NR_CPUS);
- spin_unlock_bh(&call_lock);
return 0;
}
-#endif
+__uml_setup("ncpus=", uml_ncpus_setup,
+"ncpus=<# of desired CPUs>\n"
+" This tells UML how many virtual processors to start. The maximum\n"
+" number of supported virtual processors can be obtained by querying\n"
+" the CONFIG_NR_CPUS option using --showconfig.\n\n"
+);
+
+EXPORT_SYMBOL(uml_curr_cpu);
diff --git a/arch/um/kernel/stacktrace.c b/arch/um/kernel/stacktrace.c
new file mode 100644
index 000000000000..fd3b61b3d4d2
--- /dev/null
+++ b/arch/um/kernel/stacktrace.c
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Copyright (C) 2013 Richard Weinberger <richard@nod.at>
+ * Copyright (C) 2014 Google Inc., Author: Daniel Walter <dwalter@google.com>
+ */
+
+#include <linux/kallsyms.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/stacktrace.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <asm/stacktrace.h>
+
+void dump_trace(struct task_struct *tsk,
+ const struct stacktrace_ops *ops,
+ void *data)
+{
+ int reliable = 0;
+ unsigned long *sp, bp, addr;
+ struct pt_regs *segv_regs = tsk->thread.segv_regs;
+ struct stack_frame *frame;
+
+ bp = get_frame_pointer(tsk, segv_regs);
+ sp = get_stack_pointer(tsk, segv_regs);
+
+ frame = (struct stack_frame *)bp;
+ while (((long) sp & (THREAD_SIZE-1)) != 0) {
+ addr = READ_ONCE_NOCHECK(*sp);
+ if (__kernel_text_address(addr)) {
+ reliable = 0;
+ if ((unsigned long) sp == bp + sizeof(long)) {
+ frame = frame ? frame->next_frame : NULL;
+ bp = (unsigned long)frame;
+ reliable = 1;
+ }
+ ops->address(data, addr, reliable);
+ }
+ sp++;
+ }
+}
+
+static void save_addr(void *data, unsigned long address, int reliable)
+{
+ struct stack_trace *trace = data;
+
+ if (!reliable)
+ return;
+ if (trace->nr_entries >= trace->max_entries)
+ return;
+
+ trace->entries[trace->nr_entries++] = address;
+}
+
+static const struct stacktrace_ops dump_ops = {
+ .address = save_addr
+};
+
+static void __save_stack_trace(struct task_struct *tsk, struct stack_trace *trace)
+{
+ dump_trace(tsk, &dump_ops, trace);
+}
+
+void save_stack_trace(struct stack_trace *trace)
+{
+ __save_stack_trace(current, trace);
+}
+EXPORT_SYMBOL_GPL(save_stack_trace);
+
+void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
+{
+ __save_stack_trace(tsk, trace);
+}
+EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
diff --git a/arch/um/kernel/syscall.c b/arch/um/kernel/syscall.c
deleted file mode 100644
index c1d0ae069b53..000000000000
--- a/arch/um/kernel/syscall.c
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
- */
-
-#include <linux/file.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/utsname.h>
-#include <linux/syscalls.h>
-#include <asm/current.h>
-#include <asm/mman.h>
-#include <asm/uaccess.h>
-#include <asm/unistd.h>
-
-long old_mmap(unsigned long addr, unsigned long len,
- unsigned long prot, unsigned long flags,
- unsigned long fd, unsigned long offset)
-{
- long err = -EINVAL;
- if (offset & ~PAGE_MASK)
- goto out;
-
- err = sys_mmap_pgoff(addr, len, prot, flags, fd, offset >> PAGE_SHIFT);
- out:
- return err;
-}
diff --git a/arch/um/kernel/sysrq.c b/arch/um/kernel/sysrq.c
index 0dc4d1c6f98a..13ee5666668d 100644
--- a/arch/um/kernel/sysrq.c
+++ b/arch/um/kernel/sysrq.c
@@ -1,66 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
+ * Copyright (C) 2013 Richard Weinberger <richrd@nod.at>
*/
#include <linux/kallsyms.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/sched.h>
-#include <asm/sysrq.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
-/* Catch non-i386 SUBARCH's. */
-#if !defined(CONFIG_UML_X86) || defined(CONFIG_64BIT)
-void show_trace(struct task_struct *task, unsigned long * stack)
-{
- unsigned long addr;
+#include <asm/stacktrace.h>
+#include <os.h>
- if (!stack) {
- stack = (unsigned long*) &stack;
- WARN_ON(1);
- }
+static void _print_addr(void *data, unsigned long address, int reliable)
+{
+ const char *loglvl = data;
- printk(KERN_INFO "Call Trace: \n");
- while (((long) stack & (THREAD_SIZE-1)) != 0) {
- addr = *stack;
- if (__kernel_text_address(addr)) {
- printk(KERN_INFO "%08lx: [<%08lx>]",
- (unsigned long) stack, addr);
- print_symbol(KERN_CONT " %s", addr);
- printk(KERN_CONT "\n");
- }
- stack++;
- }
- printk(KERN_INFO "\n");
+ printk("%s [<%08lx>] %s%pS\n", loglvl, address, reliable ? "" : "? ",
+ (void *)address);
}
-#endif
-/*Stolen from arch/i386/kernel/traps.c */
-static const int kstack_depth_to_print = 24;
+static const struct stacktrace_ops stackops = {
+ .address = _print_addr
+};
-/* This recently started being used in arch-independent code too, as in
- * kernel/sched/core.c.*/
-void show_stack(struct task_struct *task, unsigned long *esp)
+void show_stack(struct task_struct *task, unsigned long *stack,
+ const char *loglvl)
{
- unsigned long *stack;
+ struct pt_regs *segv_regs = current->thread.segv_regs;
int i;
- if (esp == NULL) {
- if (task != current && task != NULL) {
- esp = (unsigned long *) KSTK_ESP(task);
- } else {
- esp = (unsigned long *) &esp;
- }
- }
+ if (!stack)
+ stack = get_stack_pointer(task, segv_regs);
- stack = esp;
- for (i = 0; i < kstack_depth_to_print; i++) {
+ printk("%sStack:\n", loglvl);
+ for (i = 0; i < 3 * STACKSLOTS_PER_LINE; i++) {
if (kstack_end(stack))
break;
- if (i && ((i % 8) == 0))
- printk(KERN_INFO " ");
- printk(KERN_CONT "%08lx ", *stack++);
+ if (i && ((i % STACKSLOTS_PER_LINE) == 0))
+ pr_cont("\n");
+ pr_cont(" %08lx", READ_ONCE_NOCHECK(*stack));
+ stack++;
}
- show_trace(task, esp);
+ printk("%sCall Trace:\n", loglvl);
+ dump_trace(task ?: current, &stackops, (void *)loglvl);
}
diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c
index 117568d4f64a..b344a36b44eb 100644
--- a/arch/um/kernel/time.c
+++ b/arch/um/kernel/time.c
@@ -1,115 +1,1092 @@
+// SPDX-License-Identifier: GPL-2.0
/*
+ * Copyright (C) 2015 Anton Ivanov (aivanov@{brocade.com,kot-begemot.co.uk})
+ * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de)
+ * Copyright (C) 2012-2014 Cisco Systems
* Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
+ * Copyright (C) 2019 Intel Corporation
*/
#include <linux/clockchips.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/jiffies.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
#include <linux/threads.h>
#include <asm/irq.h>
#include <asm/param.h>
#include <kern_util.h>
#include <os.h>
+#include <linux/delay.h>
+#include <linux/time-internal.h>
+#include <linux/um_timetravel.h>
+#include <shared/init.h>
+
+#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT
+#include <linux/sched/clock.h>
+
+enum time_travel_mode time_travel_mode;
+EXPORT_SYMBOL_GPL(time_travel_mode);
+
+static bool time_travel_start_set;
+static unsigned long long time_travel_start;
+static unsigned long long time_travel_time;
+static unsigned long long time_travel_shm_offset;
+static LIST_HEAD(time_travel_events);
+static LIST_HEAD(time_travel_irqs);
+static unsigned long long time_travel_timer_interval;
+static unsigned long long time_travel_next_event;
+static struct time_travel_event time_travel_timer_event;
+static int time_travel_ext_fd = -1;
+static unsigned int time_travel_ext_waiting;
+static bool time_travel_ext_prev_request_valid;
+static unsigned long long time_travel_ext_prev_request;
+static unsigned long long *time_travel_ext_free_until;
+static unsigned long long _time_travel_ext_free_until;
+static u16 time_travel_shm_id;
+static struct um_timetravel_schedshm *time_travel_shm;
+static union um_timetravel_schedshm_client *time_travel_shm_client;
+
+unsigned long tt_extra_sched_jiffies;
+
+notrace unsigned long long sched_clock(void)
+{
+ return (unsigned long long)(jiffies - INITIAL_JIFFIES +
+ tt_extra_sched_jiffies)
+ * (NSEC_PER_SEC / HZ);
+}
+
+static void time_travel_set_time(unsigned long long ns)
+{
+ if (unlikely(ns < time_travel_time))
+ panic("time-travel: time goes backwards %lld -> %lld\n",
+ time_travel_time, ns);
+ else if (unlikely(ns >= S64_MAX))
+ panic("The system was going to sleep forever, aborting");
+
+ time_travel_time = ns;
+}
+
+enum time_travel_message_handling {
+ TTMH_IDLE,
+ TTMH_POLL,
+ TTMH_READ,
+ TTMH_READ_START_ACK,
+};
+
+static u64 bc_message;
+int time_travel_should_print_bc_msg;
+
+void _time_travel_print_bc_msg(void)
+{
+ time_travel_should_print_bc_msg = 0;
+ printk(KERN_INFO "time-travel: received broadcast 0x%llx\n", bc_message);
+}
+
+static void time_travel_setup_shm(int fd, u16 id)
+{
+ u32 len;
+
+ time_travel_shm = os_mmap_rw_shared(fd, sizeof(*time_travel_shm));
+
+ if (!time_travel_shm)
+ goto out;
+
+ len = time_travel_shm->len;
+
+ if (time_travel_shm->version != UM_TIMETRAVEL_SCHEDSHM_VERSION ||
+ len < struct_size(time_travel_shm, clients, id + 1)) {
+ os_unmap_memory(time_travel_shm, sizeof(*time_travel_shm));
+ time_travel_shm = NULL;
+ goto out;
+ }
+
+ time_travel_shm = os_mremap_rw_shared(time_travel_shm,
+ sizeof(*time_travel_shm),
+ len);
+ if (!time_travel_shm)
+ goto out;
+
+ time_travel_shm_offset = time_travel_shm->current_time;
+ time_travel_shm_client = &time_travel_shm->clients[id];
+ time_travel_shm_client->capa |= UM_TIMETRAVEL_SCHEDSHM_CAP_TIME_SHARE;
+ time_travel_shm_id = id;
+ /* always look at that free_until from now on */
+ time_travel_ext_free_until = &time_travel_shm->free_until;
+out:
+ os_close_file(fd);
+}
+
+static void time_travel_handle_message(struct um_timetravel_msg *msg,
+ enum time_travel_message_handling mode)
+{
+ struct um_timetravel_msg resp = {
+ .op = UM_TIMETRAVEL_ACK,
+ };
+ int ret;
+
+ /*
+ * We can't unlock here, but interrupt signals with a timetravel_handler
+ * (see um_request_irq_tt) get to the timetravel_handler anyway.
+ */
+ if (mode != TTMH_READ) {
+ BUG_ON(mode == TTMH_IDLE && !irqs_disabled());
+
+ while (os_poll(1, &time_travel_ext_fd) != 0) {
+ /* nothing */
+ }
+ }
+
+ if (unlikely(mode == TTMH_READ_START_ACK)) {
+ int fd[UM_TIMETRAVEL_SHARED_MAX_FDS];
+
+ ret = os_rcv_fd_msg(time_travel_ext_fd, fd,
+ ARRAY_SIZE(fd), msg, sizeof(*msg));
+ if (ret == sizeof(*msg)) {
+ time_travel_setup_shm(fd[UM_TIMETRAVEL_SHARED_MEMFD],
+ msg->time & UM_TIMETRAVEL_START_ACK_ID);
+ /* we don't use the logging for now */
+ os_close_file(fd[UM_TIMETRAVEL_SHARED_LOGFD]);
+ }
+ } else {
+ ret = os_read_file(time_travel_ext_fd, msg, sizeof(*msg));
+ }
+
+ if (ret == 0)
+ panic("time-travel external link is broken\n");
+ if (ret != sizeof(*msg))
+ panic("invalid time-travel message - %d bytes\n", ret);
+
+ switch (msg->op) {
+ default:
+ WARN_ONCE(1, "time-travel: unexpected message %lld\n",
+ (unsigned long long)msg->op);
+ break;
+ case UM_TIMETRAVEL_ACK:
+ return;
+ case UM_TIMETRAVEL_RUN:
+ time_travel_set_time(msg->time);
+ if (time_travel_shm) {
+ /* no request right now since we're running */
+ time_travel_shm_client->flags &=
+ ~UM_TIMETRAVEL_SCHEDSHM_FLAGS_REQ_RUN;
+ /* no ack for shared memory RUN */
+ return;
+ }
+ break;
+ case UM_TIMETRAVEL_FREE_UNTIL:
+ /* not supposed to get this with shm, but ignore it */
+ if (time_travel_shm)
+ break;
+ time_travel_ext_free_until = &_time_travel_ext_free_until;
+ _time_travel_ext_free_until = msg->time;
+ break;
+ case UM_TIMETRAVEL_BROADCAST:
+ bc_message = msg->time;
+ time_travel_should_print_bc_msg = 1;
+ break;
+ }
+
+ resp.seq = msg->seq;
+ os_write_file(time_travel_ext_fd, &resp, sizeof(resp));
+}
+
+static u64 time_travel_ext_req(u32 op, u64 time)
+{
+ static int seq;
+ int mseq = ++seq;
+ struct um_timetravel_msg msg = {
+ .op = op,
+ .time = time,
+ .seq = mseq,
+ };
+
+ /*
+ * We need to block even the timetravel handlers of SIGIO here and
+ * only restore their use when we got the ACK - otherwise we may
+ * (will) get interrupted by that, try to queue the IRQ for future
+ * processing and thus send another request while we're still waiting
+ * for an ACK, but the peer doesn't know we got interrupted and will
+ * send the ACKs in the same order as the message, but we'd need to
+ * see them in the opposite order ...
+ *
+ * This wouldn't matter *too* much, but some ACKs carry the
+ * current time (for UM_TIMETRAVEL_GET) and getting another
+ * ACK without a time would confuse us a lot!
+ *
+ * The sequence number assignment that happens here lets us
+ * debug such message handling issues more easily.
+ */
+ block_signals_hard();
+ os_write_file(time_travel_ext_fd, &msg, sizeof(msg));
+
+ /* no ACK expected for WAIT in shared memory mode */
+ if (msg.op == UM_TIMETRAVEL_WAIT && time_travel_shm)
+ goto done;
+
+ while (msg.op != UM_TIMETRAVEL_ACK)
+ time_travel_handle_message(&msg,
+ op == UM_TIMETRAVEL_START ?
+ TTMH_READ_START_ACK :
+ TTMH_READ);
+
+ if (msg.seq != mseq)
+ panic("time-travel: ACK message has different seqno! op=%d, seq=%d != %d time=%lld\n",
+ msg.op, msg.seq, mseq, msg.time);
+
+ if (op == UM_TIMETRAVEL_GET)
+ time_travel_set_time(msg.time);
+done:
+ unblock_signals_hard();
+
+ return msg.time;
+}
+
+void __time_travel_wait_readable(int fd)
+{
+ int fds[2] = { fd, time_travel_ext_fd };
+ int ret;
+
+ if (time_travel_mode != TT_MODE_EXTERNAL)
+ return;
+
+ while ((ret = os_poll(2, fds))) {
+ struct um_timetravel_msg msg;
+
+ if (ret == 1)
+ time_travel_handle_message(&msg, TTMH_READ);
+ }
+}
+EXPORT_SYMBOL_GPL(__time_travel_wait_readable);
+
+static void time_travel_ext_update_request(unsigned long long time)
+{
+ if (time_travel_mode != TT_MODE_EXTERNAL)
+ return;
+
+ /* asked for exactly this time previously */
+ if (time_travel_ext_prev_request_valid &&
+ time == time_travel_ext_prev_request)
+ return;
+
+ /*
+ * if we're running and are allowed to run past the request
+ * then we don't need to update it either
+ *
+ * Note for shm we ignore FREE_UNTIL messages and leave the pointer
+ * to shared memory, and for non-shm the offset is 0.
+ */
+ if (!time_travel_ext_waiting && time_travel_ext_free_until &&
+ time < (*time_travel_ext_free_until - time_travel_shm_offset))
+ return;
+
+ time_travel_ext_prev_request = time;
+ time_travel_ext_prev_request_valid = true;
+
+ if (time_travel_shm) {
+ union um_timetravel_schedshm_client *running;
+
+ running = &time_travel_shm->clients[time_travel_shm->running_id];
+
+ if (running->capa & UM_TIMETRAVEL_SCHEDSHM_CAP_TIME_SHARE) {
+ time_travel_shm_client->flags |=
+ UM_TIMETRAVEL_SCHEDSHM_FLAGS_REQ_RUN;
+ time += time_travel_shm_offset;
+ time_travel_shm_client->req_time = time;
+ if (time < time_travel_shm->free_until)
+ time_travel_shm->free_until = time;
+ return;
+ }
+ }
+
+ time_travel_ext_req(UM_TIMETRAVEL_REQUEST, time);
+}
+
+void __time_travel_propagate_time(void)
+{
+ static unsigned long long last_propagated;
+
+ if (time_travel_shm) {
+ if (time_travel_shm->running_id != time_travel_shm_id)
+ panic("time-travel: setting time while not running\n");
+ time_travel_shm->current_time = time_travel_time +
+ time_travel_shm_offset;
+ return;
+ }
+
+ if (last_propagated == time_travel_time)
+ return;
+
+ time_travel_ext_req(UM_TIMETRAVEL_UPDATE, time_travel_time);
+ last_propagated = time_travel_time;
+}
+EXPORT_SYMBOL_GPL(__time_travel_propagate_time);
+
+/* returns true if we must do a wait to the simtime device */
+static bool time_travel_ext_request(unsigned long long time)
+{
+ /*
+ * If we received an external sync point ("free until") then we
+ * don't have to request/wait for anything until then, unless
+ * we're already waiting.
+ *
+ * Note for shm we ignore FREE_UNTIL messages and leave the pointer
+ * to shared memory, and for non-shm the offset is 0.
+ */
+ if (!time_travel_ext_waiting && time_travel_ext_free_until &&
+ time < (*time_travel_ext_free_until - time_travel_shm_offset))
+ return false;
+
+ time_travel_ext_update_request(time);
+ return true;
+}
+
+static void time_travel_ext_wait(bool idle)
+{
+ struct um_timetravel_msg msg = {
+ .op = UM_TIMETRAVEL_ACK,
+ };
+
+ time_travel_ext_prev_request_valid = false;
+ if (!time_travel_shm)
+ time_travel_ext_free_until = NULL;
+ time_travel_ext_waiting++;
+
+ time_travel_ext_req(UM_TIMETRAVEL_WAIT, -1);
+
+ /*
+ * Here we are deep in the idle loop, so we have to break out of the
+ * kernel abstraction in a sense and implement this in terms of the
+ * UML system waiting on the VQ interrupt while sleeping, when we get
+ * the signal it'll call time_travel_ext_vq_notify_done() completing the
+ * call.
+ */
+ while (msg.op != UM_TIMETRAVEL_RUN)
+ time_travel_handle_message(&msg, idle ? TTMH_IDLE : TTMH_POLL);
+
+ time_travel_ext_waiting--;
+
+ /* we might request more stuff while polling - reset when we run */
+ time_travel_ext_prev_request_valid = false;
+}
+
+static void time_travel_ext_get_time(void)
+{
+ if (time_travel_shm)
+ time_travel_set_time(time_travel_shm->current_time -
+ time_travel_shm_offset);
+ else
+ time_travel_ext_req(UM_TIMETRAVEL_GET, -1);
+}
+
+static void __time_travel_update_time(unsigned long long ns, bool idle)
+{
+ if (time_travel_mode == TT_MODE_EXTERNAL && time_travel_ext_request(ns))
+ time_travel_ext_wait(idle);
+ else
+ time_travel_set_time(ns);
+}
+
+static struct time_travel_event *time_travel_first_event(void)
+{
+ return list_first_entry_or_null(&time_travel_events,
+ struct time_travel_event,
+ list);
+}
+
+static void __time_travel_add_event(struct time_travel_event *e,
+ unsigned long long time)
+{
+ struct time_travel_event *tmp;
+ bool inserted = false;
+ unsigned long flags;
+
+ if (e->pending)
+ return;
+
+ e->pending = true;
+ e->time = time;
+
+ local_irq_save(flags);
+ list_for_each_entry(tmp, &time_travel_events, list) {
+ /*
+ * Add the new entry before one with higher time,
+ * or if they're equal and both on stack, because
+ * in that case we need to unwind the stack in the
+ * right order, and the later event (timer sleep
+ * or such) must be dequeued first.
+ */
+ if ((tmp->time > e->time) ||
+ (tmp->time == e->time && tmp->onstack && e->onstack)) {
+ list_add_tail(&e->list, &tmp->list);
+ inserted = true;
+ break;
+ }
+ }
+
+ if (!inserted)
+ list_add_tail(&e->list, &time_travel_events);
+
+ tmp = time_travel_first_event();
+ time_travel_ext_update_request(tmp->time);
+ time_travel_next_event = tmp->time;
+ local_irq_restore(flags);
+}
+
+static void time_travel_add_event(struct time_travel_event *e,
+ unsigned long long time)
+{
+ if (WARN_ON(!e->fn))
+ return;
+
+ __time_travel_add_event(e, time);
+}
+
+void time_travel_add_event_rel(struct time_travel_event *e,
+ unsigned long long delay_ns)
+{
+ time_travel_add_event(e, time_travel_time + delay_ns);
+}
+
+static void time_travel_periodic_timer(struct time_travel_event *e)
+{
+ time_travel_add_event(&time_travel_timer_event,
+ time_travel_time + time_travel_timer_interval);
+
+ /* clock tick; decrease extra jiffies by keeping sched_clock constant */
+ if (tt_extra_sched_jiffies > 0)
+ tt_extra_sched_jiffies -= 1;
+
+ deliver_alarm();
+}
+
+void deliver_time_travel_irqs(void)
+{
+ struct time_travel_event *e;
+ unsigned long flags;
+
+ /*
+ * Don't do anything for most cases. Note that because here we have
+ * to disable IRQs (and re-enable later) we'll actually recurse at
+ * the end of the function, so this is strictly necessary.
+ */
+ if (likely(list_empty(&time_travel_irqs)))
+ return;
+
+ local_irq_save(flags);
+ irq_enter();
+ while ((e = list_first_entry_or_null(&time_travel_irqs,
+ struct time_travel_event,
+ list))) {
+ list_del(&e->list);
+ e->pending = false;
+ e->fn(e);
+ }
+ irq_exit();
+ local_irq_restore(flags);
+}
+
+static void time_travel_deliver_event(struct time_travel_event *e)
+{
+ if (e == &time_travel_timer_event) {
+ /*
+ * deliver_alarm() does the irq_enter/irq_exit
+ * by itself, so must handle it specially here
+ */
+ e->fn(e);
+ } else if (irqs_disabled()) {
+ list_add_tail(&e->list, &time_travel_irqs);
+ /*
+ * set pending again, it was set to false when the
+ * event was deleted from the original list, but
+ * now it's still pending until we deliver the IRQ.
+ */
+ e->pending = true;
+ } else {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ irq_enter();
+ e->fn(e);
+ irq_exit();
+ local_irq_restore(flags);
+ }
+}
+
+bool time_travel_del_event(struct time_travel_event *e)
+{
+ unsigned long flags;
+
+ if (!e->pending)
+ return false;
+ local_irq_save(flags);
+ list_del(&e->list);
+ e->pending = false;
+ local_irq_restore(flags);
+ return true;
+}
+
+static void time_travel_update_time(unsigned long long next, bool idle)
+{
+ struct time_travel_event ne = {
+ .onstack = true,
+ };
+ struct time_travel_event *e;
+ bool finished = idle;
+
+ /* add it without a handler - we deal with that specifically below */
+ __time_travel_add_event(&ne, next);
+
+ do {
+ e = time_travel_first_event();
+
+ BUG_ON(!e);
+ __time_travel_update_time(e->time, idle);
+
+ /* new events may have been inserted while we were waiting */
+ if (e == time_travel_first_event()) {
+ BUG_ON(!time_travel_del_event(e));
+ BUG_ON(time_travel_time != e->time);
+
+ if (e == &ne) {
+ finished = true;
+ } else {
+ if (e->onstack)
+ panic("On-stack event dequeued outside of the stack! time=%lld, event time=%lld, event=%pS\n",
+ time_travel_time, e->time, e);
+ time_travel_deliver_event(e);
+ }
+ }
+
+ e = time_travel_first_event();
+ if (e)
+ time_travel_ext_update_request(e->time);
+ } while (ne.pending && !finished);
+
+ time_travel_del_event(&ne);
+}
+
+static void time_travel_update_time_rel(unsigned long long offs)
+{
+ unsigned long flags;
+
+ /*
+ * Disable interrupts before calculating the new time so
+ * that a real timer interrupt (signal) can't happen at
+ * a bad time e.g. after we read time_travel_time but
+ * before we've completed updating the time.
+ */
+ local_irq_save(flags);
+ time_travel_update_time(time_travel_time + offs, false);
+ local_irq_restore(flags);
+}
+
+void time_travel_ndelay(unsigned long nsec)
+{
+ /*
+ * Not strictly needed to use _rel() version since this is
+ * only used in INFCPU/EXT modes, but it doesn't hurt and
+ * is more readable too.
+ */
+ time_travel_update_time_rel(nsec);
+}
+EXPORT_SYMBOL(time_travel_ndelay);
+
+void time_travel_add_irq_event(struct time_travel_event *e)
+{
+ BUG_ON(time_travel_mode != TT_MODE_EXTERNAL);
+
+ time_travel_ext_get_time();
+ /*
+ * We could model interrupt latency here, for now just
+ * don't have any latency at all and request the exact
+ * same time (again) to run the interrupt...
+ */
+ time_travel_add_event(e, time_travel_time);
+}
+EXPORT_SYMBOL_GPL(time_travel_add_irq_event);
+
+static void time_travel_oneshot_timer(struct time_travel_event *e)
+{
+ /* clock tick; decrease extra jiffies by keeping sched_clock constant */
+ if (tt_extra_sched_jiffies > 0)
+ tt_extra_sched_jiffies -= 1;
+
+ deliver_alarm();
+}
+
+void time_travel_sleep(void)
+{
+ /*
+ * Wait "forever" (using S64_MAX because there are some potential
+ * wrapping issues, especially with the current TT_MODE_EXTERNAL
+ * controller application.
+ */
+ unsigned long long next = S64_MAX;
+ int cpu = raw_smp_processor_id();
+
+ if (time_travel_mode == TT_MODE_BASIC)
+ os_timer_disable(cpu);
+
+ time_travel_update_time(next, true);
+
+ if (time_travel_mode == TT_MODE_BASIC &&
+ time_travel_timer_event.pending) {
+ if (time_travel_timer_event.fn == time_travel_periodic_timer) {
+ /*
+ * This is somewhat wrong - we should get the first
+ * one sooner like the os_timer_one_shot() below...
+ */
+ os_timer_set_interval(cpu, time_travel_timer_interval);
+ } else {
+ os_timer_one_shot(cpu, time_travel_timer_event.time - next);
+ }
+ }
+}
+
+static void time_travel_handle_real_alarm(void)
+{
+ time_travel_set_time(time_travel_next_event);
+
+ time_travel_del_event(&time_travel_timer_event);
+
+ if (time_travel_timer_event.fn == time_travel_periodic_timer)
+ time_travel_add_event(&time_travel_timer_event,
+ time_travel_time +
+ time_travel_timer_interval);
+}
+
+static void time_travel_set_interval(unsigned long long interval)
+{
+ time_travel_timer_interval = interval;
+}
+
+static int time_travel_connect_external(const char *socket)
+{
+ const char *sep;
+ unsigned long long id = (unsigned long long)-1;
+ int rc;
+
+ if ((sep = strchr(socket, ':'))) {
+ char buf[25] = {};
+ if (sep - socket > sizeof(buf) - 1)
+ goto invalid_number;
+
+ memcpy(buf, socket, sep - socket);
+ if (kstrtoull(buf, 0, &id)) {
+invalid_number:
+ panic("time-travel: invalid external ID in string '%s'\n",
+ socket);
+ return -EINVAL;
+ }
+
+ socket = sep + 1;
+ }
+
+ rc = os_connect_socket(socket);
+ if (rc < 0) {
+ panic("time-travel: failed to connect to external socket %s\n",
+ socket);
+ return rc;
+ }
+
+ time_travel_ext_fd = rc;
+
+ time_travel_ext_req(UM_TIMETRAVEL_START, id);
+
+ return 1;
+}
+
+static void time_travel_set_start(void)
+{
+ if (time_travel_start_set)
+ return;
+
+ switch (time_travel_mode) {
+ case TT_MODE_EXTERNAL:
+ time_travel_start = time_travel_ext_req(UM_TIMETRAVEL_GET_TOD, -1);
+ /* controller gave us the *current* time, so adjust by that */
+ time_travel_ext_get_time();
+ time_travel_start -= time_travel_time;
+ break;
+ case TT_MODE_INFCPU:
+ case TT_MODE_BASIC:
+ if (!time_travel_start_set)
+ time_travel_start = os_persistent_clock_emulation();
+ break;
+ case TT_MODE_OFF:
+ /* we just read the host clock with os_persistent_clock_emulation() */
+ break;
+ }
+
+ time_travel_start_set = true;
+}
+#else /* CONFIG_UML_TIME_TRAVEL_SUPPORT */
+#define time_travel_start_set 0
+#define time_travel_start 0
+#define time_travel_time 0
+#define time_travel_ext_waiting 0
+
+static inline void time_travel_update_time(unsigned long long ns, bool idle)
+{
+}
+
+static inline void time_travel_update_time_rel(unsigned long long offs)
+{
+}
+
+static inline void time_travel_handle_real_alarm(void)
+{
+}
+
+static void time_travel_set_interval(unsigned long long interval)
+{
+}
+
+static inline void time_travel_set_start(void)
+{
+}
+
+/* fail link if this actually gets used */
+extern u64 time_travel_ext_req(u32 op, u64 time);
+
+/* these are empty macros so the struct/fn need not exist */
+#define time_travel_add_event(e, time) do { } while (0)
+/* externally not usable - redefine here so we can */
+#undef time_travel_del_event
+#define time_travel_del_event(e) do { } while (0)
+#endif
+
+static struct clock_event_device timer_clockevent[NR_CPUS];
void timer_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
{
unsigned long flags;
+ /*
+ * In basic time-travel mode we still get real interrupts
+ * (signals) but since we don't read time from the OS, we
+ * must update the simulated time here to the expiry when
+ * we get a signal.
+ * This is not the case in inf-cpu mode, since there we
+ * never get any real signals from the OS.
+ */
+ if (time_travel_mode == TT_MODE_BASIC)
+ time_travel_handle_real_alarm();
+
local_irq_save(flags);
do_IRQ(TIMER_IRQ, regs);
local_irq_restore(flags);
}
-static void itimer_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt)
+static int itimer_shutdown(struct clock_event_device *evt)
{
- switch (mode) {
- case CLOCK_EVT_MODE_PERIODIC:
- set_interval();
- break;
+ int cpu = evt - &timer_clockevent[0];
- case CLOCK_EVT_MODE_SHUTDOWN:
- case CLOCK_EVT_MODE_UNUSED:
- case CLOCK_EVT_MODE_ONESHOT:
- disable_timer();
- break;
+ if (time_travel_mode != TT_MODE_OFF)
+ time_travel_del_event(&time_travel_timer_event);
- case CLOCK_EVT_MODE_RESUME:
- break;
+ if (time_travel_mode != TT_MODE_INFCPU &&
+ time_travel_mode != TT_MODE_EXTERNAL)
+ os_timer_disable(cpu);
+
+ return 0;
+}
+
+static int itimer_set_periodic(struct clock_event_device *evt)
+{
+ unsigned long long interval = NSEC_PER_SEC / HZ;
+ int cpu = evt - &timer_clockevent[0];
+
+ if (time_travel_mode != TT_MODE_OFF) {
+ time_travel_del_event(&time_travel_timer_event);
+ time_travel_set_event_fn(&time_travel_timer_event,
+ time_travel_periodic_timer);
+ time_travel_set_interval(interval);
+ time_travel_add_event(&time_travel_timer_event,
+ time_travel_time + interval);
}
+
+ if (time_travel_mode != TT_MODE_INFCPU &&
+ time_travel_mode != TT_MODE_EXTERNAL)
+ os_timer_set_interval(cpu, interval);
+
+ return 0;
}
static int itimer_next_event(unsigned long delta,
struct clock_event_device *evt)
{
- return timer_one_shot(delta + 1);
+ delta += 1;
+
+ if (time_travel_mode != TT_MODE_OFF) {
+ time_travel_del_event(&time_travel_timer_event);
+ time_travel_set_event_fn(&time_travel_timer_event,
+ time_travel_oneshot_timer);
+ time_travel_add_event(&time_travel_timer_event,
+ time_travel_time + delta);
+ }
+
+ if (time_travel_mode != TT_MODE_INFCPU &&
+ time_travel_mode != TT_MODE_EXTERNAL)
+ return os_timer_one_shot(raw_smp_processor_id(), delta);
+
+ return 0;
}
-static struct clock_event_device itimer_clockevent = {
- .name = "itimer",
- .rating = 250,
- .cpumask = cpu_all_mask,
- .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
- .set_mode = itimer_set_mode,
- .set_next_event = itimer_next_event,
- .shift = 32,
- .irq = 0,
+static int itimer_one_shot(struct clock_event_device *evt)
+{
+ return itimer_next_event(0, evt);
+}
+
+static struct clock_event_device _timer_clockevent = {
+ .name = "posix-timer",
+ .rating = 250,
+ .features = CLOCK_EVT_FEAT_PERIODIC |
+ CLOCK_EVT_FEAT_ONESHOT,
+ .set_state_shutdown = itimer_shutdown,
+ .set_state_periodic = itimer_set_periodic,
+ .set_state_oneshot = itimer_one_shot,
+ .set_next_event = itimer_next_event,
+ .shift = 0,
+ .max_delta_ns = 0xffffffff,
+ .max_delta_ticks = 0xffffffff,
+ .min_delta_ns = TIMER_MIN_DELTA,
+ .min_delta_ticks = TIMER_MIN_DELTA, // microsecond resolution should be enough for anyone, same as 640K RAM
+ .irq = 0,
+ .mult = 1,
};
static irqreturn_t um_timer(int irq, void *dev)
{
- (*itimer_clockevent.event_handler)(&itimer_clockevent);
+ int cpu = raw_smp_processor_id();
+ struct clock_event_device *evt = &timer_clockevent[cpu];
+
+ /*
+ * Interrupt the (possibly) running userspace process, technically this
+ * should only happen if userspace is currently executing.
+ * With infinite CPU time-travel, we can only get here when userspace
+ * is not executing. Do not notify there and avoid spurious scheduling.
+ */
+ if (time_travel_mode != TT_MODE_INFCPU &&
+ time_travel_mode != TT_MODE_EXTERNAL &&
+ get_current()->mm)
+ os_alarm_process(get_current()->mm->context.id.pid);
+
+ evt->event_handler(evt);
return IRQ_HANDLED;
}
-static cycle_t itimer_read(struct clocksource *cs)
+static u64 timer_read(struct clocksource *cs)
{
- return os_nsecs() / 1000;
+ if (time_travel_mode != TT_MODE_OFF) {
+ /*
+ * We make reading the timer cost a bit so that we don't get
+ * stuck in loops that expect time to move more than the
+ * exact requested sleep amount, e.g. python's socket server,
+ * see https://bugs.python.org/issue37026.
+ *
+ * However, don't do that when we're in interrupt or such as
+ * then we might recurse into our own processing, and get to
+ * even more waiting, and that's not good - it messes up the
+ * "what do I do next" and onstack event we use to know when
+ * to return from time_travel_update_time().
+ */
+ if (!irqs_disabled() && !in_interrupt() && !in_softirq() &&
+ !time_travel_ext_waiting)
+ time_travel_update_time_rel(TIMER_MULTIPLIER);
+ return time_travel_time / TIMER_MULTIPLIER;
+ }
+
+ return os_nsecs() / TIMER_MULTIPLIER;
}
-static struct clocksource itimer_clocksource = {
- .name = "itimer",
+static struct clocksource timer_clocksource = {
+ .name = "timer",
.rating = 300,
- .read = itimer_read,
+ .read = timer_read,
.mask = CLOCKSOURCE_MASK(64),
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
-static void __init setup_itimer(void)
+int um_setup_timer(void)
{
+ int cpu = raw_smp_processor_id();
+ struct clock_event_device *evt = &timer_clockevent[cpu];
int err;
- err = request_irq(TIMER_IRQ, um_timer, 0, "timer", NULL);
+ err = os_timer_create();
+ if (err)
+ return err;
+
+ memcpy(evt, &_timer_clockevent, sizeof(*evt));
+ evt->cpumask = cpumask_of(cpu);
+ clockevents_register_device(evt);
+
+ return 0;
+}
+
+static void __init um_timer_init(void)
+{
+ int err;
+
+ err = request_irq(TIMER_IRQ, um_timer, IRQF_TIMER, "hr timer", NULL);
if (err != 0)
printk(KERN_ERR "register_timer : request_irq failed - "
"errno = %d\n", -err);
- itimer_clockevent.mult = div_sc(HZ, NSEC_PER_SEC, 32);
- itimer_clockevent.max_delta_ns =
- clockevent_delta2ns(60 * HZ, &itimer_clockevent);
- itimer_clockevent.min_delta_ns =
- clockevent_delta2ns(1, &itimer_clockevent);
- err = clocksource_register_hz(&itimer_clocksource, USEC_PER_SEC);
+ err = um_setup_timer();
+ if (err) {
+ printk(KERN_ERR "creation of timer failed - errno = %d\n", -err);
+ return;
+ }
+
+ err = clocksource_register_hz(&timer_clocksource, NSEC_PER_SEC/TIMER_MULTIPLIER);
if (err) {
printk(KERN_ERR "clocksource_register_hz returned %d\n", err);
return;
}
- clockevents_register_device(&itimer_clockevent);
}
-void read_persistent_clock(struct timespec *ts)
+void read_persistent_clock64(struct timespec64 *ts)
{
- long long nsecs = os_nsecs();
+ long long nsecs;
+
+ time_travel_set_start();
- set_normalized_timespec(ts, nsecs / NSEC_PER_SEC,
- nsecs % NSEC_PER_SEC);
+ if (time_travel_mode != TT_MODE_OFF)
+ nsecs = time_travel_start + time_travel_time;
+ else
+ nsecs = os_persistent_clock_emulation();
+
+ set_normalized_timespec64(ts, nsecs / NSEC_PER_SEC,
+ nsecs % NSEC_PER_SEC);
}
void __init time_init(void)
{
- timer_init();
- late_time_init = setup_itimer;
+ timer_set_signal_handler();
+ late_time_init = um_timer_init;
+}
+
+#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT
+unsigned long calibrate_delay_is_known(void)
+{
+ if (time_travel_mode == TT_MODE_INFCPU ||
+ time_travel_mode == TT_MODE_EXTERNAL)
+ return 1;
+ return 0;
+}
+
+static int setup_time_travel(char *str)
+{
+ if (strcmp(str, "=inf-cpu") == 0) {
+ time_travel_mode = TT_MODE_INFCPU;
+ _timer_clockevent.name = "time-travel-timer-infcpu";
+ timer_clocksource.name = "time-travel-clock";
+ return 1;
+ }
+
+ if (strncmp(str, "=ext:", 5) == 0) {
+ time_travel_mode = TT_MODE_EXTERNAL;
+ _timer_clockevent.name = "time-travel-timer-external";
+ timer_clocksource.name = "time-travel-clock-external";
+ return time_travel_connect_external(str + 5);
+ }
+
+ if (!*str) {
+ time_travel_mode = TT_MODE_BASIC;
+ _timer_clockevent.name = "time-travel-timer";
+ timer_clocksource.name = "time-travel-clock";
+ return 1;
+ }
+
+ return -EINVAL;
+}
+
+__setup("time-travel", setup_time_travel);
+__uml_help(setup_time_travel,
+"time-travel\n"
+" This option just enables basic time travel mode, in which the clock/timers\n"
+" inside the UML instance skip forward when there's nothing to do, rather than\n"
+" waiting for real time to elapse. However, instance CPU speed is limited by\n"
+" the real CPU speed, so e.g. a 10ms timer will always fire after ~10ms wall\n"
+" clock (but quicker when there's nothing to do).\n"
+"\n"
+"time-travel=inf-cpu\n"
+" This enables time travel mode with infinite processing power, in which there\n"
+" are no wall clock timers, and any CPU processing happens - as seen from the\n"
+" guest - instantly. This can be useful for accurate simulation regardless of\n"
+" debug overhead, physical CPU speed, etc. but is somewhat dangerous as it can\n"
+" easily lead to getting stuck (e.g. if anything in the system busy loops).\n"
+"\n"
+"time-travel=ext:[ID:]/path/to/socket\n"
+" This enables time travel mode similar to =inf-cpu, except the system will\n"
+" use the given socket to coordinate with a central scheduler, in order to\n"
+" have more than one system simultaneously be on simulated time. The virtio\n"
+" driver code in UML knows about this so you can also simulate networks and\n"
+" devices using it, assuming the device has the right capabilities.\n"
+" The optional ID is a 64-bit integer that's sent to the central scheduler.\n\n");
+
+static int setup_time_travel_start(char *str)
+{
+ int err;
+
+ err = kstrtoull(str, 0, &time_travel_start);
+ if (err)
+ return err;
+
+ time_travel_start_set = 1;
+ return 1;
+}
+
+__setup("time-travel-start=", setup_time_travel_start);
+__uml_help(setup_time_travel_start,
+"time-travel-start=<nanoseconds>\n"
+" Configure the UML instance's wall clock to start at this value rather than\n"
+" the host's wall clock at the time of UML boot.\n\n");
+
+static struct kobject *bc_time_kobject;
+
+static ssize_t bc_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "0x%llx", bc_message);
+}
+
+static ssize_t bc_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ int ret;
+ u64 user_bc_message;
+
+ ret = kstrtou64(buf, 0, &user_bc_message);
+ if (ret)
+ return ret;
+
+ bc_message = user_bc_message;
+
+ time_travel_ext_req(UM_TIMETRAVEL_BROADCAST, bc_message);
+ pr_info("um: time: sent broadcast message: 0x%llx\n", bc_message);
+ return count;
+}
+
+static struct kobj_attribute bc_attribute = __ATTR(bc-message, 0660, bc_show, bc_store);
+
+static int __init um_bc_start(void)
+{
+ if (time_travel_mode != TT_MODE_EXTERNAL)
+ return 0;
+
+ bc_time_kobject = kobject_create_and_add("um-ext-time", kernel_kobj);
+ if (!bc_time_kobject)
+ return 0;
+
+ if (sysfs_create_file(bc_time_kobject, &bc_attribute.attr))
+ pr_debug("failed to create the bc file in /sys/kernel/um_time");
+
+ return 0;
}
+late_initcall(um_bc_start);
+#endif
diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c
index 9472079471bb..39608cccf2c6 100644
--- a/arch/um/kernel/tlb.c
+++ b/arch/um/kernel/tlb.c
@@ -1,212 +1,89 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
*/
#include <linux/mm.h>
#include <linux/module.h>
-#include <linux/sched.h>
-#include <asm/pgtable.h>
+#include <linux/sched/signal.h>
+
#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
#include <as-layout.h>
#include <mem_user.h>
#include <os.h>
#include <skas.h>
+#include <kern_util.h>
-struct host_vm_change {
- struct host_vm_op {
- enum { NONE, MMAP, MUNMAP, MPROTECT } type;
- union {
- struct {
- unsigned long addr;
- unsigned long len;
- unsigned int prot;
- int fd;
- __u64 offset;
- } mmap;
- struct {
- unsigned long addr;
- unsigned long len;
- } munmap;
- struct {
- unsigned long addr;
- unsigned long len;
- unsigned int prot;
- } mprotect;
- } u;
- } ops[1];
- int index;
- struct mm_id *id;
- void *data;
- int force;
-};
-
-#define INIT_HVC(mm, force) \
- ((struct host_vm_change) \
- { .ops = { { .type = NONE } }, \
- .id = &mm->context.id, \
- .data = NULL, \
- .index = 0, \
- .force = force })
-
-static int do_ops(struct host_vm_change *hvc, int end,
- int finished)
-{
- struct host_vm_op *op;
- int i, ret = 0;
-
- for (i = 0; i < end && !ret; i++) {
- op = &hvc->ops[i];
- switch (op->type) {
- case MMAP:
- ret = map(hvc->id, op->u.mmap.addr, op->u.mmap.len,
- op->u.mmap.prot, op->u.mmap.fd,
- op->u.mmap.offset, finished, &hvc->data);
- break;
- case MUNMAP:
- ret = unmap(hvc->id, op->u.munmap.addr,
- op->u.munmap.len, finished, &hvc->data);
- break;
- case MPROTECT:
- ret = protect(hvc->id, op->u.mprotect.addr,
- op->u.mprotect.len, op->u.mprotect.prot,
- finished, &hvc->data);
- break;
- default:
- printk(KERN_ERR "Unknown op type %d in do_ops\n",
- op->type);
- BUG();
- break;
- }
- }
+struct vm_ops {
+ struct mm_id *mm_idp;
- return ret;
-}
+ int (*mmap)(struct mm_id *mm_idp,
+ unsigned long virt, unsigned long len, int prot,
+ int phys_fd, unsigned long long offset);
+ int (*unmap)(struct mm_id *mm_idp,
+ unsigned long virt, unsigned long len);
+};
-static int add_mmap(unsigned long virt, unsigned long phys, unsigned long len,
- unsigned int prot, struct host_vm_change *hvc)
+static int kern_map(struct mm_id *mm_idp,
+ unsigned long virt, unsigned long len, int prot,
+ int phys_fd, unsigned long long offset)
{
- __u64 offset;
- struct host_vm_op *last;
- int fd, ret = 0;
-
- fd = phys_mapping(phys, &offset);
- if (hvc->index != 0) {
- last = &hvc->ops[hvc->index - 1];
- if ((last->type == MMAP) &&
- (last->u.mmap.addr + last->u.mmap.len == virt) &&
- (last->u.mmap.prot == prot) && (last->u.mmap.fd == fd) &&
- (last->u.mmap.offset + last->u.mmap.len == offset)) {
- last->u.mmap.len += len;
- return 0;
- }
- }
-
- if (hvc->index == ARRAY_SIZE(hvc->ops)) {
- ret = do_ops(hvc, ARRAY_SIZE(hvc->ops), 0);
- hvc->index = 0;
- }
-
- hvc->ops[hvc->index++] = ((struct host_vm_op)
- { .type = MMAP,
- .u = { .mmap = { .addr = virt,
- .len = len,
- .prot = prot,
- .fd = fd,
- .offset = offset }
- } });
- return ret;
+ /* TODO: Why is executable needed to be always set in the kernel? */
+ return os_map_memory((void *)virt, phys_fd, offset, len,
+ prot & UM_PROT_READ, prot & UM_PROT_WRITE,
+ 1);
}
-static int add_munmap(unsigned long addr, unsigned long len,
- struct host_vm_change *hvc)
+static int kern_unmap(struct mm_id *mm_idp,
+ unsigned long virt, unsigned long len)
{
- struct host_vm_op *last;
- int ret = 0;
-
- if (hvc->index != 0) {
- last = &hvc->ops[hvc->index - 1];
- if ((last->type == MUNMAP) &&
- (last->u.munmap.addr + last->u.mmap.len == addr)) {
- last->u.munmap.len += len;
- return 0;
- }
- }
-
- if (hvc->index == ARRAY_SIZE(hvc->ops)) {
- ret = do_ops(hvc, ARRAY_SIZE(hvc->ops), 0);
- hvc->index = 0;
- }
-
- hvc->ops[hvc->index++] = ((struct host_vm_op)
- { .type = MUNMAP,
- .u = { .munmap = { .addr = addr,
- .len = len } } });
- return ret;
+ return os_unmap_memory((void *)virt, len);
}
-static int add_mprotect(unsigned long addr, unsigned long len,
- unsigned int prot, struct host_vm_change *hvc)
+void report_enomem(void)
{
- struct host_vm_op *last;
- int ret = 0;
-
- if (hvc->index != 0) {
- last = &hvc->ops[hvc->index - 1];
- if ((last->type == MPROTECT) &&
- (last->u.mprotect.addr + last->u.mprotect.len == addr) &&
- (last->u.mprotect.prot == prot)) {
- last->u.mprotect.len += len;
- return 0;
- }
- }
-
- if (hvc->index == ARRAY_SIZE(hvc->ops)) {
- ret = do_ops(hvc, ARRAY_SIZE(hvc->ops), 0);
- hvc->index = 0;
- }
-
- hvc->ops[hvc->index++] = ((struct host_vm_op)
- { .type = MPROTECT,
- .u = { .mprotect = { .addr = addr,
- .len = len,
- .prot = prot } } });
- return ret;
+ printk(KERN_ERR "UML ran out of memory on the host side! "
+ "This can happen due to a memory limitation or "
+ "vm.max_map_count has been reached.\n");
}
-#define ADD_ROUND(n, inc) (((n) + (inc)) & ~((inc) - 1))
-
static inline int update_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end,
- struct host_vm_change *hvc)
+ struct vm_ops *ops)
{
pte_t *pte;
- int r, w, x, prot, ret = 0;
+ int ret = 0;
pte = pte_offset_kernel(pmd, addr);
do {
- if ((addr >= STUB_START) && (addr < STUB_END))
+ if (!pte_needsync(*pte))
continue;
- r = pte_read(*pte);
- w = pte_write(*pte);
- x = pte_exec(*pte);
- if (!pte_young(*pte)) {
- r = 0;
- w = 0;
- } else if (!pte_dirty(*pte))
- w = 0;
-
- prot = ((r ? UM_PROT_READ : 0) | (w ? UM_PROT_WRITE : 0) |
- (x ? UM_PROT_EXEC : 0));
- if (hvc->force || pte_newpage(*pte)) {
- if (pte_present(*pte))
- ret = add_mmap(addr, pte_val(*pte) & PAGE_MASK,
- PAGE_SIZE, prot, hvc);
- else
- ret = add_munmap(addr, PAGE_SIZE, hvc);
- } else if (pte_newprot(*pte))
- ret = add_mprotect(addr, PAGE_SIZE, prot, hvc);
+ if (pte_present(*pte)) {
+ __u64 offset;
+ unsigned long phys = pte_val(*pte) & PAGE_MASK;
+ int fd = phys_mapping(phys, &offset);
+ int r, w, x, prot;
+
+ r = pte_read(*pte);
+ w = pte_write(*pte);
+ x = pte_exec(*pte);
+ if (!pte_young(*pte)) {
+ r = 0;
+ w = 0;
+ } else if (!pte_dirty(*pte))
+ w = 0;
+
+ prot = (r ? UM_PROT_READ : 0) |
+ (w ? UM_PROT_WRITE : 0) |
+ (x ? UM_PROT_EXEC : 0);
+
+ ret = ops->mmap(ops->mm_idp, addr, PAGE_SIZE,
+ prot, fd, offset);
+ } else
+ ret = ops->unmap(ops->mm_idp, addr, PAGE_SIZE);
+
*pte = pte_mkuptodate(*pte);
} while (pte++, addr += PAGE_SIZE, ((addr < end) && !ret));
return ret;
@@ -214,7 +91,7 @@ static inline int update_pte_range(pmd_t *pmd, unsigned long addr,
static inline int update_pmd_range(pud_t *pud, unsigned long addr,
unsigned long end,
- struct host_vm_change *hvc)
+ struct vm_ops *ops)
{
pmd_t *pmd;
unsigned long next;
@@ -224,314 +101,125 @@ static inline int update_pmd_range(pud_t *pud, unsigned long addr,
do {
next = pmd_addr_end(addr, end);
if (!pmd_present(*pmd)) {
- if (hvc->force || pmd_newpage(*pmd)) {
- ret = add_munmap(addr, next - addr, hvc);
+ if (pmd_needsync(*pmd)) {
+ ret = ops->unmap(ops->mm_idp, addr,
+ next - addr);
pmd_mkuptodate(*pmd);
}
}
- else ret = update_pte_range(pmd, addr, next, hvc);
+ else ret = update_pte_range(pmd, addr, next, ops);
} while (pmd++, addr = next, ((addr < end) && !ret));
return ret;
}
-static inline int update_pud_range(pgd_t *pgd, unsigned long addr,
+static inline int update_pud_range(p4d_t *p4d, unsigned long addr,
unsigned long end,
- struct host_vm_change *hvc)
+ struct vm_ops *ops)
{
pud_t *pud;
unsigned long next;
int ret = 0;
- pud = pud_offset(pgd, addr);
+ pud = pud_offset(p4d, addr);
do {
next = pud_addr_end(addr, end);
if (!pud_present(*pud)) {
- if (hvc->force || pud_newpage(*pud)) {
- ret = add_munmap(addr, next - addr, hvc);
+ if (pud_needsync(*pud)) {
+ ret = ops->unmap(ops->mm_idp, addr,
+ next - addr);
pud_mkuptodate(*pud);
}
}
- else ret = update_pmd_range(pud, addr, next, hvc);
+ else ret = update_pmd_range(pud, addr, next, ops);
} while (pud++, addr = next, ((addr < end) && !ret));
return ret;
}
-void fix_range_common(struct mm_struct *mm, unsigned long start_addr,
- unsigned long end_addr, int force)
+static inline int update_p4d_range(pgd_t *pgd, unsigned long addr,
+ unsigned long end,
+ struct vm_ops *ops)
{
- pgd_t *pgd;
- struct host_vm_change hvc;
- unsigned long addr = start_addr, next;
+ p4d_t *p4d;
+ unsigned long next;
int ret = 0;
- hvc = INIT_HVC(mm, force);
- pgd = pgd_offset(mm, addr);
+ p4d = p4d_offset(pgd, addr);
do {
- next = pgd_addr_end(addr, end_addr);
- if (!pgd_present(*pgd)) {
- if (force || pgd_newpage(*pgd)) {
- ret = add_munmap(addr, next - addr, &hvc);
- pgd_mkuptodate(*pgd);
+ next = p4d_addr_end(addr, end);
+ if (!p4d_present(*p4d)) {
+ if (p4d_needsync(*p4d)) {
+ ret = ops->unmap(ops->mm_idp, addr,
+ next - addr);
+ p4d_mkuptodate(*p4d);
}
- }
- else ret = update_pud_range(pgd, addr, next, &hvc);
- } while (pgd++, addr = next, ((addr < end_addr) && !ret));
-
- if (!ret)
- ret = do_ops(&hvc, hvc.index, 1);
-
- /* This is not an else because ret is modified above */
- if (ret) {
- printk(KERN_ERR "fix_range_common: failed, killing current "
- "process\n");
- force_sig(SIGKILL, current);
- }
+ } else
+ ret = update_pud_range(p4d, addr, next, ops);
+ } while (p4d++, addr = next, ((addr < end) && !ret));
+ return ret;
}
-static int flush_tlb_kernel_range_common(unsigned long start, unsigned long end)
+int um_tlb_sync(struct mm_struct *mm)
{
- struct mm_struct *mm;
pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
- unsigned long addr, last;
- int updated = 0, err;
-
- mm = &init_mm;
- for (addr = start; addr < end;) {
- pgd = pgd_offset(mm, addr);
- if (!pgd_present(*pgd)) {
- last = ADD_ROUND(addr, PGDIR_SIZE);
- if (last > end)
- last = end;
- if (pgd_newpage(*pgd)) {
- updated = 1;
- err = os_unmap_memory((void *) addr,
- last - addr);
- if (err < 0)
- panic("munmap failed, errno = %d\n",
- -err);
- }
- addr = last;
- continue;
- }
-
- pud = pud_offset(pgd, addr);
- if (!pud_present(*pud)) {
- last = ADD_ROUND(addr, PUD_SIZE);
- if (last > end)
- last = end;
- if (pud_newpage(*pud)) {
- updated = 1;
- err = os_unmap_memory((void *) addr,
- last - addr);
- if (err < 0)
- panic("munmap failed, errno = %d\n",
- -err);
- }
- addr = last;
- continue;
- }
-
- pmd = pmd_offset(pud, addr);
- if (!pmd_present(*pmd)) {
- last = ADD_ROUND(addr, PMD_SIZE);
- if (last > end)
- last = end;
- if (pmd_newpage(*pmd)) {
- updated = 1;
- err = os_unmap_memory((void *) addr,
- last - addr);
- if (err < 0)
- panic("munmap failed, errno = %d\n",
- -err);
- }
- addr = last;
- continue;
- }
-
- pte = pte_offset_kernel(pmd, addr);
- if (!pte_present(*pte) || pte_newpage(*pte)) {
- updated = 1;
- err = os_unmap_memory((void *) addr,
- PAGE_SIZE);
- if (err < 0)
- panic("munmap failed, errno = %d\n",
- -err);
- if (pte_present(*pte))
- map_memory(addr,
- pte_val(*pte) & PAGE_MASK,
- PAGE_SIZE, 1, 1, 1);
- }
- else if (pte_newprot(*pte)) {
- updated = 1;
- os_protect_memory((void *) addr, PAGE_SIZE, 1, 1, 1);
- }
- addr += PAGE_SIZE;
- }
- return updated;
-}
+ struct vm_ops ops;
+ unsigned long addr, next;
+ int ret = 0;
-void flush_tlb_page(struct vm_area_struct *vma, unsigned long address)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
- struct mm_struct *mm = vma->vm_mm;
- void *flush = NULL;
- int r, w, x, prot, err = 0;
- struct mm_id *mm_id;
-
- address &= PAGE_MASK;
- pgd = pgd_offset(mm, address);
- if (!pgd_present(*pgd))
- goto kill;
-
- pud = pud_offset(pgd, address);
- if (!pud_present(*pud))
- goto kill;
-
- pmd = pmd_offset(pud, address);
- if (!pmd_present(*pmd))
- goto kill;
-
- pte = pte_offset_kernel(pmd, address);
-
- r = pte_read(*pte);
- w = pte_write(*pte);
- x = pte_exec(*pte);
- if (!pte_young(*pte)) {
- r = 0;
- w = 0;
- } else if (!pte_dirty(*pte)) {
- w = 0;
- }
+ guard(spinlock_irqsave)(&mm->context.sync_tlb_lock);
- mm_id = &mm->context.id;
- prot = ((r ? UM_PROT_READ : 0) | (w ? UM_PROT_WRITE : 0) |
- (x ? UM_PROT_EXEC : 0));
- if (pte_newpage(*pte)) {
- if (pte_present(*pte)) {
- unsigned long long offset;
- int fd;
+ if (mm->context.sync_tlb_range_to == 0)
+ return 0;
- fd = phys_mapping(pte_val(*pte) & PAGE_MASK, &offset);
- err = map(mm_id, address, PAGE_SIZE, prot, fd, offset,
- 1, &flush);
- }
- else err = unmap(mm_id, address, PAGE_SIZE, 1, &flush);
+ ops.mm_idp = &mm->context.id;
+ if (mm == &init_mm) {
+ ops.mmap = kern_map;
+ ops.unmap = kern_unmap;
+ } else {
+ ops.mmap = map;
+ ops.unmap = unmap;
}
- else if (pte_newprot(*pte))
- err = protect(mm_id, address, PAGE_SIZE, prot, 1, &flush);
-
- if (err)
- goto kill;
- *pte = pte_mkuptodate(*pte);
-
- return;
-
-kill:
- printk(KERN_ERR "Failed to flush page for address 0x%lx\n", address);
- force_sig(SIGKILL, current);
-}
-
-pgd_t *pgd_offset_proc(struct mm_struct *mm, unsigned long address)
-{
- return pgd_offset(mm, address);
-}
-
-pud_t *pud_offset_proc(pgd_t *pgd, unsigned long address)
-{
- return pud_offset(pgd, address);
-}
-
-pmd_t *pmd_offset_proc(pud_t *pud, unsigned long address)
-{
- return pmd_offset(pud, address);
-}
+ addr = mm->context.sync_tlb_range_from;
+ pgd = pgd_offset(mm, addr);
+ do {
+ next = pgd_addr_end(addr, mm->context.sync_tlb_range_to);
+ if (!pgd_present(*pgd)) {
+ if (pgd_needsync(*pgd)) {
+ ret = ops.unmap(ops.mm_idp, addr,
+ next - addr);
+ pgd_mkuptodate(*pgd);
+ }
+ } else
+ ret = update_p4d_range(pgd, addr, next, &ops);
+ } while (pgd++, addr = next,
+ ((addr < mm->context.sync_tlb_range_to) && !ret));
-pte_t *pte_offset_proc(pmd_t *pmd, unsigned long address)
-{
- return pte_offset_kernel(pmd, address);
-}
+ if (ret == -ENOMEM)
+ report_enomem();
-pte_t *addr_pte(struct task_struct *task, unsigned long addr)
-{
- pgd_t *pgd = pgd_offset(task->mm, addr);
- pud_t *pud = pud_offset(pgd, addr);
- pmd_t *pmd = pmd_offset(pud, addr);
+ mm->context.sync_tlb_range_from = 0;
+ mm->context.sync_tlb_range_to = 0;
- return pte_offset_map(pmd, addr);
+ return ret;
}
void flush_tlb_all(void)
{
- flush_tlb_mm(current->mm);
-}
-
-void flush_tlb_kernel_range(unsigned long start, unsigned long end)
-{
- flush_tlb_kernel_range_common(start, end);
-}
-
-void flush_tlb_kernel_vm(void)
-{
- flush_tlb_kernel_range_common(start_vm, end_vm);
-}
-
-void __flush_tlb_one(unsigned long addr)
-{
- flush_tlb_kernel_range_common(addr, addr + PAGE_SIZE);
-}
-
-static void fix_range(struct mm_struct *mm, unsigned long start_addr,
- unsigned long end_addr, int force)
-{
- fix_range_common(mm, start_addr, end_addr, force);
-}
-
-void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
- unsigned long end)
-{
- if (vma->vm_mm == NULL)
- flush_tlb_kernel_range_common(start, end);
- else fix_range(vma->vm_mm, start, end, 0);
-}
-EXPORT_SYMBOL(flush_tlb_range);
-
-void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
- unsigned long end)
-{
/*
* Don't bother flushing if this address space is about to be
* destroyed.
*/
- if (atomic_read(&mm->mm_users) == 0)
+ if (atomic_read(&current->mm->mm_users) == 0)
return;
- fix_range(mm, start, end, 0);
+ flush_tlb_mm(current->mm);
}
void flush_tlb_mm(struct mm_struct *mm)
{
- struct vm_area_struct *vma = mm->mmap;
-
- while (vma != NULL) {
- fix_range(mm, vma->vm_start, vma->vm_end, 0);
- vma = vma->vm_next;
- }
-}
+ struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, 0);
-void force_flush_all(void)
-{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma = mm->mmap;
-
- while (vma != NULL) {
- fix_range(mm, vma->vm_start, vma->vm_end, 1);
- vma = vma->vm_next;
- }
+ for_each_vma(vmi, vma)
+ um_tlb_mark_sync(mm, vma->vm_start, vma->vm_end);
}
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index 089f3987e273..177615820a4c 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -1,14 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
*/
#include <linux/mm.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/hardirq.h>
#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/sched/debug.h>
#include <asm/current.h>
-#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <arch.h>
#include <as-layout.h>
@@ -17,7 +18,123 @@
#include <skas.h>
/*
- * Note this is constrained to return 0, -EFAULT, -EACCESS, -ENOMEM by
+ * NOTE: UML does not have exception tables. As such, this is almost a copy
+ * of the code in mm/memory.c, only adjusting the logic to simply check whether
+ * we are coming from the kernel instead of doing an additional lookup in the
+ * exception table.
+ * We can do this simplification because we never get here if the exception was
+ * fixable.
+ */
+static inline bool get_mmap_lock_carefully(struct mm_struct *mm, bool is_user)
+{
+ if (likely(mmap_read_trylock(mm)))
+ return true;
+
+ if (!is_user)
+ return false;
+
+ return !mmap_read_lock_killable(mm);
+}
+
+static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
+{
+ /*
+ * We don't have this operation yet.
+ *
+ * It should be easy enough to do: it's basically a
+ * atomic_long_try_cmpxchg_acquire()
+ * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
+ * it also needs the proper lockdep magic etc.
+ */
+ return false;
+}
+
+static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, bool is_user)
+{
+ mmap_read_unlock(mm);
+ if (!is_user)
+ return false;
+
+ return !mmap_write_lock_killable(mm);
+}
+
+/*
+ * Helper for page fault handling.
+ *
+ * This is kind of equivalend to "mmap_read_lock()" followed
+ * by "find_extend_vma()", except it's a lot more careful about
+ * the locking (and will drop the lock on failure).
+ *
+ * For example, if we have a kernel bug that causes a page
+ * fault, we don't want to just use mmap_read_lock() to get
+ * the mm lock, because that would deadlock if the bug were
+ * to happen while we're holding the mm lock for writing.
+ *
+ * So this checks the exception tables on kernel faults in
+ * order to only do this all for instructions that are actually
+ * expected to fault.
+ *
+ * We can also actually take the mm lock for writing if we
+ * need to extend the vma, which helps the VM layer a lot.
+ */
+static struct vm_area_struct *
+um_lock_mm_and_find_vma(struct mm_struct *mm,
+ unsigned long addr, bool is_user)
+{
+ struct vm_area_struct *vma;
+
+ if (!get_mmap_lock_carefully(mm, is_user))
+ return NULL;
+
+ vma = find_vma(mm, addr);
+ if (likely(vma && (vma->vm_start <= addr)))
+ return vma;
+
+ /*
+ * Well, dang. We might still be successful, but only
+ * if we can extend a vma to do so.
+ */
+ if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
+ mmap_read_unlock(mm);
+ return NULL;
+ }
+
+ /*
+ * We can try to upgrade the mmap lock atomically,
+ * in which case we can continue to use the vma
+ * we already looked up.
+ *
+ * Otherwise we'll have to drop the mmap lock and
+ * re-take it, and also look up the vma again,
+ * re-checking it.
+ */
+ if (!mmap_upgrade_trylock(mm)) {
+ if (!upgrade_mmap_lock_carefully(mm, is_user))
+ return NULL;
+
+ vma = find_vma(mm, addr);
+ if (!vma)
+ goto fail;
+ if (vma->vm_start <= addr)
+ goto success;
+ if (!(vma->vm_flags & VM_GROWSDOWN))
+ goto fail;
+ }
+
+ if (expand_stack_locked(vma, addr))
+ goto fail;
+
+success:
+ mmap_write_downgrade(mm);
+ return vma;
+
+fail:
+ mmap_write_unlock(mm);
+ return NULL;
+}
+
+/*
+ * Note this is constrained to return 0, -EFAULT, -EACCES, -ENOMEM by
* segv().
*/
int handle_page_fault(unsigned long address, unsigned long ip,
@@ -25,79 +142,68 @@ int handle_page_fault(unsigned long address, unsigned long ip,
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
- pgd_t *pgd;
- pud_t *pud;
pmd_t *pmd;
pte_t *pte;
int err = -EFAULT;
- unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
- (is_write ? FAULT_FLAG_WRITE : 0);
+ unsigned int flags = FAULT_FLAG_DEFAULT;
*code_out = SEGV_MAPERR;
/*
- * If the fault was during atomic operation, don't take the fault, just
+ * If the fault was with pagefaults disabled, don't take the fault, just
* fail.
*/
- if (in_atomic())
+ if (faulthandler_disabled())
goto out_nosemaphore;
+ if (is_user)
+ flags |= FAULT_FLAG_USER;
retry:
- down_read(&mm->mmap_sem);
- vma = find_vma(mm, address);
+ vma = um_lock_mm_and_find_vma(mm, address, is_user);
if (!vma)
- goto out;
- else if (vma->vm_start <= address)
- goto good_area;
- else if (!(vma->vm_flags & VM_GROWSDOWN))
- goto out;
- else if (is_user && !ARCH_IS_STACKGROW(address))
- goto out;
- else if (expand_stack(vma, address))
- goto out;
+ goto out_nosemaphore;
-good_area:
*code_out = SEGV_ACCERR;
- if (is_write && !(vma->vm_flags & VM_WRITE))
- goto out;
-
- /* Don't require VM_READ|VM_EXEC for write faults! */
- if (!is_write && !(vma->vm_flags & (VM_READ | VM_EXEC)))
- goto out;
+ if (is_write) {
+ if (!(vma->vm_flags & VM_WRITE))
+ goto out;
+ flags |= FAULT_FLAG_WRITE;
+ } else {
+ /* Don't require VM_READ|VM_EXEC for write faults! */
+ if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
+ goto out;
+ }
do {
- int fault;
+ vm_fault_t fault;
- fault = handle_mm_fault(mm, vma, address, flags);
+ fault = handle_mm_fault(vma, address, flags, NULL);
if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
goto out_nosemaphore;
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return 0;
+
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM) {
goto out_of_memory;
+ } else if (fault & VM_FAULT_SIGSEGV) {
+ goto out;
} else if (fault & VM_FAULT_SIGBUS) {
err = -EACCES;
goto out;
}
BUG();
}
- if (flags & FAULT_FLAG_ALLOW_RETRY) {
- if (fault & VM_FAULT_MAJOR)
- current->maj_flt++;
- else
- current->min_flt++;
- if (fault & VM_FAULT_RETRY) {
- flags &= ~FAULT_FLAG_ALLOW_RETRY;
- flags |= FAULT_FLAG_TRIED;
-
- goto retry;
- }
+ if (fault & VM_FAULT_RETRY) {
+ flags |= FAULT_FLAG_TRIED;
+
+ goto retry;
}
- pgd = pgd_offset(mm, address);
- pud = pud_offset(pgd, address);
- pmd = pmd_offset(pud, address);
+ pmd = pmd_off(mm, address);
pte = pte_offset_kernel(pmd, address);
} while (!pte_present(*pte));
err = 0;
@@ -112,9 +218,9 @@ good_area:
#if 0
WARN_ON(!pte_young(*pte) || (is_write && !pte_dirty(*pte)));
#endif
- flush_tlb_page(vma, address);
+
out:
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
out_nosemaphore:
return err;
@@ -123,11 +229,12 @@ out_of_memory:
* We ran out of memory, call the OOM killer, and return the userspace
* (which will retry the fault, or kill us if we got oom-killed).
*/
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
+ if (!is_user)
+ goto out_nosemaphore;
pagefault_out_of_memory();
return 0;
}
-EXPORT_SYMBOL(handle_page_fault);
static void show_segv_info(struct uml_pt_regs *regs)
{
@@ -140,7 +247,7 @@ static void show_segv_info(struct uml_pt_regs *regs)
if (!printk_ratelimit())
return;
- printk("%s%s[%d]: segfault at %lx ip %p sp %p error %x",
+ printk("%s%s[%d]: segfault at %lx ip %px sp %px error %x",
task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
tsk->comm, task_pid_nr(tsk), FAULT_ADDRESS(*fi),
(void *)UPT_IP(regs), (void *)UPT_SP(regs),
@@ -152,19 +259,14 @@ static void show_segv_info(struct uml_pt_regs *regs)
static void bad_segv(struct faultinfo fi, unsigned long ip)
{
- struct siginfo si;
-
- si.si_signo = SIGSEGV;
- si.si_code = SEGV_ACCERR;
- si.si_addr = (void __user *) FAULT_ADDRESS(fi);
current->thread.arch.faultinfo = fi;
- force_sig_info(SIGSEGV, &si, current);
+ force_sig_fault(SIGSEGV, SEGV_ACCERR, (void __user *) FAULT_ADDRESS(fi));
}
void fatal_sigsegv(void)
{
- force_sigsegv(SIGSEGV, current);
- do_signal();
+ force_fatal_sig(SIGSEGV);
+ do_signal(&current->thread.regs);
/*
* This is to tell gcc that we're not returning - do_signal
* can, in general, return, but in this case, it's not, since
@@ -173,7 +275,19 @@ void fatal_sigsegv(void)
os_dump_core();
}
-void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
+/**
+ * segv_handler() - the SIGSEGV handler
+ * @sig: the signal number
+ * @unused_si: the signal info struct; unused in this handler
+ * @regs: the ptrace register information
+ * @mc: the mcontext of the signal
+ *
+ * The handler first extracts the faultinfo from the UML ptrace regs struct.
+ * If the userfault did not happen in an UML userspace process, bad_segv is called.
+ * Otherwise the signal did happen in a cloned userspace process, handle it.
+ */
+void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs,
+ void *mc)
{
struct faultinfo * fi = UPT_FAULTINFO(regs);
@@ -182,7 +296,7 @@ void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
bad_segv(*fi, UPT_IP(regs));
return;
}
- segv(*fi, UPT_IP(regs), UPT_IS_USER(regs), regs);
+ segv(*fi, UPT_IP(regs), UPT_IS_USER(regs), regs, mc);
}
/*
@@ -192,26 +306,55 @@ void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
* give us bad data!
*/
unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user,
- struct uml_pt_regs *regs)
+ struct uml_pt_regs *regs, void *mc)
{
- struct siginfo si;
- jmp_buf *catcher;
+ int si_code;
int err;
int is_write = FAULT_WRITE(fi);
unsigned long address = FAULT_ADDRESS(fi);
- if (!is_user && (address >= start_vm) && (address < end_vm)) {
- flush_tlb_kernel_vm();
- return 0;
+ if (!is_user && regs)
+ current->thread.segv_regs = container_of(regs, struct pt_regs, regs);
+
+ if (!is_user && address >= start_vm && address < end_vm) {
+ /*
+ * Kernel has pending updates from set_ptes that were not
+ * flushed yet. Syncing them should fix the pagefault (if not
+ * we'll get here again and panic).
+ */
+ err = um_tlb_sync(&init_mm);
+ if (err == -ENOMEM)
+ report_enomem();
+ if (err)
+ panic("Failed to sync kernel TLBs: %d", err);
+ goto out;
+ }
+ else if (current->pagefault_disabled) {
+ if (!mc) {
+ show_regs(container_of(regs, struct pt_regs, regs));
+ panic("Segfault with pagefaults disabled but no mcontext");
+ }
+ if (!current->thread.segv_continue) {
+ show_regs(container_of(regs, struct pt_regs, regs));
+ panic("Segfault without recovery target");
+ }
+ mc_set_rip(mc, current->thread.segv_continue);
+ current->thread.segv_continue = NULL;
+ goto out;
}
else if (current->mm == NULL) {
show_regs(container_of(regs, struct pt_regs, regs));
panic("Segfault with no mm");
}
+ else if (!is_user && address > PAGE_SIZE && address < TASK_SIZE) {
+ show_regs(container_of(regs, struct pt_regs, regs));
+ panic("Kernel tried to access user memory at addr 0x%lx, ip 0x%lx",
+ address, ip);
+ }
- if (SEGV_IS_FIXABLE(&fi) || SEGV_MAYBE_FIXABLE(&fi))
+ if (SEGV_IS_FIXABLE(&fi))
err = handle_page_fault(address, ip, is_write, is_user,
- &si.si_code);
+ &si_code);
else {
err = -EFAULT;
/*
@@ -222,17 +365,10 @@ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user,
address = 0;
}
- catcher = current->thread.fault_catcher;
if (!err)
- return 0;
- else if (catcher != NULL) {
- current->thread.fault_addr = (void *) address;
- UML_LONGJMP(catcher, 1);
- }
- else if (current->thread.fault_addr != NULL)
- panic("fault_addr set but no fault catcher");
+ goto out;
else if (!is_user && arch_fixup(ip, regs))
- return 0;
+ goto out;
if (!is_user) {
show_regs(container_of(regs, struct pt_regs, regs));
@@ -243,27 +379,25 @@ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user,
show_segv_info(regs);
if (err == -EACCES) {
- si.si_signo = SIGBUS;
- si.si_errno = 0;
- si.si_code = BUS_ADRERR;
- si.si_addr = (void __user *)address;
current->thread.arch.faultinfo = fi;
- force_sig_info(SIGBUS, &si, current);
+ force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
} else {
BUG_ON(err != -EFAULT);
- si.si_signo = SIGSEGV;
- si.si_addr = (void __user *) address;
current->thread.arch.faultinfo = fi;
- force_sig_info(SIGSEGV, &si, current);
+ force_sig_fault(SIGSEGV, si_code, (void __user *) address);
}
+
+out:
+ if (regs)
+ current->thread.segv_regs = NULL;
+
return 0;
}
-void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs)
+void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs,
+ void *mc)
{
- struct faultinfo *fi;
- struct siginfo clean_si;
-
+ int code, err;
if (!UPT_IS_USER(regs)) {
if (sig == SIGBUS)
printk(KERN_ERR "Bus error - the host /dev/shm or /tmp "
@@ -273,44 +407,24 @@ void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs)
arch_examine_signal(sig, regs);
- memset(&clean_si, 0, sizeof(clean_si));
- clean_si.si_signo = si->si_signo;
- clean_si.si_errno = si->si_errno;
- clean_si.si_code = si->si_code;
- switch (sig) {
- case SIGILL:
- case SIGFPE:
- case SIGSEGV:
- case SIGBUS:
- case SIGTRAP:
- fi = UPT_FAULTINFO(regs);
- clean_si.si_addr = (void __user *) FAULT_ADDRESS(*fi);
+ /* Is the signal layout for the signal known?
+ * Signal data must be scrubbed to prevent information leaks.
+ */
+ code = si->si_code;
+ err = si->si_errno;
+ if ((err == 0) && (siginfo_layout(sig, code) == SIL_FAULT)) {
+ struct faultinfo *fi = UPT_FAULTINFO(regs);
current->thread.arch.faultinfo = *fi;
-#ifdef __ARCH_SI_TRAPNO
- clean_si.si_trapno = si->si_trapno;
-#endif
- break;
- default:
- printk(KERN_ERR "Attempted to relay unknown signal %d (si_code = %d)\n",
- sig, si->si_code);
+ force_sig_fault(sig, code, (void __user *)FAULT_ADDRESS(*fi));
+ } else {
+ printk(KERN_ERR "Attempted to relay unknown signal %d (si_code = %d) with errno %d\n",
+ sig, code, err);
+ force_sig(sig);
}
-
- force_sig_info(sig, &clean_si, current);
-}
-
-void bus_handler(int sig, struct siginfo *si, struct uml_pt_regs *regs)
-{
- if (current->thread.fault_catcher != NULL)
- UML_LONGJMP(current->thread.fault_catcher, 1);
- else
- relay_signal(sig, si, regs);
}
-void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
+void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs,
+ void *mc)
{
do_IRQ(WINCH_IRQ, regs);
}
-
-void trap_init(void)
-{
-}
diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c
index 87df5e3acc26..e2b24e1ecfa6 100644
--- a/arch/um/kernel/um_arch.c
+++ b/arch/um/kernel/um_arch.c
@@ -1,19 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
*/
+#include <linux/cpu.h>
#include <linux/delay.h>
#include <linux/init.h>
#include <linux/mm.h>
+#include <linux/ctype.h>
#include <linux/module.h>
+#include <linux/panic_notifier.h>
#include <linux/seq_file.h>
#include <linux/string.h>
+#include <linux/string_choices.h>
#include <linux/utsname.h>
#include <linux/sched.h>
-#include <asm/pgtable.h>
+#include <linux/sched/task.h>
+#include <linux/kmsg_dump.h>
+#include <linux/suspend.h>
+#include <linux/random.h>
+#include <linux/smp-internal.h>
+
#include <asm/processor.h>
+#include <asm/cpufeature.h>
+#include <asm/sections.h>
#include <asm/setup.h>
+#include <asm/text-patching.h>
#include <as-layout.h>
#include <arch.h>
#include <init.h>
@@ -22,7 +34,10 @@
#include <mem_user.h>
#include <os.h>
-#define DEFAULT_COMMAND_LINE "root=98:0"
+#include "um_arch.h"
+
+#define DEFAULT_COMMAND_LINE_ROOT "root=98:0"
+#define DEFAULT_COMMAND_LINE_CONSOLE "console=tty0"
/* Changed in add_arg and setup_arch, which run before SMP is started */
static char __initdata command_line[COMMAND_LINE_SIZE] = { 0 };
@@ -30,7 +45,7 @@ static char __initdata command_line[COMMAND_LINE_SIZE] = { 0 };
static void __init add_arg(char *arg)
{
if (strlen(command_line) + strlen(arg) + 1 > COMMAND_LINE_SIZE) {
- printf("add_arg: Too many command line arguments!\n");
+ os_warn("add_arg: Too many command line arguments!\n");
exit(1);
}
if (strlen(command_line) > 0)
@@ -40,43 +55,42 @@ static void __init add_arg(char *arg)
/*
* These fields are initialized at boot time and not changed.
- * XXX This structure is used only in the non-SMP case. Maybe this
- * should be moved to smp.c.
*/
struct cpuinfo_um boot_cpu_data = {
.loops_per_jiffy = 0,
- .ipi_pipe = { -1, -1 }
+ .cache_alignment = L1_CACHE_BYTES,
+ .x86_capability = { 0 }
};
-union thread_union cpu0_irqstack
- __attribute__((__section__(".data..init_irqstack"))) =
- { INIT_THREAD_INFO(init_task) };
+EXPORT_SYMBOL(boot_cpu_data);
-unsigned long thread_saved_pc(struct task_struct *task)
-{
- /* FIXME: Need to look up userspace_pid by cpu */
- return os_process_pc(userspace_pid[0]);
-}
/* Changed in setup_arch, which is called in early boot */
static char host_info[(__NEW_UTS_LEN + 1) * 5];
static int show_cpuinfo(struct seq_file *m, void *v)
{
- int index = 0;
+ int i = 0;
-#ifdef CONFIG_SMP
- index = (struct cpuinfo_um *) v - cpu_data;
- if (!cpu_online(index))
+#if IS_ENABLED(CONFIG_SMP)
+ i = (uintptr_t) v - 1;
+ if (!cpu_online(i))
return 0;
#endif
- seq_printf(m, "processor\t: %d\n", index);
+ seq_printf(m, "processor\t: %d\n", i);
seq_printf(m, "vendor_id\t: User Mode Linux\n");
seq_printf(m, "model name\t: UML\n");
seq_printf(m, "mode\t\t: skas\n");
seq_printf(m, "host\t\t: %s\n", host_info);
- seq_printf(m, "bogomips\t: %lu.%02lu\n\n",
+ seq_printf(m, "fpu\t\t: %s\n", str_yes_no(cpu_has(&boot_cpu_data, X86_FEATURE_FPU)));
+ seq_printf(m, "flags\t\t:");
+ for (i = 0; i < 32*NCAPINTS; i++)
+ if (cpu_has(&boot_cpu_data, i) && (x86_cap_flags[i] != NULL))
+ seq_printf(m, " %s", x86_cap_flags[i]);
+ seq_printf(m, "\n");
+ seq_printf(m, "cache_alignment\t: %d\n", boot_cpu_data.cache_alignment);
+ seq_printf(m, "bogomips\t: %lu.%02lu\n",
loops_per_jiffy/(500000/HZ),
(loops_per_jiffy/(5000/HZ)) % 100);
@@ -85,7 +99,9 @@ static int show_cpuinfo(struct seq_file *m, void *v)
static void *c_start(struct seq_file *m, loff_t *pos)
{
- return *pos < NR_CPUS ? cpu_data + *pos : NULL;
+ if (*pos < nr_cpu_ids)
+ return (void *)(uintptr_t)(*pos + 1);
+ return NULL;
}
static void *c_next(struct seq_file *m, void *v, loff_t *pos)
@@ -113,14 +129,13 @@ unsigned long uml_reserved; /* Also modified in mem_init */
unsigned long start_vm;
unsigned long end_vm;
-/* Set in uml_ncpus_setup */
-int ncpus = 1;
-
/* Set in early boot */
-static int have_root __initdata = 0;
+static int have_root __initdata;
+static int have_console __initdata;
/* Set in uml_mem_setup and modified in linux_main */
-long long physmem_size = 32 * 1024 * 1024;
+unsigned long long physmem_size = 64 * 1024 * 1024;
+EXPORT_SYMBOL(physmem_size);
static const char *usage_string =
"User Mode Linux v%s\n"
@@ -128,6 +143,7 @@ static const char *usage_string =
static int __init uml_version_setup(char *line, int *add)
{
+ /* Explicitly use printf() to show version in stdout */
printf("%s\n", init_utsname()->release);
exit(0);
@@ -154,42 +170,24 @@ __uml_setup("root=", uml_root_setup,
" root=/dev/ubd5\n\n"
);
-static int __init no_skas_debug_setup(char *line, int *add)
+static int __init uml_console_setup(char *line, int *add)
{
- printf("'debug' is not necessary to gdb UML in skas mode - run \n");
- printf("'gdb linux'\n");
-
+ have_console = 1;
return 0;
}
-__uml_setup("debug", no_skas_debug_setup,
-"debug\n"
-" this flag is not needed to run gdb on UML in skas mode\n\n"
+__uml_setup("console=", uml_console_setup,
+"console=<preferred console>\n"
+" Specify the preferred console output driver\n\n"
);
-#ifdef CONFIG_SMP
-static int __init uml_ncpus_setup(char *line, int *add)
-{
- if (!sscanf(line, "%d", &ncpus)) {
- printf("Couldn't parse [%s]\n", line);
- return -1;
- }
-
- return 0;
-}
-
-__uml_setup("ncpus=", uml_ncpus_setup,
-"ncpus=<# of desired CPUs>\n"
-" This tells an SMP kernel how many virtual processors to start.\n\n"
-);
-#endif
-
static int __init Usage(char *line, int *add)
{
const char **p;
printf(usage_string, init_utsname()->release);
p = &__uml_help_start;
+ /* Explicitly use printf() to show help in stdout */
while (p < &__uml_help_end) {
printf("%s", *p);
p++;
@@ -233,42 +231,86 @@ static void __init uml_postsetup(void)
static int panic_exit(struct notifier_block *self, unsigned long unused1,
void *unused2)
{
+ kmsg_dump(KMSG_DUMP_PANIC);
bust_spinlocks(1);
- show_regs(&(current->thread.regs));
bust_spinlocks(0);
uml_exitcode = 1;
os_dump_core();
- return 0;
+
+ return NOTIFY_DONE;
}
static struct notifier_block panic_exit_notifier = {
- .notifier_call = panic_exit,
- .next = NULL,
- .priority = 0
+ .notifier_call = panic_exit,
+ .priority = INT_MAX - 1, /* run as 2nd notifier, won't return */
};
+void uml_finishsetup(void)
+{
+ atomic_notifier_chain_register(&panic_notifier_list,
+ &panic_exit_notifier);
+
+ uml_postsetup();
+
+ new_thread_handler();
+}
+
/* Set during early boot */
+unsigned long stub_start;
unsigned long task_size;
EXPORT_SYMBOL(task_size);
-unsigned long host_task_size;
-
unsigned long brk_start;
-unsigned long end_iomem;
-EXPORT_SYMBOL(end_iomem);
#define MIN_VMALLOC (32 * 1024 * 1024)
-extern char __binary_start;
+static void __init parse_host_cpu_flags(char *line)
+{
+ int i;
+ for (i = 0; i < 32*NCAPINTS; i++) {
+ if ((x86_cap_flags[i] != NULL) && strstr(line, x86_cap_flags[i]))
+ set_cpu_cap(&boot_cpu_data, i);
+ }
+}
+
+static void __init parse_cache_line(char *line)
+{
+ long res;
+ char *to_parse = strstr(line, ":");
+ if (to_parse) {
+ to_parse++;
+ while (*to_parse != 0 && isspace(*to_parse)) {
+ to_parse++;
+ }
+ if (kstrtoul(to_parse, 10, &res) == 0 && is_power_of_2(res))
+ boot_cpu_data.cache_alignment = res;
+ else
+ boot_cpu_data.cache_alignment = L1_CACHE_BYTES;
+ }
+}
-int __init linux_main(int argc, char **argv)
+static unsigned long __init get_top_address(char **envp)
+{
+ unsigned long top_addr = (unsigned long) &top_addr;
+ int i;
+
+ /* The earliest variable should be after the program name in ELF */
+ for (i = 0; envp[i]; i++) {
+ if ((unsigned long) envp[i] > top_addr)
+ top_addr = (unsigned long) envp[i];
+ }
+
+ return PAGE_ALIGN(top_addr + 1);
+}
+
+int __init linux_main(int argc, char **argv, char **envp)
{
unsigned long avail, diff;
unsigned long virtmem_size, max_physmem;
+ unsigned long host_task_size;
unsigned long stack;
unsigned int i;
int add;
- char * mode;
for (i = 1; i < argc; i++) {
if ((i == 1) && (argv[i][0] == ' '))
@@ -279,26 +321,31 @@ int __init linux_main(int argc, char **argv)
add_arg(argv[i]);
}
if (have_root == 0)
- add_arg(DEFAULT_COMMAND_LINE);
+ add_arg(DEFAULT_COMMAND_LINE_ROOT);
+
+ if (have_console == 0)
+ add_arg(DEFAULT_COMMAND_LINE_CONSOLE);
+
+ host_task_size = get_top_address(envp);
+ /* reserve a few pages for the stubs */
+ stub_start = host_task_size - STUB_SIZE;
+ host_task_size = stub_start;
+
+ /* Limit TASK_SIZE to what is addressable by the page table */
+ task_size = host_task_size;
+ if (task_size > (unsigned long long) PTRS_PER_PGD * PGDIR_SIZE)
+ task_size = PTRS_PER_PGD * PGDIR_SIZE;
- host_task_size = os_get_top_address();
/*
* TASK_SIZE needs to be PGDIR_SIZE aligned or else exit_mmap craps
* out
*/
- task_size = host_task_size & PGDIR_MASK;
+ task_size = task_size & PGDIR_MASK;
/* OS sanity checks that need to happen before the kernel runs */
os_early_checks();
- can_do_skas();
-
- if (proc_mm && ptrace_faultinfo)
- mode = "SKAS3";
- else
- mode = "SKAS0";
-
- printf("UML running in %s mode\n", mode);
+ get_host_cpu_features(parse_host_cpu_flags, parse_cache_line);
brk_start = (unsigned long) sbrk(0);
@@ -307,54 +354,32 @@ int __init linux_main(int argc, char **argv)
* so they actually get what they asked for. This should
* add zero for non-exec shield users
*/
-
- diff = UML_ROUND_UP(brk_start) - UML_ROUND_UP(&_end);
+ diff = PAGE_ALIGN(brk_start) - PAGE_ALIGN((unsigned long) &_end);
if (diff > 1024 * 1024) {
- printf("Adding %ld bytes to physical memory to account for "
- "exec-shield gap\n", diff);
- physmem_size += UML_ROUND_UP(brk_start) - UML_ROUND_UP(&_end);
+ os_info("Adding %ld bytes to physical memory to account for "
+ "exec-shield gap\n", diff);
+ physmem_size += diff;
}
- uml_physmem = (unsigned long) &__binary_start & PAGE_MASK;
+ uml_physmem = (unsigned long) __binary_start & PAGE_MASK;
/* Reserve up to 4M after the current brk */
uml_reserved = ROUND_4M(brk_start) + (1 << 22);
setup_machinename(init_utsname()->machine);
- highmem = 0;
- iomem_size = (iomem_size + PAGE_SIZE - 1) & PAGE_MASK;
- max_physmem = TASK_SIZE - uml_physmem - iomem_size - MIN_VMALLOC;
-
- /*
- * Zones have to begin on a 1 << MAX_ORDER page boundary,
- * so this makes sure that's true for highmem
- */
- max_physmem &= ~((1 << (PAGE_SHIFT + MAX_ORDER)) - 1);
- if (physmem_size + iomem_size > max_physmem) {
- highmem = physmem_size + iomem_size - max_physmem;
- physmem_size -= highmem;
-#ifndef CONFIG_HIGHMEM
- highmem = 0;
- printf("CONFIG_HIGHMEM not enabled - physical memory shrunk "
- "to %Lu bytes\n", physmem_size);
-#endif
+ physmem_size = PAGE_ALIGN(physmem_size);
+ max_physmem = TASK_SIZE - uml_physmem - MIN_VMALLOC;
+ if (physmem_size > max_physmem) {
+ physmem_size = max_physmem;
+ os_info("Physical memory size shrunk to %llu bytes\n",
+ physmem_size);
}
high_physmem = uml_physmem + physmem_size;
- end_iomem = high_physmem + iomem_size;
- high_memory = (void *) end_iomem;
start_vm = VMALLOC_START;
- setup_physmem(uml_physmem, uml_reserved, physmem_size, highmem);
- if (init_maps(physmem_size, iomem_size, highmem)) {
- printf("Failed to allocate mem_map for %Lu bytes of physical "
- "memory and %Lu bytes of highmem\n", physmem_size,
- highmem);
- exit(1);
- }
-
virtmem_size = physmem_size;
stack = (unsigned long) argv;
stack &= ~(1024 * 1024 - 1);
@@ -364,39 +389,70 @@ int __init linux_main(int argc, char **argv)
end_vm = start_vm + virtmem_size;
if (virtmem_size < physmem_size)
- printf("Kernel virtual memory size shrunk to %lu bytes\n",
- virtmem_size);
+ os_info("Kernel virtual memory size shrunk to %lu bytes\n",
+ virtmem_size);
- atomic_notifier_chain_register(&panic_notifier_list,
- &panic_exit_notifier);
-
- uml_postsetup();
+ arch_task_struct_size = sizeof(struct task_struct) + host_fp_size;
- stack_protections((unsigned long) &init_thread_info);
os_flush_stdout();
return start_uml();
}
+int __init __weak read_initrd(void)
+{
+ return 0;
+}
+
void __init setup_arch(char **cmdline_p)
{
+ u8 rng_seed[32];
+
+ stack_protections((unsigned long) init_task.stack);
+ setup_physmem(uml_physmem, uml_reserved, physmem_size);
+ uml_dtb_init();
+ read_initrd();
+
paging_init();
- strlcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
+ strscpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
*cmdline_p = command_line;
setup_hostinfo(host_info, sizeof host_info);
+ prefill_possible_map();
+
+ if (os_getrandom(rng_seed, sizeof(rng_seed), 0) == sizeof(rng_seed)) {
+ add_bootloader_randomness(rng_seed, sizeof(rng_seed));
+ memzero_explicit(rng_seed, sizeof(rng_seed));
+ }
}
-void __init check_bugs(void)
+void __init arch_cpu_finalize_init(void)
{
arch_check_bugs();
os_check_bugs();
}
+void apply_seal_endbr(s32 *start, s32 *end)
+{
+}
+
+void apply_retpolines(s32 *start, s32 *end)
+{
+}
+
+void apply_returns(s32 *start, s32 *end)
+{
+}
+
+void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
+ s32 *start_cfi, s32 *end_cfi)
+{
+}
+
void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
{
}
-#ifdef CONFIG_SMP
+#if IS_ENABLED(CONFIG_SMP)
void alternatives_smp_module_add(struct module *mod, char *name,
void *locks, void *locks_end,
void *text, void *text_end)
@@ -407,3 +463,90 @@ void alternatives_smp_module_del(struct module *mod)
{
}
#endif
+
+void *text_poke(void *addr, const void *opcode, size_t len)
+{
+ /*
+ * In UML, the only reference to this function is in
+ * apply_relocate_add(), which shouldn't ever actually call this
+ * because UML doesn't have live patching.
+ */
+ WARN_ON(1);
+
+ return memcpy(addr, opcode, len);
+}
+
+void *text_poke_copy(void *addr, const void *opcode, size_t len)
+{
+ return text_poke(addr, opcode, len);
+}
+
+void smp_text_poke_sync_each_cpu(void)
+{
+}
+
+void uml_pm_wake(void)
+{
+ pm_system_wakeup();
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int um_suspend_valid(suspend_state_t state)
+{
+ return state == PM_SUSPEND_MEM;
+}
+
+static int um_suspend_prepare(void)
+{
+ um_irqs_suspend();
+ return 0;
+}
+
+static int um_suspend_enter(suspend_state_t state)
+{
+ if (WARN_ON(state != PM_SUSPEND_MEM))
+ return -EINVAL;
+
+ /*
+ * This is identical to the idle sleep, but we've just
+ * (during suspend) turned off all interrupt sources
+ * except for the ones we want, so now we can only wake
+ * up on something we actually want to wake up on. All
+ * timing has also been suspended.
+ */
+ um_idle_sleep();
+ return 0;
+}
+
+static void um_suspend_finish(void)
+{
+ um_irqs_resume();
+}
+
+const struct platform_suspend_ops um_suspend_ops = {
+ .valid = um_suspend_valid,
+ .prepare = um_suspend_prepare,
+ .enter = um_suspend_enter,
+ .finish = um_suspend_finish,
+};
+
+static int init_pm_wake_signal(void)
+{
+ /*
+ * In external time-travel mode we can't use signals to wake up
+ * since that would mess with the scheduling. We'll have to do
+ * some additional work to support wakeup on virtio devices or
+ * similar, perhaps implementing a fake RTC controller that can
+ * trigger wakeup (and request the appropriate scheduling from
+ * the external scheduler when going to suspend.)
+ */
+ if (time_travel_mode != TT_MODE_EXTERNAL)
+ register_pm_wake_signal();
+
+ suspend_set_ops(&um_suspend_ops);
+
+ return 0;
+}
+
+late_initcall(init_pm_wake_signal);
+#endif
diff --git a/arch/um/kernel/um_arch.h b/arch/um/kernel/um_arch.h
new file mode 100644
index 000000000000..46e731ab9dfc
--- /dev/null
+++ b/arch/um/kernel/um_arch.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __UML_ARCH_H__
+#define __UML_ARCH_H__
+
+extern void * __init uml_load_file(const char *filename, unsigned long long *size);
+
+#ifdef CONFIG_OF
+extern void __init uml_dtb_init(void);
+#else
+static inline void uml_dtb_init(void) { }
+#endif
+
+extern int __init read_initrd(void);
+
+#endif
diff --git a/arch/um/kernel/umid.c b/arch/um/kernel/umid.c
index f6cc3bd61781..72bc60ade347 100644
--- a/arch/um/kernel/umid.c
+++ b/arch/um/kernel/umid.c
@@ -1,6 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
*/
#include <asm/errno.h>
@@ -9,21 +9,21 @@
#include <os.h>
/* Changed by set_umid_arg */
-static int umid_inited = 0;
+static int umid_inited;
static int __init set_umid_arg(char *name, int *add)
{
int err;
if (umid_inited) {
- printf("umid already set\n");
+ os_warn("umid already set\n");
return 0;
}
*add = 0;
err = set_umid(name);
if (err == -EEXIST)
- printf("umid '%s' already in use\n", name);
+ os_warn("umid '%s' already in use\n", name);
else if (!err)
umid_inited = 1;
diff --git a/arch/um/kernel/uml.lds.S b/arch/um/kernel/uml.lds.S
index 6899195602b7..a409d4b66114 100644
--- a/arch/um/kernel/uml.lds.S
+++ b/arch/um/kernel/uml.lds.S
@@ -1,4 +1,5 @@
-#include <asm-generic/vmlinux.lds.h>
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <asm/vmlinux.lds.h>
#include <asm/page.h>
OUTPUT_FORMAT(ELF_FORMAT)
@@ -6,6 +7,12 @@ OUTPUT_ARCH(ELF_ARCH)
ENTRY(_start)
jiffies = jiffies_64;
+VERSION {
+ {
+ local: *;
+ };
+}
+
SECTIONS
{
/* This must contain the right address - not quite the default ELF one.*/
@@ -18,10 +25,10 @@ SECTIONS
__binary_start = START;
. = START + SIZEOF_HEADERS;
+ . = ALIGN(PAGE_SIZE);
_text = .;
INIT_TEXT_SECTION(0)
- . = ALIGN(PAGE_SIZE);
.text :
{
@@ -29,6 +36,8 @@ SECTIONS
TEXT_TEXT
SCHED_TEXT
LOCK_TEXT
+ IRQENTRY_TEXT
+ SOFTIRQENTRY_TEXT
*(.fixup)
/* .gnu.warning sections are handled specially by elf32.em. */
*(.gnu.warning)
@@ -68,8 +77,6 @@ SECTIONS
.data :
{
INIT_TASK_DATA(KERNEL_STACK_SIZE)
- . = ALIGN(KERNEL_STACK_SIZE);
- *(.data..init_irqstack)
DATA_DATA
*(.gnu.linkonce.d*)
CONSTRUCTORS
@@ -85,6 +92,7 @@ SECTIONS
}
.got : { *(.got.plt) *(.got) }
+ .eh_frame : { KEEP (*(.eh_frame)) }
.dynamic : { *(.dynamic) }
.tdata : { *(.tdata .tdata.* .gnu.linkonce.td.*) }
.tbss : { *(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon) }
@@ -104,8 +112,8 @@ SECTIONS
PROVIDE (end = .);
STABS_DEBUG
-
DWARF_DEBUG
+ ELF_DETAILS
DISCARDS
}
diff --git a/arch/um/kernel/vmlinux.lds.S b/arch/um/kernel/vmlinux.lds.S
index 16e49bfa2b42..53d719c04ba9 100644
--- a/arch/um/kernel/vmlinux.lds.S
+++ b/arch/um/kernel/vmlinux.lds.S
@@ -1,4 +1,4 @@
-
+#define RUNTIME_DISCARD_EXIT
KERNEL_STACK_SIZE = 4096 * (1 << CONFIG_KERNEL_STACK_ORDER);
#ifdef CONFIG_LD_SCRIPT_STATIC