diff options
Diffstat (limited to 'arch/um/kernel')
44 files changed, 3957 insertions, 2403 deletions
diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile index babe21826e3e..be60bc451b3f 100644 --- a/arch/um/kernel/Makefile +++ b/arch/um/kernel/Makefile @@ -1,30 +1,37 @@ +# SPDX-License-Identifier: GPL-2.0 # # Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux,intel}.com) -# Licensed under the GPL # +# Don't instrument UML-specific code; without this, we may crash when +# accessing the instrumentation buffer for the first time from the +# kernel. +KCOV_INSTRUMENT := n + CPPFLAGS_vmlinux.lds := -DSTART=$(LDS_START) \ -DELF_ARCH=$(LDS_ELF_ARCH) \ -DELF_FORMAT=$(LDS_ELF_FORMAT) \ $(LDS_EXTRA) -extra-y := vmlinux.lds -clean-files := +always-$(KBUILD_BUILTIN) := vmlinux.lds obj-y = config.o exec.o exitcode.o irq.o ksyms.o mem.o \ physmem.o process.o ptrace.o reboot.o sigio.o \ - signal.o smp.o syscall.o sysrq.o time.o tlb.o trap.o \ - um_arch.o umid.o skas/ + signal.o sysrq.o time.o tlb.o trap.o \ + um_arch.o umid.o kmsg_dump.o capflags.o skas/ +obj-y += load_file.o obj-$(CONFIG_BLK_DEV_INITRD) += initrd.o obj-$(CONFIG_GPROF) += gprof_syms.o -obj-$(CONFIG_GCOV) += gmon_syms.o +obj-$(CONFIG_OF) += dtb.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o +obj-$(CONFIG_STACKTRACE) += stacktrace.o +obj-$(CONFIG_SMP) += smp.o USER_OBJS := config.o -include arch/um/scripts/Makefile.rules +include $(srctree)/arch/um/scripts/Makefile.rules -targets := config.c config.tmp +targets := config.c config.tmp capflags.c # Be careful with the below Sed code - sed is pitfall-rich! # We use sed to lower build requirements, for "embedded" builders for instance. @@ -39,6 +46,15 @@ quiet_cmd_quote1 = QUOTE $@ $(obj)/config.c: $(src)/config.c.in $(obj)/config.tmp FORCE $(call if_changed,quote2) +quiet_cmd_mkcapflags = MKCAP $@ + cmd_mkcapflags = $(CONFIG_SHELL) $(src)/../../x86/kernel/cpu/mkcapflags.sh $@ $^ + +cpufeature = $(src)/../../x86/include/asm/cpufeatures.h +vmxfeature = $(src)/../../x86/include/asm/vmxfeatures.h + +$(obj)/capflags.c: $(cpufeature) $(vmxfeature) $(src)/../../x86/kernel/cpu/mkcapflags.sh FORCE + $(call if_changed,mkcapflags) + quiet_cmd_quote2 = QUOTE $@ cmd_quote2 = sed -e '/CONFIG/{' \ -e 's/"CONFIG"//' \ diff --git a/arch/um/kernel/asm-offsets.c b/arch/um/kernel/asm-offsets.c index 1fb12235ab9c..d620b6f6de9b 100644 --- a/arch/um/kernel/asm-offsets.c +++ b/arch/um/kernel/asm-offsets.c @@ -1 +1,49 @@ -#include <sysdep/kernel-offsets.h> +/* SPDX-License-Identifier: GPL-2.0 */ +#define COMPILE_OFFSETS +#include <linux/stddef.h> +#include <linux/sched.h> +#include <linux/elf.h> +#include <linux/crypto.h> +#include <linux/kbuild.h> +#include <linux/audit.h> +#include <linux/fs.h> +#include <asm/mman.h> +#include <asm/seccomp.h> +#include <asm/extable.h> + +/* workaround for a warning with -Wmissing-prototypes */ +void foo(void); + +void foo(void) +{ + DEFINE(KERNEL_MADV_REMOVE, MADV_REMOVE); + + DEFINE(UM_KERN_PAGE_SIZE, PAGE_SIZE); + DEFINE(UM_KERN_PAGE_MASK, PAGE_MASK); + DEFINE(UM_KERN_PAGE_SHIFT, PAGE_SHIFT); + + DEFINE(UM_GFP_KERNEL, GFP_KERNEL); + DEFINE(UM_GFP_ATOMIC, GFP_ATOMIC); + + DEFINE(UM_THREAD_SIZE, THREAD_SIZE); + + DEFINE(UM_NSEC_PER_SEC, NSEC_PER_SEC); + DEFINE(UM_NSEC_PER_USEC, NSEC_PER_USEC); + + DEFINE(UM_KERN_GDT_ENTRY_TLS_ENTRIES, GDT_ENTRY_TLS_ENTRIES); + + DEFINE(UM_SECCOMP_ARCH_NATIVE, SECCOMP_ARCH_NATIVE); + + DEFINE(HOSTFS_ATTR_MODE, ATTR_MODE); + DEFINE(HOSTFS_ATTR_UID, ATTR_UID); + DEFINE(HOSTFS_ATTR_GID, ATTR_GID); + DEFINE(HOSTFS_ATTR_SIZE, ATTR_SIZE); + DEFINE(HOSTFS_ATTR_ATIME, ATTR_ATIME); + DEFINE(HOSTFS_ATTR_MTIME, ATTR_MTIME); + DEFINE(HOSTFS_ATTR_CTIME, ATTR_CTIME); + DEFINE(HOSTFS_ATTR_ATIME_SET, ATTR_ATIME_SET); + DEFINE(HOSTFS_ATTR_MTIME_SET, ATTR_MTIME_SET); + + DEFINE(ALT_INSTR_SIZE, sizeof(struct alt_instr)); + DEFINE(EXTABLE_SIZE, sizeof(struct exception_table_entry)); +} diff --git a/arch/um/kernel/config.c.in b/arch/um/kernel/config.c.in index 972bf1659564..3ece3c3b31cc 100644 --- a/arch/um/kernel/config.c.in +++ b/arch/um/kernel/config.c.in @@ -1,6 +1,6 @@ -/* +// SPDX-License-Identifier: GPL-2.0 +/* * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL */ #include <stdio.h> diff --git a/arch/um/kernel/dtb.c b/arch/um/kernel/dtb.c new file mode 100644 index 000000000000..47cd3d869fb2 --- /dev/null +++ b/arch/um/kernel/dtb.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/init.h> +#include <linux/of_fdt.h> +#include <linux/printk.h> +#include <linux/memblock.h> +#include <init.h> + +#include "um_arch.h" + +static char *dtb __initdata; + +void uml_dtb_init(void) +{ + long long size; + void *area; + + area = uml_load_file(dtb, &size); + if (area) { + if (!early_init_dt_scan(area, __pa(area))) { + pr_err("invalid DTB %s\n", dtb); + memblock_free(area, size); + return; + } + + early_init_fdt_scan_reserved_mem(); + } + + unflatten_device_tree(); +} + +static int __init uml_dtb_setup(char *line, int *add) +{ + *add = 0; + dtb = line; + return 0; +} + +__uml_setup("dtb=", uml_dtb_setup, +"dtb=<file>\n" +" Boot the kernel with the devicetree blob from the specified file.\n\n" +); diff --git a/arch/um/kernel/dyn.lds.S b/arch/um/kernel/dyn.lds.S index adde088aeeff..a36b7918a011 100644 --- a/arch/um/kernel/dyn.lds.S +++ b/arch/um/kernel/dyn.lds.S @@ -1,4 +1,4 @@ -#include <asm-generic/vmlinux.lds.h> +#include <asm/vmlinux.lds.h> #include <asm/page.h> OUTPUT_FORMAT(ELF_FORMAT) @@ -6,6 +6,12 @@ OUTPUT_ARCH(ELF_ARCH) ENTRY(_start) jiffies = jiffies_64; +VERSION { + { + local: *; + }; +} + SECTIONS { PROVIDE (__executable_start = START); @@ -69,6 +75,8 @@ SECTIONS TEXT_TEXT SCHED_TEXT LOCK_TEXT + IRQENTRY_TEXT + SOFTIRQENTRY_TEXT *(.fixup) *(.stub .text.* .gnu.linkonce.t.*) /* .gnu.warning sections are handled specially by elf32.em. */ @@ -100,12 +108,14 @@ SECTIONS be empty, which isn't pretty. */ . = ALIGN(32 / 8); .preinit_array : { *(.preinit_array) } - .init_array : { *(.init_array) } + .init_array : { + *(.kasan_init) + *(.init_array.*) + *(.init_array) + } .fini_array : { *(.fini_array) } .data : { INIT_TASK_DATA(KERNEL_STACK_SIZE) - . = ALIGN(KERNEL_STACK_SIZE); - *(.data..init_irqstack) DATA_DATA *(.data.* .gnu.linkonce.d.*) SORT(CONSTRUCTORS) @@ -161,8 +171,11 @@ SECTIONS PROVIDE (end = .); STABS_DEBUG - DWARF_DEBUG + ELF_DETAILS DISCARDS } + +ASSERT(__syscall_stub_end - __syscall_stub_start <= PAGE_SIZE, + "STUB code must not be larger than one page"); diff --git a/arch/um/kernel/early_printk.c b/arch/um/kernel/early_printk.c index 4a0800bc37b2..c350c2331bbe 100644 --- a/arch/um/kernel/early_printk.c +++ b/arch/um/kernel/early_printk.c @@ -1,9 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2011 Richard Weinberger <richrd@nod.at> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/kernel.h> diff --git a/arch/um/kernel/exec.c b/arch/um/kernel/exec.c index 0d7103c9eff3..13812fa97eee 100644 --- a/arch/um/kernel/exec.c +++ b/arch/um/kernel/exec.c @@ -1,50 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <linux/stddef.h> #include <linux/module.h> #include <linux/fs.h> #include <linux/ptrace.h> -#include <linux/sched.h> +#include <linux/sched/mm.h> +#include <linux/sched/task.h> +#include <linux/sched/task_stack.h> #include <linux/slab.h> #include <asm/current.h> #include <asm/processor.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <as-layout.h> #include <mem_user.h> +#include <registers.h> #include <skas.h> #include <os.h> void flush_thread(void) { - void *data = NULL; - int ret; - arch_flush_thread(¤t->thread.arch); - ret = unmap(¤t->mm->context.id, 0, STUB_START, 0, &data); - ret = ret || unmap(¤t->mm->context.id, STUB_END, - host_task_size - STUB_END, 1, &data); - if (ret) { - printk(KERN_ERR "flush_thread - clearing address space failed, " - "err = %d\n", ret); - force_sig(SIGKILL, current); - } get_safe_registers(current_pt_regs()->regs.gp, current_pt_regs()->regs.fp); - - __switch_mm(¤t->mm->context.id); } void start_thread(struct pt_regs *regs, unsigned long eip, unsigned long esp) { PT_REGS_IP(regs) = eip; PT_REGS_SP(regs) = esp; - current->ptrace &= ~PT_DTRACE; -#ifdef SUBARCH_EXECVE1 - SUBARCH_EXECVE1(regs->regs); -#endif + clear_thread_flag(TIF_SINGLESTEP); } EXPORT_SYMBOL(start_thread); diff --git a/arch/um/kernel/exitcode.c b/arch/um/kernel/exitcode.c index 829df49dee99..43edc2aa57e4 100644 --- a/arch/um/kernel/exitcode.c +++ b/arch/um/kernel/exitcode.c @@ -1,6 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <linux/ctype.h> @@ -10,7 +10,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/types.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> /* * If read and write race, the read will still atomically read a valid @@ -40,9 +40,11 @@ static ssize_t exitcode_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *pos) { char *end, buf[sizeof("nnnnn\0")]; + size_t size; int tmp; - if (copy_from_user(buf, buffer, count)) + size = min(count, sizeof(buf)); + if (copy_from_user(buf, buffer, size)) return -EFAULT; tmp = simple_strtol(buf, &end, 0); @@ -53,20 +55,19 @@ static ssize_t exitcode_proc_write(struct file *file, return count; } -static const struct file_operations exitcode_proc_fops = { - .owner = THIS_MODULE, - .open = exitcode_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = exitcode_proc_write, +static const struct proc_ops exitcode_proc_ops = { + .proc_open = exitcode_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = exitcode_proc_write, }; static int make_proc_exitcode(void) { struct proc_dir_entry *ent; - ent = proc_create("exitcode", 0600, NULL, &exitcode_proc_fops); + ent = proc_create("exitcode", 0600, NULL, &exitcode_proc_ops); if (ent == NULL) { printk(KERN_WARNING "make_proc_exitcode : Failed to register " "/proc/exitcode\n"); diff --git a/arch/um/kernel/gmon_syms.c b/arch/um/kernel/gmon_syms.c deleted file mode 100644 index 1bf61266da8e..000000000000 --- a/arch/um/kernel/gmon_syms.c +++ /dev/null @@ -1,9 +0,0 @@ -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL - */ - -#include <linux/module.h> - -extern void __bb_init_func(void *) __attribute__((weak)); -EXPORT_SYMBOL(__bb_init_func); diff --git a/arch/um/kernel/gprof_syms.c b/arch/um/kernel/gprof_syms.c index 74ddb44288a3..84d536908775 100644 --- a/arch/um/kernel/gprof_syms.c +++ b/arch/um/kernel/gprof_syms.c @@ -1,6 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <linux/module.h> diff --git a/arch/um/kernel/initrd.c b/arch/um/kernel/initrd.c index 55cead809b18..99dba827461c 100644 --- a/arch/um/kernel/initrd.c +++ b/arch/um/kernel/initrd.c @@ -1,46 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <linux/init.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/initrd.h> #include <asm/types.h> #include <init.h> #include <os.h> +#include "um_arch.h" + /* Changed by uml_initrd_setup, which is a setup */ static char *initrd __initdata = NULL; -static int load_initrd(char *filename, void *buf, int size); -static int __init read_initrd(void) +int __init read_initrd(void) { + unsigned long long size; void *area; - long long size; - int err; - - if (initrd == NULL) - return 0; - - err = os_file_size(initrd, &size); - if (err) - return 0; - - /* - * This is necessary because alloc_bootmem craps out if you - * ask for no memory. - */ - if (size == 0) { - printk(KERN_ERR "\"%s\" is a zero-size initrd\n", initrd); - return 0; - } - area = alloc_bootmem(size); - if (area == NULL) + if (!initrd) return 0; - if (load_initrd(initrd, area, size) == -1) + area = uml_load_file(initrd, &size); + if (!area) return 0; initrd_start = (unsigned long) area; @@ -48,10 +32,9 @@ static int __init read_initrd(void) return 0; } -__uml_postsetup(read_initrd); - static int __init uml_initrd_setup(char *line, int *add) { + *add = 0; initrd = line; return 0; } @@ -61,25 +44,3 @@ __uml_setup("initrd=", uml_initrd_setup, " This is used to boot UML from an initrd image. The argument is the\n" " name of the file containing the image.\n\n" ); - -static int load_initrd(char *filename, void *buf, int size) -{ - int fd, n; - - fd = os_open_file(filename, of_read(OPENFLAGS()), 0); - if (fd < 0) { - printk(KERN_ERR "Opening '%s' failed - err = %d\n", filename, - -fd); - return -1; - } - n = os_read_file(fd, buf, size); - if (n != size) { - printk(KERN_ERR "Read of %d bytes from '%s' failed, " - "err = %d\n", size, - filename, -n); - return -1; - } - - os_close_file(fd); - return 0; -} diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index 36e12f0cefd5..f4b13f15a9c1 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -1,6 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* + * Copyright (C) 2017 - Cambridge Greys Ltd + * Copyright (C) 2011 - 2014 Cisco Systems Inc * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL * Derived (i.e. mostly copied) from arch/i386/kernel/irq.c: * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar */ @@ -16,245 +18,424 @@ #include <as-layout.h> #include <kern_util.h> #include <os.h> +#include <irq_user.h> +#include <irq_kern.h> +#include <linux/time-internal.h> +DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); + +#define irq_stats(x) (&per_cpu(irq_stat, x)) + +/* When epoll triggers we do not know why it did so + * we can also have different IRQs for read and write. + * This is why we keep a small irq_reg array for each fd - + * one entry per IRQ type + */ +struct irq_reg { + void *id; + int irq; + /* it's cheaper to store this than to query it */ + int events; + bool active; + bool pending; + bool wakeup; +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT + bool pending_event; + void (*timetravel_handler)(int, int, void *, + struct time_travel_event *); + struct time_travel_event event; +#endif +}; + +struct irq_entry { + struct list_head list; + int fd; + struct irq_reg reg[NUM_IRQ_TYPES]; + bool suspended; + bool sigio_workaround; +}; + +static DEFINE_RAW_SPINLOCK(irq_lock); +static LIST_HEAD(active_fds); +static DECLARE_BITMAP(irqs_allocated, UM_LAST_SIGNAL_IRQ); +static bool irqs_suspended; +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT +static bool irqs_pending; +#endif + +static void irq_io_loop(struct irq_reg *irq, struct uml_pt_regs *regs) +{ /* - * This list is accessed under irq_lock, except in sigio_handler, - * where it is safe from being modified. IRQ handlers won't change it - - * if an IRQ source has vanished, it will be freed by free_irqs just - * before returning from sigio_handler. That will process a separate - * list of irqs to free, with its own locking, coming back here to - * remove list elements, taking the irq_lock to do so. + * irq->active guards against reentry + * irq->pending accumulates pending requests + * if pending is raised the irq_handler is re-run + * until pending is cleared */ -static struct irq_fd *active_fds = NULL; -static struct irq_fd **last_irq_ptr = &active_fds; + if (irq->active) { + irq->active = false; -extern void free_irqs(void); + do { + irq->pending = false; + do_IRQ(irq->irq, regs); + } while (irq->pending); + + irq->active = true; + } else { + irq->pending = true; + } +} -void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT +static void irq_event_handler(struct time_travel_event *ev) { - struct irq_fd *irq_fd; - int n; + struct irq_reg *reg = container_of(ev, struct irq_reg, event); - if (smp_sigio_handler()) + /* do nothing if suspended; just cause a wakeup and mark as pending */ + if (irqs_suspended) { + irqs_pending = true; + reg->pending_event = true; return; + } - while (1) { - n = os_waiting_for_events(active_fds); - if (n <= 0) { - if (n == -EINTR) - continue; - else break; - } + generic_handle_irq(reg->irq); +} + +static bool irq_do_timetravel_handler(struct irq_entry *entry, + enum um_irq_type t) +{ + struct irq_reg *reg = &entry->reg[t]; + + if (!reg->timetravel_handler) + return false; + + /* + * Handle all messages - we might get multiple even while + * interrupts are already suspended, due to suspend order + * etc. Note that time_travel_add_irq_event() will not add + * an event twice, if it's pending already "first wins". + */ + reg->timetravel_handler(reg->irq, entry->fd, reg->id, ®->event); - for (irq_fd = active_fds; irq_fd != NULL; - irq_fd = irq_fd->next) { - if (irq_fd->current_events != 0) { - irq_fd->current_events = 0; - do_IRQ(irq_fd->irq, regs); + if (!reg->event.pending) + return false; + + return true; +} + +static void irq_do_pending_events(bool timetravel_handlers_only) +{ + struct irq_entry *entry; + + if (!irqs_pending || timetravel_handlers_only) + return; + + irqs_pending = false; + + list_for_each_entry(entry, &active_fds, list) { + enum um_irq_type t; + + for (t = 0; t < NUM_IRQ_TYPES; t++) { + struct irq_reg *reg = &entry->reg[t]; + + /* + * Any timetravel_handler was invoked already, just + * directly run the IRQ. + */ + if (reg->pending_event) { + irq_enter(); + generic_handle_irq(reg->irq); + irq_exit(); + reg->pending_event = false; } } } - - free_irqs(); +} +#else +static bool irq_do_timetravel_handler(struct irq_entry *entry, + enum um_irq_type t) +{ + return false; } -static DEFINE_SPINLOCK(irq_lock); +static void irq_do_pending_events(bool timetravel_handlers_only) +{ +} +#endif -static int activate_fd(int irq, int fd, int type, void *dev_id) +static void sigio_reg_handler(int idx, struct irq_entry *entry, enum um_irq_type t, + struct uml_pt_regs *regs, + bool timetravel_handlers_only) { - struct pollfd *tmp_pfd; - struct irq_fd *new_fd, *irq_fd; - unsigned long flags; - int events, err, n; + struct irq_reg *reg = &entry->reg[t]; - err = os_set_fd_async(fd); - if (err < 0) - goto out; + if (!reg->events) + return; - err = -ENOMEM; - new_fd = kmalloc(sizeof(struct irq_fd), GFP_KERNEL); - if (new_fd == NULL) - goto out; + if (os_epoll_triggered(idx, reg->events) <= 0) + return; - if (type == IRQ_READ) - events = UM_POLLIN | UM_POLLPRI; - else events = UM_POLLOUT; - *new_fd = ((struct irq_fd) { .next = NULL, - .id = dev_id, - .fd = fd, - .type = type, - .irq = irq, - .events = events, - .current_events = 0 } ); - - err = -EBUSY; - spin_lock_irqsave(&irq_lock, flags); - for (irq_fd = active_fds; irq_fd != NULL; irq_fd = irq_fd->next) { - if ((irq_fd->fd == fd) && (irq_fd->type == type)) { - printk(KERN_ERR "Registering fd %d twice\n", fd); - printk(KERN_ERR "Irqs : %d, %d\n", irq_fd->irq, irq); - printk(KERN_ERR "Ids : 0x%p, 0x%p\n", irq_fd->id, - dev_id); - goto out_unlock; - } + if (irq_do_timetravel_handler(entry, t)) + return; + + /* + * If we're called to only run time-travel handlers then don't + * actually proceed but mark sigio as pending (if applicable). + * For suspend/resume, timetravel_handlers_only may be true + * despite time-travel not being configured and used. + */ + if (timetravel_handlers_only) { +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT + reg->pending_event = true; + irqs_pending = true; + mark_sigio_pending(); +#endif + return; } - if (type == IRQ_WRITE) - fd = -1; + irq_io_loop(reg, regs); +} - tmp_pfd = NULL; - n = 0; +static void _sigio_handler(struct uml_pt_regs *regs, + bool timetravel_handlers_only) +{ + struct irq_entry *irq_entry; + int n, i; - while (1) { - n = os_create_pollfd(fd, events, tmp_pfd, n); - if (n == 0) - break; - - /* - * n > 0 - * It means we couldn't put new pollfd to current pollfds - * and tmp_fds is NULL or too small for new pollfds array. - * Needed size is equal to n as minimum. - * - * Here we have to drop the lock in order to call - * kmalloc, which might sleep. - * If something else came in and changed the pollfds array - * so we will not be able to put new pollfd struct to pollfds - * then we free the buffer tmp_fds and try again. - */ - spin_unlock_irqrestore(&irq_lock, flags); - kfree(tmp_pfd); + if (timetravel_handlers_only && !um_irq_timetravel_handler_used()) + return; - tmp_pfd = kmalloc(n, GFP_KERNEL); - if (tmp_pfd == NULL) - goto out_kfree; + /* Flush out pending events that were ignored due to time-travel. */ + if (!irqs_suspended) + irq_do_pending_events(timetravel_handlers_only); - spin_lock_irqsave(&irq_lock, flags); - } + while (1) { + /* This is now lockless - epoll keeps back-referencesto the irqs + * which have trigger it so there is no need to walk the irq + * list and lock it every time. We avoid locking by turning off + * IO for a specific fd by executing os_del_epoll_fd(fd) before + * we do any changes to the actual data structures + */ + n = os_waiting_for_events_epoll(); - *last_irq_ptr = new_fd; - last_irq_ptr = &new_fd->next; + if (n <= 0) { + if (n == -EINTR) + continue; + else + break; + } - spin_unlock_irqrestore(&irq_lock, flags); + for (i = 0; i < n ; i++) { + enum um_irq_type t; - /* - * This calls activate_fd, so it has to be outside the critical - * section. - */ - maybe_sigio_broken(fd, (type == IRQ_READ)); + irq_entry = os_epoll_get_data_pointer(i); - return 0; + for (t = 0; t < NUM_IRQ_TYPES; t++) + sigio_reg_handler(i, irq_entry, t, regs, + timetravel_handlers_only); + } + } - out_unlock: - spin_unlock_irqrestore(&irq_lock, flags); - out_kfree: - kfree(new_fd); - out: - return err; + if (!timetravel_handlers_only) + free_irqs(); } -static void free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg) +void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs, + void *mc) { - unsigned long flags; - - spin_lock_irqsave(&irq_lock, flags); - os_free_irq_by_cb(test, arg, active_fds, &last_irq_ptr); - spin_unlock_irqrestore(&irq_lock, flags); + preempt_disable(); + _sigio_handler(regs, irqs_suspended); + preempt_enable(); } -struct irq_and_dev { - int irq; - void *dev; -}; - -static int same_irq_and_dev(struct irq_fd *irq, void *d) +static struct irq_entry *get_irq_entry_by_fd(int fd) { - struct irq_and_dev *data = d; + struct irq_entry *walk; - return ((irq->irq == data->irq) && (irq->id == data->dev)); + lockdep_assert_held(&irq_lock); + + list_for_each_entry(walk, &active_fds, list) { + if (walk->fd == fd) + return walk; + } + + return NULL; } -static void free_irq_by_irq_and_dev(unsigned int irq, void *dev) +static void remove_irq_entry(struct irq_entry *to_free, bool remove) { - struct irq_and_dev data = ((struct irq_and_dev) { .irq = irq, - .dev = dev }); + if (!to_free) + return; - free_irq_by_cb(same_irq_and_dev, &data); + if (remove) + os_del_epoll_fd(to_free->fd); + list_del(&to_free->list); } -static int same_fd(struct irq_fd *irq, void *fd) +static bool update_irq_entry(struct irq_entry *entry) { - return (irq->fd == *((int *)fd)); + enum um_irq_type i; + int events = 0; + + for (i = 0; i < NUM_IRQ_TYPES; i++) + events |= entry->reg[i].events; + + if (events) { + /* will modify (instead of add) if needed */ + os_add_epoll_fd(events, entry->fd, entry); + return true; + } + + os_del_epoll_fd(entry->fd); + return false; } -void free_irq_by_fd(int fd) +static struct irq_entry *update_or_remove_irq_entry(struct irq_entry *entry) { - free_irq_by_cb(same_fd, &fd); + if (update_irq_entry(entry)) + return NULL; + remove_irq_entry(entry, false); + return entry; } -/* Must be called with irq_lock held */ -static struct irq_fd *find_irq_by_fd(int fd, int irqnum, int *index_out) +static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id, + void (*timetravel_handler)(int, int, void *, + struct time_travel_event *)) { - struct irq_fd *irq; - int i = 0; - int fdi; + struct irq_entry *irq_entry, *to_free = NULL; + int err, events = os_event_mask(type); + unsigned long flags; - for (irq = active_fds; irq != NULL; irq = irq->next) { - if ((irq->fd == fd) && (irq->irq == irqnum)) - break; - i++; - } - if (irq == NULL) { - printk(KERN_ERR "find_irq_by_fd doesn't have descriptor %d\n", - fd); + err = os_set_fd_async(fd); + if (err < 0) goto out; + + raw_spin_lock_irqsave(&irq_lock, flags); + irq_entry = get_irq_entry_by_fd(fd); + if (irq_entry) { +already: + /* cannot register the same FD twice with the same type */ + if (WARN_ON(irq_entry->reg[type].events)) { + err = -EALREADY; + goto out_unlock; + } + + /* temporarily disable to avoid IRQ-side locking */ + os_del_epoll_fd(fd); + } else { + struct irq_entry *new; + + /* don't restore interrupts */ + raw_spin_unlock(&irq_lock); + new = kzalloc(sizeof(*irq_entry), GFP_ATOMIC); + if (!new) { + local_irq_restore(flags); + return -ENOMEM; + } + raw_spin_lock(&irq_lock); + irq_entry = get_irq_entry_by_fd(fd); + if (irq_entry) { + to_free = new; + goto already; + } + irq_entry = new; + irq_entry->fd = fd; + list_add_tail(&irq_entry->list, &active_fds); + maybe_sigio_broken(fd); } - fdi = os_get_pollfd(i); - if ((fdi != -1) && (fdi != fd)) { - printk(KERN_ERR "find_irq_by_fd - mismatch between active_fds " - "and pollfds, fd %d vs %d, need %d\n", irq->fd, - fdi, fd); - irq = NULL; - goto out; + + irq_entry->reg[type].id = dev_id; + irq_entry->reg[type].irq = irq; + irq_entry->reg[type].active = true; + irq_entry->reg[type].events = events; + +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT + if (um_irq_timetravel_handler_used()) { + irq_entry->reg[type].timetravel_handler = timetravel_handler; + irq_entry->reg[type].event.fn = irq_event_handler; } - *index_out = i; - out: - return irq; +#endif + + WARN_ON(!update_irq_entry(irq_entry)); + err = 0; +out_unlock: + raw_spin_unlock_irqrestore(&irq_lock, flags); +out: + kfree(to_free); + return err; +} + +/* + * Remove the entry or entries for a specific FD, if you + * don't want to remove all the possible entries then use + * um_free_irq() or deactivate_fd() instead. + */ +void free_irq_by_fd(int fd) +{ + struct irq_entry *to_free; + unsigned long flags; + + raw_spin_lock_irqsave(&irq_lock, flags); + to_free = get_irq_entry_by_fd(fd); + remove_irq_entry(to_free, true); + raw_spin_unlock_irqrestore(&irq_lock, flags); + kfree(to_free); } +EXPORT_SYMBOL(free_irq_by_fd); -void reactivate_fd(int fd, int irqnum) +static void free_irq_by_irq_and_dev(unsigned int irq, void *dev) { - struct irq_fd *irq; + struct irq_entry *entry, *to_free = NULL; unsigned long flags; - int i; - spin_lock_irqsave(&irq_lock, flags); - irq = find_irq_by_fd(fd, irqnum, &i); - if (irq == NULL) { - spin_unlock_irqrestore(&irq_lock, flags); - return; - } - os_set_pollfd(i, irq->fd); - spin_unlock_irqrestore(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); + list_for_each_entry(entry, &active_fds, list) { + enum um_irq_type i; - add_sigio_fd(fd); + for (i = 0; i < NUM_IRQ_TYPES; i++) { + struct irq_reg *reg = &entry->reg[i]; + + if (!reg->events) + continue; + if (reg->irq != irq) + continue; + if (reg->id != dev) + continue; + + os_del_epoll_fd(entry->fd); + reg->events = 0; + to_free = update_or_remove_irq_entry(entry); + goto out; + } + } +out: + raw_spin_unlock_irqrestore(&irq_lock, flags); + kfree(to_free); } void deactivate_fd(int fd, int irqnum) { - struct irq_fd *irq; + struct irq_entry *entry; unsigned long flags; - int i; + enum um_irq_type i; - spin_lock_irqsave(&irq_lock, flags); - irq = find_irq_by_fd(fd, irqnum, &i); - if (irq == NULL) { - spin_unlock_irqrestore(&irq_lock, flags); - return; + os_del_epoll_fd(fd); + + raw_spin_lock_irqsave(&irq_lock, flags); + entry = get_irq_entry_by_fd(fd); + if (!entry) + goto out; + + for (i = 0; i < NUM_IRQ_TYPES; i++) { + if (!entry->reg[i].events) + continue; + if (entry->reg[i].irq == irqnum) + entry->reg[i].events = 0; } - os_set_pollfd(i, -1); - spin_unlock_irqrestore(&irq_lock, flags); + entry = update_or_remove_irq_entry(entry); +out: + raw_spin_unlock_irqrestore(&irq_lock, flags); + kfree(entry); ignore_sigio_fd(fd); } @@ -268,17 +449,18 @@ EXPORT_SYMBOL(deactivate_fd); */ int deactivate_all_fds(void) { - struct irq_fd *irq; - int err; + struct irq_entry *entry; - for (irq = active_fds; irq != NULL; irq = irq->next) { - err = os_clear_fd_async(irq->fd); - if (err) - return err; - } - /* If there is a signal already queued, after unblocking ignore it */ + /* Stop IO. The IRQ loop has no lock so this is our + * only way of making sure we are safe to dispose + * of all IRQ handlers + */ os_set_ioignore(); + /* we can no longer call kfree() here so just deactivate */ + list_for_each_entry(entry, &active_fds, list) + os_del_epoll_fd(entry->fd); + os_close_epoll_fd(); return 0; } @@ -297,31 +479,180 @@ unsigned int do_IRQ(int irq, struct uml_pt_regs *regs) return 1; } -void um_free_irq(unsigned int irq, void *dev) +void um_free_irq(int irq, void *dev) { + if (WARN(irq < 0 || irq > UM_LAST_SIGNAL_IRQ, + "freeing invalid irq %d", irq)) + return; + free_irq_by_irq_and_dev(irq, dev); free_irq(irq, dev); + clear_bit(irq, irqs_allocated); } EXPORT_SYMBOL(um_free_irq); -int um_request_irq(unsigned int irq, int fd, int type, - irq_handler_t handler, - unsigned long irqflags, const char * devname, - void *dev_id) +static int +_um_request_irq(int irq, int fd, enum um_irq_type type, + irq_handler_t handler, unsigned long irqflags, + const char *devname, void *dev_id, + void (*timetravel_handler)(int, int, void *, + struct time_travel_event *)) { int err; + if (irq == UM_IRQ_ALLOC) { + int i; + + for (i = UM_FIRST_DYN_IRQ; i < NR_IRQS; i++) { + if (!test_and_set_bit(i, irqs_allocated)) { + irq = i; + break; + } + } + } + + if (irq < 0) + return -ENOSPC; + if (fd != -1) { - err = activate_fd(irq, fd, type, dev_id); + err = activate_fd(irq, fd, type, dev_id, timetravel_handler); if (err) - return err; + goto error; } - return request_irq(irq, handler, irqflags, devname, dev_id); + err = request_irq(irq, handler, irqflags, devname, dev_id); + if (err < 0) + goto error; + + return irq; +error: + clear_bit(irq, irqs_allocated); + return err; } +int um_request_irq(int irq, int fd, enum um_irq_type type, + irq_handler_t handler, unsigned long irqflags, + const char *devname, void *dev_id) +{ + return _um_request_irq(irq, fd, type, handler, irqflags, + devname, dev_id, NULL); +} EXPORT_SYMBOL(um_request_irq); -EXPORT_SYMBOL(reactivate_fd); + +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT +int um_request_irq_tt(int irq, int fd, enum um_irq_type type, + irq_handler_t handler, unsigned long irqflags, + const char *devname, void *dev_id, + void (*timetravel_handler)(int, int, void *, + struct time_travel_event *)) +{ + return _um_request_irq(irq, fd, type, handler, irqflags, + devname, dev_id, timetravel_handler); +} +EXPORT_SYMBOL(um_request_irq_tt); + +void sigio_run_timetravel_handlers(void) +{ + _sigio_handler(NULL, true); +} +#endif + +#ifdef CONFIG_PM_SLEEP +void um_irqs_suspend(void) +{ + struct irq_entry *entry; + unsigned long flags; + + irqs_suspended = true; + + raw_spin_lock_irqsave(&irq_lock, flags); + list_for_each_entry(entry, &active_fds, list) { + enum um_irq_type t; + bool clear = true; + + for (t = 0; t < NUM_IRQ_TYPES; t++) { + if (!entry->reg[t].events) + continue; + + /* + * For the SIGIO_WRITE_IRQ, which is used to handle the + * SIGIO workaround thread, we need special handling: + * enable wake for it itself, but below we tell it about + * any FDs that should be suspended. + */ + if (entry->reg[t].wakeup || + entry->reg[t].irq == SIGIO_WRITE_IRQ +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT + || entry->reg[t].timetravel_handler +#endif + ) { + clear = false; + break; + } + } + + if (clear) { + entry->suspended = true; + os_clear_fd_async(entry->fd); + entry->sigio_workaround = + !__ignore_sigio_fd(entry->fd); + } + } + raw_spin_unlock_irqrestore(&irq_lock, flags); +} + +void um_irqs_resume(void) +{ + struct irq_entry *entry; + unsigned long flags; + + + raw_spin_lock_irqsave(&irq_lock, flags); + list_for_each_entry(entry, &active_fds, list) { + if (entry->suspended) { + int err = os_set_fd_async(entry->fd); + + WARN(err < 0, "os_set_fd_async returned %d\n", err); + entry->suspended = false; + + if (entry->sigio_workaround) { + err = __add_sigio_fd(entry->fd); + WARN(err < 0, "add_sigio_returned %d\n", err); + } + } + } + raw_spin_unlock_irqrestore(&irq_lock, flags); + + irqs_suspended = false; + send_sigio_to_self(); +} + +static int normal_irq_set_wake(struct irq_data *d, unsigned int on) +{ + struct irq_entry *entry; + unsigned long flags; + + raw_spin_lock_irqsave(&irq_lock, flags); + list_for_each_entry(entry, &active_fds, list) { + enum um_irq_type t; + + for (t = 0; t < NUM_IRQ_TYPES; t++) { + if (!entry->reg[t].events) + continue; + + if (entry->reg[t].irq != d->irq) + continue; + entry->reg[t].wakeup = on; + goto unlock; + } + } +unlock: + raw_spin_unlock_irqrestore(&irq_lock, flags); + return 0; +} +#else +#define normal_irq_set_wake NULL +#endif /* * irq_chip must define at least enable/disable and ack when @@ -331,139 +662,67 @@ static void dummy(struct irq_data *d) { } -/* This is used for everything else than the timer. */ +/* This is used for everything other than the timer. */ static struct irq_chip normal_irq_type = { .name = "SIGIO", .irq_disable = dummy, .irq_enable = dummy, .irq_ack = dummy, + .irq_mask = dummy, + .irq_unmask = dummy, + .irq_set_wake = normal_irq_set_wake, }; -static struct irq_chip SIGVTALRM_irq_type = { - .name = "SIGVTALRM", +static struct irq_chip alarm_irq_type = { + .name = "SIGALRM", .irq_disable = dummy, .irq_enable = dummy, .irq_ack = dummy, + .irq_mask = dummy, + .irq_unmask = dummy, }; void __init init_IRQ(void) { int i; - irq_set_chip_and_handler(TIMER_IRQ, &SIGVTALRM_irq_type, handle_edge_irq); + irq_set_chip_and_handler(TIMER_IRQ, &alarm_irq_type, handle_percpu_irq); - for (i = 1; i < NR_IRQS; i++) + for (i = 1; i < UM_LAST_SIGNAL_IRQ; i++) irq_set_chip_and_handler(i, &normal_irq_type, handle_edge_irq); + /* Initialize EPOLL Loop */ + os_setup_epoll(); } -/* - * IRQ stack entry and exit: - * - * Unlike i386, UML doesn't receive IRQs on the normal kernel stack - * and switch over to the IRQ stack after some preparation. We use - * sigaltstack to receive signals on a separate stack from the start. - * These two functions make sure the rest of the kernel won't be too - * upset by being on a different stack. The IRQ stack has a - * thread_info structure at the bottom so that current et al continue - * to work. - * - * to_irq_stack copies the current task's thread_info to the IRQ stack - * thread_info and sets the tasks's stack to point to the IRQ stack. - * - * from_irq_stack copies the thread_info struct back (flags may have - * been modified) and resets the task's stack pointer. - * - * Tricky bits - - * - * What happens when two signals race each other? UML doesn't block - * signals with sigprocmask, SA_DEFER, or sa_mask, so a second signal - * could arrive while a previous one is still setting up the - * thread_info. - * - * There are three cases - - * The first interrupt on the stack - sets up the thread_info and - * handles the interrupt - * A nested interrupt interrupting the copying of the thread_info - - * can't handle the interrupt, as the stack is in an unknown state - * A nested interrupt not interrupting the copying of the - * thread_info - doesn't do any setup, just handles the interrupt - * - * The first job is to figure out whether we interrupted stack setup. - * This is done by xchging the signal mask with thread_info->pending. - * If the value that comes back is zero, then there is no setup in - * progress, and the interrupt can be handled. If the value is - * non-zero, then there is stack setup in progress. In order to have - * the interrupt handled, we leave our signal in the mask, and it will - * be handled by the upper handler after it has set up the stack. - * - * Next is to figure out whether we are the outer handler or a nested - * one. As part of setting up the stack, thread_info->real_thread is - * set to non-NULL (and is reset to NULL on exit). This is the - * nesting indicator. If it is non-NULL, then the stack is already - * set up and the handler can run. - */ - -static unsigned long pending_mask; - -unsigned long to_irq_stack(unsigned long *mask_out) +int __init arch_probe_nr_irqs(void) { - struct thread_info *ti; - unsigned long mask, old; - int nested; - - mask = xchg(&pending_mask, *mask_out); - if (mask != 0) { - /* - * If any interrupts come in at this point, we want to - * make sure that their bits aren't lost by our - * putting our bit in. So, this loop accumulates bits - * until xchg returns the same value that we put in. - * When that happens, there were no new interrupts, - * and pending_mask contains a bit for each interrupt - * that came in. - */ - old = *mask_out; - do { - old |= mask; - mask = xchg(&pending_mask, old); - } while (mask != old); - return 1; - } - - ti = current_thread_info(); - nested = (ti->real_thread != NULL); - if (!nested) { - struct task_struct *task; - struct thread_info *tti; - - task = cpu_tasks[ti->cpu].task; - tti = task_thread_info(task); - - *ti = *tti; - ti->real_thread = tti; - task->stack = ti; - } - - mask = xchg(&pending_mask, 0); - *mask_out |= mask | nested; - return 0; + return NR_IRQS; } -unsigned long from_irq_stack(int nested) +void sigchld_handler(int sig, struct siginfo *unused_si, + struct uml_pt_regs *regs, void *mc) { - struct thread_info *ti, *to; - unsigned long mask; + do_IRQ(SIGCHLD_IRQ, regs); +} - ti = current_thread_info(); +/* + * /proc/interrupts printing for arch specific interrupts + */ +int arch_show_interrupts(struct seq_file *p, int prec) +{ +#if IS_ENABLED(CONFIG_SMP) + int cpu; - pending_mask = 1; + seq_printf(p, "%*s: ", prec, "RES"); + for_each_online_cpu(cpu) + seq_printf(p, "%10u ", irq_stats(cpu)->irq_resched_count); + seq_puts(p, " Rescheduling interrupts\n"); - to = ti->real_thread; - current->stack = to; - ti->real_thread = NULL; - *to = *ti; + seq_printf(p, "%*s: ", prec, "CAL"); + for_each_online_cpu(cpu) + seq_printf(p, "%10u ", irq_stats(cpu)->irq_call_count); + seq_puts(p, " Function call interrupts\n"); +#endif - mask = xchg(&pending_mask, 0); - return mask & ~1; + return 0; } - diff --git a/arch/um/kernel/kmsg_dump.c b/arch/um/kernel/kmsg_dump.c new file mode 100644 index 000000000000..fc0f543d1d8e --- /dev/null +++ b/arch/um/kernel/kmsg_dump.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kmsg_dump.h> +#include <linux/spinlock.h> +#include <linux/console.h> +#include <linux/string.h> +#include <shared/init.h> +#include <shared/kern.h> +#include <os.h> + +static void kmsg_dumper_stdout(struct kmsg_dumper *dumper, + struct kmsg_dump_detail *detail) +{ + static struct kmsg_dump_iter iter; + static DEFINE_SPINLOCK(lock); + static char line[1024]; + struct console *con; + unsigned long flags; + size_t len = 0; + int cookie; + + /* + * If no consoles are available to output crash information, dump + * the kmsg buffer to stdout. + */ + + cookie = console_srcu_read_lock(); + for_each_console_srcu(con) { + /* + * The ttynull console and disabled consoles are ignored + * since they cannot output. All other consoles are + * expected to output the crash information. + */ + if (strcmp(con->name, "ttynull") != 0 && + console_is_usable(con, console_srcu_read_flags(con), true)) { + break; + } + } + console_srcu_read_unlock(cookie); + if (con) + return; + + if (!spin_trylock_irqsave(&lock, flags)) + return; + + kmsg_dump_rewind(&iter); + + printf("kmsg_dump:\n"); + while (kmsg_dump_get_line(&iter, true, line, sizeof(line), &len)) { + line[len] = '\0'; + printf("%s", line); + } + + spin_unlock_irqrestore(&lock, flags); +} + +static struct kmsg_dumper kmsg_dumper = { + .dump = kmsg_dumper_stdout +}; + +static int __init kmsg_dumper_stdout_init(void) +{ + return kmsg_dump_register(&kmsg_dumper); +} + +__uml_postsetup(kmsg_dumper_stdout_init); diff --git a/arch/um/kernel/ksyms.c b/arch/um/kernel/ksyms.c index 543c04756939..96314c31e61c 100644 --- a/arch/um/kernel/ksyms.c +++ b/arch/um/kernel/ksyms.c @@ -1,13 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <linux/module.h> #include <os.h> -EXPORT_SYMBOL(set_signals); -EXPORT_SYMBOL(get_signals); +EXPORT_SYMBOL(um_get_signals); +EXPORT_SYMBOL(um_set_signals); EXPORT_SYMBOL(os_stat_fd); EXPORT_SYMBOL(os_stat_file); @@ -33,12 +33,16 @@ EXPORT_SYMBOL(os_shutdown_socket); EXPORT_SYMBOL(os_create_unix_socket); EXPORT_SYMBOL(os_connect_socket); EXPORT_SYMBOL(os_accept_connection); -EXPORT_SYMBOL(os_rcv_fd); +EXPORT_SYMBOL(os_rcv_fd_msg); EXPORT_SYMBOL(run_helper); EXPORT_SYMBOL(os_major); EXPORT_SYMBOL(os_minor); EXPORT_SYMBOL(os_makedev); +EXPORT_SYMBOL(os_eventfd); +EXPORT_SYMBOL(os_sendmsg_fds); EXPORT_SYMBOL(add_sigio_fd); EXPORT_SYMBOL(ignore_sigio_fd); EXPORT_SYMBOL(sigio_broken); + +EXPORT_SYMBOL(syscall); diff --git a/arch/um/kernel/load_file.c b/arch/um/kernel/load_file.c new file mode 100644 index 000000000000..cb9d178ab7d8 --- /dev/null +++ b/arch/um/kernel/load_file.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + */ +#include <linux/memblock.h> +#include <os.h> + +#include "um_arch.h" + +static int __init __uml_load_file(const char *filename, void *buf, int size) +{ + int fd, n; + + fd = os_open_file(filename, of_read(OPENFLAGS()), 0); + if (fd < 0) { + printk(KERN_ERR "Opening '%s' failed - err = %d\n", filename, + -fd); + return -1; + } + n = os_read_file(fd, buf, size); + if (n != size) { + printk(KERN_ERR "Read of %d bytes from '%s' failed, " + "err = %d\n", size, + filename, -n); + return -1; + } + + os_close_file(fd); + return 0; +} + +void *uml_load_file(const char *filename, unsigned long long *size) +{ + void *area; + int err; + + *size = 0; + + if (!filename) + return NULL; + + err = os_file_size(filename, size); + if (err) + return NULL; + + if (*size == 0) { + printk(KERN_ERR "\"%s\" is empty\n", filename); + return NULL; + } + + area = memblock_alloc_or_panic(*size, SMP_CACHE_BYTES); + + if (__uml_load_file(filename, area, *size)) { + memblock_free(area, *size); + return NULL; + } + + return area; +} diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 7ddb64baf327..39c4a7e21c6f 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -1,29 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <linux/stddef.h> #include <linux/module.h> -#include <linux/bootmem.h> -#include <linux/highmem.h> +#include <linux/memblock.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/slab.h> -#include <asm/fixmap.h> +#include <linux/init.h> +#include <asm/sections.h> #include <asm/page.h> +#include <asm/pgalloc.h> #include <as-layout.h> #include <init.h> #include <kern.h> #include <kern_util.h> #include <mem_user.h> #include <os.h> +#include <um_malloc.h> +#include <linux/sched/task.h> +#include <linux/kasan.h> + +#ifdef CONFIG_KASAN +void __init kasan_init(void) +{ + /* + * kasan_map_memory will map all of the required address space and + * the host machine will allocate physical memory as necessary. + */ + kasan_map_memory((void *)KASAN_SHADOW_START, KASAN_SHADOW_SIZE); + init_task.kasan_depth = 0; + /* + * Since kasan_init() is called before main(), + * KASAN is initialized but the enablement is deferred after + * jump_label_init(). See arch_mm_preinit(). + */ +} + +static void (*kasan_init_ptr)(void) +__section(".kasan_init") __used += kasan_init; +#endif /* allocated in paging_init, zeroed in mem_init, and unchanged thereafter */ unsigned long *empty_zero_page = NULL; EXPORT_SYMBOL(empty_zero_page); -/* allocated in paging_init and unchanged thereafter */ -static unsigned long *empty_bad_page = NULL; /* * Initialized during boot, and readonly for initializing page tables @@ -32,202 +55,47 @@ static unsigned long *empty_bad_page = NULL; pgd_t swapper_pg_dir[PTRS_PER_PGD]; /* Initialized at boot time, and readonly after that */ -unsigned long long highmem; int kmalloc_ok = 0; /* Used during early boot */ static unsigned long brk_end; -#ifdef CONFIG_HIGHMEM -static void setup_highmem(unsigned long highmem_start, - unsigned long highmem_len) +void __init arch_mm_preinit(void) { - unsigned long highmem_pfn; - int i; + /* Safe to call after jump_label_init(). Enables KASAN. */ + kasan_init_generic(); - highmem_pfn = __pa(highmem_start) >> PAGE_SHIFT; - for (i = 0; i < highmem_len >> PAGE_SHIFT; i++) - free_highmem_page(&mem_map[highmem_pfn + i]); -} -#endif - -void __init mem_init(void) -{ /* clear the zero-page */ memset(empty_zero_page, 0, PAGE_SIZE); /* Map in the area just after the brk now that kmalloc is about * to be turned on. */ - brk_end = (unsigned long) UML_ROUND_UP(sbrk(0)); + brk_end = PAGE_ALIGN((unsigned long) sbrk(0)); map_memory(brk_end, __pa(brk_end), uml_reserved - brk_end, 1, 1, 0); - free_bootmem(__pa(brk_end), uml_reserved - brk_end); + memblock_free((void *)brk_end, uml_reserved - brk_end); uml_reserved = brk_end; - - /* this will put all low memory onto the freelists */ - free_all_bootmem(); - max_low_pfn = totalram_pages; -#ifdef CONFIG_HIGHMEM - setup_highmem(end_iomem, highmem); -#endif - max_pfn = totalram_pages; - mem_init_print_info(NULL); - kmalloc_ok = 1; -} - -/* - * Create a page table and place a pointer to it in a middle page - * directory entry. - */ -static void __init one_page_table_init(pmd_t *pmd) -{ - if (pmd_none(*pmd)) { - pte_t *pte = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); - set_pmd(pmd, __pmd(_KERNPG_TABLE + - (unsigned long) __pa(pte))); - if (pte != pte_offset_kernel(pmd, 0)) - BUG(); - } -} - -static void __init one_md_table_init(pud_t *pud) -{ -#ifdef CONFIG_3_LEVEL_PGTABLES - pmd_t *pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); - set_pud(pud, __pud(_KERNPG_TABLE + (unsigned long) __pa(pmd_table))); - if (pmd_table != pmd_offset(pud, 0)) - BUG(); -#endif -} - -static void __init fixrange_init(unsigned long start, unsigned long end, - pgd_t *pgd_base) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - int i, j; - unsigned long vaddr; - - vaddr = start; - i = pgd_index(vaddr); - j = pmd_index(vaddr); - pgd = pgd_base + i; - - for ( ; (i < PTRS_PER_PGD) && (vaddr < end); pgd++, i++) { - pud = pud_offset(pgd, vaddr); - if (pud_none(*pud)) - one_md_table_init(pud); - pmd = pmd_offset(pud, vaddr); - for (; (j < PTRS_PER_PMD) && (vaddr < end); pmd++, j++) { - one_page_table_init(pmd); - vaddr += PMD_SIZE; - } - j = 0; - } -} - -#ifdef CONFIG_HIGHMEM -pte_t *kmap_pte; -pgprot_t kmap_prot; - -#define kmap_get_fixmap_pte(vaddr) \ - pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), (vaddr)),\ - (vaddr)), (vaddr)) - -static void __init kmap_init(void) -{ - unsigned long kmap_vstart; - - /* cache the first kmap pte */ - kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); - kmap_pte = kmap_get_fixmap_pte(kmap_vstart); - - kmap_prot = PAGE_KERNEL; -} - -static void __init init_highmem(void) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - unsigned long vaddr; - - /* - * Permanent kmaps: - */ - vaddr = PKMAP_BASE; - fixrange_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, swapper_pg_dir); - - pgd = swapper_pg_dir + pgd_index(vaddr); - pud = pud_offset(pgd, vaddr); - pmd = pmd_offset(pud, vaddr); - pte = pte_offset_kernel(pmd, vaddr); - pkmap_page_table = pte; - - kmap_init(); + min_low_pfn = PFN_UP(__pa(uml_reserved)); + max_pfn = max_low_pfn; } -#endif /* CONFIG_HIGHMEM */ -static void __init fixaddr_user_init( void) +void __init mem_init(void) { -#ifdef CONFIG_ARCH_REUSE_HOST_VSYSCALL_AREA - long size = FIXADDR_USER_END - FIXADDR_USER_START; - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - phys_t p; - unsigned long v, vaddr = FIXADDR_USER_START; - - if (!size) - return; - - fixrange_init( FIXADDR_USER_START, FIXADDR_USER_END, swapper_pg_dir); - v = (unsigned long) alloc_bootmem_low_pages(size); - memcpy((void *) v , (void *) FIXADDR_USER_START, size); - p = __pa(v); - for ( ; size > 0; size -= PAGE_SIZE, vaddr += PAGE_SIZE, - p += PAGE_SIZE) { - pgd = swapper_pg_dir + pgd_index(vaddr); - pud = pud_offset(pgd, vaddr); - pmd = pmd_offset(pud, vaddr); - pte = pte_offset_kernel(pmd, vaddr); - pte_set_val(*pte, p, PAGE_READONLY); - } -#endif + kmalloc_ok = 1; } void __init paging_init(void) { - unsigned long zones_size[MAX_NR_ZONES], vaddr; - int i; + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; - empty_zero_page = (unsigned long *) alloc_bootmem_low_pages(PAGE_SIZE); - empty_bad_page = (unsigned long *) alloc_bootmem_low_pages(PAGE_SIZE); - for (i = 0; i < ARRAY_SIZE(zones_size); i++) - zones_size[i] = 0; + empty_zero_page = (unsigned long *) memblock_alloc_low(PAGE_SIZE, + PAGE_SIZE); + if (!empty_zero_page) + panic("%s: Failed to allocate %lu bytes align=%lx\n", + __func__, PAGE_SIZE, PAGE_SIZE); - zones_size[ZONE_NORMAL] = (end_iomem >> PAGE_SHIFT) - - (uml_physmem >> PAGE_SHIFT); -#ifdef CONFIG_HIGHMEM - zones_size[ZONE_HIGHMEM] = highmem >> PAGE_SHIFT; -#endif - free_area_init(zones_size); - - /* - * Fixed mappings, only the page table structure has to be - * created - mappings will be set by set_fixmap(): - */ - vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; - fixrange_init(vaddr, FIXADDR_TOP, swapper_pg_dir); - - fixaddr_user_init(); - -#ifdef CONFIG_HIGHMEM - init_highmem(); -#endif + max_zone_pfn[ZONE_NORMAL] = high_physmem >> PAGE_SHIFT; + free_area_init(max_zone_pfn); } /* @@ -239,64 +107,49 @@ void free_initmem(void) { } -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif - /* Allocate and free page tables. */ pgd_t *pgd_alloc(struct mm_struct *mm) { - pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL); + pgd_t *pgd = __pgd_alloc(mm, 0); - if (pgd) { - memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); + if (pgd) memcpy(pgd + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); - } - return pgd; -} - -void pgd_free(struct mm_struct *mm, pgd_t *pgd) -{ - free_page((unsigned long) pgd); -} - -pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) -{ - pte_t *pte; - - pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); - return pte; -} - -pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) -{ - struct page *pte; - - pte = alloc_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); - if (pte) - pgtable_page_ctor(pte); - return pte; -} - -#ifdef CONFIG_3_LEVEL_PGTABLES -pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) -{ - pmd_t *pmd = (pmd_t *) __get_free_page(GFP_KERNEL); - if (pmd) - memset(pmd, 0, PAGE_SIZE); - - return pmd; + return pgd; } -#endif void *uml_kmalloc(int size, int flags) { return kmalloc(size, flags); } + +static const pgprot_t protection_map[16] = { + [VM_NONE] = PAGE_NONE, + [VM_READ] = PAGE_READONLY, + [VM_WRITE] = PAGE_COPY, + [VM_WRITE | VM_READ] = PAGE_COPY, + [VM_EXEC] = PAGE_READONLY, + [VM_EXEC | VM_READ] = PAGE_READONLY, + [VM_EXEC | VM_WRITE] = PAGE_COPY, + [VM_EXEC | VM_WRITE | VM_READ] = PAGE_COPY, + [VM_SHARED] = PAGE_NONE, + [VM_SHARED | VM_READ] = PAGE_READONLY, + [VM_SHARED | VM_WRITE] = PAGE_SHARED, + [VM_SHARED | VM_WRITE | VM_READ] = PAGE_SHARED, + [VM_SHARED | VM_EXEC] = PAGE_READONLY, + [VM_SHARED | VM_EXEC | VM_READ] = PAGE_READONLY, + [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_SHARED, + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED +}; +DECLARE_VM_GET_PAGE_PROT + +void mark_rodata_ro(void) +{ + unsigned long rodata_start = PFN_ALIGN(__start_rodata); + unsigned long rodata_end = PFN_ALIGN(__end_rodata); + + os_protect_memory((void *)rodata_start, rodata_end - rodata_start, 1, 0, 0); +} diff --git a/arch/um/kernel/physmem.c b/arch/um/kernel/physmem.c index f116db15d402..ae6ca373c261 100644 --- a/arch/um/kernel/physmem.c +++ b/arch/um/kernel/physmem.c @@ -1,16 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <linux/module.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/mm.h> #include <linux/pfn.h> #include <asm/page.h> +#include <asm/sections.h> #include <as-layout.h> #include <init.h> #include <kern.h> +#include <kern_util.h> #include <mem_user.h> #include <os.h> @@ -20,43 +22,6 @@ static int physmem_fd = -1; unsigned long high_physmem; EXPORT_SYMBOL(high_physmem); -extern unsigned long long physmem_size; - -int __init init_maps(unsigned long physmem, unsigned long iomem, - unsigned long highmem) -{ - struct page *p, *map; - unsigned long phys_len, phys_pages, highmem_len, highmem_pages; - unsigned long iomem_len, iomem_pages, total_len, total_pages; - int i; - - phys_pages = physmem >> PAGE_SHIFT; - phys_len = phys_pages * sizeof(struct page); - - iomem_pages = iomem >> PAGE_SHIFT; - iomem_len = iomem_pages * sizeof(struct page); - - highmem_pages = highmem >> PAGE_SHIFT; - highmem_len = highmem_pages * sizeof(struct page); - - total_pages = phys_pages + iomem_pages + highmem_pages; - total_len = phys_len + iomem_len + highmem_len; - - map = alloc_bootmem_low_pages(total_len); - if (map == NULL) - return -ENOMEM; - - for (i = 0; i < total_pages; i++) { - p = &map[i]; - memset(p, 0, sizeof(struct page)); - SetPageReserved(p); - INIT_LIST_HEAD(&p->lru); - } - - max_mapnr = total_pages; - return 0; -} - void map_memory(unsigned long virt, unsigned long phys, unsigned long len, int r, int w, int x) { @@ -75,25 +40,46 @@ void map_memory(unsigned long virt, unsigned long phys, unsigned long len, } } -extern int __syscall_stub_start; - +/** + * setup_physmem() - Setup physical memory for UML + * @start: Start address of the physical kernel memory, + * i.e start address of the executable image. + * @reserve_end: end address of the physical kernel memory. + * @len: Length of total physical memory that should be mapped/made + * available, in bytes. + * + * Creates an unlinked temporary file of size (len) and memory maps + * it on the last executable image address (uml_reserved). + * + * The offset is needed as the length of the total physical memory + * (len) includes the size of the memory used be the executable image, + * but the mapped-to address is the last address of the executable image + * (uml_reserved == end address of executable image). + * + * The memory mapped memory of the temporary file is used as backing memory + * of all user space processes/kernel tasks. + */ void __init setup_physmem(unsigned long start, unsigned long reserve_end, - unsigned long len, unsigned long long highmem) + unsigned long len) { unsigned long reserve = reserve_end - start; - int pfn = PFN_UP(__pa(reserve_end)); - int delta = (len - reserve) >> PAGE_SHIFT; - int err, offset, bootmap_size; + unsigned long map_size = len - reserve; + int err; + + if (len <= reserve) { + os_warn("Too few physical memory! Needed=%lu, given=%lu\n", + reserve, len); + exit(1); + } - physmem_fd = create_mem_file(len + highmem); + physmem_fd = create_mem_file(len); - offset = uml_reserved - uml_physmem; - err = os_map_memory((void *) uml_reserved, physmem_fd, offset, - len - offset, 1, 1, 1); + err = os_map_memory((void *) reserve_end, physmem_fd, reserve, + map_size, 1, 1, 1); if (err < 0) { - printf("setup_physmem - mapping %ld bytes of memory at 0x%p " - "failed - errno = %d\n", len - offset, - (void *) uml_reserved, err); + os_warn("setup_physmem - mapping %lu bytes of memory at 0x%p " + "failed - errno = %d\n", map_size, + (void *) reserve_end, err); exit(1); } @@ -101,12 +87,14 @@ void __init setup_physmem(unsigned long start, unsigned long reserve_end, * Special kludge - This page will be mapped in to userspace processes * from physmem_fd, so it needs to be written out there. */ - os_seek_file(physmem_fd, __pa(&__syscall_stub_start)); - os_write_file(physmem_fd, &__syscall_stub_start, PAGE_SIZE); + os_seek_file(physmem_fd, __pa(__syscall_stub_start)); + os_write_file(physmem_fd, __syscall_stub_start, PAGE_SIZE); + + memblock_add(__pa(start), len); + memblock_reserve(__pa(start), reserve); - bootmap_size = init_bootmem(pfn, pfn + delta); - free_bootmem(__pa(reserve_end) + bootmap_size, - len - bootmap_size - reserve); + min_low_pfn = PFN_UP(__pa(reserve_end)); + max_low_pfn = min_low_pfn + (map_size >> PAGE_SHIFT); } int phys_mapping(unsigned long phys, unsigned long long *offset_out) @@ -117,30 +105,16 @@ int phys_mapping(unsigned long phys, unsigned long long *offset_out) fd = physmem_fd; *offset_out = phys; } - else if (phys < __pa(end_iomem)) { - struct iomem_region *region = iomem_regions; - - while (region != NULL) { - if ((phys >= region->phys) && - (phys < region->phys + region->size)) { - fd = region->fd; - *offset_out = phys - region->phys; - break; - } - region = region->next; - } - } - else if (phys < __pa(end_iomem) + highmem) { - fd = physmem_fd; - *offset_out = phys - iomem_size; - } return fd; } +EXPORT_SYMBOL(phys_mapping); static int __init uml_mem_setup(char *line, int *add) { char *retptr; + + *add = 0; physmem_size = memparse(line,&retptr); return 0; } @@ -153,63 +127,3 @@ __uml_setup("mem=", uml_mem_setup, " be more, and the excess, if it's ever used, will just be swapped out.\n" " Example: mem=64M\n\n" ); - -extern int __init parse_iomem(char *str, int *add); - -__uml_setup("iomem=", parse_iomem, -"iomem=<name>,<file>\n" -" Configure <file> as an IO memory region named <name>.\n\n" -); - -/* - * This list is constructed in parse_iomem and addresses filled in in - * setup_iomem, both of which run during early boot. Afterwards, it's - * unchanged. - */ -struct iomem_region *iomem_regions; - -/* Initialized in parse_iomem and unchanged thereafter */ -int iomem_size; - -unsigned long find_iomem(char *driver, unsigned long *len_out) -{ - struct iomem_region *region = iomem_regions; - - while (region != NULL) { - if (!strcmp(region->driver, driver)) { - *len_out = region->size; - return region->virt; - } - - region = region->next; - } - - return 0; -} -EXPORT_SYMBOL(find_iomem); - -static int setup_iomem(void) -{ - struct iomem_region *region = iomem_regions; - unsigned long iomem_start = high_physmem + PAGE_SIZE; - int err; - - while (region != NULL) { - err = os_map_memory((void *) iomem_start, region->fd, 0, - region->size, 1, 1, 0); - if (err) - printk(KERN_ERR "Mapping iomem region for driver '%s' " - "failed, errno = %d\n", region->driver, -err); - else { - region->virt = iomem_start; - region->phys = __pa(region->virt); - } - - iomem_start += region->size + PAGE_SIZE; - region = region->next; - } - - return 0; -} - -__initcall(setup_iomem); diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index bbcef522bcb1..63b38a3f73f7 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -1,7 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* + * Copyright (C) 2015 Anton Ivanov (aivanov@{brocade.com,kot-begemot.co.uk}) + * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de) * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) * Copyright 2003 PathScale, Inc. - * Licensed under the GPL */ #include <linux/stddef.h> @@ -13,44 +15,38 @@ #include <linux/proc_fs.h> #include <linux/ptrace.h> #include <linux/random.h> +#include <linux/cpu.h> #include <linux/slab.h> #include <linux/sched.h> +#include <linux/sched/debug.h> +#include <linux/sched/task.h> +#include <linux/sched/task_stack.h> #include <linux/seq_file.h> #include <linux/tick.h> #include <linux/threads.h> -#include <linux/tracehook.h> +#include <linux/resume_user_mode.h> #include <asm/current.h> -#include <asm/pgtable.h> #include <asm/mmu_context.h> -#include <asm/uaccess.h> +#include <asm/switch_to.h> +#include <asm/exec.h> +#include <linux/uaccess.h> #include <as-layout.h> #include <kern_util.h> #include <os.h> #include <skas.h> +#include <registers.h> +#include <linux/time-internal.h> +#include <linux/elfcore.h> /* * This is a per-cpu array. A processor only modifies its entry and it only * cares about its entry, so it's OK if another processor is modifying its * entry. */ -struct cpu_task cpu_tasks[NR_CPUS] = { [0 ... NR_CPUS - 1] = { -1, NULL } }; - -static inline int external_pid(void) -{ - /* FIXME: Need to look up userspace_pid by cpu */ - return userspace_pid[0]; -} - -int pid_to_processor_id(int pid) -{ - int i; - - for (i = 0; i < ncpus; i++) { - if (cpu_tasks[i].pid == pid) - return i; - } - return -1; -} +struct task_struct *cpu_tasks[NR_CPUS] = { + [0 ... NR_CPUS - 1] = &init_task, +}; +EXPORT_SYMBOL(cpu_tasks); void free_stack(unsigned long stack, int order) { @@ -71,46 +67,35 @@ unsigned long alloc_stack(int order, int atomic) static inline void set_current(struct task_struct *task) { - cpu_tasks[task_thread_info(task)->cpu] = ((struct cpu_task) - { external_pid(), task }); + cpu_tasks[task_thread_info(task)->cpu] = task; } -extern void arch_switch_to(struct task_struct *to); - -void *__switch_to(struct task_struct *from, struct task_struct *to) +struct task_struct *__switch_to(struct task_struct *from, struct task_struct *to) { to->thread.prev_sched = from; set_current(to); - do { - current->thread.saved_task = NULL; - - switch_threads(&from->thread.switch_buf, - &to->thread.switch_buf); - - arch_switch_to(current); - - if (current->thread.saved_task) - show_regs(&(current->thread.regs)); - to = current->thread.saved_task; - from = current; - } while (current->thread.saved_task); + switch_threads(&from->thread.switch_buf, &to->thread.switch_buf); + arch_switch_to(current); return current->thread.prev_sched; } void interrupt_end(void) { - if (need_resched()) - schedule(); - if (test_thread_flag(TIF_SIGPENDING)) - do_signal(); - if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME)) - tracehook_notify_resume(¤t->thread.regs); -} - -void exit_thread(void) -{ + struct pt_regs *regs = ¤t->thread.regs; + unsigned long thread_flags; + + thread_flags = read_thread_flags(); + while (thread_flags & _TIF_WORK_MASK) { + if (thread_flags & _TIF_NEED_RESCHED) + schedule(); + if (thread_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) + do_signal(regs); + if (thread_flags & _TIF_NOTIFY_RESUME) + resume_user_mode_work(regs); + thread_flags = read_thread_flags(); + } } int get_current_pid(void) @@ -124,28 +109,26 @@ int get_current_pid(void) */ void new_thread_handler(void) { - int (*fn)(void *), n; + int (*fn)(void *); void *arg; if (current->thread.prev_sched != NULL) schedule_tail(current->thread.prev_sched); current->thread.prev_sched = NULL; - fn = current->thread.request.u.thread.proc; - arg = current->thread.request.u.thread.arg; + fn = current->thread.request.thread.proc; + arg = current->thread.request.thread.arg; /* * callback returns only if the kernel thread execs a process */ - n = fn(arg); + fn(arg); userspace(¤t->thread.regs.regs); } /* Called magically, see new_thread_handler above */ -void fork_handler(void) +static void fork_handler(void) { - force_flush_all(); - schedule_tail(current->thread.prev_sched); /* @@ -160,16 +143,17 @@ void fork_handler(void) userspace(¤t->thread.regs.regs); } -int copy_thread(unsigned long clone_flags, unsigned long sp, - unsigned long arg, struct task_struct * p) +int copy_thread(struct task_struct * p, const struct kernel_clone_args *args) { + u64 clone_flags = args->flags; + unsigned long sp = args->stack; + unsigned long tls = args->tls; void (*handler)(void); - int kthread = current->flags & PF_KTHREAD; int ret = 0; p->thread = (struct thread_struct) INIT_THREAD; - if (!kthread) { + if (!args->fn) { memcpy(&p->thread.regs.regs, current_pt_regs(), sizeof(p->thread.regs.regs)); PT_REGS_SET_SYSCALL_RETURN(&p->thread.regs, 0); @@ -181,21 +165,21 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, arch_copy_thread(¤t->thread.arch, &p->thread.arch); } else { get_safe_registers(p->thread.regs.regs.gp, p->thread.regs.regs.fp); - p->thread.request.u.thread.proc = (int (*)(void *))sp; - p->thread.request.u.thread.arg = (void *)arg; + p->thread.request.thread.proc = args->fn; + p->thread.request.thread.arg = args->fn_arg; handler = new_thread_handler; } new_thread(task_stack_page(p), &p->thread.switch_buf, handler); - if (!kthread) { + if (!args->fn) { clear_flushed_tls(p); /* * Set a new TLS for the child thread? */ if (clone_flags & CLONE_SETTLS) - ret = arch_copy_tls(p); + ret = arch_set_tls(p, tls); } return ret; @@ -203,34 +187,50 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, void initial_thread_cb(void (*proc)(void *), void *arg) { - int save_kmalloc_ok = kmalloc_ok; - - kmalloc_ok = 0; initial_thread_cb_skas(proc, arg); - kmalloc_ok = save_kmalloc_ok; +} + +int arch_dup_task_struct(struct task_struct *dst, + struct task_struct *src) +{ + /* init_task is not dynamically sized (missing FPU state) */ + if (unlikely(src == &init_task)) { + memcpy(dst, src, sizeof(init_task)); + memset((void *)dst + sizeof(init_task), 0, + arch_task_struct_size - sizeof(init_task)); + } else { + memcpy(dst, src, arch_task_struct_size); + } + + return 0; +} + +void um_idle_sleep(void) +{ + if (time_travel_mode != TT_MODE_OFF) + time_travel_sleep(); + else + os_idle_sleep(); } void arch_cpu_idle(void) { - unsigned long long nsecs; + um_idle_sleep(); +} - cpu_tasks[current_thread_info()->cpu].pid = os_getpid(); - nsecs = disable_timer(); - idle_sleep(nsecs); - local_irq_enable(); +void arch_cpu_idle_prepare(void) +{ + os_idle_prepare(); } -int __cant_sleep(void) { +int __uml_cant_sleep(void) { return in_atomic() || irqs_disabled() || in_interrupt(); /* Is in_interrupt() really needed? */ } -int user_context(unsigned long sp) +int uml_need_resched(void) { - unsigned long stack; - - stack = sp & (PAGE_MASK << CONFIG_KERNEL_STACK_ORDER); - return stack != (unsigned long) current_thread_info(); + return need_resched(); } extern exitcall_t __uml_exitcall_begin, __uml_exitcall_end; @@ -250,127 +250,20 @@ char *uml_strdup(const char *string) } EXPORT_SYMBOL(uml_strdup); -int copy_to_user_proc(void __user *to, void *from, int size) -{ - return copy_to_user(to, from, size); -} - int copy_from_user_proc(void *to, void __user *from, int size) { return copy_from_user(to, from, size); } -int clear_user_proc(void __user *buf, int size) -{ - return clear_user(buf, size); -} - -int strlen_user_proc(char __user *str) -{ - return strlen_user(str); -} - -int smp_sigio_handler(void) -{ -#ifdef CONFIG_SMP - int cpu = current_thread_info()->cpu; - IPI_handler(cpu); - if (cpu != 0) - return 1; -#endif - return 0; -} - -int cpu(void) +int singlestepping(void) { - return current_thread_info()->cpu; -} - -static atomic_t using_sysemu = ATOMIC_INIT(0); -int sysemu_supported; - -void set_using_sysemu(int value) -{ - if (value > sysemu_supported) - return; - atomic_set(&using_sysemu, value); -} - -int get_using_sysemu(void) -{ - return atomic_read(&using_sysemu); -} - -static int sysemu_proc_show(struct seq_file *m, void *v) -{ - seq_printf(m, "%d\n", get_using_sysemu()); - return 0; -} - -static int sysemu_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, sysemu_proc_show, NULL); -} - -static ssize_t sysemu_proc_write(struct file *file, const char __user *buf, - size_t count, loff_t *pos) -{ - char tmp[2]; - - if (copy_from_user(tmp, buf, 1)) - return -EFAULT; - - if (tmp[0] >= '0' && tmp[0] <= '2') - set_using_sysemu(tmp[0] - '0'); - /* We use the first char, but pretend to write everything */ - return count; -} - -static const struct file_operations sysemu_proc_fops = { - .owner = THIS_MODULE, - .open = sysemu_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = sysemu_proc_write, -}; - -int __init make_proc_sysemu(void) -{ - struct proc_dir_entry *ent; - if (!sysemu_supported) - return 0; - - ent = proc_create("sysemu", 0600, NULL, &sysemu_proc_fops); - - if (ent == NULL) - { - printk(KERN_WARNING "Failed to register /proc/sysemu\n"); - return 0; - } - - return 0; -} - -late_initcall(make_proc_sysemu); - -int singlestepping(void * t) -{ - struct task_struct *task = t ? t : current; - - if (!(task->ptrace & PT_DTRACE)) - return 0; - - if (task->thread.singlestep_syscall) - return 1; - - return 2; + return test_thread_flag(TIF_SINGLESTEP); } /* * Only x86 and x86_64 have an arch_align_stack(). * All other arches have "#define arch_align_stack(x) (x)" - * in their asm/system.h + * in their asm/exec.h * As this is included in UML from asm-um/system-generic.h, * we can use it to behave as the subarch does. */ @@ -378,19 +271,16 @@ int singlestepping(void * t) unsigned long arch_align_stack(unsigned long sp) { if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= get_random_int() % 8192; + sp -= get_random_u32_below(8192); return sp & ~0xf; } #endif -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { unsigned long stack_page, sp, ip; bool seen_sched = 0; - if ((p == NULL) || (p == current) || (p->state == TASK_RUNNING)) - return 0; - stack_page = (unsigned long) task_stack_page(p); /* Bail if the process has no kernel stack for some reason */ if (stack_page == 0) @@ -417,11 +307,3 @@ unsigned long get_wchan(struct task_struct *p) return 0; } - -int elf_core_copy_fpregs(struct task_struct *t, elf_fpregset_t *fpu) -{ - int cpu = current_thread_info()->cpu; - - return save_fp_registers(userspace_pid[cpu], (unsigned long *) fpu); -} - diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c index 694d551c8899..fdbb37b5c399 100644 --- a/arch/um/kernel/ptrace.c +++ b/arch/um/kernel/ptrace.c @@ -1,21 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <linux/audit.h> #include <linux/ptrace.h> #include <linux/sched.h> -#include <linux/tracehook.h> -#include <asm/uaccess.h> -#include <skas_ptrace.h> - +#include <linux/uaccess.h> +#include <asm/ptrace-abi.h> +#define CREATE_TRACE_POINTS +#include <trace/events/syscalls.h> void user_enable_single_step(struct task_struct *child) { - child->ptrace |= PT_DTRACE; - child->thread.singlestep_syscall = 0; + set_tsk_thread_flag(child, TIF_SINGLESTEP); #ifdef SUBARCH_SET_SINGLESTEPPING SUBARCH_SET_SINGLESTEPPING(child, 1); @@ -24,8 +23,7 @@ void user_enable_single_step(struct task_struct *child) void user_disable_single_step(struct task_struct *child) { - child->ptrace &= ~PT_DTRACE; - child->thread.singlestep_syscall = 0; + clear_tsk_thread_flag(child, TIF_SINGLESTEP); #ifdef SUBARCH_SET_SINGLESTEPPING SUBARCH_SET_SINGLESTEPPING(child, 0); @@ -40,9 +38,6 @@ void ptrace_disable(struct task_struct *child) user_disable_single_step(child); } -extern int peek_user(struct task_struct * child, long addr, long data); -extern int poke_user(struct task_struct * child, long addr, long data); - long arch_ptrace(struct task_struct *child, long request, unsigned long addr, unsigned long data) { @@ -68,7 +63,7 @@ long arch_ptrace(struct task_struct *child, long request, #ifdef PTRACE_GETREGS case PTRACE_GETREGS: { /* Get all gp regs from the child. */ - if (!access_ok(VERIFY_WRITE, p, MAX_REG_OFFSET)) { + if (!access_ok(p, MAX_REG_OFFSET)) { ret = -EIO; break; } @@ -83,7 +78,7 @@ long arch_ptrace(struct task_struct *child, long request, #ifdef PTRACE_SETREGS case PTRACE_SETREGS: { /* Set all gp regs in the child. */ unsigned long tmp = 0; - if (!access_ok(VERIFY_READ, p, MAX_REG_OFFSET)) { + if (!access_ok(p, MAX_REG_OFFSET)) { ret = -EIO; break; } @@ -104,35 +99,6 @@ long arch_ptrace(struct task_struct *child, long request, ret = ptrace_set_thread_area(child, addr, vp); break; - case PTRACE_FAULTINFO: { - /* - * Take the info from thread->arch->faultinfo, - * but transfer max. sizeof(struct ptrace_faultinfo). - * On i386, ptrace_faultinfo is smaller! - */ - ret = copy_to_user(p, &child->thread.arch.faultinfo, - sizeof(struct ptrace_faultinfo)) ? - -EIO : 0; - break; - } - -#ifdef PTRACE_LDT - case PTRACE_LDT: { - struct ptrace_ldt ldt; - - if (copy_from_user(&ldt, p, sizeof(ldt))) { - ret = -EIO; - break; - } - - /* - * This one is confusing, so just punt and return -EIO for - * now - */ - ret = -EIO; - break; - } -#endif default: ret = ptrace_request(child, request, addr, data); if (ret == -EIO) @@ -143,39 +109,33 @@ long arch_ptrace(struct task_struct *child, long request, return ret; } -static void send_sigtrap(struct task_struct *tsk, struct uml_pt_regs *regs, - int error_code) +static void send_sigtrap(struct uml_pt_regs *regs, int error_code) { - struct siginfo info; - - memset(&info, 0, sizeof(info)); - info.si_signo = SIGTRAP; - info.si_code = TRAP_BRKPT; - - /* User-mode eip? */ - info.si_addr = UPT_IS_USER(regs) ? (void __user *) UPT_IP(regs) : NULL; - /* Send us the fake SIGTRAP */ - force_sig_info(SIGTRAP, &info, tsk); + force_sig_fault(SIGTRAP, TRAP_BRKPT, + /* User-mode eip? */ + UPT_IS_USER(regs) ? (void __user *) UPT_IP(regs) : NULL); } /* - * XXX Check PT_DTRACE vs TIF_SINGLESTEP for singlestepping check and + * XXX Check TIF_SINGLESTEP for singlestepping check and * PT_PTRACED vs TIF_SYSCALL_TRACE for syscall tracing check */ -void syscall_trace_enter(struct pt_regs *regs) +int syscall_trace_enter(struct pt_regs *regs) { - audit_syscall_entry(HOST_AUDIT_ARCH, - UPT_SYSCALL_NR(®s->regs), + audit_syscall_entry(UPT_SYSCALL_NR(®s->regs), UPT_SYSCALL_ARG1(®s->regs), UPT_SYSCALL_ARG2(®s->regs), UPT_SYSCALL_ARG3(®s->regs), UPT_SYSCALL_ARG4(®s->regs)); + if (test_thread_flag(TIF_SYSCALL_TRACEPOINT)) + trace_sys_enter(regs, UPT_SYSCALL_NR(®s->regs)); + if (!test_thread_flag(TIF_SYSCALL_TRACE)) - return; + return 0; - tracehook_report_syscall_entry(regs); + return ptrace_report_syscall_entry(regs); } void syscall_trace_leave(struct pt_regs *regs) @@ -185,13 +145,16 @@ void syscall_trace_leave(struct pt_regs *regs) audit_syscall_exit(regs); /* Fake a debug trap */ - if (ptraced & PT_DTRACE) - send_sigtrap(current, ®s->regs, 0); + if (test_thread_flag(TIF_SINGLESTEP)) + send_sigtrap(®s->regs, 0); + + if (test_thread_flag(TIF_SYSCALL_TRACEPOINT)) + trace_sys_exit(regs, PT_REGS_SYSCALL_RET(regs)); if (!test_thread_flag(TIF_SYSCALL_TRACE)) return; - tracehook_report_syscall_exit(regs, 0); + ptrace_report_syscall_exit(regs, 0); /* force do_signal() --> is_syscall() */ if (ptraced & PT_PTRACED) set_thread_flag(TIF_SIGPENDING); diff --git a/arch/um/kernel/reboot.c b/arch/um/kernel/reboot.c index ced8903921ae..680bce4bd8fa 100644 --- a/arch/um/kernel/reboot.c +++ b/arch/um/kernel/reboot.c @@ -1,42 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ -#include <linux/sched.h> +#include <linux/sched/signal.h> +#include <linux/sched/task.h> +#include <linux/sched/mm.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/oom.h> +#include <linux/reboot.h> #include <kern_util.h> #include <os.h> #include <skas.h> void (*pm_power_off)(void); +EXPORT_SYMBOL(pm_power_off); static void kill_off_processes(void) { - if (proc_mm) - /* - * FIXME: need to loop over userspace_pids - */ - os_kill_ptraced_process(userspace_pid[0], 1); - else { - struct task_struct *p; - int pid; + struct task_struct *p; + int pid; - read_lock(&tasklist_lock); - for_each_process(p) { - struct task_struct *t; + read_lock(&tasklist_lock); + for_each_process(p) { + struct task_struct *t; - t = find_lock_task_mm(p); - if (!t) - continue; - pid = t->mm->context.id.u.pid; - task_unlock(t); - os_kill_ptraced_process(pid, 1); - } - read_unlock(&tasklist_lock); + t = find_lock_task_mm(p); + if (!t) + continue; + pid = t->mm->context.id.pid; + task_unlock(t); + os_kill_ptraced_process(pid, 1); } + read_unlock(&tasklist_lock); } void uml_cleanup(void) @@ -62,3 +59,18 @@ void machine_halt(void) { machine_power_off(); } + +static int sys_power_off_handler(struct sys_off_data *data) +{ + machine_power_off(); + return 0; +} + +static int register_power_off(void) +{ + register_sys_off_handler(SYS_OFF_MODE_POWER_OFF, + SYS_OFF_PRIO_DEFAULT, + sys_power_off_handler, NULL); + return 0; +} +__initcall(register_power_off); diff --git a/arch/um/kernel/sigio.c b/arch/um/kernel/sigio.c index b5e0cbb34382..4fc04742048a 100644 --- a/arch/um/kernel/sigio.c +++ b/arch/um/kernel/sigio.c @@ -1,6 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{linux.intel,addtoit}.com) - * Licensed under the GPL */ #include <linux/interrupt.h> @@ -8,42 +8,15 @@ #include <os.h> #include <sigio.h> -/* Protected by sigio_lock() called from write_sigio_workaround */ -static int sigio_irq_fd = -1; - -static irqreturn_t sigio_interrupt(int irq, void *data) -{ - char c; - - os_read_file(sigio_irq_fd, &c, sizeof(c)); - reactivate_fd(sigio_irq_fd, SIGIO_WRITE_IRQ); - return IRQ_HANDLED; -} - -int write_sigio_irq(int fd) -{ - int err; - - err = um_request_irq(SIGIO_WRITE_IRQ, fd, IRQ_READ, sigio_interrupt, - 0, "write sigio", NULL); - if (err) { - printk(KERN_ERR "write_sigio_irq : um_request_irq failed, " - "err = %d\n", err); - return -1; - } - sigio_irq_fd = fd; - return 0; -} - /* These are called from os-Linux/sigio.c to protect its pollfds arrays. */ -static DEFINE_SPINLOCK(sigio_spinlock); +static DEFINE_MUTEX(sigio_mutex); void sigio_lock(void) { - spin_lock(&sigio_spinlock); + mutex_lock(&sigio_mutex); } void sigio_unlock(void) { - spin_unlock(&sigio_spinlock); + mutex_unlock(&sigio_mutex); } diff --git a/arch/um/kernel/signal.c b/arch/um/kernel/signal.c index 3e831b3fd07b..a56b44522766 100644 --- a/arch/um/kernel/signal.c +++ b/arch/um/kernel/signal.c @@ -1,32 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <linux/module.h> #include <linux/ptrace.h> #include <linux/sched.h> +#include <linux/ftrace.h> #include <asm/siginfo.h> #include <asm/signal.h> #include <asm/unistd.h> #include <frame_kern.h> #include <kern_util.h> +#include <os.h> EXPORT_SYMBOL(block_signals); EXPORT_SYMBOL(unblock_signals); +void block_signals_trace(void) +{ + block_signals(); + if (current_thread_info()) + trace_hardirqs_off(); +} + +void unblock_signals_trace(void) +{ + if (current_thread_info()) + trace_hardirqs_on(); + unblock_signals(); +} + +void um_trace_signals_on(void) +{ + if (current_thread_info()) + trace_hardirqs_on(); +} + +void um_trace_signals_off(void) +{ + if (current_thread_info()) + trace_hardirqs_off(); +} + /* * OK, we're invoking a handler */ -static void handle_signal(struct pt_regs *regs, unsigned long signr, - struct k_sigaction *ka, siginfo_t *info) +static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) { sigset_t *oldset = sigmask_to_save(); int singlestep = 0; unsigned long sp; int err; - if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED)) + if (test_thread_flag(TIF_SINGLESTEP) && (current->ptrace & PT_PTRACED)) singlestep = 1; /* Did we come from a system call? */ @@ -39,11 +66,11 @@ static void handle_signal(struct pt_regs *regs, unsigned long signr, break; case -ERESTARTSYS: - if (!(ka->sa.sa_flags & SA_RESTART)) { + if (!(ksig->ka.sa.sa_flags & SA_RESTART)) { PT_REGS_SYSCALL_RET(regs) = -EINTR; break; } - /* fallthrough */ + fallthrough; case -ERESTARTNOINTR: PT_REGS_RESTART_SYSCALL(regs); PT_REGS_ORIG_SYSCALL(regs) = PT_REGS_SYSCALL_NR(regs); @@ -52,32 +79,28 @@ static void handle_signal(struct pt_regs *regs, unsigned long signr, } sp = PT_REGS_SP(regs); - if ((ka->sa.sa_flags & SA_ONSTACK) && (sas_ss_flags(sp) == 0)) + if ((ksig->ka.sa.sa_flags & SA_ONSTACK) && (sas_ss_flags(sp) == 0)) sp = current->sas_ss_sp + current->sas_ss_size; #ifdef CONFIG_ARCH_HAS_SC_SIGNALS - if (!(ka->sa.sa_flags & SA_SIGINFO)) - err = setup_signal_stack_sc(sp, signr, ka, regs, oldset); + if (!(ksig->ka.sa.sa_flags & SA_SIGINFO)) + err = setup_signal_stack_sc(sp, ksig, regs, oldset); else #endif - err = setup_signal_stack_si(sp, signr, ka, regs, info, oldset); + err = setup_signal_stack_si(sp, ksig, regs, oldset); - if (err) - force_sigsegv(signr, current); - else - signal_delivered(signr, info, ka, regs, singlestep); + signal_setup_done(err, ksig, singlestep); } -static int kern_do_signal(struct pt_regs *regs) +void do_signal(struct pt_regs *regs) { - struct k_sigaction ka_copy; - siginfo_t info; - int sig, handled_sig = 0; + struct ksignal ksig; + int handled_sig = 0; - while ((sig = get_signal_to_deliver(&info, &ka_copy, regs, NULL)) > 0) { + while (get_signal(&ksig)) { handled_sig = 1; /* Whee! Actually deliver the signal. */ - handle_signal(regs, sig, &ka_copy, &info); + handle_signal(&ksig, regs); } /* Did we come from a system call? */ @@ -98,27 +121,9 @@ static int kern_do_signal(struct pt_regs *regs) } /* - * This closes a way to execute a system call on the host. If - * you set a breakpoint on a system call instruction and singlestep - * from it, the tracing thread used to PTRACE_SINGLESTEP the process - * rather than PTRACE_SYSCALL it, allowing the system call to execute - * on the host. The tracing thread will check this flag and - * PTRACE_SYSCALL if necessary. - */ - if (current->ptrace & PT_DTRACE) - current->thread.singlestep_syscall = - is_syscall(PT_REGS_IP(¤t->thread.regs)); - - /* * if there's no signal to deliver, we just put the saved sigmask * back */ if (!handled_sig) restore_saved_sigmask(); - return handled_sig; -} - -int do_signal(void) -{ - return kern_do_signal(¤t->thread.regs); } diff --git a/arch/um/kernel/skas/.gitignore b/arch/um/kernel/skas/.gitignore new file mode 100644 index 000000000000..c3409ced0f38 --- /dev/null +++ b/arch/um/kernel/skas/.gitignore @@ -0,0 +1,2 @@ +stub_exe +stub_exe.dbg diff --git a/arch/um/kernel/skas/Makefile b/arch/um/kernel/skas/Makefile index 0b76d8869c94..3384be42691f 100644 --- a/arch/um/kernel/skas/Makefile +++ b/arch/um/kernel/skas/Makefile @@ -1,15 +1,50 @@ +# SPDX-License-Identifier: GPL-2.0 # # Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) -# Licensed under the GPL # -obj-y := clone.o mmu.o process.o syscall.o uaccess.o +obj-y := stub.o mmu.o process.o syscall.o uaccess.o \ + stub_exe_embed.o -# clone.o is in the stub, so it can't be built with profiling +# Stub executable + +stub_exe_objs-y := stub_exe.o + +stub_exe_objs := $(foreach F,$(stub_exe_objs-y),$(obj)/$F) + +# Object file containing the ELF executable +$(obj)/stub_exe_embed.o: $(src)/stub_exe_embed.S $(obj)/stub_exe + +$(obj)/stub_exe.dbg: $(stub_exe_objs) FORCE + $(call if_changed,stub_exe) + +$(obj)/stub_exe: OBJCOPYFLAGS := -S +$(obj)/stub_exe: $(obj)/stub_exe.dbg FORCE + $(call if_changed,objcopy) + +quiet_cmd_stub_exe = STUB_EXE $@ + cmd_stub_exe = $(CC) -nostdlib -o $@ \ + $(filter-out $(UM_GPROF_OPT) $(UM_GCOV_OPT),$(KBUILD_CFLAGS)) $(STUB_EXE_LDFLAGS) \ + $(filter %.o,$^) + +STUB_EXE_LDFLAGS = -Wl,-n -static + +targets += stub_exe.dbg stub_exe $(stub_exe_objs-y) + +# end + +# stub.o is in the stub, so it can't be built with profiling # GCC hardened also auto-enables -fpic, but we need %ebx so it can't work -> # disable it -CFLAGS_clone.o := $(CFLAGS_NO_HARDENING) -UNPROFILE_OBJS := clone.o +CFLAGS_stub.o := $(CFLAGS_NO_HARDENING) +CFLAGS_stub_exe.o := $(CFLAGS_NO_HARDENING) + +# Clang will call memset() from __builtin_alloca() when stack variable +# initialization is enabled, which is used in stub_exe.c. +CFLAGS_stub_exe.o += $(call cc-option, -ftrivial-auto-var-init=uninitialized) + +UNPROFILE_OBJS := stub.o stub_exe.o +KCOV_INSTRUMENT := n -include arch/um/scripts/Makefile.rules +include $(srctree)/arch/um/scripts/Makefile.rules diff --git a/arch/um/kernel/skas/clone.c b/arch/um/kernel/skas/clone.c deleted file mode 100644 index 289771dadf81..000000000000 --- a/arch/um/kernel/skas/clone.c +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (C) 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL - */ - -#include <signal.h> -#include <sched.h> -#include <asm/unistd.h> -#include <sys/time.h> -#include <as-layout.h> -#include <ptrace_user.h> -#include <stub-data.h> -#include <sysdep/stub.h> - -/* - * This is in a separate file because it needs to be compiled with any - * extraneous gcc flags (-pg, -fprofile-arcs, -ftest-coverage) disabled - * - * Use UM_KERN_PAGE_SIZE instead of PAGE_SIZE because that calls getpagesize - * on some systems. - */ - -void __attribute__ ((__section__ (".__syscall_stub"))) -stub_clone_handler(void) -{ - struct stub_data *data = (struct stub_data *) STUB_DATA; - long err; - - err = stub_syscall2(__NR_clone, CLONE_PARENT | CLONE_FILES | SIGCHLD, - STUB_DATA + UM_KERN_PAGE_SIZE / 2 - sizeof(void *)); - if (err != 0) - goto out; - - err = stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0); - if (err) - goto out; - - err = stub_syscall3(__NR_setitimer, ITIMER_VIRTUAL, - (long) &data->timer, 0); - if (err) - goto out; - - remap_stack(data->fd, data->offset); - goto done; - - out: - /* - * save current result. - * Parent: pid; - * child: retcode of mmap already saved and it jumps around this - * assignment - */ - data->err = err; - done: - trap_myself(); -} diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c index ff03067a3b14..00957788591b 100644 --- a/arch/um/kernel/skas/mmu.c +++ b/arch/um/kernel/skas/mmu.c @@ -1,178 +1,162 @@ +// SPDX-License-Identifier: GPL-2.0 /* + * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de) * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <linux/mm.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/slab.h> + +#include <shared/irq_kern.h> #include <asm/pgalloc.h> -#include <asm/pgtable.h> +#include <asm/sections.h> +#include <asm/mmu_context.h> #include <as-layout.h> #include <os.h> #include <skas.h> +#include <stub-data.h> -extern int __syscall_stub_start; +/* Ensure the stub_data struct covers the allocated area */ +static_assert(sizeof(struct stub_data) == STUB_DATA_PAGES * UM_KERN_PAGE_SIZE); -static int init_stub_pte(struct mm_struct *mm, unsigned long proc, - unsigned long kernel) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - pgd = pgd_offset(mm, proc); - pud = pud_alloc(mm, pgd, proc); - if (!pud) - goto out; +static spinlock_t mm_list_lock; +static struct list_head mm_list; - pmd = pmd_alloc(mm, pud, proc); - if (!pmd) - goto out_pmd; +void enter_turnstile(struct mm_id *mm_id) __acquires(turnstile) +{ + struct mm_context *ctx = container_of(mm_id, struct mm_context, id); - pte = pte_alloc_map(mm, NULL, pmd, proc); - if (!pte) - goto out_pte; + mutex_lock(&ctx->turnstile); +} - *pte = mk_pte(virt_to_page(kernel), __pgprot(_PAGE_PRESENT)); - *pte = pte_mkread(*pte); - return 0; +void exit_turnstile(struct mm_id *mm_id) __releases(turnstile) +{ + struct mm_context *ctx = container_of(mm_id, struct mm_context, id); - out_pte: - pmd_free(mm, pmd); - out_pmd: - pud_free(mm, pud); - out: - return -ENOMEM; + mutex_unlock(&ctx->turnstile); } int init_new_context(struct task_struct *task, struct mm_struct *mm) { - struct mm_context *from_mm = NULL; - struct mm_context *to_mm = &mm->context; + struct mm_id *new_id = &mm->context.id; unsigned long stack = 0; int ret = -ENOMEM; - if (skas_needs_stub) { - stack = get_zeroed_page(GFP_KERNEL); - if (stack == 0) - goto out; - } + mutex_init(&mm->context.turnstile); + spin_lock_init(&mm->context.sync_tlb_lock); - to_mm->id.stack = stack; - if (current->mm != NULL && current->mm != &init_mm) - from_mm = ¤t->mm->context; + stack = __get_free_pages(GFP_KERNEL | __GFP_ZERO, ilog2(STUB_DATA_PAGES)); + if (stack == 0) + goto out; - if (proc_mm) { - ret = new_mm(stack); - if (ret < 0) { - printk(KERN_ERR "init_new_context_skas - " - "new_mm failed, errno = %d\n", ret); - goto out_free; - } - to_mm->id.u.mm_fd = ret; - } - else { - if (from_mm) - to_mm->id.u.pid = copy_context_skas0(stack, - from_mm->id.u.pid); - else to_mm->id.u.pid = start_userspace(stack); - - if (to_mm->id.u.pid < 0) { - ret = to_mm->id.u.pid; - goto out_free; - } + new_id->stack = stack; + new_id->syscall_data_len = 0; + new_id->syscall_fd_num = 0; + + scoped_guard(spinlock_irqsave, &mm_list_lock) { + /* Insert into list, used for lookups when the child dies */ + list_add(&mm->context.list, &mm_list); } - ret = init_new_ldt(to_mm, from_mm); - if (ret < 0) { - printk(KERN_ERR "init_new_context_skas - init_ldt" - " failed, errno = %d\n", ret); + ret = start_userspace(new_id); + if (ret < 0) goto out_free; - } + + /* Ensure the new MM is clean and nothing unwanted is mapped */ + unmap(new_id, 0, STUB_START); return 0; out_free: - if (to_mm->id.stack != 0) - free_page(to_mm->id.stack); + free_pages(new_id->stack, ilog2(STUB_DATA_PAGES)); out: return ret; } -void uml_setup_stubs(struct mm_struct *mm) +void destroy_context(struct mm_struct *mm) { - int err, ret; + struct mm_context *mmu = &mm->context; - if (!skas_needs_stub) + /* + * If init_new_context wasn't called, this will be + * zero, resulting in a kill(0), which will result in the + * whole UML suddenly dying. Also, cover negative and + * 1 cases, since they shouldn't happen either. + * + * Negative cases happen if the child died unexpectedly. + */ + if (mmu->id.pid >= 0 && mmu->id.pid < 2) { + printk(KERN_ERR "corrupt mm_context - pid = %d\n", + mmu->id.pid); return; + } - ret = init_stub_pte(mm, STUB_CODE, - (unsigned long) &__syscall_stub_start); - if (ret) - goto out; - - ret = init_stub_pte(mm, STUB_DATA, mm->context.id.stack); - if (ret) - goto out; - - mm->context.stub_pages[0] = virt_to_page(&__syscall_stub_start); - mm->context.stub_pages[1] = virt_to_page(mm->context.id.stack); + scoped_guard(spinlock_irqsave, &mm_list_lock) + list_del(&mm->context.list); - /* dup_mmap already holds mmap_sem */ - err = install_special_mapping(mm, STUB_START, STUB_END - STUB_START, - VM_READ | VM_MAYREAD | VM_EXEC | - VM_MAYEXEC | VM_DONTCOPY, - mm->context.stub_pages); - if (err) { - printk(KERN_ERR "install_special_mapping returned %d\n", err); - goto out; + if (mmu->id.pid > 0) { + os_kill_ptraced_process(mmu->id.pid, 1); + mmu->id.pid = -1; } - return; -out: - force_sigsegv(SIGSEGV, current); + if (using_seccomp && mmu->id.sock) + os_close_file(mmu->id.sock); + + free_pages(mmu->id.stack, ilog2(STUB_DATA_PAGES)); } -void arch_exit_mmap(struct mm_struct *mm) +static irqreturn_t mm_sigchld_irq(int irq, void* dev) { - pte_t *pte; + struct mm_context *mm_context; + pid_t pid; - pte = virt_to_pte(mm, STUB_CODE); - if (pte != NULL) - pte_clear(mm, STUB_CODE, pte); + guard(spinlock)(&mm_list_lock); - pte = virt_to_pte(mm, STUB_DATA); - if (pte == NULL) - return; + while ((pid = os_reap_child()) > 0) { + /* + * A child died, check if we have an MM with the PID. This is + * only relevant in SECCOMP mode (as ptrace will fail anyway). + * + * See wait_stub_done_seccomp for more details. + */ + list_for_each_entry(mm_context, &mm_list, list) { + if (mm_context->id.pid == pid) { + struct stub_data *stub_data; + printk("Unexpectedly lost MM child! Affected tasks will segfault."); + + /* Marks the MM as dead */ + mm_context->id.pid = -1; + + stub_data = (void *)mm_context->id.stack; + stub_data->futex = FUTEX_IN_KERN; +#if IS_ENABLED(CONFIG_SMP) + os_futex_wake(&stub_data->futex); +#endif + + /* + * NOTE: Currently executing syscalls by + * affected tasks may finish normally. + */ + break; + } + } + } - pte_clear(mm, STUB_DATA, pte); + return IRQ_HANDLED; } -void destroy_context(struct mm_struct *mm) +static int __init init_child_tracking(void) { - struct mm_context *mmu = &mm->context; + int err; - if (proc_mm) - os_close_file(mmu->id.u.mm_fd); - else { - /* - * If init_new_context wasn't called, this will be - * zero, resulting in a kill(0), which will result in the - * whole UML suddenly dying. Also, cover negative and - * 1 cases, since they shouldn't happen either. - */ - if (mmu->id.u.pid < 2) { - printk(KERN_ERR "corrupt mm_context - pid = %d\n", - mmu->id.u.pid); - return; - } - os_kill_ptraced_process(mmu->id.u.pid, 1); - } + spin_lock_init(&mm_list_lock); + INIT_LIST_HEAD(&mm_list); - if (skas_needs_stub) - free_page(mmu->id.stack); + err = request_irq(SIGCHLD_IRQ, mm_sigchld_irq, 0, "SIGCHLD", NULL); + if (err < 0) + panic("Failed to register SIGCHLD IRQ: %d", err); - free_ldt(mmu); + return 0; } +early_initcall(init_child_tracking) diff --git a/arch/um/kernel/skas/process.c b/arch/um/kernel/skas/process.c index 4da11b3c8ddb..4a7673b0261a 100644 --- a/arch/um/kernel/skas/process.c +++ b/arch/um/kernel/skas/process.c @@ -1,73 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <linux/init.h> -#include <linux/sched.h> +#include <linux/sched/mm.h> +#include <linux/sched/task_stack.h> +#include <linux/sched/task.h> +#include <linux/smp-internal.h> + +#include <asm/tlbflush.h> + #include <as-layout.h> #include <kern.h> #include <os.h> #include <skas.h> - -int new_mm(unsigned long stack) -{ - int fd, err; - - fd = os_open_file("/proc/mm", of_cloexec(of_write(OPENFLAGS())), 0); - if (fd < 0) - return fd; - - if (skas_needs_stub) { - err = map_stub_pages(fd, STUB_CODE, STUB_DATA, stack); - if (err) { - os_close_file(fd); - return err; - } - } - - return fd; -} +#include <kern_util.h> extern void start_kernel(void); static int __init start_kernel_proc(void *unused) { - int pid; - - block_signals(); - pid = os_getpid(); + block_signals_trace(); - cpu_tasks[0].pid = pid; - cpu_tasks[0].task = current; -#ifdef CONFIG_SMP - init_cpu_online(get_cpu_mask(0)); -#endif start_kernel(); return 0; } -extern int userspace_pid[]; - -extern char cpu0_irqstack[]; +char cpu_irqstacks[NR_CPUS][THREAD_SIZE] __aligned(THREAD_SIZE); int __init start_uml(void) { - stack_protections((unsigned long) &cpu0_irqstack); - set_sigstack(cpu0_irqstack, THREAD_SIZE); - if (proc_mm) { - userspace_pid[0] = start_userspace(0); - if (userspace_pid[0] < 0) { - printf("start_uml - start_userspace returned %d\n", - userspace_pid[0]); - exit(1); - } - } + stack_protections((unsigned long) &cpu_irqstacks[0]); + set_sigstack(cpu_irqstacks[0], THREAD_SIZE); init_new_thread_signals(); - init_task.thread.request.u.thread.proc = start_kernel_proc; - init_task.thread.request.u.thread.arg = NULL; + init_task.thread.request.thread.proc = start_kernel_proc; + init_task.thread.request.thread.arg = NULL; return start_idle_thread(task_stack_page(&init_task), &init_task.thread.switch_buf); } @@ -79,3 +49,31 @@ unsigned long current_stub_stack(void) return current->mm->context.id.stack; } + +struct mm_id *current_mm_id(void) +{ + if (current->mm == NULL) + return NULL; + + return ¤t->mm->context.id; +} + +void current_mm_sync(void) +{ + if (current->mm == NULL) + return; + + um_tlb_sync(current->mm); +} + +static DEFINE_SPINLOCK(initial_jmpbuf_spinlock); + +void initial_jmpbuf_lock(void) +{ + spin_lock_irq(&initial_jmpbuf_spinlock); +} + +void initial_jmpbuf_unlock(void) +{ + spin_unlock_irq(&initial_jmpbuf_spinlock); +} diff --git a/arch/um/kernel/skas/stub.c b/arch/um/kernel/skas/stub.c new file mode 100644 index 000000000000..67cab46a602c --- /dev/null +++ b/arch/um/kernel/skas/stub.c @@ -0,0 +1,181 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 Benjamin Berg <benjamin@sipsolutions.net> + */ + +#include <sysdep/stub.h> + +#include <linux/futex.h> +#include <sys/socket.h> +#include <errno.h> + +/* + * Known security issues + * + * Userspace can jump to this address to execute *any* syscall that is + * permitted by the stub. As we will return afterwards, it can do + * whatever it likes, including: + * - Tricking the kernel into handing out the memory FD + * - Using this memory FD to read/write all physical memory + * - Running in parallel to the kernel processing a syscall + * (possibly creating data races?) + * - Blocking e.g. SIGALRM to avoid time based scheduling + * + * To avoid this, the permitted location for each syscall needs to be + * checked for in the SECCOMP filter (which is reasonably simple). Also, + * more care will need to go into considerations how the code might be + * tricked by using a prepared stack (or even modifying the stack from + * another thread in case SMP support is added). + * + * As for the SIGALRM, the best counter measure will be to check in the + * kernel that the process is reporting back the SIGALRM in a timely + * fashion. + */ +static __always_inline int syscall_handler(int fd_map[STUB_MAX_FDS]) +{ + struct stub_data *d = get_stub_data(); + int i; + unsigned long res; + int fd; + + for (i = 0; i < d->syscall_data_len; i++) { + struct stub_syscall *sc = &d->syscall_data[i]; + + switch (sc->syscall) { + case STUB_SYSCALL_MMAP: + if (fd_map) + fd = fd_map[sc->mem.fd]; + else + fd = sc->mem.fd; + + res = stub_syscall6(STUB_MMAP_NR, + sc->mem.addr, sc->mem.length, + sc->mem.prot, + MAP_SHARED | MAP_FIXED, + fd, sc->mem.offset); + if (res != sc->mem.addr) { + d->err = res; + d->syscall_data_len = i; + return -1; + } + break; + case STUB_SYSCALL_MUNMAP: + res = stub_syscall2(__NR_munmap, + sc->mem.addr, sc->mem.length); + if (res) { + d->err = res; + d->syscall_data_len = i; + return -1; + } + break; + default: + d->err = -95; /* EOPNOTSUPP */ + d->syscall_data_len = i; + return -1; + } + } + + d->err = 0; + d->syscall_data_len = 0; + + return 0; +} + +void __section(".__syscall_stub") +stub_syscall_handler(void) +{ + syscall_handler(NULL); + + trap_myself(); +} + +void __section(".__syscall_stub") +stub_signal_interrupt(int sig, siginfo_t *info, void *p) +{ + struct stub_data *d = get_stub_data(); + char rcv_data; + union { + char data[CMSG_SPACE(sizeof(int) * STUB_MAX_FDS)]; + struct cmsghdr align; + } ctrl = {}; + struct iovec iov = { + .iov_base = &rcv_data, + .iov_len = 1, + }; + struct msghdr msghdr = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = &ctrl, + .msg_controllen = sizeof(ctrl), + }; + ucontext_t *uc = p; + struct cmsghdr *fd_msg; + int *fd_map; + int num_fds; + long res; + + d->signal = sig; + d->si_offset = (unsigned long)info - (unsigned long)&d->sigstack[0]; + d->mctx_offset = (unsigned long)&uc->uc_mcontext - (unsigned long)&d->sigstack[0]; + +restart_wait: + d->futex = FUTEX_IN_KERN; + do { + res = stub_syscall3(__NR_futex, (unsigned long)&d->futex, + FUTEX_WAKE, 1); + } while (res == -EINTR); + + do { + res = stub_syscall4(__NR_futex, (unsigned long)&d->futex, + FUTEX_WAIT, FUTEX_IN_KERN, 0); + } while (res == -EINTR || d->futex == FUTEX_IN_KERN); + + if (res < 0 && res != -EAGAIN) + stub_syscall1(__NR_exit_group, 1); + + if (d->syscall_data_len) { + /* Read passed FDs (if any) */ + do { + res = stub_syscall3(__NR_recvmsg, 0, (unsigned long)&msghdr, 0); + } while (res == -EINTR); + + /* We should never have a receive error (other than -EAGAIN) */ + if (res < 0 && res != -EAGAIN) + stub_syscall1(__NR_exit_group, 1); + + /* Receive the FDs */ + num_fds = 0; + fd_msg = msghdr.msg_control; + fd_map = (void *)&CMSG_DATA(fd_msg); + if (res == iov.iov_len && msghdr.msg_controllen > sizeof(struct cmsghdr)) + num_fds = (fd_msg->cmsg_len - CMSG_LEN(0)) / sizeof(int); + + /* Try running queued syscalls. */ + res = syscall_handler(fd_map); + + while (num_fds) + stub_syscall2(__NR_close, fd_map[--num_fds], 0); + } else { + res = 0; + } + + if (res < 0 || d->restart_wait) { + /* Report SIGSYS if we restart. */ + d->signal = SIGSYS; + d->restart_wait = 0; + + goto restart_wait; + } + + /* Restore arch dependent state that is not part of the mcontext */ + stub_seccomp_restore_state(&d->arch_data); + + /* Return so that the host modified mcontext is restored. */ +} + +void __section(".__syscall_stub") +stub_signal_restorer(void) +{ + /* We must not have anything on the stack when doing rt_sigreturn */ + stub_syscall0(__NR_rt_sigreturn); +} diff --git a/arch/um/kernel/skas/stub_exe.c b/arch/um/kernel/skas/stub_exe.c new file mode 100644 index 000000000000..cbafaa684e66 --- /dev/null +++ b/arch/um/kernel/skas/stub_exe.c @@ -0,0 +1,230 @@ +#include <sys/ptrace.h> +#include <sys/prctl.h> +#include <sys/fcntl.h> +#include <asm/unistd.h> +#include <sysdep/stub.h> +#include <stub-data.h> +#include <linux/filter.h> +#include <linux/seccomp.h> +#include <generated/asm-offsets.h> + +void _start(void); + +noinline static void real_init(void) +{ + struct stub_init_data init_data; + unsigned long res; + struct { + void *ss_sp; + int ss_flags; + size_t ss_size; + } stack = { + .ss_size = STUB_DATA_PAGES * UM_KERN_PAGE_SIZE, + }; + struct { + void *sa_handler_; + unsigned long sa_flags; + void *sa_restorer; + unsigned long long sa_mask; + } sa = { + /* Need to set SA_RESTORER (but the handler never returns) */ + .sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO | 0x04000000, + }; + + /* set a nice name */ + stub_syscall2(__NR_prctl, PR_SET_NAME, (unsigned long)"uml-userspace"); + + /* Make sure this process dies if the kernel dies */ + stub_syscall2(__NR_prctl, PR_SET_PDEATHSIG, SIGKILL); + + /* Needed in SECCOMP mode (and safe to do anyway) */ + stub_syscall5(__NR_prctl, PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + + /* read information from STDIN and close it */ + res = stub_syscall3(__NR_read, 0, + (unsigned long)&init_data, sizeof(init_data)); + if (res != sizeof(init_data)) + stub_syscall1(__NR_exit, 10); + + /* In SECCOMP mode, FD 0 is a socket and is later used for FD passing */ + if (!init_data.seccomp) + stub_syscall1(__NR_close, 0); + else + stub_syscall3(__NR_fcntl, 0, F_SETFL, O_NONBLOCK); + + /* map stub code + data */ + res = stub_syscall6(STUB_MMAP_NR, + init_data.stub_start, UM_KERN_PAGE_SIZE, + PROT_READ | PROT_EXEC, MAP_FIXED | MAP_SHARED, + init_data.stub_code_fd, init_data.stub_code_offset); + if (res != init_data.stub_start) + stub_syscall1(__NR_exit, 11); + + res = stub_syscall6(STUB_MMAP_NR, + init_data.stub_start + UM_KERN_PAGE_SIZE, + STUB_DATA_PAGES * UM_KERN_PAGE_SIZE, + PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, + init_data.stub_data_fd, init_data.stub_data_offset); + if (res != init_data.stub_start + UM_KERN_PAGE_SIZE) + stub_syscall1(__NR_exit, 12); + + /* In SECCOMP mode, we only need the signalling FD from now on */ + if (init_data.seccomp) { + res = stub_syscall3(__NR_close_range, 1, ~0U, 0); + if (res != 0) + stub_syscall1(__NR_exit, 13); + } + + /* setup signal stack inside stub data */ + stack.ss_sp = (void *)init_data.stub_start + UM_KERN_PAGE_SIZE; + stub_syscall2(__NR_sigaltstack, (unsigned long)&stack, 0); + + /* register signal handlers */ + sa.sa_handler_ = (void *) init_data.signal_handler; + sa.sa_restorer = (void *) init_data.signal_restorer; + if (!init_data.seccomp) { + /* In ptrace mode, the SIGSEGV handler never returns */ + sa.sa_mask = 0; + + res = stub_syscall4(__NR_rt_sigaction, SIGSEGV, + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); + if (res != 0) + stub_syscall1(__NR_exit, 14); + } else { + /* SECCOMP mode uses rt_sigreturn, need to mask all signals */ + sa.sa_mask = ~0ULL; + + res = stub_syscall4(__NR_rt_sigaction, SIGSEGV, + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); + if (res != 0) + stub_syscall1(__NR_exit, 15); + + res = stub_syscall4(__NR_rt_sigaction, SIGSYS, + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); + if (res != 0) + stub_syscall1(__NR_exit, 16); + + res = stub_syscall4(__NR_rt_sigaction, SIGALRM, + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); + if (res != 0) + stub_syscall1(__NR_exit, 17); + + res = stub_syscall4(__NR_rt_sigaction, SIGTRAP, + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); + if (res != 0) + stub_syscall1(__NR_exit, 18); + + res = stub_syscall4(__NR_rt_sigaction, SIGILL, + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); + if (res != 0) + stub_syscall1(__NR_exit, 19); + + res = stub_syscall4(__NR_rt_sigaction, SIGFPE, + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); + if (res != 0) + stub_syscall1(__NR_exit, 20); + } + + /* + * If in seccomp mode, install the SECCOMP filter and trigger a syscall. + * Otherwise set PTRACE_TRACEME and do a SIGSTOP. + */ + if (init_data.seccomp) { + struct sock_filter filter[] = { +#if __BITS_PER_LONG > 32 + /* [0] Load upper 32bit of instruction pointer from seccomp_data */ + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, + (offsetof(struct seccomp_data, instruction_pointer) + 4)), + + /* [1] Jump forward 3 instructions if the upper address is not identical */ + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, (init_data.stub_start) >> 32, 0, 3), +#endif + /* [2] Load lower 32bit of instruction pointer from seccomp_data */ + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, + (offsetof(struct seccomp_data, instruction_pointer))), + + /* [3] Mask out lower bits */ + BPF_STMT(BPF_ALU | BPF_AND | BPF_K, 0xfffff000), + + /* [4] Jump to [6] if the lower bits are not on the expected page */ + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, (init_data.stub_start) & 0xfffff000, 1, 0), + + /* [5] Trap call, allow */ + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRAP), + + /* [6,7] Check architecture */ + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, + offsetof(struct seccomp_data, arch)), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, + UM_SECCOMP_ARCH_NATIVE, 1, 0), + + /* [8] Kill (for architecture check) */ + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS), + + /* [9] Load syscall number */ + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, + offsetof(struct seccomp_data, nr)), + + /* [10-16] Check against permitted syscalls */ + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_futex, + 7, 0), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K,__NR_recvmsg, + 6, 0), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K,__NR_close, + 5, 0), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, STUB_MMAP_NR, + 4, 0), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_munmap, + 3, 0), +#ifdef __i386__ + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_set_thread_area, + 2, 0), +#else + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_arch_prctl, + 2, 0), +#endif + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_rt_sigreturn, + 1, 0), + + /* [17] Not one of the permitted syscalls */ + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS), + + /* [18] Permitted call for the stub */ + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = sizeof(filter) / sizeof(filter[0]), + .filter = filter, + }; + + if (stub_syscall3(__NR_seccomp, SECCOMP_SET_MODE_FILTER, + SECCOMP_FILTER_FLAG_TSYNC, + (unsigned long)&prog) != 0) + stub_syscall1(__NR_exit, 21); + + /* Fall through, the exit syscall will cause SIGSYS */ + } else { + stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0); + + stub_syscall2(__NR_kill, stub_syscall0(__NR_getpid), SIGSTOP); + } + + stub_syscall1(__NR_exit, 30); + + __builtin_unreachable(); +} + +__attribute__((naked)) void _start(void) +{ + /* + * Since the stack after exec() starts at the top-most address, + * but that's exactly where we also want to map the stub data + * and code, this must: + * - push the stack by 1 code and STUB_DATA_PAGES data pages + * - call real_init() + * This way, real_init() can use the stack normally, while the + * original stack further down (higher address) will become + * inaccessible after the mmap() calls above. + */ + stub_start(real_init); +} diff --git a/arch/um/kernel/skas/stub_exe_embed.S b/arch/um/kernel/skas/stub_exe_embed.S new file mode 100644 index 000000000000..6d8914fbe8f1 --- /dev/null +++ b/arch/um/kernel/skas/stub_exe_embed.S @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/init.h> +#include <linux/linkage.h> + +__INITDATA + +SYM_DATA_START(stub_exe_start) + .incbin "arch/um/kernel/skas/stub_exe" +SYM_DATA_END_LABEL(stub_exe_start, SYM_L_GLOBAL, stub_exe_end) + +__FINIT diff --git a/arch/um/kernel/skas/syscall.c b/arch/um/kernel/skas/syscall.c index c0681e097432..ba7494f9bfe4 100644 --- a/arch/um/kernel/skas/syscall.c +++ b/arch/um/kernel/skas/syscall.c @@ -1,40 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <linux/kernel.h> #include <linux/ptrace.h> +#include <linux/seccomp.h> #include <kern_util.h> #include <sysdep/ptrace.h> -#include <sysdep/syscalls.h> - -extern int syscall_table_size; -#define NR_SYSCALLS (syscall_table_size / sizeof(void *)) +#include <sysdep/ptrace_user.h> +#include <linux/time-internal.h> +#include <asm/syscall.h> +#include <asm/unistd.h> +#include <asm/delay.h> void handle_syscall(struct uml_pt_regs *r) { struct pt_regs *regs = container_of(r, struct pt_regs, regs); - long result; int syscall; - syscall_trace_enter(regs); + /* Initialize the syscall number and default return value. */ + UPT_SYSCALL_NR(r) = PT_SYSCALL_NR(r->gp); + PT_REGS_SET_SYSCALL_RETURN(regs, -ENOSYS); + + if (syscall_trace_enter(regs)) + goto out; + + /* Do the seccomp check after ptrace; failures should be fast. */ + if (secure_computing() == -1) + goto out; + + syscall = UPT_SYSCALL_NR(r); /* - * This should go in the declaration of syscall, but when I do that, - * strace -f -c bash -c 'ls ; ls' breaks, sometimes not tracing - * children at all, sometimes hanging when bash doesn't see the first - * ls exit. - * The assembly looks functionally the same to me. This is - * gcc version 4.0.1 20050727 (Red Hat 4.0.1-5) - * in case it's a compiler bug. + * If no time passes, then sched_yield may not actually yield, causing + * broken spinlock implementations in userspace (ASAN) to hang for long + * periods of time. */ - syscall = UPT_SYSCALL_NR(r); - if ((syscall >= NR_SYSCALLS) || (syscall < 0)) - result = -ENOSYS; - else result = EXECUTE_SYSCALL(syscall, regs); + if ((time_travel_mode == TT_MODE_INFCPU || + time_travel_mode == TT_MODE_EXTERNAL) && + syscall == __NR_sched_yield) + tt_extra_sched_jiffies += 1; + + if (syscall >= 0 && syscall < __NR_syscalls) { + unsigned long ret; + + ret = (*sys_call_table[syscall])(UPT_SYSCALL_ARG1(®s->regs), + UPT_SYSCALL_ARG2(®s->regs), + UPT_SYSCALL_ARG3(®s->regs), + UPT_SYSCALL_ARG4(®s->regs), + UPT_SYSCALL_ARG5(®s->regs), + UPT_SYSCALL_ARG6(®s->regs)); + + PT_REGS_SET_SYSCALL_RETURN(regs, ret); - PT_REGS_SET_SYSCALL_RETURN(regs, result); + /* + * An error value here can be some form of -ERESTARTSYS + * and then we'd just loop. Make any error syscalls take + * some time, so that it won't just loop if something is + * not ready, and hopefully other things will make some + * progress. + */ + if (IS_ERR_VALUE(ret) && + (time_travel_mode == TT_MODE_INFCPU || + time_travel_mode == TT_MODE_EXTERNAL)) { + um_udelay(1); + schedule(); + } + } +out: syscall_trace_leave(regs); } diff --git a/arch/um/kernel/skas/uaccess.c b/arch/um/kernel/skas/uaccess.c index 1d3e0c17340b..198269e384c4 100644 --- a/arch/um/kernel/skas/uaccess.c +++ b/arch/um/kernel/skas/uaccess.c @@ -1,6 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <linux/err.h> @@ -10,13 +10,14 @@ #include <linux/sched.h> #include <asm/current.h> #include <asm/page.h> -#include <asm/pgtable.h> #include <kern_util.h> +#include <asm/futex.h> #include <os.h> pte_t *virt_to_pte(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; @@ -27,7 +28,11 @@ pte_t *virt_to_pte(struct mm_struct *mm, unsigned long addr) if (!pgd_present(*pgd)) return NULL; - pud = pud_offset(pgd, addr); + p4d = p4d_offset(pgd, addr); + if (!p4d_present(*p4d)) + return NULL; + + pud = pud_offset(p4d, addr); if (!pud_present(*pud)) return NULL; @@ -59,38 +64,38 @@ static pte_t *maybe_map(unsigned long virt, int is_write) static int do_op_one_page(unsigned long addr, int len, int is_write, int (*op)(unsigned long addr, int len, void *arg), void *arg) { - jmp_buf buf; struct page *page; pte_t *pte; - int n, faulted; + int n; pte = maybe_map(addr, is_write); if (pte == NULL) return -1; page = pte_page(*pte); +#ifdef CONFIG_64BIT + pagefault_disable(); + addr = (unsigned long) page_address(page) + + (addr & ~PAGE_MASK); +#else addr = (unsigned long) kmap_atomic(page) + (addr & ~PAGE_MASK); +#endif + n = (*op)(addr, len, arg); - current->thread.fault_catcher = &buf; - - faulted = UML_SETJMP(&buf); - if (faulted == 0) - n = (*op)(addr, len, arg); - else - n = -1; - - current->thread.fault_catcher = NULL; - +#ifdef CONFIG_64BIT + pagefault_enable(); +#else kunmap_atomic((void *)addr); +#endif return n; } -static int buffer_op(unsigned long addr, int len, int is_write, - int (*op)(unsigned long, int, void *), void *arg) +static long buffer_op(unsigned long addr, int len, int is_write, + int (*op)(unsigned long, int, void *), void *arg) { - int size, remain, n; + long size, remain, n; size = min(PAGE_ALIGN(addr) - addr, (unsigned long) len); remain = len; @@ -139,18 +144,11 @@ static int copy_chunk_from_user(unsigned long from, int len, void *arg) return 0; } -int copy_from_user(void *to, const void __user *from, int n) +unsigned long raw_copy_from_user(void *to, const void __user *from, unsigned long n) { - if (segment_eq(get_fs(), KERNEL_DS)) { - memcpy(to, (__force void*)from, n); - return 0; - } - - return access_ok(VERIFY_READ, from, n) ? - buffer_op((unsigned long) from, n, 0, copy_chunk_from_user, &to): - n; + return buffer_op((unsigned long) from, n, 0, copy_chunk_from_user, &to); } -EXPORT_SYMBOL(copy_from_user); +EXPORT_SYMBOL(raw_copy_from_user); static int copy_chunk_to_user(unsigned long to, int len, void *arg) { @@ -161,18 +159,11 @@ static int copy_chunk_to_user(unsigned long to, int len, void *arg) return 0; } -int copy_to_user(void __user *to, const void *from, int n) +unsigned long raw_copy_to_user(void __user *to, const void *from, unsigned long n) { - if (segment_eq(get_fs(), KERNEL_DS)) { - memcpy((__force void *) to, from, n); - return 0; - } - - return access_ok(VERIFY_WRITE, to, n) ? - buffer_op((unsigned long) to, n, 1, copy_chunk_to_user, &from) : - n; + return buffer_op((unsigned long) to, n, 1, copy_chunk_to_user, &from); } -EXPORT_SYMBOL(copy_to_user); +EXPORT_SYMBOL(raw_copy_to_user); static int strncpy_chunk_from_user(unsigned long from, int len, void *arg) { @@ -188,19 +179,13 @@ static int strncpy_chunk_from_user(unsigned long from, int len, void *arg) return 0; } -int strncpy_from_user(char *dst, const char __user *src, int count) +long strncpy_from_user(char *dst, const char __user *src, long count) { - int n; + long n; char *ptr = dst; - if (segment_eq(get_fs(), KERNEL_DS)) { - strncpy(dst, (__force void *) src, count); - return strnlen(dst, count); - } - - if (!access_ok(VERIFY_READ, src, 1)) + if (!access_ok(src, 1)) return -EFAULT; - n = buffer_op((unsigned long) src, count, 0, strncpy_chunk_from_user, &ptr); if (n != 0) @@ -215,22 +200,11 @@ static int clear_chunk(unsigned long addr, int len, void *unused) return 0; } -int __clear_user(void __user *mem, int len) +unsigned long __clear_user(void __user *mem, unsigned long len) { return buffer_op((unsigned long) mem, len, 1, clear_chunk, NULL); } - -int clear_user(void __user *mem, int len) -{ - if (segment_eq(get_fs(), KERNEL_DS)) { - memset((__force void*)mem, 0, len); - return 0; - } - - return access_ok(VERIFY_WRITE, mem, len) ? - buffer_op((unsigned long) mem, len, 1, clear_chunk, NULL) : len; -} -EXPORT_SYMBOL(clear_user); +EXPORT_SYMBOL(__clear_user); static int strnlen_chunk(unsigned long str, int len, void *arg) { @@ -244,16 +218,151 @@ static int strnlen_chunk(unsigned long str, int len, void *arg) return 0; } -int strnlen_user(const void __user *str, int len) +long strnlen_user(const char __user *str, long len) { int count = 0, n; - if (segment_eq(get_fs(), KERNEL_DS)) - return strnlen((__force char*)str, len) + 1; - + if (!access_ok(str, 1)) + return -EFAULT; n = buffer_op((unsigned long) str, len, 0, strnlen_chunk, &count); if (n == 0) return count + 1; - return -EFAULT; + return 0; } EXPORT_SYMBOL(strnlen_user); + +/** + * arch_futex_atomic_op_inuser() - Atomic arithmetic operation with constant + * argument and comparison of the previous + * futex value with another constant. + * + * @op: operation to execute + * @oparg: argument to operation + * @oval: old value at uaddr + * @uaddr: pointer to user space address + * + * Return: + * 0 - On success + * -EFAULT - User access resulted in a page fault + * -EAGAIN - Atomic operation was unable to complete due to contention + * -ENOSYS - Operation not supported + */ + +int arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval, u32 __user *uaddr) +{ + int oldval, ret; + struct page *page; + unsigned long addr = (unsigned long) uaddr; + pte_t *pte; + + ret = -EFAULT; + if (!access_ok(uaddr, sizeof(*uaddr))) + return -EFAULT; + preempt_disable(); + pte = maybe_map(addr, 1); + if (pte == NULL) + goto out_inuser; + + page = pte_page(*pte); +#ifdef CONFIG_64BIT + pagefault_disable(); + addr = (unsigned long) page_address(page) + + (((unsigned long) addr) & ~PAGE_MASK); +#else + addr = (unsigned long) kmap_atomic(page) + + ((unsigned long) addr & ~PAGE_MASK); +#endif + uaddr = (u32 *) addr; + oldval = *uaddr; + + ret = 0; + + switch (op) { + case FUTEX_OP_SET: + *uaddr = oparg; + break; + case FUTEX_OP_ADD: + *uaddr += oparg; + break; + case FUTEX_OP_OR: + *uaddr |= oparg; + break; + case FUTEX_OP_ANDN: + *uaddr &= ~oparg; + break; + case FUTEX_OP_XOR: + *uaddr ^= oparg; + break; + default: + ret = -ENOSYS; + } +#ifdef CONFIG_64BIT + pagefault_enable(); +#else + kunmap_atomic((void *)addr); +#endif + +out_inuser: + preempt_enable(); + + if (ret == 0) + *oval = oldval; + + return ret; +} +EXPORT_SYMBOL(arch_futex_atomic_op_inuser); + +/** + * futex_atomic_cmpxchg_inatomic() - Compare and exchange the content of the + * uaddr with newval if the current value is + * oldval. + * @uval: pointer to store content of @uaddr + * @uaddr: pointer to user space address + * @oldval: old value + * @newval: new value to store to @uaddr + * + * Return: + * 0 - On success + * -EFAULT - User access resulted in a page fault + * -EAGAIN - Atomic operation was unable to complete due to contention + */ + +int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) +{ + struct page *page; + pte_t *pte; + int ret = -EFAULT; + + if (!access_ok(uaddr, sizeof(*uaddr))) + return -EFAULT; + + preempt_disable(); + pte = maybe_map((unsigned long) uaddr, 1); + if (pte == NULL) + goto out_inatomic; + + page = pte_page(*pte); +#ifdef CONFIG_64BIT + pagefault_disable(); + uaddr = page_address(page) + (((unsigned long) uaddr) & ~PAGE_MASK); +#else + uaddr = kmap_atomic(page) + ((unsigned long) uaddr & ~PAGE_MASK); +#endif + + *uval = *uaddr; + + ret = cmpxchg(uaddr, oldval, newval); + +#ifdef CONFIG_64BIT + pagefault_enable(); +#else + kunmap_atomic(uaddr); +#endif + ret = 0; + +out_inatomic: + preempt_enable(); + return ret; +} +EXPORT_SYMBOL(futex_atomic_cmpxchg_inatomic); diff --git a/arch/um/kernel/smp.c b/arch/um/kernel/smp.c index 5c8c3ea7db7b..f1e52b7348fb 100644 --- a/arch/um/kernel/smp.c +++ b/arch/um/kernel/smp.c @@ -1,238 +1,242 @@ +// SPDX-License-Identifier: GPL-2.0 /* + * Copyright (C) 2025 Ant Group + * Author: Tiwei Bie <tiwei.btw@antgroup.com> + * + * Based on the previous implementation in TT mode * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ -#include <linux/percpu.h> -#include <asm/pgalloc.h> -#include <asm/tlb.h> - -#ifdef CONFIG_SMP - #include <linux/sched.h> +#include <linux/sched/task.h> +#include <linux/sched/task_stack.h> #include <linux/module.h> +#include <linux/processor.h> #include <linux/threads.h> -#include <linux/interrupt.h> -#include <linux/err.h> +#include <linux/cpu.h> #include <linux/hardirq.h> -#include <asm/smp.h> -#include <asm/processor.h> -#include <asm/spinlock.h> +#include <linux/smp.h> +#include <linux/smp-internal.h> +#include <init.h> #include <kern.h> -#include <irq_user.h> #include <os.h> +#include <smp.h> -/* Per CPU bogomips and other parameters - * The only piece used here is the ipi pipe, which is set before SMP is - * started and never changed. - */ -struct cpuinfo_um cpu_data[NR_CPUS]; +enum { + UML_IPI_RES = 0, + UML_IPI_CALL_SINGLE, + UML_IPI_CALL, + UML_IPI_STOP, +}; -/* A statistic, can be a little off */ -int num_reschedules_sent = 0; +void arch_smp_send_reschedule(int cpu) +{ + os_send_ipi(cpu, UML_IPI_RES); +} -/* Not changed after boot */ -struct task_struct *idle_threads[NR_CPUS]; +void arch_send_call_function_single_ipi(int cpu) +{ + os_send_ipi(cpu, UML_IPI_CALL_SINGLE); +} -void smp_send_reschedule(int cpu) +void arch_send_call_function_ipi_mask(const struct cpumask *mask) { - os_write_file(cpu_data[cpu].ipi_pipe[1], "R", 1); - num_reschedules_sent++; + int cpu; + + for_each_cpu(cpu, mask) + os_send_ipi(cpu, UML_IPI_CALL); } void smp_send_stop(void) { - int i; + int cpu, me = smp_processor_id(); - printk(KERN_INFO "Stopping all CPUs..."); - for (i = 0; i < num_online_cpus(); i++) { - if (i == current_thread->cpu) + for_each_online_cpu(cpu) { + if (cpu == me) continue; - os_write_file(cpu_data[i].ipi_pipe[1], "S", 1); + os_send_ipi(cpu, UML_IPI_STOP); } - printk(KERN_CONT "done\n"); } -static cpumask_t smp_commenced_mask = CPU_MASK_NONE; -static cpumask_t cpu_callin_map = CPU_MASK_NONE; +static void ipi_handler(int vector, struct uml_pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs((struct pt_regs *)regs); + int cpu = raw_smp_processor_id(); + + irq_enter(); + + if (current->mm) + os_alarm_process(current->mm->context.id.pid); + + switch (vector) { + case UML_IPI_RES: + inc_irq_stat(irq_resched_count); + scheduler_ipi(); + break; + + case UML_IPI_CALL_SINGLE: + inc_irq_stat(irq_call_count); + generic_smp_call_function_single_interrupt(); + break; + + case UML_IPI_CALL: + inc_irq_stat(irq_call_count); + generic_smp_call_function_interrupt(); + break; + + case UML_IPI_STOP: + set_cpu_online(cpu, false); + while (1) + pause(); + break; + + default: + pr_err("CPU#%d received unknown IPI (vector=%d)!\n", cpu, vector); + break; + } + + irq_exit(); + set_irq_regs(old_regs); +} -static int idle_proc(void *cpup) +void uml_ipi_handler(int vector) { - int cpu = (int) cpup, err; + struct uml_pt_regs r = { .is_user = 0 }; - err = os_pipe(cpu_data[cpu].ipi_pipe, 1, 1); - if (err < 0) - panic("CPU#%d failed to create IPI pipe, err = %d", cpu, -err); + preempt_disable(); + ipi_handler(vector, &r); + preempt_enable(); +} - os_set_fd_async(cpu_data[cpu].ipi_pipe[0]); +/* AP states used only during CPU startup */ +enum { + UML_CPU_PAUSED = 0, + UML_CPU_RUNNING, +}; - wmb(); - if (cpu_test_and_set(cpu, cpu_callin_map)) { - printk(KERN_ERR "huh, CPU#%d already present??\n", cpu); - BUG(); - } +static int cpu_states[NR_CPUS]; - while (!cpu_isset(cpu, smp_commenced_mask)) - cpu_relax(); +static int start_secondary(void *unused) +{ + int err, cpu = raw_smp_processor_id(); notify_cpu_starting(cpu); set_cpu_online(cpu, true); - default_idle(); - return 0; -} -static struct task_struct *idle_thread(int cpu) -{ - struct task_struct *new_task; - - current->thread.request.u.thread.proc = idle_proc; - current->thread.request.u.thread.arg = (void *) cpu; - new_task = fork_idle(cpu); - if (IS_ERR(new_task)) - panic("copy_process failed in idle_thread, error = %ld", - PTR_ERR(new_task)); - - cpu_tasks[cpu] = ((struct cpu_task) - { .pid = new_task->thread.mode.tt.extern_pid, - .task = new_task } ); - idle_threads[cpu] = new_task; - panic("skas mode doesn't support SMP"); - return new_task; -} + err = um_setup_timer(); + if (err) + panic("CPU#%d failed to setup timer, err = %d", cpu, err); -void smp_prepare_cpus(unsigned int maxcpus) -{ - struct task_struct *idle; - unsigned long waittime; - int err, cpu, me = smp_processor_id(); - int i; + local_irq_enable(); - for (i = 0; i < ncpus; ++i) - set_cpu_possible(i, true); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); - set_cpu_online(me, true); - cpu_set(me, cpu_callin_map); + return 0; +} - err = os_pipe(cpu_data[me].ipi_pipe, 1, 1); - if (err < 0) - panic("CPU#0 failed to create IPI pipe, errno = %d", -err); +void uml_start_secondary(void *opaque) +{ + int cpu = raw_smp_processor_id(); + struct mm_struct *mm = &init_mm; + struct task_struct *idle; - os_set_fd_async(cpu_data[me].ipi_pipe[0]); + stack_protections((unsigned long) &cpu_irqstacks[cpu]); + set_sigstack(&cpu_irqstacks[cpu], THREAD_SIZE); - for (cpu = 1; cpu < ncpus; cpu++) { - printk(KERN_INFO "Booting processor %d...\n", cpu); + set_cpu_present(cpu, true); + os_futex_wait(&cpu_states[cpu], UML_CPU_PAUSED); - idle = idle_thread(cpu); + smp_rmb(); /* paired with smp_wmb() in __cpu_up() */ - init_idle(idle, cpu); + idle = cpu_tasks[cpu]; + idle->thread_info.cpu = cpu; - waittime = 200000000; - while (waittime-- && !cpu_isset(cpu, cpu_callin_map)) - cpu_relax(); + mmgrab(mm); + idle->active_mm = mm; - printk(KERN_INFO "%s\n", - cpu_isset(cpu, cpu_calling_map) ? "done" : "failed"); - } -} + idle->thread.request.thread.proc = start_secondary; + idle->thread.request.thread.arg = NULL; -void smp_prepare_boot_cpu(void) -{ - set_cpu_online(smp_processor_id(), true); + new_thread(task_stack_page(idle), &idle->thread.switch_buf, + new_thread_handler); + os_start_secondary(opaque, &idle->thread.switch_buf); } -int __cpu_up(unsigned int cpu, struct task_struct *tidle) +void __init smp_prepare_cpus(unsigned int max_cpus) { - cpu_set(cpu, smp_commenced_mask); - while (!cpu_online(cpu)) - mb(); - return 0; -} + int err, cpu, me = smp_processor_id(); + unsigned long deadline; -int setup_profiling_timer(unsigned int multiplier) -{ - printk(KERN_INFO "setup_profiling_timer\n"); - return 0; -} + os_init_smp(); -void smp_call_function_slave(int cpu); + for_each_possible_cpu(cpu) { + if (cpu == me) + continue; -void IPI_handler(int cpu) -{ - unsigned char c; - int fd; - - fd = cpu_data[cpu].ipi_pipe[0]; - while (os_read_file(fd, &c, 1) == 1) { - switch (c) { - case 'C': - smp_call_function_slave(cpu); - break; - - case 'R': - scheduler_ipi(); - break; - - case 'S': - printk(KERN_INFO "CPU#%d stopping\n", cpu); - while (1) - pause(); - break; - - default: - printk(KERN_ERR "CPU#%d received unknown IPI [%c]!\n", - cpu, c); - break; + pr_debug("Booting processor %d...\n", cpu); + err = os_start_cpu_thread(cpu); + if (err) { + pr_crit("CPU#%d failed to start cpu thread, err = %d", + cpu, err); + continue; } + + deadline = jiffies + msecs_to_jiffies(1000); + spin_until_cond(cpu_present(cpu) || + time_is_before_jiffies(deadline)); + + if (!cpu_present(cpu)) + pr_crit("CPU#%d failed to boot\n", cpu); } } -int hard_smp_processor_id(void) +int __cpu_up(unsigned int cpu, struct task_struct *tidle) { - return pid_to_processor_id(os_getpid()); -} + cpu_tasks[cpu] = tidle; + smp_wmb(); /* paired with smp_rmb() in uml_start_secondary() */ + cpu_states[cpu] = UML_CPU_RUNNING; + os_futex_wake(&cpu_states[cpu]); + spin_until_cond(cpu_online(cpu)); -static DEFINE_SPINLOCK(call_lock); -static atomic_t scf_started; -static atomic_t scf_finished; -static void (*func)(void *info); -static void *info; - -void smp_call_function_slave(int cpu) -{ - atomic_inc(&scf_started); - (*func)(info); - atomic_inc(&scf_finished); + return 0; } -int smp_call_function(void (*_func)(void *info), void *_info, int wait) +void __init smp_cpus_done(unsigned int max_cpus) { - int cpus = num_online_cpus() - 1; - int i; +} - if (!cpus) - return 0; +/* Set in uml_ncpus_setup */ +int uml_ncpus = 1; - /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); +void __init prefill_possible_map(void) +{ + int cpu; - spin_lock_bh(&call_lock); - atomic_set(&scf_started, 0); - atomic_set(&scf_finished, 0); - func = _func; - info = _info; + for (cpu = 0; cpu < uml_ncpus; cpu++) + set_cpu_possible(cpu, true); + for (; cpu < NR_CPUS; cpu++) + set_cpu_possible(cpu, false); +} - for_each_online_cpu(i) - os_write_file(cpu_data[i].ipi_pipe[1], "C", 1); +static int __init uml_ncpus_setup(char *line, int *add) +{ + *add = 0; - while (atomic_read(&scf_started) != cpus) - barrier(); + if (kstrtoint(line, 10, ¨_ncpus)) { + os_warn("%s: Couldn't parse '%s'\n", __func__, line); + return -1; + } - if (wait) - while (atomic_read(&scf_finished) != cpus) - barrier(); + uml_ncpus = clamp(uml_ncpus, 1, NR_CPUS); - spin_unlock_bh(&call_lock); return 0; } -#endif +__uml_setup("ncpus=", uml_ncpus_setup, +"ncpus=<# of desired CPUs>\n" +" This tells UML how many virtual processors to start. The maximum\n" +" number of supported virtual processors can be obtained by querying\n" +" the CONFIG_NR_CPUS option using --showconfig.\n\n" +); + +EXPORT_SYMBOL(uml_curr_cpu); diff --git a/arch/um/kernel/stacktrace.c b/arch/um/kernel/stacktrace.c new file mode 100644 index 000000000000..fd3b61b3d4d2 --- /dev/null +++ b/arch/um/kernel/stacktrace.c @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Copyright (C) 2013 Richard Weinberger <richard@nod.at> + * Copyright (C) 2014 Google Inc., Author: Daniel Walter <dwalter@google.com> + */ + +#include <linux/kallsyms.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/stacktrace.h> +#include <linux/module.h> +#include <linux/uaccess.h> +#include <asm/stacktrace.h> + +void dump_trace(struct task_struct *tsk, + const struct stacktrace_ops *ops, + void *data) +{ + int reliable = 0; + unsigned long *sp, bp, addr; + struct pt_regs *segv_regs = tsk->thread.segv_regs; + struct stack_frame *frame; + + bp = get_frame_pointer(tsk, segv_regs); + sp = get_stack_pointer(tsk, segv_regs); + + frame = (struct stack_frame *)bp; + while (((long) sp & (THREAD_SIZE-1)) != 0) { + addr = READ_ONCE_NOCHECK(*sp); + if (__kernel_text_address(addr)) { + reliable = 0; + if ((unsigned long) sp == bp + sizeof(long)) { + frame = frame ? frame->next_frame : NULL; + bp = (unsigned long)frame; + reliable = 1; + } + ops->address(data, addr, reliable); + } + sp++; + } +} + +static void save_addr(void *data, unsigned long address, int reliable) +{ + struct stack_trace *trace = data; + + if (!reliable) + return; + if (trace->nr_entries >= trace->max_entries) + return; + + trace->entries[trace->nr_entries++] = address; +} + +static const struct stacktrace_ops dump_ops = { + .address = save_addr +}; + +static void __save_stack_trace(struct task_struct *tsk, struct stack_trace *trace) +{ + dump_trace(tsk, &dump_ops, trace); +} + +void save_stack_trace(struct stack_trace *trace) +{ + __save_stack_trace(current, trace); +} +EXPORT_SYMBOL_GPL(save_stack_trace); + +void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) +{ + __save_stack_trace(tsk, trace); +} +EXPORT_SYMBOL_GPL(save_stack_trace_tsk); diff --git a/arch/um/kernel/syscall.c b/arch/um/kernel/syscall.c deleted file mode 100644 index c1d0ae069b53..000000000000 --- a/arch/um/kernel/syscall.c +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL - */ - -#include <linux/file.h> -#include <linux/fs.h> -#include <linux/mm.h> -#include <linux/sched.h> -#include <linux/utsname.h> -#include <linux/syscalls.h> -#include <asm/current.h> -#include <asm/mman.h> -#include <asm/uaccess.h> -#include <asm/unistd.h> - -long old_mmap(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long offset) -{ - long err = -EINVAL; - if (offset & ~PAGE_MASK) - goto out; - - err = sys_mmap_pgoff(addr, len, prot, flags, fd, offset >> PAGE_SHIFT); - out: - return err; -} diff --git a/arch/um/kernel/sysrq.c b/arch/um/kernel/sysrq.c index 0dc4d1c6f98a..13ee5666668d 100644 --- a/arch/um/kernel/sysrq.c +++ b/arch/um/kernel/sysrq.c @@ -1,66 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL + * Copyright (C) 2013 Richard Weinberger <richrd@nod.at> */ #include <linux/kallsyms.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/sched.h> -#include <asm/sysrq.h> +#include <linux/sched/debug.h> +#include <linux/sched/task_stack.h> -/* Catch non-i386 SUBARCH's. */ -#if !defined(CONFIG_UML_X86) || defined(CONFIG_64BIT) -void show_trace(struct task_struct *task, unsigned long * stack) -{ - unsigned long addr; +#include <asm/stacktrace.h> +#include <os.h> - if (!stack) { - stack = (unsigned long*) &stack; - WARN_ON(1); - } +static void _print_addr(void *data, unsigned long address, int reliable) +{ + const char *loglvl = data; - printk(KERN_INFO "Call Trace: \n"); - while (((long) stack & (THREAD_SIZE-1)) != 0) { - addr = *stack; - if (__kernel_text_address(addr)) { - printk(KERN_INFO "%08lx: [<%08lx>]", - (unsigned long) stack, addr); - print_symbol(KERN_CONT " %s", addr); - printk(KERN_CONT "\n"); - } - stack++; - } - printk(KERN_INFO "\n"); + printk("%s [<%08lx>] %s%pS\n", loglvl, address, reliable ? "" : "? ", + (void *)address); } -#endif -/*Stolen from arch/i386/kernel/traps.c */ -static const int kstack_depth_to_print = 24; +static const struct stacktrace_ops stackops = { + .address = _print_addr +}; -/* This recently started being used in arch-independent code too, as in - * kernel/sched/core.c.*/ -void show_stack(struct task_struct *task, unsigned long *esp) +void show_stack(struct task_struct *task, unsigned long *stack, + const char *loglvl) { - unsigned long *stack; + struct pt_regs *segv_regs = current->thread.segv_regs; int i; - if (esp == NULL) { - if (task != current && task != NULL) { - esp = (unsigned long *) KSTK_ESP(task); - } else { - esp = (unsigned long *) &esp; - } - } + if (!stack) + stack = get_stack_pointer(task, segv_regs); - stack = esp; - for (i = 0; i < kstack_depth_to_print; i++) { + printk("%sStack:\n", loglvl); + for (i = 0; i < 3 * STACKSLOTS_PER_LINE; i++) { if (kstack_end(stack)) break; - if (i && ((i % 8) == 0)) - printk(KERN_INFO " "); - printk(KERN_CONT "%08lx ", *stack++); + if (i && ((i % STACKSLOTS_PER_LINE) == 0)) + pr_cont("\n"); + pr_cont(" %08lx", READ_ONCE_NOCHECK(*stack)); + stack++; } - show_trace(task, esp); + printk("%sCall Trace:\n", loglvl); + dump_trace(task ?: current, &stackops, (void *)loglvl); } diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c index 117568d4f64a..b344a36b44eb 100644 --- a/arch/um/kernel/time.c +++ b/arch/um/kernel/time.c @@ -1,115 +1,1092 @@ +// SPDX-License-Identifier: GPL-2.0 /* + * Copyright (C) 2015 Anton Ivanov (aivanov@{brocade.com,kot-begemot.co.uk}) + * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de) + * Copyright (C) 2012-2014 Cisco Systems * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL + * Copyright (C) 2019 Intel Corporation */ #include <linux/clockchips.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/jiffies.h> +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/spinlock.h> #include <linux/threads.h> #include <asm/irq.h> #include <asm/param.h> #include <kern_util.h> #include <os.h> +#include <linux/delay.h> +#include <linux/time-internal.h> +#include <linux/um_timetravel.h> +#include <shared/init.h> + +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT +#include <linux/sched/clock.h> + +enum time_travel_mode time_travel_mode; +EXPORT_SYMBOL_GPL(time_travel_mode); + +static bool time_travel_start_set; +static unsigned long long time_travel_start; +static unsigned long long time_travel_time; +static unsigned long long time_travel_shm_offset; +static LIST_HEAD(time_travel_events); +static LIST_HEAD(time_travel_irqs); +static unsigned long long time_travel_timer_interval; +static unsigned long long time_travel_next_event; +static struct time_travel_event time_travel_timer_event; +static int time_travel_ext_fd = -1; +static unsigned int time_travel_ext_waiting; +static bool time_travel_ext_prev_request_valid; +static unsigned long long time_travel_ext_prev_request; +static unsigned long long *time_travel_ext_free_until; +static unsigned long long _time_travel_ext_free_until; +static u16 time_travel_shm_id; +static struct um_timetravel_schedshm *time_travel_shm; +static union um_timetravel_schedshm_client *time_travel_shm_client; + +unsigned long tt_extra_sched_jiffies; + +notrace unsigned long long sched_clock(void) +{ + return (unsigned long long)(jiffies - INITIAL_JIFFIES + + tt_extra_sched_jiffies) + * (NSEC_PER_SEC / HZ); +} + +static void time_travel_set_time(unsigned long long ns) +{ + if (unlikely(ns < time_travel_time)) + panic("time-travel: time goes backwards %lld -> %lld\n", + time_travel_time, ns); + else if (unlikely(ns >= S64_MAX)) + panic("The system was going to sleep forever, aborting"); + + time_travel_time = ns; +} + +enum time_travel_message_handling { + TTMH_IDLE, + TTMH_POLL, + TTMH_READ, + TTMH_READ_START_ACK, +}; + +static u64 bc_message; +int time_travel_should_print_bc_msg; + +void _time_travel_print_bc_msg(void) +{ + time_travel_should_print_bc_msg = 0; + printk(KERN_INFO "time-travel: received broadcast 0x%llx\n", bc_message); +} + +static void time_travel_setup_shm(int fd, u16 id) +{ + u32 len; + + time_travel_shm = os_mmap_rw_shared(fd, sizeof(*time_travel_shm)); + + if (!time_travel_shm) + goto out; + + len = time_travel_shm->len; + + if (time_travel_shm->version != UM_TIMETRAVEL_SCHEDSHM_VERSION || + len < struct_size(time_travel_shm, clients, id + 1)) { + os_unmap_memory(time_travel_shm, sizeof(*time_travel_shm)); + time_travel_shm = NULL; + goto out; + } + + time_travel_shm = os_mremap_rw_shared(time_travel_shm, + sizeof(*time_travel_shm), + len); + if (!time_travel_shm) + goto out; + + time_travel_shm_offset = time_travel_shm->current_time; + time_travel_shm_client = &time_travel_shm->clients[id]; + time_travel_shm_client->capa |= UM_TIMETRAVEL_SCHEDSHM_CAP_TIME_SHARE; + time_travel_shm_id = id; + /* always look at that free_until from now on */ + time_travel_ext_free_until = &time_travel_shm->free_until; +out: + os_close_file(fd); +} + +static void time_travel_handle_message(struct um_timetravel_msg *msg, + enum time_travel_message_handling mode) +{ + struct um_timetravel_msg resp = { + .op = UM_TIMETRAVEL_ACK, + }; + int ret; + + /* + * We can't unlock here, but interrupt signals with a timetravel_handler + * (see um_request_irq_tt) get to the timetravel_handler anyway. + */ + if (mode != TTMH_READ) { + BUG_ON(mode == TTMH_IDLE && !irqs_disabled()); + + while (os_poll(1, &time_travel_ext_fd) != 0) { + /* nothing */ + } + } + + if (unlikely(mode == TTMH_READ_START_ACK)) { + int fd[UM_TIMETRAVEL_SHARED_MAX_FDS]; + + ret = os_rcv_fd_msg(time_travel_ext_fd, fd, + ARRAY_SIZE(fd), msg, sizeof(*msg)); + if (ret == sizeof(*msg)) { + time_travel_setup_shm(fd[UM_TIMETRAVEL_SHARED_MEMFD], + msg->time & UM_TIMETRAVEL_START_ACK_ID); + /* we don't use the logging for now */ + os_close_file(fd[UM_TIMETRAVEL_SHARED_LOGFD]); + } + } else { + ret = os_read_file(time_travel_ext_fd, msg, sizeof(*msg)); + } + + if (ret == 0) + panic("time-travel external link is broken\n"); + if (ret != sizeof(*msg)) + panic("invalid time-travel message - %d bytes\n", ret); + + switch (msg->op) { + default: + WARN_ONCE(1, "time-travel: unexpected message %lld\n", + (unsigned long long)msg->op); + break; + case UM_TIMETRAVEL_ACK: + return; + case UM_TIMETRAVEL_RUN: + time_travel_set_time(msg->time); + if (time_travel_shm) { + /* no request right now since we're running */ + time_travel_shm_client->flags &= + ~UM_TIMETRAVEL_SCHEDSHM_FLAGS_REQ_RUN; + /* no ack for shared memory RUN */ + return; + } + break; + case UM_TIMETRAVEL_FREE_UNTIL: + /* not supposed to get this with shm, but ignore it */ + if (time_travel_shm) + break; + time_travel_ext_free_until = &_time_travel_ext_free_until; + _time_travel_ext_free_until = msg->time; + break; + case UM_TIMETRAVEL_BROADCAST: + bc_message = msg->time; + time_travel_should_print_bc_msg = 1; + break; + } + + resp.seq = msg->seq; + os_write_file(time_travel_ext_fd, &resp, sizeof(resp)); +} + +static u64 time_travel_ext_req(u32 op, u64 time) +{ + static int seq; + int mseq = ++seq; + struct um_timetravel_msg msg = { + .op = op, + .time = time, + .seq = mseq, + }; + + /* + * We need to block even the timetravel handlers of SIGIO here and + * only restore their use when we got the ACK - otherwise we may + * (will) get interrupted by that, try to queue the IRQ for future + * processing and thus send another request while we're still waiting + * for an ACK, but the peer doesn't know we got interrupted and will + * send the ACKs in the same order as the message, but we'd need to + * see them in the opposite order ... + * + * This wouldn't matter *too* much, but some ACKs carry the + * current time (for UM_TIMETRAVEL_GET) and getting another + * ACK without a time would confuse us a lot! + * + * The sequence number assignment that happens here lets us + * debug such message handling issues more easily. + */ + block_signals_hard(); + os_write_file(time_travel_ext_fd, &msg, sizeof(msg)); + + /* no ACK expected for WAIT in shared memory mode */ + if (msg.op == UM_TIMETRAVEL_WAIT && time_travel_shm) + goto done; + + while (msg.op != UM_TIMETRAVEL_ACK) + time_travel_handle_message(&msg, + op == UM_TIMETRAVEL_START ? + TTMH_READ_START_ACK : + TTMH_READ); + + if (msg.seq != mseq) + panic("time-travel: ACK message has different seqno! op=%d, seq=%d != %d time=%lld\n", + msg.op, msg.seq, mseq, msg.time); + + if (op == UM_TIMETRAVEL_GET) + time_travel_set_time(msg.time); +done: + unblock_signals_hard(); + + return msg.time; +} + +void __time_travel_wait_readable(int fd) +{ + int fds[2] = { fd, time_travel_ext_fd }; + int ret; + + if (time_travel_mode != TT_MODE_EXTERNAL) + return; + + while ((ret = os_poll(2, fds))) { + struct um_timetravel_msg msg; + + if (ret == 1) + time_travel_handle_message(&msg, TTMH_READ); + } +} +EXPORT_SYMBOL_GPL(__time_travel_wait_readable); + +static void time_travel_ext_update_request(unsigned long long time) +{ + if (time_travel_mode != TT_MODE_EXTERNAL) + return; + + /* asked for exactly this time previously */ + if (time_travel_ext_prev_request_valid && + time == time_travel_ext_prev_request) + return; + + /* + * if we're running and are allowed to run past the request + * then we don't need to update it either + * + * Note for shm we ignore FREE_UNTIL messages and leave the pointer + * to shared memory, and for non-shm the offset is 0. + */ + if (!time_travel_ext_waiting && time_travel_ext_free_until && + time < (*time_travel_ext_free_until - time_travel_shm_offset)) + return; + + time_travel_ext_prev_request = time; + time_travel_ext_prev_request_valid = true; + + if (time_travel_shm) { + union um_timetravel_schedshm_client *running; + + running = &time_travel_shm->clients[time_travel_shm->running_id]; + + if (running->capa & UM_TIMETRAVEL_SCHEDSHM_CAP_TIME_SHARE) { + time_travel_shm_client->flags |= + UM_TIMETRAVEL_SCHEDSHM_FLAGS_REQ_RUN; + time += time_travel_shm_offset; + time_travel_shm_client->req_time = time; + if (time < time_travel_shm->free_until) + time_travel_shm->free_until = time; + return; + } + } + + time_travel_ext_req(UM_TIMETRAVEL_REQUEST, time); +} + +void __time_travel_propagate_time(void) +{ + static unsigned long long last_propagated; + + if (time_travel_shm) { + if (time_travel_shm->running_id != time_travel_shm_id) + panic("time-travel: setting time while not running\n"); + time_travel_shm->current_time = time_travel_time + + time_travel_shm_offset; + return; + } + + if (last_propagated == time_travel_time) + return; + + time_travel_ext_req(UM_TIMETRAVEL_UPDATE, time_travel_time); + last_propagated = time_travel_time; +} +EXPORT_SYMBOL_GPL(__time_travel_propagate_time); + +/* returns true if we must do a wait to the simtime device */ +static bool time_travel_ext_request(unsigned long long time) +{ + /* + * If we received an external sync point ("free until") then we + * don't have to request/wait for anything until then, unless + * we're already waiting. + * + * Note for shm we ignore FREE_UNTIL messages and leave the pointer + * to shared memory, and for non-shm the offset is 0. + */ + if (!time_travel_ext_waiting && time_travel_ext_free_until && + time < (*time_travel_ext_free_until - time_travel_shm_offset)) + return false; + + time_travel_ext_update_request(time); + return true; +} + +static void time_travel_ext_wait(bool idle) +{ + struct um_timetravel_msg msg = { + .op = UM_TIMETRAVEL_ACK, + }; + + time_travel_ext_prev_request_valid = false; + if (!time_travel_shm) + time_travel_ext_free_until = NULL; + time_travel_ext_waiting++; + + time_travel_ext_req(UM_TIMETRAVEL_WAIT, -1); + + /* + * Here we are deep in the idle loop, so we have to break out of the + * kernel abstraction in a sense and implement this in terms of the + * UML system waiting on the VQ interrupt while sleeping, when we get + * the signal it'll call time_travel_ext_vq_notify_done() completing the + * call. + */ + while (msg.op != UM_TIMETRAVEL_RUN) + time_travel_handle_message(&msg, idle ? TTMH_IDLE : TTMH_POLL); + + time_travel_ext_waiting--; + + /* we might request more stuff while polling - reset when we run */ + time_travel_ext_prev_request_valid = false; +} + +static void time_travel_ext_get_time(void) +{ + if (time_travel_shm) + time_travel_set_time(time_travel_shm->current_time - + time_travel_shm_offset); + else + time_travel_ext_req(UM_TIMETRAVEL_GET, -1); +} + +static void __time_travel_update_time(unsigned long long ns, bool idle) +{ + if (time_travel_mode == TT_MODE_EXTERNAL && time_travel_ext_request(ns)) + time_travel_ext_wait(idle); + else + time_travel_set_time(ns); +} + +static struct time_travel_event *time_travel_first_event(void) +{ + return list_first_entry_or_null(&time_travel_events, + struct time_travel_event, + list); +} + +static void __time_travel_add_event(struct time_travel_event *e, + unsigned long long time) +{ + struct time_travel_event *tmp; + bool inserted = false; + unsigned long flags; + + if (e->pending) + return; + + e->pending = true; + e->time = time; + + local_irq_save(flags); + list_for_each_entry(tmp, &time_travel_events, list) { + /* + * Add the new entry before one with higher time, + * or if they're equal and both on stack, because + * in that case we need to unwind the stack in the + * right order, and the later event (timer sleep + * or such) must be dequeued first. + */ + if ((tmp->time > e->time) || + (tmp->time == e->time && tmp->onstack && e->onstack)) { + list_add_tail(&e->list, &tmp->list); + inserted = true; + break; + } + } + + if (!inserted) + list_add_tail(&e->list, &time_travel_events); + + tmp = time_travel_first_event(); + time_travel_ext_update_request(tmp->time); + time_travel_next_event = tmp->time; + local_irq_restore(flags); +} + +static void time_travel_add_event(struct time_travel_event *e, + unsigned long long time) +{ + if (WARN_ON(!e->fn)) + return; + + __time_travel_add_event(e, time); +} + +void time_travel_add_event_rel(struct time_travel_event *e, + unsigned long long delay_ns) +{ + time_travel_add_event(e, time_travel_time + delay_ns); +} + +static void time_travel_periodic_timer(struct time_travel_event *e) +{ + time_travel_add_event(&time_travel_timer_event, + time_travel_time + time_travel_timer_interval); + + /* clock tick; decrease extra jiffies by keeping sched_clock constant */ + if (tt_extra_sched_jiffies > 0) + tt_extra_sched_jiffies -= 1; + + deliver_alarm(); +} + +void deliver_time_travel_irqs(void) +{ + struct time_travel_event *e; + unsigned long flags; + + /* + * Don't do anything for most cases. Note that because here we have + * to disable IRQs (and re-enable later) we'll actually recurse at + * the end of the function, so this is strictly necessary. + */ + if (likely(list_empty(&time_travel_irqs))) + return; + + local_irq_save(flags); + irq_enter(); + while ((e = list_first_entry_or_null(&time_travel_irqs, + struct time_travel_event, + list))) { + list_del(&e->list); + e->pending = false; + e->fn(e); + } + irq_exit(); + local_irq_restore(flags); +} + +static void time_travel_deliver_event(struct time_travel_event *e) +{ + if (e == &time_travel_timer_event) { + /* + * deliver_alarm() does the irq_enter/irq_exit + * by itself, so must handle it specially here + */ + e->fn(e); + } else if (irqs_disabled()) { + list_add_tail(&e->list, &time_travel_irqs); + /* + * set pending again, it was set to false when the + * event was deleted from the original list, but + * now it's still pending until we deliver the IRQ. + */ + e->pending = true; + } else { + unsigned long flags; + + local_irq_save(flags); + irq_enter(); + e->fn(e); + irq_exit(); + local_irq_restore(flags); + } +} + +bool time_travel_del_event(struct time_travel_event *e) +{ + unsigned long flags; + + if (!e->pending) + return false; + local_irq_save(flags); + list_del(&e->list); + e->pending = false; + local_irq_restore(flags); + return true; +} + +static void time_travel_update_time(unsigned long long next, bool idle) +{ + struct time_travel_event ne = { + .onstack = true, + }; + struct time_travel_event *e; + bool finished = idle; + + /* add it without a handler - we deal with that specifically below */ + __time_travel_add_event(&ne, next); + + do { + e = time_travel_first_event(); + + BUG_ON(!e); + __time_travel_update_time(e->time, idle); + + /* new events may have been inserted while we were waiting */ + if (e == time_travel_first_event()) { + BUG_ON(!time_travel_del_event(e)); + BUG_ON(time_travel_time != e->time); + + if (e == &ne) { + finished = true; + } else { + if (e->onstack) + panic("On-stack event dequeued outside of the stack! time=%lld, event time=%lld, event=%pS\n", + time_travel_time, e->time, e); + time_travel_deliver_event(e); + } + } + + e = time_travel_first_event(); + if (e) + time_travel_ext_update_request(e->time); + } while (ne.pending && !finished); + + time_travel_del_event(&ne); +} + +static void time_travel_update_time_rel(unsigned long long offs) +{ + unsigned long flags; + + /* + * Disable interrupts before calculating the new time so + * that a real timer interrupt (signal) can't happen at + * a bad time e.g. after we read time_travel_time but + * before we've completed updating the time. + */ + local_irq_save(flags); + time_travel_update_time(time_travel_time + offs, false); + local_irq_restore(flags); +} + +void time_travel_ndelay(unsigned long nsec) +{ + /* + * Not strictly needed to use _rel() version since this is + * only used in INFCPU/EXT modes, but it doesn't hurt and + * is more readable too. + */ + time_travel_update_time_rel(nsec); +} +EXPORT_SYMBOL(time_travel_ndelay); + +void time_travel_add_irq_event(struct time_travel_event *e) +{ + BUG_ON(time_travel_mode != TT_MODE_EXTERNAL); + + time_travel_ext_get_time(); + /* + * We could model interrupt latency here, for now just + * don't have any latency at all and request the exact + * same time (again) to run the interrupt... + */ + time_travel_add_event(e, time_travel_time); +} +EXPORT_SYMBOL_GPL(time_travel_add_irq_event); + +static void time_travel_oneshot_timer(struct time_travel_event *e) +{ + /* clock tick; decrease extra jiffies by keeping sched_clock constant */ + if (tt_extra_sched_jiffies > 0) + tt_extra_sched_jiffies -= 1; + + deliver_alarm(); +} + +void time_travel_sleep(void) +{ + /* + * Wait "forever" (using S64_MAX because there are some potential + * wrapping issues, especially with the current TT_MODE_EXTERNAL + * controller application. + */ + unsigned long long next = S64_MAX; + int cpu = raw_smp_processor_id(); + + if (time_travel_mode == TT_MODE_BASIC) + os_timer_disable(cpu); + + time_travel_update_time(next, true); + + if (time_travel_mode == TT_MODE_BASIC && + time_travel_timer_event.pending) { + if (time_travel_timer_event.fn == time_travel_periodic_timer) { + /* + * This is somewhat wrong - we should get the first + * one sooner like the os_timer_one_shot() below... + */ + os_timer_set_interval(cpu, time_travel_timer_interval); + } else { + os_timer_one_shot(cpu, time_travel_timer_event.time - next); + } + } +} + +static void time_travel_handle_real_alarm(void) +{ + time_travel_set_time(time_travel_next_event); + + time_travel_del_event(&time_travel_timer_event); + + if (time_travel_timer_event.fn == time_travel_periodic_timer) + time_travel_add_event(&time_travel_timer_event, + time_travel_time + + time_travel_timer_interval); +} + +static void time_travel_set_interval(unsigned long long interval) +{ + time_travel_timer_interval = interval; +} + +static int time_travel_connect_external(const char *socket) +{ + const char *sep; + unsigned long long id = (unsigned long long)-1; + int rc; + + if ((sep = strchr(socket, ':'))) { + char buf[25] = {}; + if (sep - socket > sizeof(buf) - 1) + goto invalid_number; + + memcpy(buf, socket, sep - socket); + if (kstrtoull(buf, 0, &id)) { +invalid_number: + panic("time-travel: invalid external ID in string '%s'\n", + socket); + return -EINVAL; + } + + socket = sep + 1; + } + + rc = os_connect_socket(socket); + if (rc < 0) { + panic("time-travel: failed to connect to external socket %s\n", + socket); + return rc; + } + + time_travel_ext_fd = rc; + + time_travel_ext_req(UM_TIMETRAVEL_START, id); + + return 1; +} + +static void time_travel_set_start(void) +{ + if (time_travel_start_set) + return; + + switch (time_travel_mode) { + case TT_MODE_EXTERNAL: + time_travel_start = time_travel_ext_req(UM_TIMETRAVEL_GET_TOD, -1); + /* controller gave us the *current* time, so adjust by that */ + time_travel_ext_get_time(); + time_travel_start -= time_travel_time; + break; + case TT_MODE_INFCPU: + case TT_MODE_BASIC: + if (!time_travel_start_set) + time_travel_start = os_persistent_clock_emulation(); + break; + case TT_MODE_OFF: + /* we just read the host clock with os_persistent_clock_emulation() */ + break; + } + + time_travel_start_set = true; +} +#else /* CONFIG_UML_TIME_TRAVEL_SUPPORT */ +#define time_travel_start_set 0 +#define time_travel_start 0 +#define time_travel_time 0 +#define time_travel_ext_waiting 0 + +static inline void time_travel_update_time(unsigned long long ns, bool idle) +{ +} + +static inline void time_travel_update_time_rel(unsigned long long offs) +{ +} + +static inline void time_travel_handle_real_alarm(void) +{ +} + +static void time_travel_set_interval(unsigned long long interval) +{ +} + +static inline void time_travel_set_start(void) +{ +} + +/* fail link if this actually gets used */ +extern u64 time_travel_ext_req(u32 op, u64 time); + +/* these are empty macros so the struct/fn need not exist */ +#define time_travel_add_event(e, time) do { } while (0) +/* externally not usable - redefine here so we can */ +#undef time_travel_del_event +#define time_travel_del_event(e) do { } while (0) +#endif + +static struct clock_event_device timer_clockevent[NR_CPUS]; void timer_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) { unsigned long flags; + /* + * In basic time-travel mode we still get real interrupts + * (signals) but since we don't read time from the OS, we + * must update the simulated time here to the expiry when + * we get a signal. + * This is not the case in inf-cpu mode, since there we + * never get any real signals from the OS. + */ + if (time_travel_mode == TT_MODE_BASIC) + time_travel_handle_real_alarm(); + local_irq_save(flags); do_IRQ(TIMER_IRQ, regs); local_irq_restore(flags); } -static void itimer_set_mode(enum clock_event_mode mode, - struct clock_event_device *evt) +static int itimer_shutdown(struct clock_event_device *evt) { - switch (mode) { - case CLOCK_EVT_MODE_PERIODIC: - set_interval(); - break; + int cpu = evt - &timer_clockevent[0]; - case CLOCK_EVT_MODE_SHUTDOWN: - case CLOCK_EVT_MODE_UNUSED: - case CLOCK_EVT_MODE_ONESHOT: - disable_timer(); - break; + if (time_travel_mode != TT_MODE_OFF) + time_travel_del_event(&time_travel_timer_event); - case CLOCK_EVT_MODE_RESUME: - break; + if (time_travel_mode != TT_MODE_INFCPU && + time_travel_mode != TT_MODE_EXTERNAL) + os_timer_disable(cpu); + + return 0; +} + +static int itimer_set_periodic(struct clock_event_device *evt) +{ + unsigned long long interval = NSEC_PER_SEC / HZ; + int cpu = evt - &timer_clockevent[0]; + + if (time_travel_mode != TT_MODE_OFF) { + time_travel_del_event(&time_travel_timer_event); + time_travel_set_event_fn(&time_travel_timer_event, + time_travel_periodic_timer); + time_travel_set_interval(interval); + time_travel_add_event(&time_travel_timer_event, + time_travel_time + interval); } + + if (time_travel_mode != TT_MODE_INFCPU && + time_travel_mode != TT_MODE_EXTERNAL) + os_timer_set_interval(cpu, interval); + + return 0; } static int itimer_next_event(unsigned long delta, struct clock_event_device *evt) { - return timer_one_shot(delta + 1); + delta += 1; + + if (time_travel_mode != TT_MODE_OFF) { + time_travel_del_event(&time_travel_timer_event); + time_travel_set_event_fn(&time_travel_timer_event, + time_travel_oneshot_timer); + time_travel_add_event(&time_travel_timer_event, + time_travel_time + delta); + } + + if (time_travel_mode != TT_MODE_INFCPU && + time_travel_mode != TT_MODE_EXTERNAL) + return os_timer_one_shot(raw_smp_processor_id(), delta); + + return 0; } -static struct clock_event_device itimer_clockevent = { - .name = "itimer", - .rating = 250, - .cpumask = cpu_all_mask, - .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, - .set_mode = itimer_set_mode, - .set_next_event = itimer_next_event, - .shift = 32, - .irq = 0, +static int itimer_one_shot(struct clock_event_device *evt) +{ + return itimer_next_event(0, evt); +} + +static struct clock_event_device _timer_clockevent = { + .name = "posix-timer", + .rating = 250, + .features = CLOCK_EVT_FEAT_PERIODIC | + CLOCK_EVT_FEAT_ONESHOT, + .set_state_shutdown = itimer_shutdown, + .set_state_periodic = itimer_set_periodic, + .set_state_oneshot = itimer_one_shot, + .set_next_event = itimer_next_event, + .shift = 0, + .max_delta_ns = 0xffffffff, + .max_delta_ticks = 0xffffffff, + .min_delta_ns = TIMER_MIN_DELTA, + .min_delta_ticks = TIMER_MIN_DELTA, // microsecond resolution should be enough for anyone, same as 640K RAM + .irq = 0, + .mult = 1, }; static irqreturn_t um_timer(int irq, void *dev) { - (*itimer_clockevent.event_handler)(&itimer_clockevent); + int cpu = raw_smp_processor_id(); + struct clock_event_device *evt = &timer_clockevent[cpu]; + + /* + * Interrupt the (possibly) running userspace process, technically this + * should only happen if userspace is currently executing. + * With infinite CPU time-travel, we can only get here when userspace + * is not executing. Do not notify there and avoid spurious scheduling. + */ + if (time_travel_mode != TT_MODE_INFCPU && + time_travel_mode != TT_MODE_EXTERNAL && + get_current()->mm) + os_alarm_process(get_current()->mm->context.id.pid); + + evt->event_handler(evt); return IRQ_HANDLED; } -static cycle_t itimer_read(struct clocksource *cs) +static u64 timer_read(struct clocksource *cs) { - return os_nsecs() / 1000; + if (time_travel_mode != TT_MODE_OFF) { + /* + * We make reading the timer cost a bit so that we don't get + * stuck in loops that expect time to move more than the + * exact requested sleep amount, e.g. python's socket server, + * see https://bugs.python.org/issue37026. + * + * However, don't do that when we're in interrupt or such as + * then we might recurse into our own processing, and get to + * even more waiting, and that's not good - it messes up the + * "what do I do next" and onstack event we use to know when + * to return from time_travel_update_time(). + */ + if (!irqs_disabled() && !in_interrupt() && !in_softirq() && + !time_travel_ext_waiting) + time_travel_update_time_rel(TIMER_MULTIPLIER); + return time_travel_time / TIMER_MULTIPLIER; + } + + return os_nsecs() / TIMER_MULTIPLIER; } -static struct clocksource itimer_clocksource = { - .name = "itimer", +static struct clocksource timer_clocksource = { + .name = "timer", .rating = 300, - .read = itimer_read, + .read = timer_read, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; -static void __init setup_itimer(void) +int um_setup_timer(void) { + int cpu = raw_smp_processor_id(); + struct clock_event_device *evt = &timer_clockevent[cpu]; int err; - err = request_irq(TIMER_IRQ, um_timer, 0, "timer", NULL); + err = os_timer_create(); + if (err) + return err; + + memcpy(evt, &_timer_clockevent, sizeof(*evt)); + evt->cpumask = cpumask_of(cpu); + clockevents_register_device(evt); + + return 0; +} + +static void __init um_timer_init(void) +{ + int err; + + err = request_irq(TIMER_IRQ, um_timer, IRQF_TIMER, "hr timer", NULL); if (err != 0) printk(KERN_ERR "register_timer : request_irq failed - " "errno = %d\n", -err); - itimer_clockevent.mult = div_sc(HZ, NSEC_PER_SEC, 32); - itimer_clockevent.max_delta_ns = - clockevent_delta2ns(60 * HZ, &itimer_clockevent); - itimer_clockevent.min_delta_ns = - clockevent_delta2ns(1, &itimer_clockevent); - err = clocksource_register_hz(&itimer_clocksource, USEC_PER_SEC); + err = um_setup_timer(); + if (err) { + printk(KERN_ERR "creation of timer failed - errno = %d\n", -err); + return; + } + + err = clocksource_register_hz(&timer_clocksource, NSEC_PER_SEC/TIMER_MULTIPLIER); if (err) { printk(KERN_ERR "clocksource_register_hz returned %d\n", err); return; } - clockevents_register_device(&itimer_clockevent); } -void read_persistent_clock(struct timespec *ts) +void read_persistent_clock64(struct timespec64 *ts) { - long long nsecs = os_nsecs(); + long long nsecs; + + time_travel_set_start(); - set_normalized_timespec(ts, nsecs / NSEC_PER_SEC, - nsecs % NSEC_PER_SEC); + if (time_travel_mode != TT_MODE_OFF) + nsecs = time_travel_start + time_travel_time; + else + nsecs = os_persistent_clock_emulation(); + + set_normalized_timespec64(ts, nsecs / NSEC_PER_SEC, + nsecs % NSEC_PER_SEC); } void __init time_init(void) { - timer_init(); - late_time_init = setup_itimer; + timer_set_signal_handler(); + late_time_init = um_timer_init; +} + +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT +unsigned long calibrate_delay_is_known(void) +{ + if (time_travel_mode == TT_MODE_INFCPU || + time_travel_mode == TT_MODE_EXTERNAL) + return 1; + return 0; +} + +static int setup_time_travel(char *str) +{ + if (strcmp(str, "=inf-cpu") == 0) { + time_travel_mode = TT_MODE_INFCPU; + _timer_clockevent.name = "time-travel-timer-infcpu"; + timer_clocksource.name = "time-travel-clock"; + return 1; + } + + if (strncmp(str, "=ext:", 5) == 0) { + time_travel_mode = TT_MODE_EXTERNAL; + _timer_clockevent.name = "time-travel-timer-external"; + timer_clocksource.name = "time-travel-clock-external"; + return time_travel_connect_external(str + 5); + } + + if (!*str) { + time_travel_mode = TT_MODE_BASIC; + _timer_clockevent.name = "time-travel-timer"; + timer_clocksource.name = "time-travel-clock"; + return 1; + } + + return -EINVAL; +} + +__setup("time-travel", setup_time_travel); +__uml_help(setup_time_travel, +"time-travel\n" +" This option just enables basic time travel mode, in which the clock/timers\n" +" inside the UML instance skip forward when there's nothing to do, rather than\n" +" waiting for real time to elapse. However, instance CPU speed is limited by\n" +" the real CPU speed, so e.g. a 10ms timer will always fire after ~10ms wall\n" +" clock (but quicker when there's nothing to do).\n" +"\n" +"time-travel=inf-cpu\n" +" This enables time travel mode with infinite processing power, in which there\n" +" are no wall clock timers, and any CPU processing happens - as seen from the\n" +" guest - instantly. This can be useful for accurate simulation regardless of\n" +" debug overhead, physical CPU speed, etc. but is somewhat dangerous as it can\n" +" easily lead to getting stuck (e.g. if anything in the system busy loops).\n" +"\n" +"time-travel=ext:[ID:]/path/to/socket\n" +" This enables time travel mode similar to =inf-cpu, except the system will\n" +" use the given socket to coordinate with a central scheduler, in order to\n" +" have more than one system simultaneously be on simulated time. The virtio\n" +" driver code in UML knows about this so you can also simulate networks and\n" +" devices using it, assuming the device has the right capabilities.\n" +" The optional ID is a 64-bit integer that's sent to the central scheduler.\n\n"); + +static int setup_time_travel_start(char *str) +{ + int err; + + err = kstrtoull(str, 0, &time_travel_start); + if (err) + return err; + + time_travel_start_set = 1; + return 1; +} + +__setup("time-travel-start=", setup_time_travel_start); +__uml_help(setup_time_travel_start, +"time-travel-start=<nanoseconds>\n" +" Configure the UML instance's wall clock to start at this value rather than\n" +" the host's wall clock at the time of UML boot.\n\n"); + +static struct kobject *bc_time_kobject; + +static ssize_t bc_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "0x%llx", bc_message); +} + +static ssize_t bc_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) +{ + int ret; + u64 user_bc_message; + + ret = kstrtou64(buf, 0, &user_bc_message); + if (ret) + return ret; + + bc_message = user_bc_message; + + time_travel_ext_req(UM_TIMETRAVEL_BROADCAST, bc_message); + pr_info("um: time: sent broadcast message: 0x%llx\n", bc_message); + return count; +} + +static struct kobj_attribute bc_attribute = __ATTR(bc-message, 0660, bc_show, bc_store); + +static int __init um_bc_start(void) +{ + if (time_travel_mode != TT_MODE_EXTERNAL) + return 0; + + bc_time_kobject = kobject_create_and_add("um-ext-time", kernel_kobj); + if (!bc_time_kobject) + return 0; + + if (sysfs_create_file(bc_time_kobject, &bc_attribute.attr)) + pr_debug("failed to create the bc file in /sys/kernel/um_time"); + + return 0; } +late_initcall(um_bc_start); +#endif diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c index 9472079471bb..39608cccf2c6 100644 --- a/arch/um/kernel/tlb.c +++ b/arch/um/kernel/tlb.c @@ -1,212 +1,89 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <linux/mm.h> #include <linux/module.h> -#include <linux/sched.h> -#include <asm/pgtable.h> +#include <linux/sched/signal.h> + #include <asm/tlbflush.h> +#include <asm/mmu_context.h> #include <as-layout.h> #include <mem_user.h> #include <os.h> #include <skas.h> +#include <kern_util.h> -struct host_vm_change { - struct host_vm_op { - enum { NONE, MMAP, MUNMAP, MPROTECT } type; - union { - struct { - unsigned long addr; - unsigned long len; - unsigned int prot; - int fd; - __u64 offset; - } mmap; - struct { - unsigned long addr; - unsigned long len; - } munmap; - struct { - unsigned long addr; - unsigned long len; - unsigned int prot; - } mprotect; - } u; - } ops[1]; - int index; - struct mm_id *id; - void *data; - int force; -}; - -#define INIT_HVC(mm, force) \ - ((struct host_vm_change) \ - { .ops = { { .type = NONE } }, \ - .id = &mm->context.id, \ - .data = NULL, \ - .index = 0, \ - .force = force }) - -static int do_ops(struct host_vm_change *hvc, int end, - int finished) -{ - struct host_vm_op *op; - int i, ret = 0; - - for (i = 0; i < end && !ret; i++) { - op = &hvc->ops[i]; - switch (op->type) { - case MMAP: - ret = map(hvc->id, op->u.mmap.addr, op->u.mmap.len, - op->u.mmap.prot, op->u.mmap.fd, - op->u.mmap.offset, finished, &hvc->data); - break; - case MUNMAP: - ret = unmap(hvc->id, op->u.munmap.addr, - op->u.munmap.len, finished, &hvc->data); - break; - case MPROTECT: - ret = protect(hvc->id, op->u.mprotect.addr, - op->u.mprotect.len, op->u.mprotect.prot, - finished, &hvc->data); - break; - default: - printk(KERN_ERR "Unknown op type %d in do_ops\n", - op->type); - BUG(); - break; - } - } +struct vm_ops { + struct mm_id *mm_idp; - return ret; -} + int (*mmap)(struct mm_id *mm_idp, + unsigned long virt, unsigned long len, int prot, + int phys_fd, unsigned long long offset); + int (*unmap)(struct mm_id *mm_idp, + unsigned long virt, unsigned long len); +}; -static int add_mmap(unsigned long virt, unsigned long phys, unsigned long len, - unsigned int prot, struct host_vm_change *hvc) +static int kern_map(struct mm_id *mm_idp, + unsigned long virt, unsigned long len, int prot, + int phys_fd, unsigned long long offset) { - __u64 offset; - struct host_vm_op *last; - int fd, ret = 0; - - fd = phys_mapping(phys, &offset); - if (hvc->index != 0) { - last = &hvc->ops[hvc->index - 1]; - if ((last->type == MMAP) && - (last->u.mmap.addr + last->u.mmap.len == virt) && - (last->u.mmap.prot == prot) && (last->u.mmap.fd == fd) && - (last->u.mmap.offset + last->u.mmap.len == offset)) { - last->u.mmap.len += len; - return 0; - } - } - - if (hvc->index == ARRAY_SIZE(hvc->ops)) { - ret = do_ops(hvc, ARRAY_SIZE(hvc->ops), 0); - hvc->index = 0; - } - - hvc->ops[hvc->index++] = ((struct host_vm_op) - { .type = MMAP, - .u = { .mmap = { .addr = virt, - .len = len, - .prot = prot, - .fd = fd, - .offset = offset } - } }); - return ret; + /* TODO: Why is executable needed to be always set in the kernel? */ + return os_map_memory((void *)virt, phys_fd, offset, len, + prot & UM_PROT_READ, prot & UM_PROT_WRITE, + 1); } -static int add_munmap(unsigned long addr, unsigned long len, - struct host_vm_change *hvc) +static int kern_unmap(struct mm_id *mm_idp, + unsigned long virt, unsigned long len) { - struct host_vm_op *last; - int ret = 0; - - if (hvc->index != 0) { - last = &hvc->ops[hvc->index - 1]; - if ((last->type == MUNMAP) && - (last->u.munmap.addr + last->u.mmap.len == addr)) { - last->u.munmap.len += len; - return 0; - } - } - - if (hvc->index == ARRAY_SIZE(hvc->ops)) { - ret = do_ops(hvc, ARRAY_SIZE(hvc->ops), 0); - hvc->index = 0; - } - - hvc->ops[hvc->index++] = ((struct host_vm_op) - { .type = MUNMAP, - .u = { .munmap = { .addr = addr, - .len = len } } }); - return ret; + return os_unmap_memory((void *)virt, len); } -static int add_mprotect(unsigned long addr, unsigned long len, - unsigned int prot, struct host_vm_change *hvc) +void report_enomem(void) { - struct host_vm_op *last; - int ret = 0; - - if (hvc->index != 0) { - last = &hvc->ops[hvc->index - 1]; - if ((last->type == MPROTECT) && - (last->u.mprotect.addr + last->u.mprotect.len == addr) && - (last->u.mprotect.prot == prot)) { - last->u.mprotect.len += len; - return 0; - } - } - - if (hvc->index == ARRAY_SIZE(hvc->ops)) { - ret = do_ops(hvc, ARRAY_SIZE(hvc->ops), 0); - hvc->index = 0; - } - - hvc->ops[hvc->index++] = ((struct host_vm_op) - { .type = MPROTECT, - .u = { .mprotect = { .addr = addr, - .len = len, - .prot = prot } } }); - return ret; + printk(KERN_ERR "UML ran out of memory on the host side! " + "This can happen due to a memory limitation or " + "vm.max_map_count has been reached.\n"); } -#define ADD_ROUND(n, inc) (((n) + (inc)) & ~((inc) - 1)) - static inline int update_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, - struct host_vm_change *hvc) + struct vm_ops *ops) { pte_t *pte; - int r, w, x, prot, ret = 0; + int ret = 0; pte = pte_offset_kernel(pmd, addr); do { - if ((addr >= STUB_START) && (addr < STUB_END)) + if (!pte_needsync(*pte)) continue; - r = pte_read(*pte); - w = pte_write(*pte); - x = pte_exec(*pte); - if (!pte_young(*pte)) { - r = 0; - w = 0; - } else if (!pte_dirty(*pte)) - w = 0; - - prot = ((r ? UM_PROT_READ : 0) | (w ? UM_PROT_WRITE : 0) | - (x ? UM_PROT_EXEC : 0)); - if (hvc->force || pte_newpage(*pte)) { - if (pte_present(*pte)) - ret = add_mmap(addr, pte_val(*pte) & PAGE_MASK, - PAGE_SIZE, prot, hvc); - else - ret = add_munmap(addr, PAGE_SIZE, hvc); - } else if (pte_newprot(*pte)) - ret = add_mprotect(addr, PAGE_SIZE, prot, hvc); + if (pte_present(*pte)) { + __u64 offset; + unsigned long phys = pte_val(*pte) & PAGE_MASK; + int fd = phys_mapping(phys, &offset); + int r, w, x, prot; + + r = pte_read(*pte); + w = pte_write(*pte); + x = pte_exec(*pte); + if (!pte_young(*pte)) { + r = 0; + w = 0; + } else if (!pte_dirty(*pte)) + w = 0; + + prot = (r ? UM_PROT_READ : 0) | + (w ? UM_PROT_WRITE : 0) | + (x ? UM_PROT_EXEC : 0); + + ret = ops->mmap(ops->mm_idp, addr, PAGE_SIZE, + prot, fd, offset); + } else + ret = ops->unmap(ops->mm_idp, addr, PAGE_SIZE); + *pte = pte_mkuptodate(*pte); } while (pte++, addr += PAGE_SIZE, ((addr < end) && !ret)); return ret; @@ -214,7 +91,7 @@ static inline int update_pte_range(pmd_t *pmd, unsigned long addr, static inline int update_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, - struct host_vm_change *hvc) + struct vm_ops *ops) { pmd_t *pmd; unsigned long next; @@ -224,314 +101,125 @@ static inline int update_pmd_range(pud_t *pud, unsigned long addr, do { next = pmd_addr_end(addr, end); if (!pmd_present(*pmd)) { - if (hvc->force || pmd_newpage(*pmd)) { - ret = add_munmap(addr, next - addr, hvc); + if (pmd_needsync(*pmd)) { + ret = ops->unmap(ops->mm_idp, addr, + next - addr); pmd_mkuptodate(*pmd); } } - else ret = update_pte_range(pmd, addr, next, hvc); + else ret = update_pte_range(pmd, addr, next, ops); } while (pmd++, addr = next, ((addr < end) && !ret)); return ret; } -static inline int update_pud_range(pgd_t *pgd, unsigned long addr, +static inline int update_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, - struct host_vm_change *hvc) + struct vm_ops *ops) { pud_t *pud; unsigned long next; int ret = 0; - pud = pud_offset(pgd, addr); + pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); if (!pud_present(*pud)) { - if (hvc->force || pud_newpage(*pud)) { - ret = add_munmap(addr, next - addr, hvc); + if (pud_needsync(*pud)) { + ret = ops->unmap(ops->mm_idp, addr, + next - addr); pud_mkuptodate(*pud); } } - else ret = update_pmd_range(pud, addr, next, hvc); + else ret = update_pmd_range(pud, addr, next, ops); } while (pud++, addr = next, ((addr < end) && !ret)); return ret; } -void fix_range_common(struct mm_struct *mm, unsigned long start_addr, - unsigned long end_addr, int force) +static inline int update_p4d_range(pgd_t *pgd, unsigned long addr, + unsigned long end, + struct vm_ops *ops) { - pgd_t *pgd; - struct host_vm_change hvc; - unsigned long addr = start_addr, next; + p4d_t *p4d; + unsigned long next; int ret = 0; - hvc = INIT_HVC(mm, force); - pgd = pgd_offset(mm, addr); + p4d = p4d_offset(pgd, addr); do { - next = pgd_addr_end(addr, end_addr); - if (!pgd_present(*pgd)) { - if (force || pgd_newpage(*pgd)) { - ret = add_munmap(addr, next - addr, &hvc); - pgd_mkuptodate(*pgd); + next = p4d_addr_end(addr, end); + if (!p4d_present(*p4d)) { + if (p4d_needsync(*p4d)) { + ret = ops->unmap(ops->mm_idp, addr, + next - addr); + p4d_mkuptodate(*p4d); } - } - else ret = update_pud_range(pgd, addr, next, &hvc); - } while (pgd++, addr = next, ((addr < end_addr) && !ret)); - - if (!ret) - ret = do_ops(&hvc, hvc.index, 1); - - /* This is not an else because ret is modified above */ - if (ret) { - printk(KERN_ERR "fix_range_common: failed, killing current " - "process\n"); - force_sig(SIGKILL, current); - } + } else + ret = update_pud_range(p4d, addr, next, ops); + } while (p4d++, addr = next, ((addr < end) && !ret)); + return ret; } -static int flush_tlb_kernel_range_common(unsigned long start, unsigned long end) +int um_tlb_sync(struct mm_struct *mm) { - struct mm_struct *mm; pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - unsigned long addr, last; - int updated = 0, err; - - mm = &init_mm; - for (addr = start; addr < end;) { - pgd = pgd_offset(mm, addr); - if (!pgd_present(*pgd)) { - last = ADD_ROUND(addr, PGDIR_SIZE); - if (last > end) - last = end; - if (pgd_newpage(*pgd)) { - updated = 1; - err = os_unmap_memory((void *) addr, - last - addr); - if (err < 0) - panic("munmap failed, errno = %d\n", - -err); - } - addr = last; - continue; - } - - pud = pud_offset(pgd, addr); - if (!pud_present(*pud)) { - last = ADD_ROUND(addr, PUD_SIZE); - if (last > end) - last = end; - if (pud_newpage(*pud)) { - updated = 1; - err = os_unmap_memory((void *) addr, - last - addr); - if (err < 0) - panic("munmap failed, errno = %d\n", - -err); - } - addr = last; - continue; - } - - pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) { - last = ADD_ROUND(addr, PMD_SIZE); - if (last > end) - last = end; - if (pmd_newpage(*pmd)) { - updated = 1; - err = os_unmap_memory((void *) addr, - last - addr); - if (err < 0) - panic("munmap failed, errno = %d\n", - -err); - } - addr = last; - continue; - } - - pte = pte_offset_kernel(pmd, addr); - if (!pte_present(*pte) || pte_newpage(*pte)) { - updated = 1; - err = os_unmap_memory((void *) addr, - PAGE_SIZE); - if (err < 0) - panic("munmap failed, errno = %d\n", - -err); - if (pte_present(*pte)) - map_memory(addr, - pte_val(*pte) & PAGE_MASK, - PAGE_SIZE, 1, 1, 1); - } - else if (pte_newprot(*pte)) { - updated = 1; - os_protect_memory((void *) addr, PAGE_SIZE, 1, 1, 1); - } - addr += PAGE_SIZE; - } - return updated; -} + struct vm_ops ops; + unsigned long addr, next; + int ret = 0; -void flush_tlb_page(struct vm_area_struct *vma, unsigned long address) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - struct mm_struct *mm = vma->vm_mm; - void *flush = NULL; - int r, w, x, prot, err = 0; - struct mm_id *mm_id; - - address &= PAGE_MASK; - pgd = pgd_offset(mm, address); - if (!pgd_present(*pgd)) - goto kill; - - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - goto kill; - - pmd = pmd_offset(pud, address); - if (!pmd_present(*pmd)) - goto kill; - - pte = pte_offset_kernel(pmd, address); - - r = pte_read(*pte); - w = pte_write(*pte); - x = pte_exec(*pte); - if (!pte_young(*pte)) { - r = 0; - w = 0; - } else if (!pte_dirty(*pte)) { - w = 0; - } + guard(spinlock_irqsave)(&mm->context.sync_tlb_lock); - mm_id = &mm->context.id; - prot = ((r ? UM_PROT_READ : 0) | (w ? UM_PROT_WRITE : 0) | - (x ? UM_PROT_EXEC : 0)); - if (pte_newpage(*pte)) { - if (pte_present(*pte)) { - unsigned long long offset; - int fd; + if (mm->context.sync_tlb_range_to == 0) + return 0; - fd = phys_mapping(pte_val(*pte) & PAGE_MASK, &offset); - err = map(mm_id, address, PAGE_SIZE, prot, fd, offset, - 1, &flush); - } - else err = unmap(mm_id, address, PAGE_SIZE, 1, &flush); + ops.mm_idp = &mm->context.id; + if (mm == &init_mm) { + ops.mmap = kern_map; + ops.unmap = kern_unmap; + } else { + ops.mmap = map; + ops.unmap = unmap; } - else if (pte_newprot(*pte)) - err = protect(mm_id, address, PAGE_SIZE, prot, 1, &flush); - - if (err) - goto kill; - *pte = pte_mkuptodate(*pte); - - return; - -kill: - printk(KERN_ERR "Failed to flush page for address 0x%lx\n", address); - force_sig(SIGKILL, current); -} - -pgd_t *pgd_offset_proc(struct mm_struct *mm, unsigned long address) -{ - return pgd_offset(mm, address); -} - -pud_t *pud_offset_proc(pgd_t *pgd, unsigned long address) -{ - return pud_offset(pgd, address); -} - -pmd_t *pmd_offset_proc(pud_t *pud, unsigned long address) -{ - return pmd_offset(pud, address); -} + addr = mm->context.sync_tlb_range_from; + pgd = pgd_offset(mm, addr); + do { + next = pgd_addr_end(addr, mm->context.sync_tlb_range_to); + if (!pgd_present(*pgd)) { + if (pgd_needsync(*pgd)) { + ret = ops.unmap(ops.mm_idp, addr, + next - addr); + pgd_mkuptodate(*pgd); + } + } else + ret = update_p4d_range(pgd, addr, next, &ops); + } while (pgd++, addr = next, + ((addr < mm->context.sync_tlb_range_to) && !ret)); -pte_t *pte_offset_proc(pmd_t *pmd, unsigned long address) -{ - return pte_offset_kernel(pmd, address); -} + if (ret == -ENOMEM) + report_enomem(); -pte_t *addr_pte(struct task_struct *task, unsigned long addr) -{ - pgd_t *pgd = pgd_offset(task->mm, addr); - pud_t *pud = pud_offset(pgd, addr); - pmd_t *pmd = pmd_offset(pud, addr); + mm->context.sync_tlb_range_from = 0; + mm->context.sync_tlb_range_to = 0; - return pte_offset_map(pmd, addr); + return ret; } void flush_tlb_all(void) { - flush_tlb_mm(current->mm); -} - -void flush_tlb_kernel_range(unsigned long start, unsigned long end) -{ - flush_tlb_kernel_range_common(start, end); -} - -void flush_tlb_kernel_vm(void) -{ - flush_tlb_kernel_range_common(start_vm, end_vm); -} - -void __flush_tlb_one(unsigned long addr) -{ - flush_tlb_kernel_range_common(addr, addr + PAGE_SIZE); -} - -static void fix_range(struct mm_struct *mm, unsigned long start_addr, - unsigned long end_addr, int force) -{ - fix_range_common(mm, start_addr, end_addr, force); -} - -void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end) -{ - if (vma->vm_mm == NULL) - flush_tlb_kernel_range_common(start, end); - else fix_range(vma->vm_mm, start, end, 0); -} -EXPORT_SYMBOL(flush_tlb_range); - -void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, - unsigned long end) -{ /* * Don't bother flushing if this address space is about to be * destroyed. */ - if (atomic_read(&mm->mm_users) == 0) + if (atomic_read(¤t->mm->mm_users) == 0) return; - fix_range(mm, start, end, 0); + flush_tlb_mm(current->mm); } void flush_tlb_mm(struct mm_struct *mm) { - struct vm_area_struct *vma = mm->mmap; - - while (vma != NULL) { - fix_range(mm, vma->vm_start, vma->vm_end, 0); - vma = vma->vm_next; - } -} + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); -void force_flush_all(void) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma = mm->mmap; - - while (vma != NULL) { - fix_range(mm, vma->vm_start, vma->vm_end, 1); - vma = vma->vm_next; - } + for_each_vma(vmi, vma) + um_tlb_mark_sync(mm, vma->vm_start, vma->vm_end); } diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index 089f3987e273..177615820a4c 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -1,14 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <linux/mm.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/hardirq.h> #include <linux/module.h> +#include <linux/uaccess.h> +#include <linux/sched/debug.h> #include <asm/current.h> -#include <asm/pgtable.h> #include <asm/tlbflush.h> #include <arch.h> #include <as-layout.h> @@ -17,7 +18,123 @@ #include <skas.h> /* - * Note this is constrained to return 0, -EFAULT, -EACCESS, -ENOMEM by + * NOTE: UML does not have exception tables. As such, this is almost a copy + * of the code in mm/memory.c, only adjusting the logic to simply check whether + * we are coming from the kernel instead of doing an additional lookup in the + * exception table. + * We can do this simplification because we never get here if the exception was + * fixable. + */ +static inline bool get_mmap_lock_carefully(struct mm_struct *mm, bool is_user) +{ + if (likely(mmap_read_trylock(mm))) + return true; + + if (!is_user) + return false; + + return !mmap_read_lock_killable(mm); +} + +static inline bool mmap_upgrade_trylock(struct mm_struct *mm) +{ + /* + * We don't have this operation yet. + * + * It should be easy enough to do: it's basically a + * atomic_long_try_cmpxchg_acquire() + * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but + * it also needs the proper lockdep magic etc. + */ + return false; +} + +static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, bool is_user) +{ + mmap_read_unlock(mm); + if (!is_user) + return false; + + return !mmap_write_lock_killable(mm); +} + +/* + * Helper for page fault handling. + * + * This is kind of equivalend to "mmap_read_lock()" followed + * by "find_extend_vma()", except it's a lot more careful about + * the locking (and will drop the lock on failure). + * + * For example, if we have a kernel bug that causes a page + * fault, we don't want to just use mmap_read_lock() to get + * the mm lock, because that would deadlock if the bug were + * to happen while we're holding the mm lock for writing. + * + * So this checks the exception tables on kernel faults in + * order to only do this all for instructions that are actually + * expected to fault. + * + * We can also actually take the mm lock for writing if we + * need to extend the vma, which helps the VM layer a lot. + */ +static struct vm_area_struct * +um_lock_mm_and_find_vma(struct mm_struct *mm, + unsigned long addr, bool is_user) +{ + struct vm_area_struct *vma; + + if (!get_mmap_lock_carefully(mm, is_user)) + return NULL; + + vma = find_vma(mm, addr); + if (likely(vma && (vma->vm_start <= addr))) + return vma; + + /* + * Well, dang. We might still be successful, but only + * if we can extend a vma to do so. + */ + if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { + mmap_read_unlock(mm); + return NULL; + } + + /* + * We can try to upgrade the mmap lock atomically, + * in which case we can continue to use the vma + * we already looked up. + * + * Otherwise we'll have to drop the mmap lock and + * re-take it, and also look up the vma again, + * re-checking it. + */ + if (!mmap_upgrade_trylock(mm)) { + if (!upgrade_mmap_lock_carefully(mm, is_user)) + return NULL; + + vma = find_vma(mm, addr); + if (!vma) + goto fail; + if (vma->vm_start <= addr) + goto success; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto fail; + } + + if (expand_stack_locked(vma, addr)) + goto fail; + +success: + mmap_write_downgrade(mm); + return vma; + +fail: + mmap_write_unlock(mm); + return NULL; +} + +/* + * Note this is constrained to return 0, -EFAULT, -EACCES, -ENOMEM by * segv(). */ int handle_page_fault(unsigned long address, unsigned long ip, @@ -25,79 +142,68 @@ int handle_page_fault(unsigned long address, unsigned long ip, { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - pgd_t *pgd; - pud_t *pud; pmd_t *pmd; pte_t *pte; int err = -EFAULT; - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | - (is_write ? FAULT_FLAG_WRITE : 0); + unsigned int flags = FAULT_FLAG_DEFAULT; *code_out = SEGV_MAPERR; /* - * If the fault was during atomic operation, don't take the fault, just + * If the fault was with pagefaults disabled, don't take the fault, just * fail. */ - if (in_atomic()) + if (faulthandler_disabled()) goto out_nosemaphore; + if (is_user) + flags |= FAULT_FLAG_USER; retry: - down_read(&mm->mmap_sem); - vma = find_vma(mm, address); + vma = um_lock_mm_and_find_vma(mm, address, is_user); if (!vma) - goto out; - else if (vma->vm_start <= address) - goto good_area; - else if (!(vma->vm_flags & VM_GROWSDOWN)) - goto out; - else if (is_user && !ARCH_IS_STACKGROW(address)) - goto out; - else if (expand_stack(vma, address)) - goto out; + goto out_nosemaphore; -good_area: *code_out = SEGV_ACCERR; - if (is_write && !(vma->vm_flags & VM_WRITE)) - goto out; - - /* Don't require VM_READ|VM_EXEC for write faults! */ - if (!is_write && !(vma->vm_flags & (VM_READ | VM_EXEC))) - goto out; + if (is_write) { + if (!(vma->vm_flags & VM_WRITE)) + goto out; + flags |= FAULT_FLAG_WRITE; + } else { + /* Don't require VM_READ|VM_EXEC for write faults! */ + if (!(vma->vm_flags & (VM_READ | VM_EXEC))) + goto out; + } do { - int fault; + vm_fault_t fault; - fault = handle_mm_fault(mm, vma, address, flags); + fault = handle_mm_fault(vma, address, flags, NULL); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) goto out_nosemaphore; + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + return 0; + if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) { goto out_of_memory; + } else if (fault & VM_FAULT_SIGSEGV) { + goto out; } else if (fault & VM_FAULT_SIGBUS) { err = -EACCES; goto out; } BUG(); } - if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_MAJOR) - current->maj_flt++; - else - current->min_flt++; - if (fault & VM_FAULT_RETRY) { - flags &= ~FAULT_FLAG_ALLOW_RETRY; - flags |= FAULT_FLAG_TRIED; - - goto retry; - } + if (fault & VM_FAULT_RETRY) { + flags |= FAULT_FLAG_TRIED; + + goto retry; } - pgd = pgd_offset(mm, address); - pud = pud_offset(pgd, address); - pmd = pmd_offset(pud, address); + pmd = pmd_off(mm, address); pte = pte_offset_kernel(pmd, address); } while (!pte_present(*pte)); err = 0; @@ -112,9 +218,9 @@ good_area: #if 0 WARN_ON(!pte_young(*pte) || (is_write && !pte_dirty(*pte))); #endif - flush_tlb_page(vma, address); + out: - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); out_nosemaphore: return err; @@ -123,11 +229,12 @@ out_of_memory: * We ran out of memory, call the OOM killer, and return the userspace * (which will retry the fault, or kill us if we got oom-killed). */ - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); + if (!is_user) + goto out_nosemaphore; pagefault_out_of_memory(); return 0; } -EXPORT_SYMBOL(handle_page_fault); static void show_segv_info(struct uml_pt_regs *regs) { @@ -140,7 +247,7 @@ static void show_segv_info(struct uml_pt_regs *regs) if (!printk_ratelimit()) return; - printk("%s%s[%d]: segfault at %lx ip %p sp %p error %x", + printk("%s%s[%d]: segfault at %lx ip %px sp %px error %x", task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, tsk->comm, task_pid_nr(tsk), FAULT_ADDRESS(*fi), (void *)UPT_IP(regs), (void *)UPT_SP(regs), @@ -152,19 +259,14 @@ static void show_segv_info(struct uml_pt_regs *regs) static void bad_segv(struct faultinfo fi, unsigned long ip) { - struct siginfo si; - - si.si_signo = SIGSEGV; - si.si_code = SEGV_ACCERR; - si.si_addr = (void __user *) FAULT_ADDRESS(fi); current->thread.arch.faultinfo = fi; - force_sig_info(SIGSEGV, &si, current); + force_sig_fault(SIGSEGV, SEGV_ACCERR, (void __user *) FAULT_ADDRESS(fi)); } void fatal_sigsegv(void) { - force_sigsegv(SIGSEGV, current); - do_signal(); + force_fatal_sig(SIGSEGV); + do_signal(¤t->thread.regs); /* * This is to tell gcc that we're not returning - do_signal * can, in general, return, but in this case, it's not, since @@ -173,7 +275,19 @@ void fatal_sigsegv(void) os_dump_core(); } -void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) +/** + * segv_handler() - the SIGSEGV handler + * @sig: the signal number + * @unused_si: the signal info struct; unused in this handler + * @regs: the ptrace register information + * @mc: the mcontext of the signal + * + * The handler first extracts the faultinfo from the UML ptrace regs struct. + * If the userfault did not happen in an UML userspace process, bad_segv is called. + * Otherwise the signal did happen in a cloned userspace process, handle it. + */ +void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs, + void *mc) { struct faultinfo * fi = UPT_FAULTINFO(regs); @@ -182,7 +296,7 @@ void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) bad_segv(*fi, UPT_IP(regs)); return; } - segv(*fi, UPT_IP(regs), UPT_IS_USER(regs), regs); + segv(*fi, UPT_IP(regs), UPT_IS_USER(regs), regs, mc); } /* @@ -192,26 +306,55 @@ void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) * give us bad data! */ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user, - struct uml_pt_regs *regs) + struct uml_pt_regs *regs, void *mc) { - struct siginfo si; - jmp_buf *catcher; + int si_code; int err; int is_write = FAULT_WRITE(fi); unsigned long address = FAULT_ADDRESS(fi); - if (!is_user && (address >= start_vm) && (address < end_vm)) { - flush_tlb_kernel_vm(); - return 0; + if (!is_user && regs) + current->thread.segv_regs = container_of(regs, struct pt_regs, regs); + + if (!is_user && address >= start_vm && address < end_vm) { + /* + * Kernel has pending updates from set_ptes that were not + * flushed yet. Syncing them should fix the pagefault (if not + * we'll get here again and panic). + */ + err = um_tlb_sync(&init_mm); + if (err == -ENOMEM) + report_enomem(); + if (err) + panic("Failed to sync kernel TLBs: %d", err); + goto out; + } + else if (current->pagefault_disabled) { + if (!mc) { + show_regs(container_of(regs, struct pt_regs, regs)); + panic("Segfault with pagefaults disabled but no mcontext"); + } + if (!current->thread.segv_continue) { + show_regs(container_of(regs, struct pt_regs, regs)); + panic("Segfault without recovery target"); + } + mc_set_rip(mc, current->thread.segv_continue); + current->thread.segv_continue = NULL; + goto out; } else if (current->mm == NULL) { show_regs(container_of(regs, struct pt_regs, regs)); panic("Segfault with no mm"); } + else if (!is_user && address > PAGE_SIZE && address < TASK_SIZE) { + show_regs(container_of(regs, struct pt_regs, regs)); + panic("Kernel tried to access user memory at addr 0x%lx, ip 0x%lx", + address, ip); + } - if (SEGV_IS_FIXABLE(&fi) || SEGV_MAYBE_FIXABLE(&fi)) + if (SEGV_IS_FIXABLE(&fi)) err = handle_page_fault(address, ip, is_write, is_user, - &si.si_code); + &si_code); else { err = -EFAULT; /* @@ -222,17 +365,10 @@ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user, address = 0; } - catcher = current->thread.fault_catcher; if (!err) - return 0; - else if (catcher != NULL) { - current->thread.fault_addr = (void *) address; - UML_LONGJMP(catcher, 1); - } - else if (current->thread.fault_addr != NULL) - panic("fault_addr set but no fault catcher"); + goto out; else if (!is_user && arch_fixup(ip, regs)) - return 0; + goto out; if (!is_user) { show_regs(container_of(regs, struct pt_regs, regs)); @@ -243,27 +379,25 @@ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user, show_segv_info(regs); if (err == -EACCES) { - si.si_signo = SIGBUS; - si.si_errno = 0; - si.si_code = BUS_ADRERR; - si.si_addr = (void __user *)address; current->thread.arch.faultinfo = fi; - force_sig_info(SIGBUS, &si, current); + force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address); } else { BUG_ON(err != -EFAULT); - si.si_signo = SIGSEGV; - si.si_addr = (void __user *) address; current->thread.arch.faultinfo = fi; - force_sig_info(SIGSEGV, &si, current); + force_sig_fault(SIGSEGV, si_code, (void __user *) address); } + +out: + if (regs) + current->thread.segv_regs = NULL; + return 0; } -void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs) +void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs, + void *mc) { - struct faultinfo *fi; - struct siginfo clean_si; - + int code, err; if (!UPT_IS_USER(regs)) { if (sig == SIGBUS) printk(KERN_ERR "Bus error - the host /dev/shm or /tmp " @@ -273,44 +407,24 @@ void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs) arch_examine_signal(sig, regs); - memset(&clean_si, 0, sizeof(clean_si)); - clean_si.si_signo = si->si_signo; - clean_si.si_errno = si->si_errno; - clean_si.si_code = si->si_code; - switch (sig) { - case SIGILL: - case SIGFPE: - case SIGSEGV: - case SIGBUS: - case SIGTRAP: - fi = UPT_FAULTINFO(regs); - clean_si.si_addr = (void __user *) FAULT_ADDRESS(*fi); + /* Is the signal layout for the signal known? + * Signal data must be scrubbed to prevent information leaks. + */ + code = si->si_code; + err = si->si_errno; + if ((err == 0) && (siginfo_layout(sig, code) == SIL_FAULT)) { + struct faultinfo *fi = UPT_FAULTINFO(regs); current->thread.arch.faultinfo = *fi; -#ifdef __ARCH_SI_TRAPNO - clean_si.si_trapno = si->si_trapno; -#endif - break; - default: - printk(KERN_ERR "Attempted to relay unknown signal %d (si_code = %d)\n", - sig, si->si_code); + force_sig_fault(sig, code, (void __user *)FAULT_ADDRESS(*fi)); + } else { + printk(KERN_ERR "Attempted to relay unknown signal %d (si_code = %d) with errno %d\n", + sig, code, err); + force_sig(sig); } - - force_sig_info(sig, &clean_si, current); -} - -void bus_handler(int sig, struct siginfo *si, struct uml_pt_regs *regs) -{ - if (current->thread.fault_catcher != NULL) - UML_LONGJMP(current->thread.fault_catcher, 1); - else - relay_signal(sig, si, regs); } -void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) +void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs, + void *mc) { do_IRQ(WINCH_IRQ, regs); } - -void trap_init(void) -{ -} diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 87df5e3acc26..e2b24e1ecfa6 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -1,19 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ +#include <linux/cpu.h> #include <linux/delay.h> #include <linux/init.h> #include <linux/mm.h> +#include <linux/ctype.h> #include <linux/module.h> +#include <linux/panic_notifier.h> #include <linux/seq_file.h> #include <linux/string.h> +#include <linux/string_choices.h> #include <linux/utsname.h> #include <linux/sched.h> -#include <asm/pgtable.h> +#include <linux/sched/task.h> +#include <linux/kmsg_dump.h> +#include <linux/suspend.h> +#include <linux/random.h> +#include <linux/smp-internal.h> + #include <asm/processor.h> +#include <asm/cpufeature.h> +#include <asm/sections.h> #include <asm/setup.h> +#include <asm/text-patching.h> #include <as-layout.h> #include <arch.h> #include <init.h> @@ -22,7 +34,10 @@ #include <mem_user.h> #include <os.h> -#define DEFAULT_COMMAND_LINE "root=98:0" +#include "um_arch.h" + +#define DEFAULT_COMMAND_LINE_ROOT "root=98:0" +#define DEFAULT_COMMAND_LINE_CONSOLE "console=tty0" /* Changed in add_arg and setup_arch, which run before SMP is started */ static char __initdata command_line[COMMAND_LINE_SIZE] = { 0 }; @@ -30,7 +45,7 @@ static char __initdata command_line[COMMAND_LINE_SIZE] = { 0 }; static void __init add_arg(char *arg) { if (strlen(command_line) + strlen(arg) + 1 > COMMAND_LINE_SIZE) { - printf("add_arg: Too many command line arguments!\n"); + os_warn("add_arg: Too many command line arguments!\n"); exit(1); } if (strlen(command_line) > 0) @@ -40,43 +55,42 @@ static void __init add_arg(char *arg) /* * These fields are initialized at boot time and not changed. - * XXX This structure is used only in the non-SMP case. Maybe this - * should be moved to smp.c. */ struct cpuinfo_um boot_cpu_data = { .loops_per_jiffy = 0, - .ipi_pipe = { -1, -1 } + .cache_alignment = L1_CACHE_BYTES, + .x86_capability = { 0 } }; -union thread_union cpu0_irqstack - __attribute__((__section__(".data..init_irqstack"))) = - { INIT_THREAD_INFO(init_task) }; +EXPORT_SYMBOL(boot_cpu_data); -unsigned long thread_saved_pc(struct task_struct *task) -{ - /* FIXME: Need to look up userspace_pid by cpu */ - return os_process_pc(userspace_pid[0]); -} /* Changed in setup_arch, which is called in early boot */ static char host_info[(__NEW_UTS_LEN + 1) * 5]; static int show_cpuinfo(struct seq_file *m, void *v) { - int index = 0; + int i = 0; -#ifdef CONFIG_SMP - index = (struct cpuinfo_um *) v - cpu_data; - if (!cpu_online(index)) +#if IS_ENABLED(CONFIG_SMP) + i = (uintptr_t) v - 1; + if (!cpu_online(i)) return 0; #endif - seq_printf(m, "processor\t: %d\n", index); + seq_printf(m, "processor\t: %d\n", i); seq_printf(m, "vendor_id\t: User Mode Linux\n"); seq_printf(m, "model name\t: UML\n"); seq_printf(m, "mode\t\t: skas\n"); seq_printf(m, "host\t\t: %s\n", host_info); - seq_printf(m, "bogomips\t: %lu.%02lu\n\n", + seq_printf(m, "fpu\t\t: %s\n", str_yes_no(cpu_has(&boot_cpu_data, X86_FEATURE_FPU))); + seq_printf(m, "flags\t\t:"); + for (i = 0; i < 32*NCAPINTS; i++) + if (cpu_has(&boot_cpu_data, i) && (x86_cap_flags[i] != NULL)) + seq_printf(m, " %s", x86_cap_flags[i]); + seq_printf(m, "\n"); + seq_printf(m, "cache_alignment\t: %d\n", boot_cpu_data.cache_alignment); + seq_printf(m, "bogomips\t: %lu.%02lu\n", loops_per_jiffy/(500000/HZ), (loops_per_jiffy/(5000/HZ)) % 100); @@ -85,7 +99,9 @@ static int show_cpuinfo(struct seq_file *m, void *v) static void *c_start(struct seq_file *m, loff_t *pos) { - return *pos < NR_CPUS ? cpu_data + *pos : NULL; + if (*pos < nr_cpu_ids) + return (void *)(uintptr_t)(*pos + 1); + return NULL; } static void *c_next(struct seq_file *m, void *v, loff_t *pos) @@ -113,14 +129,13 @@ unsigned long uml_reserved; /* Also modified in mem_init */ unsigned long start_vm; unsigned long end_vm; -/* Set in uml_ncpus_setup */ -int ncpus = 1; - /* Set in early boot */ -static int have_root __initdata = 0; +static int have_root __initdata; +static int have_console __initdata; /* Set in uml_mem_setup and modified in linux_main */ -long long physmem_size = 32 * 1024 * 1024; +unsigned long long physmem_size = 64 * 1024 * 1024; +EXPORT_SYMBOL(physmem_size); static const char *usage_string = "User Mode Linux v%s\n" @@ -128,6 +143,7 @@ static const char *usage_string = static int __init uml_version_setup(char *line, int *add) { + /* Explicitly use printf() to show version in stdout */ printf("%s\n", init_utsname()->release); exit(0); @@ -154,42 +170,24 @@ __uml_setup("root=", uml_root_setup, " root=/dev/ubd5\n\n" ); -static int __init no_skas_debug_setup(char *line, int *add) +static int __init uml_console_setup(char *line, int *add) { - printf("'debug' is not necessary to gdb UML in skas mode - run \n"); - printf("'gdb linux'\n"); - + have_console = 1; return 0; } -__uml_setup("debug", no_skas_debug_setup, -"debug\n" -" this flag is not needed to run gdb on UML in skas mode\n\n" +__uml_setup("console=", uml_console_setup, +"console=<preferred console>\n" +" Specify the preferred console output driver\n\n" ); -#ifdef CONFIG_SMP -static int __init uml_ncpus_setup(char *line, int *add) -{ - if (!sscanf(line, "%d", &ncpus)) { - printf("Couldn't parse [%s]\n", line); - return -1; - } - - return 0; -} - -__uml_setup("ncpus=", uml_ncpus_setup, -"ncpus=<# of desired CPUs>\n" -" This tells an SMP kernel how many virtual processors to start.\n\n" -); -#endif - static int __init Usage(char *line, int *add) { const char **p; printf(usage_string, init_utsname()->release); p = &__uml_help_start; + /* Explicitly use printf() to show help in stdout */ while (p < &__uml_help_end) { printf("%s", *p); p++; @@ -233,42 +231,86 @@ static void __init uml_postsetup(void) static int panic_exit(struct notifier_block *self, unsigned long unused1, void *unused2) { + kmsg_dump(KMSG_DUMP_PANIC); bust_spinlocks(1); - show_regs(&(current->thread.regs)); bust_spinlocks(0); uml_exitcode = 1; os_dump_core(); - return 0; + + return NOTIFY_DONE; } static struct notifier_block panic_exit_notifier = { - .notifier_call = panic_exit, - .next = NULL, - .priority = 0 + .notifier_call = panic_exit, + .priority = INT_MAX - 1, /* run as 2nd notifier, won't return */ }; +void uml_finishsetup(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, + &panic_exit_notifier); + + uml_postsetup(); + + new_thread_handler(); +} + /* Set during early boot */ +unsigned long stub_start; unsigned long task_size; EXPORT_SYMBOL(task_size); -unsigned long host_task_size; - unsigned long brk_start; -unsigned long end_iomem; -EXPORT_SYMBOL(end_iomem); #define MIN_VMALLOC (32 * 1024 * 1024) -extern char __binary_start; +static void __init parse_host_cpu_flags(char *line) +{ + int i; + for (i = 0; i < 32*NCAPINTS; i++) { + if ((x86_cap_flags[i] != NULL) && strstr(line, x86_cap_flags[i])) + set_cpu_cap(&boot_cpu_data, i); + } +} + +static void __init parse_cache_line(char *line) +{ + long res; + char *to_parse = strstr(line, ":"); + if (to_parse) { + to_parse++; + while (*to_parse != 0 && isspace(*to_parse)) { + to_parse++; + } + if (kstrtoul(to_parse, 10, &res) == 0 && is_power_of_2(res)) + boot_cpu_data.cache_alignment = res; + else + boot_cpu_data.cache_alignment = L1_CACHE_BYTES; + } +} -int __init linux_main(int argc, char **argv) +static unsigned long __init get_top_address(char **envp) +{ + unsigned long top_addr = (unsigned long) &top_addr; + int i; + + /* The earliest variable should be after the program name in ELF */ + for (i = 0; envp[i]; i++) { + if ((unsigned long) envp[i] > top_addr) + top_addr = (unsigned long) envp[i]; + } + + return PAGE_ALIGN(top_addr + 1); +} + +int __init linux_main(int argc, char **argv, char **envp) { unsigned long avail, diff; unsigned long virtmem_size, max_physmem; + unsigned long host_task_size; unsigned long stack; unsigned int i; int add; - char * mode; for (i = 1; i < argc; i++) { if ((i == 1) && (argv[i][0] == ' ')) @@ -279,26 +321,31 @@ int __init linux_main(int argc, char **argv) add_arg(argv[i]); } if (have_root == 0) - add_arg(DEFAULT_COMMAND_LINE); + add_arg(DEFAULT_COMMAND_LINE_ROOT); + + if (have_console == 0) + add_arg(DEFAULT_COMMAND_LINE_CONSOLE); + + host_task_size = get_top_address(envp); + /* reserve a few pages for the stubs */ + stub_start = host_task_size - STUB_SIZE; + host_task_size = stub_start; + + /* Limit TASK_SIZE to what is addressable by the page table */ + task_size = host_task_size; + if (task_size > (unsigned long long) PTRS_PER_PGD * PGDIR_SIZE) + task_size = PTRS_PER_PGD * PGDIR_SIZE; - host_task_size = os_get_top_address(); /* * TASK_SIZE needs to be PGDIR_SIZE aligned or else exit_mmap craps * out */ - task_size = host_task_size & PGDIR_MASK; + task_size = task_size & PGDIR_MASK; /* OS sanity checks that need to happen before the kernel runs */ os_early_checks(); - can_do_skas(); - - if (proc_mm && ptrace_faultinfo) - mode = "SKAS3"; - else - mode = "SKAS0"; - - printf("UML running in %s mode\n", mode); + get_host_cpu_features(parse_host_cpu_flags, parse_cache_line); brk_start = (unsigned long) sbrk(0); @@ -307,54 +354,32 @@ int __init linux_main(int argc, char **argv) * so they actually get what they asked for. This should * add zero for non-exec shield users */ - - diff = UML_ROUND_UP(brk_start) - UML_ROUND_UP(&_end); + diff = PAGE_ALIGN(brk_start) - PAGE_ALIGN((unsigned long) &_end); if (diff > 1024 * 1024) { - printf("Adding %ld bytes to physical memory to account for " - "exec-shield gap\n", diff); - physmem_size += UML_ROUND_UP(brk_start) - UML_ROUND_UP(&_end); + os_info("Adding %ld bytes to physical memory to account for " + "exec-shield gap\n", diff); + physmem_size += diff; } - uml_physmem = (unsigned long) &__binary_start & PAGE_MASK; + uml_physmem = (unsigned long) __binary_start & PAGE_MASK; /* Reserve up to 4M after the current brk */ uml_reserved = ROUND_4M(brk_start) + (1 << 22); setup_machinename(init_utsname()->machine); - highmem = 0; - iomem_size = (iomem_size + PAGE_SIZE - 1) & PAGE_MASK; - max_physmem = TASK_SIZE - uml_physmem - iomem_size - MIN_VMALLOC; - - /* - * Zones have to begin on a 1 << MAX_ORDER page boundary, - * so this makes sure that's true for highmem - */ - max_physmem &= ~((1 << (PAGE_SHIFT + MAX_ORDER)) - 1); - if (physmem_size + iomem_size > max_physmem) { - highmem = physmem_size + iomem_size - max_physmem; - physmem_size -= highmem; -#ifndef CONFIG_HIGHMEM - highmem = 0; - printf("CONFIG_HIGHMEM not enabled - physical memory shrunk " - "to %Lu bytes\n", physmem_size); -#endif + physmem_size = PAGE_ALIGN(physmem_size); + max_physmem = TASK_SIZE - uml_physmem - MIN_VMALLOC; + if (physmem_size > max_physmem) { + physmem_size = max_physmem; + os_info("Physical memory size shrunk to %llu bytes\n", + physmem_size); } high_physmem = uml_physmem + physmem_size; - end_iomem = high_physmem + iomem_size; - high_memory = (void *) end_iomem; start_vm = VMALLOC_START; - setup_physmem(uml_physmem, uml_reserved, physmem_size, highmem); - if (init_maps(physmem_size, iomem_size, highmem)) { - printf("Failed to allocate mem_map for %Lu bytes of physical " - "memory and %Lu bytes of highmem\n", physmem_size, - highmem); - exit(1); - } - virtmem_size = physmem_size; stack = (unsigned long) argv; stack &= ~(1024 * 1024 - 1); @@ -364,39 +389,70 @@ int __init linux_main(int argc, char **argv) end_vm = start_vm + virtmem_size; if (virtmem_size < physmem_size) - printf("Kernel virtual memory size shrunk to %lu bytes\n", - virtmem_size); + os_info("Kernel virtual memory size shrunk to %lu bytes\n", + virtmem_size); - atomic_notifier_chain_register(&panic_notifier_list, - &panic_exit_notifier); - - uml_postsetup(); + arch_task_struct_size = sizeof(struct task_struct) + host_fp_size; - stack_protections((unsigned long) &init_thread_info); os_flush_stdout(); return start_uml(); } +int __init __weak read_initrd(void) +{ + return 0; +} + void __init setup_arch(char **cmdline_p) { + u8 rng_seed[32]; + + stack_protections((unsigned long) init_task.stack); + setup_physmem(uml_physmem, uml_reserved, physmem_size); + uml_dtb_init(); + read_initrd(); + paging_init(); - strlcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); + strscpy(boot_command_line, command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; setup_hostinfo(host_info, sizeof host_info); + prefill_possible_map(); + + if (os_getrandom(rng_seed, sizeof(rng_seed), 0) == sizeof(rng_seed)) { + add_bootloader_randomness(rng_seed, sizeof(rng_seed)); + memzero_explicit(rng_seed, sizeof(rng_seed)); + } } -void __init check_bugs(void) +void __init arch_cpu_finalize_init(void) { arch_check_bugs(); os_check_bugs(); } +void apply_seal_endbr(s32 *start, s32 *end) +{ +} + +void apply_retpolines(s32 *start, s32 *end) +{ +} + +void apply_returns(s32 *start, s32 *end) +{ +} + +void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, + s32 *start_cfi, s32 *end_cfi) +{ +} + void apply_alternatives(struct alt_instr *start, struct alt_instr *end) { } -#ifdef CONFIG_SMP +#if IS_ENABLED(CONFIG_SMP) void alternatives_smp_module_add(struct module *mod, char *name, void *locks, void *locks_end, void *text, void *text_end) @@ -407,3 +463,90 @@ void alternatives_smp_module_del(struct module *mod) { } #endif + +void *text_poke(void *addr, const void *opcode, size_t len) +{ + /* + * In UML, the only reference to this function is in + * apply_relocate_add(), which shouldn't ever actually call this + * because UML doesn't have live patching. + */ + WARN_ON(1); + + return memcpy(addr, opcode, len); +} + +void *text_poke_copy(void *addr, const void *opcode, size_t len) +{ + return text_poke(addr, opcode, len); +} + +void smp_text_poke_sync_each_cpu(void) +{ +} + +void uml_pm_wake(void) +{ + pm_system_wakeup(); +} + +#ifdef CONFIG_PM_SLEEP +static int um_suspend_valid(suspend_state_t state) +{ + return state == PM_SUSPEND_MEM; +} + +static int um_suspend_prepare(void) +{ + um_irqs_suspend(); + return 0; +} + +static int um_suspend_enter(suspend_state_t state) +{ + if (WARN_ON(state != PM_SUSPEND_MEM)) + return -EINVAL; + + /* + * This is identical to the idle sleep, but we've just + * (during suspend) turned off all interrupt sources + * except for the ones we want, so now we can only wake + * up on something we actually want to wake up on. All + * timing has also been suspended. + */ + um_idle_sleep(); + return 0; +} + +static void um_suspend_finish(void) +{ + um_irqs_resume(); +} + +const struct platform_suspend_ops um_suspend_ops = { + .valid = um_suspend_valid, + .prepare = um_suspend_prepare, + .enter = um_suspend_enter, + .finish = um_suspend_finish, +}; + +static int init_pm_wake_signal(void) +{ + /* + * In external time-travel mode we can't use signals to wake up + * since that would mess with the scheduling. We'll have to do + * some additional work to support wakeup on virtio devices or + * similar, perhaps implementing a fake RTC controller that can + * trigger wakeup (and request the appropriate scheduling from + * the external scheduler when going to suspend.) + */ + if (time_travel_mode != TT_MODE_EXTERNAL) + register_pm_wake_signal(); + + suspend_set_ops(&um_suspend_ops); + + return 0; +} + +late_initcall(init_pm_wake_signal); +#endif diff --git a/arch/um/kernel/um_arch.h b/arch/um/kernel/um_arch.h new file mode 100644 index 000000000000..46e731ab9dfc --- /dev/null +++ b/arch/um/kernel/um_arch.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __UML_ARCH_H__ +#define __UML_ARCH_H__ + +extern void * __init uml_load_file(const char *filename, unsigned long long *size); + +#ifdef CONFIG_OF +extern void __init uml_dtb_init(void); +#else +static inline void uml_dtb_init(void) { } +#endif + +extern int __init read_initrd(void); + +#endif diff --git a/arch/um/kernel/umid.c b/arch/um/kernel/umid.c index f6cc3bd61781..72bc60ade347 100644 --- a/arch/um/kernel/umid.c +++ b/arch/um/kernel/umid.c @@ -1,6 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <asm/errno.h> @@ -9,21 +9,21 @@ #include <os.h> /* Changed by set_umid_arg */ -static int umid_inited = 0; +static int umid_inited; static int __init set_umid_arg(char *name, int *add) { int err; if (umid_inited) { - printf("umid already set\n"); + os_warn("umid already set\n"); return 0; } *add = 0; err = set_umid(name); if (err == -EEXIST) - printf("umid '%s' already in use\n", name); + os_warn("umid '%s' already in use\n", name); else if (!err) umid_inited = 1; diff --git a/arch/um/kernel/uml.lds.S b/arch/um/kernel/uml.lds.S index 6899195602b7..a409d4b66114 100644 --- a/arch/um/kernel/uml.lds.S +++ b/arch/um/kernel/uml.lds.S @@ -1,4 +1,5 @@ -#include <asm-generic/vmlinux.lds.h> +/* SPDX-License-Identifier: GPL-2.0 */ +#include <asm/vmlinux.lds.h> #include <asm/page.h> OUTPUT_FORMAT(ELF_FORMAT) @@ -6,6 +7,12 @@ OUTPUT_ARCH(ELF_ARCH) ENTRY(_start) jiffies = jiffies_64; +VERSION { + { + local: *; + }; +} + SECTIONS { /* This must contain the right address - not quite the default ELF one.*/ @@ -18,10 +25,10 @@ SECTIONS __binary_start = START; . = START + SIZEOF_HEADERS; + . = ALIGN(PAGE_SIZE); _text = .; INIT_TEXT_SECTION(0) - . = ALIGN(PAGE_SIZE); .text : { @@ -29,6 +36,8 @@ SECTIONS TEXT_TEXT SCHED_TEXT LOCK_TEXT + IRQENTRY_TEXT + SOFTIRQENTRY_TEXT *(.fixup) /* .gnu.warning sections are handled specially by elf32.em. */ *(.gnu.warning) @@ -68,8 +77,6 @@ SECTIONS .data : { INIT_TASK_DATA(KERNEL_STACK_SIZE) - . = ALIGN(KERNEL_STACK_SIZE); - *(.data..init_irqstack) DATA_DATA *(.gnu.linkonce.d*) CONSTRUCTORS @@ -85,6 +92,7 @@ SECTIONS } .got : { *(.got.plt) *(.got) } + .eh_frame : { KEEP (*(.eh_frame)) } .dynamic : { *(.dynamic) } .tdata : { *(.tdata .tdata.* .gnu.linkonce.td.*) } .tbss : { *(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon) } @@ -104,8 +112,8 @@ SECTIONS PROVIDE (end = .); STABS_DEBUG - DWARF_DEBUG + ELF_DETAILS DISCARDS } diff --git a/arch/um/kernel/vmlinux.lds.S b/arch/um/kernel/vmlinux.lds.S index 16e49bfa2b42..53d719c04ba9 100644 --- a/arch/um/kernel/vmlinux.lds.S +++ b/arch/um/kernel/vmlinux.lds.S @@ -1,4 +1,4 @@ - +#define RUNTIME_DISCARD_EXIT KERNEL_STACK_SIZE = 4096 * (1 << CONFIG_KERNEL_STACK_ORDER); #ifdef CONFIG_LD_SCRIPT_STATIC |
