diff options
Diffstat (limited to 'arch/um')
271 files changed, 17594 insertions, 13768 deletions
diff --git a/arch/um/.gitignore b/arch/um/.gitignore index a73d3a1cc746..d69ea5b562ce 100644 --- a/arch/um/.gitignore +++ b/arch/um/.gitignore @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only kernel/config.c kernel/config.tmp kernel/vmlinux.lds +kernel/capflags.c diff --git a/arch/um/Kbuild b/arch/um/Kbuild new file mode 100644 index 000000000000..6cf0c1e5927b --- /dev/null +++ b/arch/um/Kbuild @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-y += kernel/ drivers/ os-Linux/ diff --git a/arch/um/Kconfig b/arch/um/Kconfig new file mode 100644 index 000000000000..8415d39b0d43 --- /dev/null +++ b/arch/um/Kconfig @@ -0,0 +1,303 @@ +# SPDX-License-Identifier: GPL-2.0 + +menu "UML-specific options" + +config UML + bool + default y + select ARCH_DISABLE_KASAN_INLINE if STATIC_LINK + select ARCH_NEEDS_DEFER_KASAN if STATIC_LINK + select ARCH_WANTS_DYNAMIC_TASK_STRUCT + select ARCH_HAS_CACHE_LINE_SIZE + select ARCH_HAS_CPU_FINALIZE_INIT + select ARCH_HAS_FORTIFY_SOURCE + select ARCH_HAS_GCOV_PROFILE_ALL + select ARCH_HAS_KCOV + select ARCH_HAS_STRNCPY_FROM_USER + select ARCH_HAS_STRNLEN_USER + select ARCH_HAS_STRICT_KERNEL_RWX + select HAVE_ARCH_AUDITSYSCALL + select HAVE_ARCH_KASAN if X86_64 + select HAVE_ARCH_KASAN_VMALLOC if HAVE_ARCH_KASAN + select HAVE_ARCH_SECCOMP_FILTER + select HAVE_ASM_MODVERSIONS + select HAVE_UID16 + select HAVE_DEBUG_KMEMLEAK + select HAVE_DEBUG_BUGVERBOSE + select HAVE_PAGE_SIZE_4KB + select NO_DMA if !UML_DMA_EMULATION + select OF_EARLY_FLATTREE if OF + select GENERIC_IRQ_SHOW + select GENERIC_CPU_DEVICES + select GENERIC_SMP_IDLE_THREAD + select HAVE_GCC_PLUGINS + select ARCH_SUPPORTS_LTO_CLANG + select ARCH_SUPPORTS_LTO_CLANG_THIN + select TRACE_IRQFLAGS_SUPPORT + select TTY # Needed for line.c + select HAVE_ARCH_VMAP_STACK + select HAVE_RUST + select ARCH_HAS_UBSAN + select HAVE_ARCH_TRACEHOOK + select HAVE_SYSCALL_TRACEPOINTS + select THREAD_INFO_IN_TASK + select SPARSE_IRQ + +config MMU + bool + default y + +config UML_DMA_EMULATION + bool + +config NO_IOMEM + bool "disable IOMEM" if EXPERT + depends on !INDIRECT_IOMEM + default y + +config UML_IOMEM_EMULATION + bool + select INDIRECT_IOMEM + select GENERIC_PCI_IOMAP + +config ISA + bool + +config SBUS + bool + +config LOCKDEP_SUPPORT + bool + default y + +config STACKTRACE_SUPPORT + bool + default y + select STACKTRACE + +config GENERIC_CALIBRATE_DELAY + bool + default y + +config HZ + int + default 100 + +config UML_SUBARCH_SUPPORTS_SMP + bool + +config SMP + bool "Symmetric multi-processing support" + default n + depends on UML_SUBARCH_SUPPORTS_SMP + help + This option enables UML SMP support. + + With this enabled, users can tell UML to start multiple virtual + processors. Each virtual processor is represented as a separate + host thread. + + In UML, kthreads and normal threads (when running in kernel mode) + can be scheduled and executed simultaneously on different virtual + processors. However, the userspace code of normal threads still + runs within their respective single-threaded stubs. + + That is, SMP support is available both within the kernel and + across different processes, but remains limited within threads + of the same process in userspace. + +config NR_CPUS_RANGE_BEGIN + int + default 1 if !SMP + default 2 + +config NR_CPUS_RANGE_END + int + default 1 if !SMP + default 64 + +config NR_CPUS_DEFAULT + int + default 1 if !SMP + default 2 + +config NR_CPUS + int "Maximum number of CPUs" if SMP + range NR_CPUS_RANGE_BEGIN NR_CPUS_RANGE_END + default NR_CPUS_DEFAULT + +source "arch/$(HEADER_ARCH)/um/Kconfig" + +config MAY_HAVE_RUNTIME_DEPS + bool + +config STATIC_LINK + bool "Force a static link" + depends on !MAY_HAVE_RUNTIME_DEPS + help + This option gives you the ability to force a static link of UML. + Normally, UML is linked as a shared binary. This is inconvenient for + use in a chroot jail. So, if you intend to run UML inside a chroot, + you probably want to say Y here. + Additionally, this option enables using higher memory spaces (up to + 2.75G) for UML. + + NOTE: This option is incompatible with some networking features which + depend on features that require being dynamically loaded (like NSS). + +config LD_SCRIPT_STATIC + bool + default y + depends on STATIC_LINK + +config LD_SCRIPT_DYN + bool + default y + depends on !LD_SCRIPT_STATIC + +config LD_SCRIPT_DYN_RPATH + bool "set rpath in the binary" if EXPERT + default y + depends on LD_SCRIPT_DYN + help + Add /lib (and /lib64 for 64-bit) to the linux binary's rpath + explicitly. + + You may need to turn this off if compiling for nix systems + that have their libraries in random /nix directories and + might otherwise unexpected use libraries from /lib or /lib64 + instead of the desired ones. + +config HOSTFS + tristate "Host filesystem" + help + While the User-Mode Linux port uses its own root file system for + booting and normal file access, this module lets the UML user + access files stored on the host. It does not require any + network connection between the Host and UML. An example use of + this might be: + + mount none /tmp/fromhost -t hostfs -o /tmp/umlshare + + where /tmp/fromhost is an empty directory inside UML and + /tmp/umlshare is a directory on the host with files the UML user + wishes to access. + + For more information, see + <http://user-mode-linux.sourceforge.net/hostfs.html>. + + If you'd like to be able to work with files stored on the host, + say Y or M here; otherwise say N. + +config MCONSOLE + bool "Management console" + depends on PROC_FS + default y + help + The user mode linux management console is a low-level interface to + the kernel, somewhat like the i386 SysRq interface. Since there is + a full-blown operating system running under every user mode linux + instance, there is much greater flexibility possible than with the + SysRq mechanism. + + If you answer 'Y' to this option, to use this feature, you need the + mconsole client (called uml_mconsole) which is present in CVS in + 2.4.5-9um and later (path /tools/mconsole), and is also in the + distribution RPM package in 2.4.6 and later. + + It is safe to say 'Y' here. + +config MAGIC_SYSRQ + bool "Magic SysRq key" + depends on MCONSOLE + help + If you say Y here, you will have some control over the system even + if the system crashes for example during kernel debugging (e.g., you + will be able to flush the buffer cache to disk, reboot the system + immediately or dump some status information). A key for each of the + possible requests is provided. + + This is the feature normally accomplished by pressing a key + while holding SysRq (Alt+PrintScreen). + + On UML, this is accomplished by sending a "sysrq" command with + mconsole, followed by the letter for the requested command. + + The keys are documented in <file:Documentation/admin-guide/sysrq.rst>. Don't say Y + unless you really know what this hack does. + +config KERNEL_STACK_ORDER + int "Kernel stack size order" + default 2 if 64BIT + range 2 10 if 64BIT + default 1 if !64BIT + help + This option determines the size of UML kernel stacks. They will + be 1 << order pages. The default is OK unless you're running Valgrind + on UML, in which case, set this to 3. + It is possible to reduce the stack to 1 for 64BIT and 0 for 32BIT on + older (pre-2017) CPUs. It is not recommended on newer CPUs due to the + increase in the size of the state which needs to be saved when handling + signals. + +config PGTABLE_LEVELS + int + default 4 if 64BIT + default 2 if !64BIT + +config UML_TIME_TRAVEL_SUPPORT + bool + prompt "Support time-travel mode (e.g. for test execution)" + # inf-cpu mode is incompatible with the benchmarking + depends on !RAID6_PQ_BENCHMARK + depends on !SMP + help + Enable this option to support time travel inside the UML instance. + + After enabling this option, two modes are accessible at runtime + (selected by the kernel command line), see the kernel's command- + line help for more details. + + It is safe to say Y, but you probably don't need this. + +config UML_MAX_USERSPACE_ITERATIONS + int + prompt "Maximum number of unscheduled userspace iterations" + default 10000 + depends on UML_TIME_TRAVEL_SUPPORT + help + In UML inf-cpu and ext time-travel mode userspace can run without being + interrupted. This will eventually overwhelm the kernel and create OOM + situations (mainly RCU not running). This setting specifies the number + of kernel/userspace switches (minor/major page fault, signal or syscall) + for the same userspace thread before the sched_clock is advanced by a + jiffie to trigger scheduling. + + Setting it to zero disables the feature. + +config KASAN_SHADOW_OFFSET + hex + depends on KASAN + default 0x100000000000 + help + This is the offset at which the ~16TB of shadow memory is + mapped and used by KASAN for memory debugging. This can be any + address that has at least KASAN_SHADOW_SIZE (total address space divided + by 8) amount of space so that the KASAN shadow memory does not conflict + with anything. The default is 0x100000000000, which works even if mem is + set to a large value. On low-memory systems, try 0x7fff8000, as it fits + into the immediate of most instructions, improving performance. + +endmenu + +source "arch/um/drivers/Kconfig" + +config ARCH_SUSPEND_POSSIBLE + def_bool y + depends on !SMP + +menu "Power management options" + +source "kernel/power/Kconfig" + +endmenu diff --git a/arch/um/Kconfig.char b/arch/um/Kconfig.char deleted file mode 100644 index b9d7c4276682..000000000000 --- a/arch/um/Kconfig.char +++ /dev/null @@ -1,127 +0,0 @@ -menu "UML Character Devices" - -config STDERR_CONSOLE - bool "stderr console" - default y - help - console driver which dumps all printk messages to stderr. - -config STDIO_CONSOLE - bool - default y - -config SSL - bool "Virtual serial line" - help - The User-Mode Linux environment allows you to create virtual serial - lines on the UML that are usually made to show up on the host as - ttys or ptys. - - See <http://user-mode-linux.sourceforge.net/old/input.html> for more - information and command line examples of how to use this facility. - - Unless you have a specific reason for disabling this, say Y. - -config NULL_CHAN - bool "null channel support" - help - This option enables support for attaching UML consoles and serial - lines to a device similar to /dev/null. Data written to it disappears - and there is never any data to be read. - -config PORT_CHAN - bool "port channel support" - help - This option enables support for attaching UML consoles and serial - lines to host portals. They may be accessed with 'telnet <host> - <port number>'. Any number of consoles and serial lines may be - attached to a single portal, although what UML device you get when - you telnet to that portal will be unpredictable. - It is safe to say 'Y' here. - -config PTY_CHAN - bool "pty channel support" - help - This option enables support for attaching UML consoles and serial - lines to host pseudo-terminals. Access to both traditional - pseudo-terminals (/dev/pty*) and pts pseudo-terminals are controlled - with this option. The assignment of UML devices to host devices - will be announced in the kernel message log. - It is safe to say 'Y' here. - -config TTY_CHAN - bool "tty channel support" - help - This option enables support for attaching UML consoles and serial - lines to host terminals. Access to both virtual consoles - (/dev/tty*) and the slave side of pseudo-terminals (/dev/ttyp* and - /dev/pts/*) are controlled by this option. - It is safe to say 'Y' here. - -config XTERM_CHAN - bool "xterm channel support" - help - This option enables support for attaching UML consoles and serial - lines to xterms. Each UML device so assigned will be brought up in - its own xterm. - It is safe to say 'Y' here. - -config NOCONFIG_CHAN - bool - default !(XTERM_CHAN && TTY_CHAN && PTY_CHAN && PORT_CHAN && NULL_CHAN) - -config CON_ZERO_CHAN - string "Default main console channel initialization" - default "fd:0,fd:1" - help - This is the string describing the channel to which the main console - will be attached by default. This value can be overridden from the - command line. The default value is "fd:0,fd:1", which attaches the - main console to stdin and stdout. - It is safe to leave this unchanged. - -config CON_CHAN - string "Default console channel initialization" - default "xterm" - help - This is the string describing the channel to which all consoles - except the main console will be attached by default. This value can - be overridden from the command line. The default value is "xterm", - which brings them up in xterms. - It is safe to leave this unchanged, although you may wish to change - this if you expect the UML that you build to be run in environments - which don't have X or xterm available. - -config SSL_CHAN - string "Default serial line channel initialization" - default "pty" - help - This is the string describing the channel to which the serial lines - will be attached by default. This value can be overridden from the - command line. The default value is "pty", which attaches them to - traditional pseudo-terminals. - It is safe to leave this unchanged, although you may wish to change - this if you expect the UML that you build to be run in environments - which don't have a set of /dev/pty* devices. - -config UML_SOUND - tristate "Sound support" - help - This option enables UML sound support. If enabled, it will pull in - soundcore and the UML hostaudio relay, which acts as a intermediary - between the host's dsp and mixer devices and the UML sound system. - It is safe to say 'Y' here. - -config SOUND - tristate - default UML_SOUND - -config SOUND_OSS_CORE - bool - default UML_SOUND - -config HOSTAUDIO - tristate - default UML_SOUND - -endmenu diff --git a/arch/um/Kconfig.common b/arch/um/Kconfig.common deleted file mode 100644 index bceee6623b00..000000000000 --- a/arch/um/Kconfig.common +++ /dev/null @@ -1,64 +0,0 @@ -config DEFCONFIG_LIST - string - option defconfig_list - default "arch/$ARCH/defconfig" - -config UML - bool - default y - select HAVE_GENERIC_HARDIRQS - select HAVE_UID16 - select GENERIC_IRQ_SHOW - select GENERIC_CPU_DEVICES - select GENERIC_IO - select GENERIC_CLOCKEVENTS - select TTY # Needed for line.c - -config MMU - bool - default y - -config NO_IOMEM - def_bool y - -config ISA - bool - -config SBUS - bool - -config PCI - bool - -config PCMCIA - bool - -# Yet to do! -config TRACE_IRQFLAGS_SUPPORT - bool - default n - -config LOCKDEP_SUPPORT - bool - default y - -config STACKTRACE_SUPPORT - bool - default n - -config GENERIC_CALIBRATE_DELAY - bool - default y - -config GENERIC_BUG - bool - default y - depends on BUG - -config HZ - int - default 100 - -config SUBARCH - string - option env="SUBARCH" diff --git a/arch/um/Kconfig.debug b/arch/um/Kconfig.debug index 68205fd3b08c..1dfb2959c73b 100644 --- a/arch/um/Kconfig.debug +++ b/arch/um/Kconfig.debug @@ -1,6 +1,4 @@ -menu "Kernel hacking" - -source "lib/Kconfig.debug" +# SPDX-License-Identifier: GPL-2.0 config GPROF bool "Enable gprof support" @@ -18,6 +16,8 @@ config GPROF config GCOV bool "Enable gcov support" depends on DEBUG_INFO + depends on !KCOV + depends on !MODULES help This option allows developers to retrieve coverage data from a UML session. @@ -31,10 +31,8 @@ config GCOV config EARLY_PRINTK bool "Early printk" default y - ---help--- + help Write kernel log output directly to stdout. This is useful for kernel debugging when your machine crashes very early before the console code is initialized. - -endmenu diff --git a/arch/um/Kconfig.net b/arch/um/Kconfig.net deleted file mode 100644 index 820a56f00332..000000000000 --- a/arch/um/Kconfig.net +++ /dev/null @@ -1,202 +0,0 @@ - -menu "UML Network Devices" - depends on NET - -# UML virtual driver -config UML_NET - bool "Virtual network device" - help - While the User-Mode port cannot directly talk to any physical - hardware devices, this choice and the following transport options - provide one or more virtual network devices through which the UML - kernels can talk to each other, the host, and with the host's help, - machines on the outside world. - - For more information, including explanations of the networking and - sample configurations, see - <http://user-mode-linux.sourceforge.net/old/networking.html>. - - If you'd like to be able to enable networking in the User-Mode - linux environment, say Y; otherwise say N. Note that you must - enable at least one of the following transport options to actually - make use of UML networking. - -config UML_NET_ETHERTAP - bool "Ethertap transport" - depends on UML_NET - help - The Ethertap User-Mode Linux network transport allows a single - running UML to exchange packets with its host over one of the - host's Ethertap devices, such as /dev/tap0. Additional running - UMLs can use additional Ethertap devices, one per running UML. - While the UML believes it's on a (multi-device, broadcast) virtual - Ethernet network, it's in fact communicating over a point-to-point - link with the host. - - To use this, your host kernel must have support for Ethertap - devices. Also, if your host kernel is 2.4.x, it must have - CONFIG_NETLINK_DEV configured as Y or M. - - For more information, see - <http://user-mode-linux.sourceforge.net/old/networking.html> That site - has examples of the UML command line to use to enable Ethertap - networking. - - If you'd like to set up an IP network with the host and/or the - outside world, say Y to this, the Daemon Transport and/or the - Slip Transport. You'll need at least one of them, but may choose - more than one without conflict. If you don't need UML networking, - say N. - -config UML_NET_TUNTAP - bool "TUN/TAP transport" - depends on UML_NET - help - The UML TUN/TAP network transport allows a UML instance to exchange - packets with the host over a TUN/TAP device. This option will only - work with a 2.4 host, unless you've applied the TUN/TAP patch to - your 2.2 host kernel. - - To use this transport, your host kernel must have support for TUN/TAP - devices, either built-in or as a module. - -config UML_NET_SLIP - bool "SLIP transport" - depends on UML_NET - help - The slip User-Mode Linux network transport allows a running UML to - network with its host over a point-to-point link. Unlike Ethertap, - which can carry any Ethernet frame (and hence even non-IP packets), - the slip transport can only carry IP packets. - - To use this, your host must support slip devices. - - For more information, see - <http://user-mode-linux.sourceforge.net/old/networking.html>. - has examples of the UML command line to use to enable slip - networking, and details of a few quirks with it. - - The Ethertap Transport is preferred over slip because of its - limitations. If you prefer slip, however, say Y here. Otherwise - choose the Multicast transport (to network multiple UMLs on - multiple hosts), Ethertap (to network with the host and the - outside world), and/or the Daemon transport (to network multiple - UMLs on a single host). You may choose more than one without - conflict. If you don't need UML networking, say N. - -config UML_NET_DAEMON - bool "Daemon transport" - depends on UML_NET - help - This User-Mode Linux network transport allows one or more running - UMLs on a single host to communicate with each other, but not to - the host. - - To use this form of networking, you'll need to run the UML - networking daemon on the host. - - For more information, see - <http://user-mode-linux.sourceforge.net/old/networking.html> That site - has examples of the UML command line to use to enable Daemon - networking. - - If you'd like to set up a network with other UMLs on a single host, - say Y. If you need a network between UMLs on multiple physical - hosts, choose the Multicast Transport. To set up a network with - the host and/or other IP machines, say Y to the Ethertap or Slip - transports. You'll need at least one of them, but may choose - more than one without conflict. If you don't need UML networking, - say N. - -config UML_NET_VDE - bool "VDE transport" - depends on UML_NET - help - This User-Mode Linux network transport allows one or more running - UMLs on a single host to communicate with each other and also - with the rest of the world using Virtual Distributed Ethernet, - an improved fork of uml_switch. - - You must have libvdeplug installed in order to build the vde - transport into UML. - - To use this form of networking, you will need to run vde_switch - on the host. - - For more information, see <http://wiki.virtualsquare.org/> - That site has a good overview of what VDE is and also examples - of the UML command line to use to enable VDE networking. - - If you need UML networking with VDE, - say Y. - -config UML_NET_MCAST - bool "Multicast transport" - depends on UML_NET - help - This Multicast User-Mode Linux network transport allows multiple - UMLs (even ones running on different host machines!) to talk to - each other over a virtual ethernet network. However, it requires - at least one UML with one of the other transports to act as a - bridge if any of them need to be able to talk to their hosts or any - other IP machines. - - To use this, your host kernel(s) must support IP Multicasting. - - For more information, see - <http://user-mode-linux.sourceforge.net/old/networking.html> That site - has examples of the UML command line to use to enable Multicast - networking, and notes about the security of this approach. - - If you need UMLs on multiple physical hosts to communicate as if - they shared an Ethernet network, say Y. If you need to communicate - with other IP machines, make sure you select one of the other - transports (possibly in addition to Multicast; they're not - exclusive). If you don't need to network UMLs say N to each of - the transports. - -config UML_NET_PCAP - bool "pcap transport" - depends on UML_NET - help - The pcap transport makes a pcap packet stream on the host look - like an ethernet device inside UML. This is useful for making - UML act as a network monitor for the host. You must have libcap - installed in order to build the pcap transport into UML. - - For more information, see - <http://user-mode-linux.sourceforge.net/old/networking.html> That site - has examples of the UML command line to use to enable this option. - - If you intend to use UML as a network monitor for the host, say - Y here. Otherwise, say N. - -config UML_NET_SLIRP - bool "SLiRP transport" - depends on UML_NET - help - The SLiRP User-Mode Linux network transport allows a running UML - to network by invoking a program that can handle SLIP encapsulated - packets. This is commonly (but not limited to) the application - known as SLiRP, a program that can re-socket IP packets back onto - the host on which it is run. Only IP packets are supported, - unlike other network transports that can handle all Ethernet - frames. In general, slirp allows the UML the same IP connectivity - to the outside world that the host user is permitted, and unlike - other transports, SLiRP works without the need of root level - privleges, setuid binaries, or SLIP devices on the host. This - also means not every type of connection is possible, but most - situations can be accommodated with carefully crafted slirp - commands that can be passed along as part of the network device's - setup string. The effect of this transport on the UML is similar - that of a host behind a firewall that masquerades all network - connections passing through it (but is less secure). - - To use this you should first have slirp compiled somewhere - accessible on the host, and have read its documentation. If you - don't need UML networking, say N. - - Startup example: "eth0=slirp,FE:FD:01:02:03:04,/usr/local/bin/slirp" - -endmenu - diff --git a/arch/um/Kconfig.rest b/arch/um/Kconfig.rest deleted file mode 100644 index 567eb5fc21df..000000000000 --- a/arch/um/Kconfig.rest +++ /dev/null @@ -1,21 +0,0 @@ -source "init/Kconfig" - -source "kernel/Kconfig.freezer" - -source "arch/um/Kconfig.char" - -source "drivers/Kconfig" - -source "net/Kconfig" - -source "arch/um/Kconfig.net" - -source "fs/Kconfig" - -source "security/Kconfig" - -source "crypto/Kconfig" - -source "lib/Kconfig" - -source "arch/um/Kconfig.debug" diff --git a/arch/um/Kconfig.um b/arch/um/Kconfig.um deleted file mode 100644 index a7520c90f62d..000000000000 --- a/arch/um/Kconfig.um +++ /dev/null @@ -1,157 +0,0 @@ -config STATIC_LINK - bool "Force a static link" - default n - help - This option gives you the ability to force a static link of UML. - Normally, UML is linked as a shared binary. This is inconvenient for - use in a chroot jail. So, if you intend to run UML inside a chroot, - you probably want to say Y here. - Additionally, this option enables using higher memory spaces (up to - 2.75G) for UML. - -source "mm/Kconfig" - -config LD_SCRIPT_STATIC - bool - default y - depends on STATIC_LINK - -config LD_SCRIPT_DYN - bool - default y - depends on !LD_SCRIPT_STATIC - -source "fs/Kconfig.binfmt" - -config HOSTFS - tristate "Host filesystem" - help - While the User-Mode Linux port uses its own root file system for - booting and normal file access, this module lets the UML user - access files stored on the host. It does not require any - network connection between the Host and UML. An example use of - this might be: - - mount none /tmp/fromhost -t hostfs -o /tmp/umlshare - - where /tmp/fromhost is an empty directory inside UML and - /tmp/umlshare is a directory on the host with files the UML user - wishes to access. - - For more information, see - <http://user-mode-linux.sourceforge.net/hostfs.html>. - - If you'd like to be able to work with files stored on the host, - say Y or M here; otherwise say N. - -config HPPFS - tristate "HoneyPot ProcFS" - depends on PROC_FS - help - hppfs (HoneyPot ProcFS) is a filesystem which allows UML /proc - entries to be overridden, removed, or fabricated from the host. - Its purpose is to allow a UML to appear to be a physical machine - by removing or changing anything in /proc which gives away the - identity of a UML. - - See <http://user-mode-linux.sf.net/old/hppfs.html> for more information. - - You only need this if you are setting up a UML honeypot. Otherwise, - it is safe to say 'N' here. - -config MCONSOLE - bool "Management console" - default y - help - The user mode linux management console is a low-level interface to - the kernel, somewhat like the i386 SysRq interface. Since there is - a full-blown operating system running under every user mode linux - instance, there is much greater flexibility possible than with the - SysRq mechanism. - - If you answer 'Y' to this option, to use this feature, you need the - mconsole client (called uml_mconsole) which is present in CVS in - 2.4.5-9um and later (path /tools/mconsole), and is also in the - distribution RPM package in 2.4.6 and later. - - It is safe to say 'Y' here. - -config MAGIC_SYSRQ - bool "Magic SysRq key" - depends on MCONSOLE - help - If you say Y here, you will have some control over the system even - if the system crashes for example during kernel debugging (e.g., you - will be able to flush the buffer cache to disk, reboot the system - immediately or dump some status information). A key for each of the - possible requests is provided. - - This is the feature normally accomplished by pressing a key - while holding SysRq (Alt+PrintScreen). - - On UML, this is accomplished by sending a "sysrq" command with - mconsole, followed by the letter for the requested command. - - The keys are documented in <file:Documentation/sysrq.txt>. Don't say Y - unless you really know what this hack does. - -config SMP - bool "Symmetric multi-processing support" - default n - depends on BROKEN - help - This option enables UML SMP support. - It is NOT related to having a real SMP box. Not directly, at least. - - UML implements virtual SMP by allowing as many processes to run - simultaneously on the host as there are virtual processors configured. - - Obviously, if the host is a uniprocessor, those processes will - timeshare, but, inside UML, will appear to be running simultaneously. - If the host is a multiprocessor, then UML processes may run - simultaneously, depending on the host scheduler. - - This, however, is supported only in TT mode. So, if you use the SKAS - patch on your host, switching to TT mode and enabling SMP usually - gives you worse performances. - Also, since the support for SMP has been under-developed, there could - be some bugs being exposed by enabling SMP. - - If you don't know what to do, say N. - -config NR_CPUS - int "Maximum number of CPUs (2-32)" - range 2 32 - depends on SMP - default "32" - -config HIGHMEM - bool "Highmem support" - depends on !64BIT && BROKEN - default n - help - This was used to allow UML to run with big amounts of memory. - Currently it is unstable, so if unsure say N. - - To use big amounts of memory, it is recommended enable static - linking (i.e. CONFIG_STATIC_LINK) - this should allow the - guest to use up to 2.75G of memory. - -config KERNEL_STACK_ORDER - int "Kernel stack size order" - default 1 if 64BIT - range 1 10 if 64BIT - default 0 if !64BIT - help - This option determines the size of UML kernel stacks. They will - be 1 << order pages. The default is OK unless you're running Valgrind - on UML, in which case, set this to 3. - -config MMAPPER - tristate "iomem emulation driver" - help - This driver allows a host file to be used as emulated IO memory inside - UML. - -config NO_DMA - def_bool y diff --git a/arch/um/Makefile b/arch/um/Makefile index 133f7de2a13d..721b652ffb65 100644 --- a/arch/um/Makefile +++ b/arch/um/Makefile @@ -6,28 +6,31 @@ # Licensed under the GPL # +# select defconfig based on actual architecture +ifeq ($(SUBARCH),x86) + ifeq ($(shell uname -m),x86_64) + KBUILD_DEFCONFIG := x86_64_defconfig + else + KBUILD_DEFCONFIG := i386_defconfig + endif +else + KBUILD_DEFCONFIG := $(SUBARCH)_defconfig +endif + ARCH_DIR := arch/um -OS := $(shell uname -s) # We require bash because the vmlinux link and loader script cpp use bash # features. -SHELL := /bin/bash - -filechk_gen_header = $< - -core-y += $(ARCH_DIR)/kernel/ \ - $(ARCH_DIR)/drivers/ \ - $(ARCH_DIR)/os-$(OS)/ +SHELL := bash MODE_INCLUDE += -I$(srctree)/$(ARCH_DIR)/include/shared/skas HEADER_ARCH := $(SUBARCH) -# Additional ARCH settings for x86 -ifeq ($(SUBARCH),i386) - HEADER_ARCH := x86 +ifneq ($(filter $(SUBARCH),x86 x86_64 i386),) + HEADER_ARCH := x86 endif -ifeq ($(SUBARCH),x86_64) - HEADER_ARCH := x86 + +ifdef CONFIG_64BIT KBUILD_CFLAGS += -mcmodel=large endif @@ -43,32 +46,40 @@ ARCH_INCLUDE := -I$(srctree)/$(SHARED_HEADERS) ARCH_INCLUDE += -I$(srctree)/$(HOST_DIR)/um/shared KBUILD_CPPFLAGS += -I$(srctree)/$(HOST_DIR)/um -# -Dvmap=kernel_vmap prevents anything from referencing the libpcap.o symbol so -# named - it's a common symbol in libpcap, so we get a binary which crashes. +# -Dstrrchr=kernel_strrchr (as well as the various in6addr symbols) prevents +# anything from referencing +# libc symbols with the same name, which can cause a linker error. # -# Same things for in6addr_loopback and mktime - found in libc. For these two we -# only get link-time error, luckily. +# -Dlongjmp=kernel_longjmp prevents anything from referencing the libpthread.a +# embedded copy of longjmp, same thing for setjmp. # -# These apply to USER_CFLAGS to. +# These apply to USER_CFLAGS too. KBUILD_CFLAGS += $(CFLAGS) $(CFLAGS-y) -D__arch_um__ \ - $(ARCH_INCLUDE) $(MODE_INCLUDE) -Dvmap=kernel_vmap \ + $(ARCH_INCLUDE) $(MODE_INCLUDE) \ + -Dlongjmp=kernel_longjmp -Dsetjmp=kernel_setjmp \ -Din6addr_loopback=kernel_in6addr_loopback \ - -Din6addr_any=kernel_in6addr_any -Dstrrchr=kernel_strrchr + -Din6addr_any=kernel_in6addr_any -Dstrrchr=kernel_strrchr \ + -D__close_range=kernel__close_range + +KBUILD_RUSTFLAGS += -Crelocation-model=pie KBUILD_AFLAGS += $(ARCH_INCLUDE) -USER_CFLAGS = $(patsubst $(KERNEL_DEFINES),,$(patsubst -D__KERNEL__,,\ - $(patsubst -I%,,$(KBUILD_CFLAGS)))) $(ARCH_INCLUDE) $(MODE_INCLUDE) \ - $(filter -I%,$(CFLAGS)) -D_FILE_OFFSET_BITS=64 -idirafter include +USER_CFLAGS = $(patsubst $(KERNEL_DEFINES),,$(patsubst -I%,,$(KBUILD_CFLAGS))) \ + $(ARCH_INCLUDE) $(MODE_INCLUDE) $(filter -I%,$(CFLAGS)) \ + -D_FILE_OFFSET_BITS=64 -idirafter $(srctree)/include \ + -idirafter $(objtree)/include -D__KERNEL__ -D__UM_HOST__ \ + -include $(srctree)/include/linux/compiler-version.h \ + -include $(srctree)/include/linux/kconfig.h #This will adjust *FLAGS accordingly to the platform. -include $(srctree)/$(ARCH_DIR)/Makefile-os-$(OS) +include $(srctree)/$(ARCH_DIR)/Makefile-os-Linux KBUILD_CPPFLAGS += -I$(srctree)/$(HOST_DIR)/include \ -I$(srctree)/$(HOST_DIR)/include/uapi \ - -I$(HOST_DIR)/include/generated \ - -I$(HOST_DIR)/include/generated/uapi + -I$(objtree)/$(HOST_DIR)/include/generated \ + -I$(objtree)/$(HOST_DIR)/include/generated/uapi # -Derrno=kernel_errno - This turns all kernel references to errno into # kernel_errno to separate them from the libc errno. This allows -fno-common @@ -98,20 +109,18 @@ define archhelp echo ' find in the kernel root.' endef -KBUILD_KCONFIG := $(HOST_DIR)/um/Kconfig - archheaders: - $(Q)$(MAKE) -C '$(srctree)' KBUILD_SRC= \ - ARCH=$(HEADER_ARCH) O='$(objtree)' archheaders + $(Q)$(MAKE) -f $(srctree)/Makefile ARCH=$(HEADER_ARCH) asm-generic archheaders -archprepare: include/generated/user_constants.h +archprepare: + $(Q)$(MAKE) $(build)=$(HOST_DIR)/um include/generated/user_constants.h LINK-$(CONFIG_LD_SCRIPT_STATIC) += -static -LINK-$(CONFIG_LD_SCRIPT_DYN) += -Wl,-rpath,/lib +LINK-$(CONFIG_LD_SCRIPT_DYN) += -no-pie +LINK-$(CONFIG_LD_SCRIPT_DYN_RPATH) += -Wl,-rpath,/lib CFLAGS_NO_HARDENING := $(call cc-option, -fno-PIC,) $(call cc-option, -fno-pic,) \ - $(call cc-option, -fno-stack-protector,) \ - $(call cc-option, -fno-stack-protector-all,) + -fno-stack-protector $(call cc-option, -fno-stack-protector-all) # Options used by linker script export LDS_START := $(START) @@ -121,38 +130,26 @@ export LDS_ELF_FORMAT := $(ELF_FORMAT) # The wrappers will select whether using "malloc" or the kernel allocator. LINK_WRAPS = -Wl,--wrap,malloc -Wl,--wrap,free -Wl,--wrap,calloc -LD_FLAGS_CMDLINE = $(foreach opt,$(LDFLAGS),-Wl,$(opt)) +# Avoid binutils 2.39+ warnings by marking the stack non-executable and +# ignorning warnings for the kallsyms sections. +LDFLAGS_EXECSTACK = -z noexecstack +ifeq ($(CONFIG_LD_IS_BFD),y) +LDFLAGS_EXECSTACK += $(call ld-option,--no-warn-rwx-segments) +endif + +LD_FLAGS_CMDLINE = $(foreach opt,$(KBUILD_LDFLAGS) $(LDFLAGS_EXECSTACK),-Wl,$(opt)) # Used by link-vmlinux.sh which has special support for um link -export CFLAGS_vmlinux := $(LINK-y) $(LINK_WRAPS) $(LD_FLAGS_CMDLINE) +export CFLAGS_vmlinux := $(LINK-y) $(LINK_WRAPS) $(LD_FLAGS_CMDLINE) $(CC_FLAGS_LTO) # When cleaning we don't include .config, so we don't include # TT or skas makefiles and don't clean skas_ptregs.h. CLEAN_FILES += linux x.i gmon.out +MRPROPER_FILES += $(HOST_DIR)/include/generated archclean: @find . \( -name '*.bb' -o -name '*.bbg' -o -name '*.da' \ -o -name '*.gcov' \) -type f -print | xargs rm -f + $(Q)$(MAKE) -f $(srctree)/Makefile ARCH=$(HEADER_ARCH) clean -# Generated files - -$(HOST_DIR)/um/user-offsets.s: __headers FORCE - $(Q)$(MAKE) $(build)=$(HOST_DIR)/um $@ - -define filechk_gen-asm-offsets - (set -e; \ - echo "/*"; \ - echo " * DO NOT MODIFY."; \ - echo " *"; \ - echo " * This file was generated by arch/$(ARCH)/Makefile"; \ - echo " *"; \ - echo " */"; \ - echo ""; \ - sed -ne "/^->/{s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; s:->::; p;}"; \ - echo ""; ) -endef - -include/generated/user_constants.h: $(HOST_DIR)/um/user-offsets.s - $(call filechk,gen-asm-offsets) - -export SUBARCH USER_CFLAGS CFLAGS_NO_HARDENING OS DEV_NULL_PATH +export HEADER_ARCH SUBARCH USER_CFLAGS CFLAGS_NO_HARDENING DEV_NULL_PATH diff --git a/arch/um/Makefile-ia64 b/arch/um/Makefile-ia64 deleted file mode 100644 index f84dc23b0f6e..000000000000 --- a/arch/um/Makefile-ia64 +++ /dev/null @@ -1 +0,0 @@ -START_ADDR = 0x1000000000000000 diff --git a/arch/um/Makefile-ppc b/arch/um/Makefile-ppc deleted file mode 100644 index 66fd2003e165..000000000000 --- a/arch/um/Makefile-ppc +++ /dev/null @@ -1,9 +0,0 @@ -ifeq ($(CONFIG_HOST_2G_2G), y) -START_ADDR = 0x80000000 -else -START_ADDR = 0xc0000000 -endif -ARCH_CFLAGS = -U__powerpc__ -D__UM_PPC__ - -# The arch is ppc, but the elf32 name is powerpc -ELF_SUBARCH = powerpc diff --git a/arch/um/Makefile-skas b/arch/um/Makefile-skas index ac35de5316a6..1a27e65bcb9c 100644 --- a/arch/um/Makefile-skas +++ b/arch/um/Makefile-skas @@ -3,10 +3,15 @@ # Licensed under the GPL # -GPROF_OPT += -pg -GCOV_OPT += -fprofile-arcs -ftest-coverage +export UM_GPROF_OPT += -pg -CFLAGS-$(CONFIG_GCOV) += $(GCOV_OPT) -CFLAGS-$(CONFIG_GPROF) += $(GPROF_OPT) -LINK-$(CONFIG_GCOV) += $(GCOV_OPT) -LINK-$(CONFIG_GPROF) += $(GPROF_OPT) +ifdef CONFIG_CC_IS_CLANG +export UM_GCOV_OPT += -fprofile-instr-generate -fcoverage-mapping +else +export UM_GCOV_OPT += -fprofile-arcs -ftest-coverage +endif + +CFLAGS-$(CONFIG_GCOV) += $(UM_GCOV_OPT) +CFLAGS-$(CONFIG_GPROF) += $(UM_GPROF_OPT) +LINK-$(CONFIG_GCOV) += $(UM_GCOV_OPT) +LINK-$(CONFIG_GPROF) += $(UM_GPROF_OPT) diff --git a/arch/um/configs/i386_defconfig b/arch/um/configs/i386_defconfig new file mode 100644 index 000000000000..29d9666eceae --- /dev/null +++ b/arch/um/configs/i386_defconfig @@ -0,0 +1,64 @@ +# CONFIG_COMPACTION is not set +CONFIG_BINFMT_MISC=m +CONFIG_HOSTFS=y +CONFIG_MAGIC_SYSRQ=y +CONFIG_KERNEL_STACK_ORDER=1 +CONFIG_SYSVIPC=y +CONFIG_POSIX_MQUEUE=y +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +CONFIG_LOG_BUF_SHIFT=14 +CONFIG_CGROUPS=y +CONFIG_CGROUP_FREEZER=y +CONFIG_CGROUP_DEVICE=y +CONFIG_CPUSETS=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_CGROUP_SCHED=y +CONFIG_BLK_CGROUP=y +# CONFIG_PID_NS is not set +CONFIG_SYSFS_DEPRECATED=y +CONFIG_CC_OPTIMIZE_FOR_SIZE=y +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +# CONFIG_BLK_DEV_BSG is not set +CONFIG_IOSCHED_BFQ=m +CONFIG_SSL=y +CONFIG_NULL_CHAN=y +CONFIG_PORT_CHAN=y +CONFIG_PTY_CHAN=y +CONFIG_TTY_CHAN=y +CONFIG_XTERM_CHAN=y +CONFIG_CON_CHAN="pts" +CONFIG_SSL_CHAN="pts" +CONFIG_SOUND=m +CONFIG_UML_SOUND=m +CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_MOUNT=y +CONFIG_BLK_DEV_UBD=y +CONFIG_BLK_DEV_LOOP=m +CONFIG_BLK_DEV_NBD=m +CONFIG_DUMMY=m +CONFIG_TUN=m +CONFIG_PPP=m +CONFIG_SLIP=m +CONFIG_LEGACY_PTY_COUNT=32 +# CONFIG_HW_RANDOM is not set +CONFIG_UML_RANDOM=y +CONFIG_NET=y +CONFIG_PACKET=y +CONFIG_UNIX=y +CONFIG_INET=y +# CONFIG_IPV6 is not set +CONFIG_EXT4_FS=y +CONFIG_QUOTA=y +CONFIG_AUTOFS_FS=m +CONFIG_ISO9660_FS=m +CONFIG_JOLIET=y +CONFIG_PROC_KCORE=y +CONFIG_TMPFS=y +CONFIG_NLS=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y +CONFIG_DEBUG_KERNEL=y diff --git a/arch/um/configs/x86_64_defconfig b/arch/um/configs/x86_64_defconfig new file mode 100644 index 000000000000..cf309c5406a2 --- /dev/null +++ b/arch/um/configs/x86_64_defconfig @@ -0,0 +1,64 @@ +# CONFIG_COMPACTION is not set +CONFIG_BINFMT_MISC=m +CONFIG_HOSTFS=y +CONFIG_MAGIC_SYSRQ=y +CONFIG_SYSVIPC=y +CONFIG_POSIX_MQUEUE=y +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +CONFIG_LOG_BUF_SHIFT=14 +CONFIG_CGROUPS=y +CONFIG_CGROUP_FREEZER=y +CONFIG_CGROUP_DEVICE=y +CONFIG_CPUSETS=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_CGROUP_SCHED=y +CONFIG_BLK_CGROUP=y +# CONFIG_PID_NS is not set +CONFIG_SYSFS_DEPRECATED=y +CONFIG_CC_OPTIMIZE_FOR_SIZE=y +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +# CONFIG_BLK_DEV_BSG is not set +CONFIG_IOSCHED_BFQ=m +CONFIG_SSL=y +CONFIG_NULL_CHAN=y +CONFIG_PORT_CHAN=y +CONFIG_PTY_CHAN=y +CONFIG_TTY_CHAN=y +CONFIG_XTERM_CHAN=y +CONFIG_CON_CHAN="pts" +CONFIG_SSL_CHAN="pts" +CONFIG_SOUND=m +CONFIG_UML_SOUND=m +CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_MOUNT=y +CONFIG_BLK_DEV_UBD=y +CONFIG_BLK_DEV_LOOP=m +CONFIG_BLK_DEV_NBD=m +CONFIG_DUMMY=m +CONFIG_TUN=m +CONFIG_PPP=m +CONFIG_SLIP=m +CONFIG_LEGACY_PTY_COUNT=32 +# CONFIG_HW_RANDOM is not set +CONFIG_UML_RANDOM=y +CONFIG_NET=y +CONFIG_PACKET=y +CONFIG_UNIX=y +CONFIG_INET=y +# CONFIG_IPV6 is not set +CONFIG_EXT4_FS=y +CONFIG_QUOTA=y +CONFIG_AUTOFS_FS=m +CONFIG_ISO9660_FS=m +CONFIG_JOLIET=y +CONFIG_PROC_KCORE=y +CONFIG_TMPFS=y +CONFIG_NLS=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y +CONFIG_FRAME_WARN=1024 +CONFIG_DEBUG_KERNEL=y diff --git a/arch/um/defconfig b/arch/um/defconfig deleted file mode 100644 index 08107a795062..000000000000 --- a/arch/um/defconfig +++ /dev/null @@ -1,901 +0,0 @@ -# -# Automatically generated file; DO NOT EDIT. -# User Mode Linux/i386 3.3.0 Kernel Configuration -# -CONFIG_DEFCONFIG_LIST="arch/$ARCH/defconfig" -CONFIG_UML=y -CONFIG_MMU=y -CONFIG_NO_IOMEM=y -# CONFIG_TRACE_IRQFLAGS_SUPPORT is not set -CONFIG_LOCKDEP_SUPPORT=y -# CONFIG_STACKTRACE_SUPPORT is not set -CONFIG_GENERIC_CALIBRATE_DELAY=y -CONFIG_GENERIC_BUG=y -CONFIG_GENERIC_CLOCKEVENTS=y -CONFIG_HZ=100 - -# -# UML-specific options -# - -# -# Host processor type and features -# -# CONFIG_M486 is not set -# CONFIG_M586 is not set -# CONFIG_M586TSC is not set -# CONFIG_M586MMX is not set -CONFIG_M686=y -# CONFIG_MPENTIUMII is not set -# CONFIG_MPENTIUMIII is not set -# CONFIG_MPENTIUMM is not set -# CONFIG_MPENTIUM4 is not set -# CONFIG_MK6 is not set -# CONFIG_MK7 is not set -# CONFIG_MK8 is not set -# CONFIG_MCRUSOE is not set -# CONFIG_MEFFICEON is not set -# CONFIG_MWINCHIPC6 is not set -# CONFIG_MWINCHIP3D is not set -# CONFIG_MELAN is not set -# CONFIG_MGEODEGX1 is not set -# CONFIG_MGEODE_LX is not set -# CONFIG_MCYRIXIII is not set -# CONFIG_MVIAC3_2 is not set -# CONFIG_MVIAC7 is not set -# CONFIG_MCORE2 is not set -# CONFIG_MATOM is not set -# CONFIG_X86_GENERIC is not set -CONFIG_X86_INTERNODE_CACHE_SHIFT=5 -CONFIG_X86_CMPXCHG=y -CONFIG_X86_L1_CACHE_SHIFT=5 -CONFIG_X86_XADD=y -CONFIG_X86_PPRO_FENCE=y -CONFIG_X86_WP_WORKS_OK=y -CONFIG_X86_INVLPG=y -CONFIG_X86_BSWAP=y -CONFIG_X86_POPAD_OK=y -CONFIG_X86_USE_PPRO_CHECKSUM=y -CONFIG_X86_TSC=y -CONFIG_X86_CMPXCHG64=y -CONFIG_X86_CMOV=y -CONFIG_X86_MINIMUM_CPU_FAMILY=5 -CONFIG_CPU_SUP_INTEL=y -CONFIG_CPU_SUP_CYRIX_32=y -CONFIG_CPU_SUP_AMD=y -CONFIG_CPU_SUP_CENTAUR=y -CONFIG_CPU_SUP_TRANSMETA_32=y -CONFIG_CPU_SUP_UMC_32=y -CONFIG_UML_X86=y -# CONFIG_64BIT is not set -CONFIG_X86_32=y -# CONFIG_X86_64 is not set -# CONFIG_RWSEM_XCHGADD_ALGORITHM is not set -CONFIG_RWSEM_GENERIC_SPINLOCK=y -# CONFIG_3_LEVEL_PGTABLES is not set -CONFIG_ARCH_HAS_SC_SIGNALS=y -CONFIG_ARCH_REUSE_HOST_VSYSCALL_AREA=y -CONFIG_GENERIC_HWEIGHT=y -# CONFIG_STATIC_LINK is not set -CONFIG_SELECT_MEMORY_MODEL=y -CONFIG_FLATMEM_MANUAL=y -CONFIG_FLATMEM=y -CONFIG_FLAT_NODE_MEM_MAP=y -CONFIG_PAGEFLAGS_EXTENDED=y -CONFIG_SPLIT_PTLOCK_CPUS=4 -# CONFIG_COMPACTION is not set -# CONFIG_PHYS_ADDR_T_64BIT is not set -CONFIG_ZONE_DMA_FLAG=0 -CONFIG_VIRT_TO_BUS=y -# CONFIG_KSM is not set -CONFIG_DEFAULT_MMAP_MIN_ADDR=4096 -CONFIG_NEED_PER_CPU_KM=y -# CONFIG_CLEANCACHE is not set -CONFIG_TICK_ONESHOT=y -CONFIG_NO_HZ=y -CONFIG_HIGH_RES_TIMERS=y -CONFIG_GENERIC_CLOCKEVENTS_BUILD=y -CONFIG_LD_SCRIPT_DYN=y -CONFIG_BINFMT_ELF=y -CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y -CONFIG_HAVE_AOUT=y -# CONFIG_BINFMT_AOUT is not set -CONFIG_BINFMT_MISC=m -CONFIG_HOSTFS=y -# CONFIG_HPPFS is not set -CONFIG_MCONSOLE=y -CONFIG_MAGIC_SYSRQ=y -CONFIG_KERNEL_STACK_ORDER=0 -# CONFIG_MMAPPER is not set -CONFIG_NO_DMA=y - -# -# General setup -# -CONFIG_EXPERIMENTAL=y -CONFIG_BROKEN_ON_SMP=y -CONFIG_INIT_ENV_ARG_LIMIT=128 -CONFIG_CROSS_COMPILE="" -CONFIG_LOCALVERSION="" -CONFIG_LOCALVERSION_AUTO=y -CONFIG_DEFAULT_HOSTNAME="(none)" -CONFIG_SWAP=y -CONFIG_SYSVIPC=y -CONFIG_SYSVIPC_SYSCTL=y -CONFIG_POSIX_MQUEUE=y -CONFIG_POSIX_MQUEUE_SYSCTL=y -CONFIG_BSD_PROCESS_ACCT=y -# CONFIG_BSD_PROCESS_ACCT_V3 is not set -# CONFIG_FHANDLE is not set -# CONFIG_TASKSTATS is not set -# CONFIG_AUDIT is not set -CONFIG_HAVE_GENERIC_HARDIRQS=y - -# -# IRQ subsystem -# -CONFIG_GENERIC_HARDIRQS=y -CONFIG_GENERIC_IRQ_SHOW=y - -# -# RCU Subsystem -# -CONFIG_TINY_RCU=y -# CONFIG_PREEMPT_RCU is not set -# CONFIG_RCU_TRACE is not set -# CONFIG_TREE_RCU_TRACE is not set -CONFIG_IKCONFIG=y -CONFIG_IKCONFIG_PROC=y -CONFIG_LOG_BUF_SHIFT=14 -CONFIG_CGROUPS=y -# CONFIG_CGROUP_DEBUG is not set -CONFIG_CGROUP_FREEZER=y -CONFIG_CGROUP_DEVICE=y -CONFIG_CPUSETS=y -CONFIG_PROC_PID_CPUSET=y -CONFIG_CGROUP_CPUACCT=y -CONFIG_RESOURCE_COUNTERS=y -CONFIG_CGROUP_MEMCG=y -CONFIG_CGROUP_MEMCG_SWAP=y -# CONFIG_CGROUP_MEMCG_SWAP_ENABLED is not set -# CONFIG_CGROUP_MEMCG_KMEM is not set -CONFIG_CGROUP_SCHED=y -CONFIG_FAIR_GROUP_SCHED=y -# CONFIG_CFS_BANDWIDTH is not set -# CONFIG_RT_GROUP_SCHED is not set -CONFIG_BLK_CGROUP=y -# CONFIG_DEBUG_BLK_CGROUP is not set -# CONFIG_CHECKPOINT_RESTORE is not set -CONFIG_NAMESPACES=y -CONFIG_UTS_NS=y -CONFIG_IPC_NS=y -# CONFIG_USER_NS is not set -# CONFIG_PID_NS is not set -CONFIG_NET_NS=y -# CONFIG_SCHED_AUTOGROUP is not set -CONFIG_MM_OWNER=y -CONFIG_SYSFS_DEPRECATED=y -# CONFIG_SYSFS_DEPRECATED_V2 is not set -# CONFIG_RELAY is not set -# CONFIG_BLK_DEV_INITRD is not set -CONFIG_CC_OPTIMIZE_FOR_SIZE=y -CONFIG_SYSCTL=y -CONFIG_ANON_INODES=y -# CONFIG_EXPERT is not set -CONFIG_UID16=y -# CONFIG_SYSCTL_SYSCALL is not set -CONFIG_KALLSYMS=y -# CONFIG_KALLSYMS_ALL is not set -CONFIG_HOTPLUG=y -CONFIG_PRINTK=y -CONFIG_BUG=y -CONFIG_ELF_CORE=y -CONFIG_BASE_FULL=y -CONFIG_FUTEX=y -CONFIG_EPOLL=y -CONFIG_SIGNALFD=y -CONFIG_TIMERFD=y -CONFIG_EVENTFD=y -CONFIG_SHMEM=y -CONFIG_AIO=y -# CONFIG_EMBEDDED is not set - -# -# Kernel Performance Events And Counters -# -CONFIG_VM_EVENT_COUNTERS=y -CONFIG_COMPAT_BRK=y -CONFIG_SLAB=y -# CONFIG_SLUB is not set -# CONFIG_PROFILING is not set - -# -# GCOV-based kernel profiling -# -# CONFIG_HAVE_GENERIC_DMA_COHERENT is not set -CONFIG_SLABINFO=y -CONFIG_RT_MUTEXES=y -CONFIG_BASE_SMALL=0 -CONFIG_MODULES=y -# CONFIG_MODULE_FORCE_LOAD is not set -CONFIG_MODULE_UNLOAD=y -# CONFIG_MODULE_FORCE_UNLOAD is not set -# CONFIG_MODVERSIONS is not set -# CONFIG_MODULE_SRCVERSION_ALL is not set -CONFIG_BLOCK=y -CONFIG_LBDAF=y -# CONFIG_BLK_DEV_BSG is not set -# CONFIG_BLK_DEV_BSGLIB is not set -# CONFIG_BLK_DEV_INTEGRITY is not set - -# -# Partition Types -# -# CONFIG_PARTITION_ADVANCED is not set -CONFIG_MSDOS_PARTITION=y - -# -# IO Schedulers -# -CONFIG_IOSCHED_NOOP=y -CONFIG_IOSCHED_DEADLINE=y -CONFIG_IOSCHED_CFQ=m -# CONFIG_CFQ_GROUP_IOSCHED is not set -CONFIG_DEFAULT_DEADLINE=y -# CONFIG_DEFAULT_CFQ is not set -# CONFIG_DEFAULT_NOOP is not set -CONFIG_DEFAULT_IOSCHED="deadline" -# CONFIG_INLINE_SPIN_TRYLOCK is not set -# CONFIG_INLINE_SPIN_TRYLOCK_BH is not set -# CONFIG_INLINE_SPIN_LOCK is not set -# CONFIG_INLINE_SPIN_LOCK_BH is not set -# CONFIG_INLINE_SPIN_LOCK_IRQ is not set -# CONFIG_INLINE_SPIN_LOCK_IRQSAVE is not set -CONFIG_INLINE_SPIN_UNLOCK=y -# CONFIG_INLINE_SPIN_UNLOCK_BH is not set -CONFIG_INLINE_SPIN_UNLOCK_IRQ=y -# CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE is not set -# CONFIG_INLINE_READ_TRYLOCK is not set -# CONFIG_INLINE_READ_LOCK is not set -# CONFIG_INLINE_READ_LOCK_BH is not set -# CONFIG_INLINE_READ_LOCK_IRQ is not set -# CONFIG_INLINE_READ_LOCK_IRQSAVE is not set -CONFIG_INLINE_READ_UNLOCK=y -# CONFIG_INLINE_READ_UNLOCK_BH is not set -CONFIG_INLINE_READ_UNLOCK_IRQ=y -# CONFIG_INLINE_READ_UNLOCK_IRQRESTORE is not set -# CONFIG_INLINE_WRITE_TRYLOCK is not set -# CONFIG_INLINE_WRITE_LOCK is not set -# CONFIG_INLINE_WRITE_LOCK_BH is not set -# CONFIG_INLINE_WRITE_LOCK_IRQ is not set -# CONFIG_INLINE_WRITE_LOCK_IRQSAVE is not set -CONFIG_INLINE_WRITE_UNLOCK=y -# CONFIG_INLINE_WRITE_UNLOCK_BH is not set -CONFIG_INLINE_WRITE_UNLOCK_IRQ=y -# CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE is not set -# CONFIG_MUTEX_SPIN_ON_OWNER is not set -CONFIG_FREEZER=y - -# -# UML Character Devices -# -CONFIG_STDERR_CONSOLE=y -CONFIG_STDIO_CONSOLE=y -CONFIG_SSL=y -CONFIG_NULL_CHAN=y -CONFIG_PORT_CHAN=y -CONFIG_PTY_CHAN=y -CONFIG_TTY_CHAN=y -CONFIG_XTERM_CHAN=y -# CONFIG_NOCONFIG_CHAN is not set -CONFIG_CON_ZERO_CHAN="fd:0,fd:1" -CONFIG_CON_CHAN="xterm" -CONFIG_SSL_CHAN="pts" -CONFIG_UML_SOUND=m -CONFIG_SOUND=m -CONFIG_SOUND_OSS_CORE=y -CONFIG_HOSTAUDIO=m - -# -# Device Drivers -# - -# -# Generic Driver Options -# -CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" -CONFIG_DEVTMPFS=y -CONFIG_DEVTMPFS_MOUNT=y -CONFIG_STANDALONE=y -CONFIG_PREVENT_FIRMWARE_BUILD=y -CONFIG_FW_LOADER=y -CONFIG_FIRMWARE_IN_KERNEL=y -CONFIG_EXTRA_FIRMWARE="" -# CONFIG_DEBUG_DRIVER is not set -# CONFIG_DEBUG_DEVRES is not set -# CONFIG_SYS_HYPERVISOR is not set -CONFIG_GENERIC_CPU_DEVICES=y -# CONFIG_DMA_SHARED_BUFFER is not set -# CONFIG_CONNECTOR is not set -# CONFIG_MTD is not set -CONFIG_BLK_DEV=y -CONFIG_BLK_DEV_UBD=y -# CONFIG_BLK_DEV_UBD_SYNC is not set -CONFIG_BLK_DEV_COW_COMMON=y -CONFIG_BLK_DEV_LOOP=m -CONFIG_BLK_DEV_LOOP_MIN_COUNT=8 -# CONFIG_BLK_DEV_CRYPTOLOOP is not set - -# -# DRBD disabled because PROC_FS, INET or CONNECTOR not selected -# -CONFIG_BLK_DEV_NBD=m -# CONFIG_BLK_DEV_RAM is not set -# CONFIG_ATA_OVER_ETH is not set -# CONFIG_BLK_DEV_RBD is not set - -# -# Misc devices -# -# CONFIG_ENCLOSURE_SERVICES is not set -# CONFIG_C2PORT is not set - -# -# EEPROM support -# -# CONFIG_EEPROM_93CX6 is not set - -# -# Texas Instruments shared transport line discipline -# - -# -# Altera FPGA firmware download module -# - -# -# SCSI device support -# -CONFIG_SCSI_MOD=y -# CONFIG_RAID_ATTRS is not set -# CONFIG_SCSI is not set -# CONFIG_SCSI_DMA is not set -# CONFIG_SCSI_NETLINK is not set -# CONFIG_MD is not set -CONFIG_NETDEVICES=y -CONFIG_NET_CORE=y -# CONFIG_BONDING is not set -CONFIG_DUMMY=m -# CONFIG_EQUALIZER is not set -# CONFIG_MII is not set -# CONFIG_NET_TEAM is not set -# CONFIG_MACVLAN is not set -# CONFIG_NETCONSOLE is not set -# CONFIG_NETPOLL is not set -# CONFIG_NET_POLL_CONTROLLER is not set -CONFIG_TUN=m -# CONFIG_VETH is not set - -# -# CAIF transport drivers -# -CONFIG_ETHERNET=y -CONFIG_NET_VENDOR_CHELSIO=y -CONFIG_NET_VENDOR_INTEL=y -CONFIG_NET_VENDOR_I825XX=y -CONFIG_NET_VENDOR_MARVELL=y -CONFIG_NET_VENDOR_NATSEMI=y -CONFIG_NET_VENDOR_8390=y -# CONFIG_PHYLIB is not set -CONFIG_PPP=m -# CONFIG_PPP_BSDCOMP is not set -# CONFIG_PPP_DEFLATE is not set -# CONFIG_PPP_FILTER is not set -# CONFIG_PPP_MPPE is not set -# CONFIG_PPP_MULTILINK is not set -# CONFIG_PPPOE is not set -# CONFIG_PPP_ASYNC is not set -# CONFIG_PPP_SYNC_TTY is not set -CONFIG_SLIP=m -CONFIG_SLHC=m -# CONFIG_SLIP_COMPRESSED is not set -# CONFIG_SLIP_SMART is not set -# CONFIG_SLIP_MODE_SLIP6 is not set -CONFIG_WLAN=y -# CONFIG_HOSTAP is not set - -# -# Enable WiMAX (Networking options) to see the WiMAX drivers -# -# CONFIG_WAN is not set - -# -# Character devices -# -CONFIG_UNIX98_PTYS=y -# CONFIG_DEVPTS_MULTIPLE_INSTANCES is not set -CONFIG_LEGACY_PTYS=y -CONFIG_LEGACY_PTY_COUNT=32 -# CONFIG_N_GSM is not set -# CONFIG_TRACE_SINK is not set -CONFIG_DEVKMEM=y -# CONFIG_HW_RANDOM is not set -CONFIG_UML_RANDOM=y -# CONFIG_R3964 is not set -# CONFIG_NSC_GPIO is not set -# CONFIG_RAW_DRIVER is not set - -# -# PPS support -# -# CONFIG_PPS is not set - -# -# PPS generators support -# - -# -# PTP clock support -# - -# -# Enable Device Drivers -> PPS to see the PTP clock options. -# -# CONFIG_POWER_SUPPLY is not set -# CONFIG_THERMAL is not set -# CONFIG_WATCHDOG is not set -# CONFIG_REGULATOR is not set -CONFIG_SOUND_OSS_CORE_PRECLAIM=y -# CONFIG_MEMSTICK is not set -# CONFIG_NEW_LEDS is not set -# CONFIG_ACCESSIBILITY is not set -# CONFIG_AUXDISPLAY is not set -# CONFIG_UIO is not set - -# -# Virtio drivers -# -# CONFIG_VIRTIO_BALLOON is not set - -# -# Microsoft Hyper-V guest support -# -# CONFIG_STAGING is not set - -# -# Hardware Spinlock drivers -# -CONFIG_IOMMU_SUPPORT=y -# CONFIG_VIRT_DRIVERS is not set -# CONFIG_PM_DEVFREQ is not set -CONFIG_NET=y - -# -# Networking options -# -CONFIG_PACKET=y -CONFIG_UNIX=y -# CONFIG_UNIX_DIAG is not set -CONFIG_XFRM=y -# CONFIG_XFRM_USER is not set -# CONFIG_XFRM_SUB_POLICY is not set -# CONFIG_XFRM_MIGRATE is not set -# CONFIG_XFRM_STATISTICS is not set -# CONFIG_NET_KEY is not set -CONFIG_INET=y -# CONFIG_IP_MULTICAST is not set -# CONFIG_IP_ADVANCED_ROUTER is not set -# CONFIG_IP_PNP is not set -# CONFIG_NET_IPIP is not set -# CONFIG_NET_IPGRE_DEMUX is not set -# CONFIG_ARPD is not set -# CONFIG_SYN_COOKIES is not set -# CONFIG_INET_AH is not set -# CONFIG_INET_ESP is not set -# CONFIG_INET_IPCOMP is not set -# CONFIG_INET_XFRM_TUNNEL is not set -# CONFIG_INET_TUNNEL is not set -CONFIG_INET_XFRM_MODE_TRANSPORT=y -CONFIG_INET_XFRM_MODE_TUNNEL=y -CONFIG_INET_XFRM_MODE_BEET=y -# CONFIG_INET_LRO is not set -CONFIG_INET_DIAG=y -CONFIG_INET_TCP_DIAG=y -# CONFIG_INET_UDP_DIAG is not set -# CONFIG_TCP_CONG_ADVANCED is not set -CONFIG_TCP_CONG_CUBIC=y -CONFIG_DEFAULT_TCP_CONG="cubic" -# CONFIG_TCP_MD5SIG is not set -# CONFIG_IPV6 is not set -# CONFIG_NETWORK_SECMARK is not set -# CONFIG_NETWORK_PHY_TIMESTAMPING is not set -# CONFIG_NETFILTER is not set -# CONFIG_IP_DCCP is not set -# CONFIG_IP_SCTP is not set -# CONFIG_RDS is not set -# CONFIG_TIPC is not set -# CONFIG_ATM is not set -# CONFIG_L2TP is not set -# CONFIG_BRIDGE is not set -# CONFIG_NET_DSA is not set -# CONFIG_VLAN_8021Q is not set -# CONFIG_DECNET is not set -# CONFIG_LLC2 is not set -# CONFIG_IPX is not set -# CONFIG_ATALK is not set -# CONFIG_X25 is not set -# CONFIG_LAPB is not set -# CONFIG_ECONET is not set -# CONFIG_WAN_ROUTER is not set -# CONFIG_PHONET is not set -# CONFIG_IEEE802154 is not set -# CONFIG_NET_SCHED is not set -# CONFIG_DCB is not set -# CONFIG_BATMAN_ADV is not set -# CONFIG_OPENVSWITCH is not set -# CONFIG_NETPRIO_CGROUP is not set -CONFIG_BQL=y - -# -# Network testing -# -# CONFIG_NET_PKTGEN is not set -# CONFIG_HAMRADIO is not set -# CONFIG_CAN is not set -# CONFIG_IRDA is not set -# CONFIG_BT is not set -# CONFIG_AF_RXRPC is not set -CONFIG_WIRELESS=y -# CONFIG_CFG80211 is not set -# CONFIG_LIB80211 is not set - -# -# CFG80211 needs to be enabled for MAC80211 -# -# CONFIG_WIMAX is not set -# CONFIG_RFKILL is not set -# CONFIG_NET_9P is not set -# CONFIG_CAIF is not set -# CONFIG_CEPH_LIB is not set -# CONFIG_NFC is not set - -# -# UML Network Devices -# -CONFIG_UML_NET=y -CONFIG_UML_NET_ETHERTAP=y -CONFIG_UML_NET_TUNTAP=y -CONFIG_UML_NET_SLIP=y -CONFIG_UML_NET_DAEMON=y -# CONFIG_UML_NET_VDE is not set -CONFIG_UML_NET_MCAST=y -# CONFIG_UML_NET_PCAP is not set -CONFIG_UML_NET_SLIRP=y - -# -# File systems -# -# CONFIG_EXT2_FS is not set -# CONFIG_EXT3_FS is not set -CONFIG_EXT4_FS=y -CONFIG_EXT4_USE_FOR_EXT23=y -CONFIG_EXT4_FS_XATTR=y -# CONFIG_EXT4_FS_POSIX_ACL is not set -# CONFIG_EXT4_FS_SECURITY is not set -# CONFIG_EXT4_DEBUG is not set -CONFIG_JBD2=y -CONFIG_FS_MBCACHE=y -CONFIG_REISERFS_FS=y -# CONFIG_REISERFS_CHECK is not set -# CONFIG_REISERFS_PROC_INFO is not set -# CONFIG_REISERFS_FS_XATTR is not set -# CONFIG_JFS_FS is not set -# CONFIG_XFS_FS is not set -# CONFIG_GFS2_FS is not set -# CONFIG_BTRFS_FS is not set -# CONFIG_NILFS2_FS is not set -# CONFIG_FS_POSIX_ACL is not set -CONFIG_FILE_LOCKING=y -CONFIG_FSNOTIFY=y -CONFIG_DNOTIFY=y -CONFIG_INOTIFY_USER=y -# CONFIG_FANOTIFY is not set -CONFIG_QUOTA=y -# CONFIG_QUOTA_NETLINK_INTERFACE is not set -CONFIG_PRINT_QUOTA_WARNING=y -# CONFIG_QUOTA_DEBUG is not set -# CONFIG_QFMT_V1 is not set -# CONFIG_QFMT_V2 is not set -CONFIG_QUOTACTL=y -CONFIG_AUTOFS4_FS=m -# CONFIG_FUSE_FS is not set - -# -# Caches -# -# CONFIG_FSCACHE is not set - -# -# CD-ROM/DVD Filesystems -# -CONFIG_ISO9660_FS=m -CONFIG_JOLIET=y -# CONFIG_ZISOFS is not set -# CONFIG_UDF_FS is not set - -# -# DOS/FAT/NT Filesystems -# -# CONFIG_MSDOS_FS is not set -# CONFIG_VFAT_FS is not set -# CONFIG_NTFS_FS is not set - -# -# Pseudo filesystems -# -CONFIG_PROC_FS=y -CONFIG_PROC_KCORE=y -CONFIG_PROC_SYSCTL=y -CONFIG_PROC_PAGE_MONITOR=y -CONFIG_SYSFS=y -CONFIG_TMPFS=y -# CONFIG_TMPFS_POSIX_ACL is not set -# CONFIG_TMPFS_XATTR is not set -# CONFIG_HUGETLB_PAGE is not set -# CONFIG_CONFIGFS_FS is not set -CONFIG_MISC_FILESYSTEMS=y -# CONFIG_ADFS_FS is not set -# CONFIG_AFFS_FS is not set -# CONFIG_HFS_FS is not set -# CONFIG_HFSPLUS_FS is not set -# CONFIG_BEFS_FS is not set -# CONFIG_BFS_FS is not set -# CONFIG_EFS_FS is not set -# CONFIG_LOGFS is not set -# CONFIG_CRAMFS is not set -# CONFIG_SQUASHFS is not set -# CONFIG_VXFS_FS is not set -# CONFIG_MINIX_FS is not set -# CONFIG_OMFS_FS is not set -# CONFIG_HPFS_FS is not set -# CONFIG_QNX4FS_FS is not set -# CONFIG_ROMFS_FS is not set -# CONFIG_PSTORE is not set -# CONFIG_SYSV_FS is not set -# CONFIG_UFS_FS is not set -CONFIG_NETWORK_FILESYSTEMS=y -# CONFIG_NFS_FS is not set -# CONFIG_NFSD is not set -# CONFIG_CEPH_FS is not set -# CONFIG_CIFS is not set -# CONFIG_NCP_FS is not set -# CONFIG_CODA_FS is not set -# CONFIG_AFS_FS is not set -CONFIG_NLS=y -CONFIG_NLS_DEFAULT="iso8859-1" -# CONFIG_NLS_CODEPAGE_437 is not set -# CONFIG_NLS_CODEPAGE_737 is not set -# CONFIG_NLS_CODEPAGE_775 is not set -# CONFIG_NLS_CODEPAGE_850 is not set -# CONFIG_NLS_CODEPAGE_852 is not set -# CONFIG_NLS_CODEPAGE_855 is not set -# CONFIG_NLS_CODEPAGE_857 is not set -# CONFIG_NLS_CODEPAGE_860 is not set -# CONFIG_NLS_CODEPAGE_861 is not set -# CONFIG_NLS_CODEPAGE_862 is not set -# CONFIG_NLS_CODEPAGE_863 is not set -# CONFIG_NLS_CODEPAGE_864 is not set -# CONFIG_NLS_CODEPAGE_865 is not set -# CONFIG_NLS_CODEPAGE_866 is not set -# CONFIG_NLS_CODEPAGE_869 is not set -# CONFIG_NLS_CODEPAGE_936 is not set -# CONFIG_NLS_CODEPAGE_950 is not set -# CONFIG_NLS_CODEPAGE_932 is not set -# CONFIG_NLS_CODEPAGE_949 is not set -# CONFIG_NLS_CODEPAGE_874 is not set -# CONFIG_NLS_ISO8859_8 is not set -# CONFIG_NLS_CODEPAGE_1250 is not set -# CONFIG_NLS_CODEPAGE_1251 is not set -# CONFIG_NLS_ASCII is not set -# CONFIG_NLS_ISO8859_1 is not set -# CONFIG_NLS_ISO8859_2 is not set -# CONFIG_NLS_ISO8859_3 is not set -# CONFIG_NLS_ISO8859_4 is not set -# CONFIG_NLS_ISO8859_5 is not set -# CONFIG_NLS_ISO8859_6 is not set -# CONFIG_NLS_ISO8859_7 is not set -# CONFIG_NLS_ISO8859_9 is not set -# CONFIG_NLS_ISO8859_13 is not set -# CONFIG_NLS_ISO8859_14 is not set -# CONFIG_NLS_ISO8859_15 is not set -# CONFIG_NLS_KOI8_R is not set -# CONFIG_NLS_KOI8_U is not set -# CONFIG_NLS_UTF8 is not set - -# -# Security options -# -# CONFIG_KEYS is not set -# CONFIG_SECURITY_DMESG_RESTRICT is not set -# CONFIG_SECURITY is not set -# CONFIG_SECURITYFS is not set -CONFIG_DEFAULT_SECURITY_DAC=y -CONFIG_DEFAULT_SECURITY="" -CONFIG_CRYPTO=y - -# -# Crypto core or helper -# -# CONFIG_CRYPTO_FIPS is not set -CONFIG_CRYPTO_ALGAPI=m -CONFIG_CRYPTO_ALGAPI2=m -CONFIG_CRYPTO_RNG=m -CONFIG_CRYPTO_RNG2=m -# CONFIG_CRYPTO_MANAGER is not set -# CONFIG_CRYPTO_MANAGER2 is not set -# CONFIG_CRYPTO_USER is not set -# CONFIG_CRYPTO_GF128MUL is not set -# CONFIG_CRYPTO_NULL is not set -# CONFIG_CRYPTO_CRYPTD is not set -# CONFIG_CRYPTO_AUTHENC is not set -# CONFIG_CRYPTO_TEST is not set - -# -# Authenticated Encryption with Associated Data -# -# CONFIG_CRYPTO_CCM is not set -# CONFIG_CRYPTO_GCM is not set -# CONFIG_CRYPTO_SEQIV is not set - -# -# Block modes -# -# CONFIG_CRYPTO_CBC is not set -# CONFIG_CRYPTO_CTR is not set -# CONFIG_CRYPTO_CTS is not set -# CONFIG_CRYPTO_ECB is not set -# CONFIG_CRYPTO_LRW is not set -# CONFIG_CRYPTO_PCBC is not set -# CONFIG_CRYPTO_XTS is not set - -# -# Hash modes -# -# CONFIG_CRYPTO_HMAC is not set -# CONFIG_CRYPTO_XCBC is not set -# CONFIG_CRYPTO_VMAC is not set - -# -# Digest -# -# CONFIG_CRYPTO_CRC32C is not set -# CONFIG_CRYPTO_GHASH is not set -# CONFIG_CRYPTO_MD4 is not set -# CONFIG_CRYPTO_MD5 is not set -# CONFIG_CRYPTO_MICHAEL_MIC is not set -# CONFIG_CRYPTO_RMD128 is not set -# CONFIG_CRYPTO_RMD160 is not set -# CONFIG_CRYPTO_RMD256 is not set -# CONFIG_CRYPTO_RMD320 is not set -# CONFIG_CRYPTO_SHA1 is not set -# CONFIG_CRYPTO_SHA256 is not set -# CONFIG_CRYPTO_SHA512 is not set -# CONFIG_CRYPTO_TGR192 is not set -# CONFIG_CRYPTO_WP512 is not set - -# -# Ciphers -# -CONFIG_CRYPTO_AES=m -# CONFIG_CRYPTO_AES_586 is not set -# CONFIG_CRYPTO_ANUBIS is not set -# CONFIG_CRYPTO_ARC4 is not set -# CONFIG_CRYPTO_BLOWFISH is not set -# CONFIG_CRYPTO_CAMELLIA is not set -# CONFIG_CRYPTO_CAST5 is not set -# CONFIG_CRYPTO_CAST6 is not set -# CONFIG_CRYPTO_DES is not set -# CONFIG_CRYPTO_FCRYPT is not set -# CONFIG_CRYPTO_KHAZAD is not set -# CONFIG_CRYPTO_SALSA20 is not set -# CONFIG_CRYPTO_SALSA20_586 is not set -# CONFIG_CRYPTO_SEED is not set -# CONFIG_CRYPTO_SERPENT is not set -# CONFIG_CRYPTO_TEA is not set -# CONFIG_CRYPTO_TWOFISH is not set -# CONFIG_CRYPTO_TWOFISH_586 is not set - -# -# Compression -# -# CONFIG_CRYPTO_DEFLATE is not set -# CONFIG_CRYPTO_ZLIB is not set -# CONFIG_CRYPTO_LZO is not set - -# -# Random Number Generation -# -CONFIG_CRYPTO_ANSI_CPRNG=m -# CONFIG_CRYPTO_USER_API_HASH is not set -# CONFIG_CRYPTO_USER_API_SKCIPHER is not set -CONFIG_CRYPTO_HW=y -# CONFIG_BINARY_PRINTF is not set - -# -# Library routines -# -CONFIG_BITREVERSE=y -CONFIG_GENERIC_FIND_FIRST_BIT=y -CONFIG_GENERIC_IO=y -# CONFIG_CRC_CCITT is not set -CONFIG_CRC16=y -# CONFIG_CRC_T10DIF is not set -# CONFIG_CRC_ITU_T is not set -CONFIG_CRC32=y -# CONFIG_CRC7 is not set -# CONFIG_LIBCRC32C is not set -# CONFIG_CRC8 is not set -# CONFIG_XZ_DEC is not set -# CONFIG_XZ_DEC_BCJ is not set -CONFIG_DQL=y -CONFIG_NLATTR=y -# CONFIG_AVERAGE is not set -# CONFIG_CORDIC is not set - -# -# Kernel hacking -# -# CONFIG_PRINTK_TIME is not set -CONFIG_DEFAULT_MESSAGE_LOGLEVEL=4 -CONFIG_ENABLE_WARN_DEPRECATED=y -CONFIG_ENABLE_MUST_CHECK=y -CONFIG_FRAME_WARN=1024 -# CONFIG_STRIP_ASM_SYMS is not set -# CONFIG_UNUSED_SYMBOLS is not set -# CONFIG_DEBUG_FS is not set -# CONFIG_DEBUG_SECTION_MISMATCH is not set -CONFIG_DEBUG_KERNEL=y -# CONFIG_DEBUG_SHIRQ is not set -# CONFIG_LOCKUP_DETECTOR is not set -# CONFIG_HARDLOCKUP_DETECTOR is not set -# CONFIG_DETECT_HUNG_TASK is not set -CONFIG_SCHED_DEBUG=y -# CONFIG_SCHEDSTATS is not set -# CONFIG_TIMER_STATS is not set -# CONFIG_DEBUG_OBJECTS is not set -# CONFIG_DEBUG_SLAB is not set -# CONFIG_DEBUG_RT_MUTEXES is not set -# CONFIG_RT_MUTEX_TESTER is not set -# CONFIG_DEBUG_SPINLOCK is not set -# CONFIG_DEBUG_MUTEXES is not set -# CONFIG_SPARSE_RCU_POINTER is not set -# CONFIG_DEBUG_ATOMIC_SLEEP is not set -# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set -# CONFIG_DEBUG_STACK_USAGE is not set -# CONFIG_DEBUG_KOBJECT is not set -CONFIG_DEBUG_BUGVERBOSE=y -CONFIG_DEBUG_INFO=y -# CONFIG_DEBUG_INFO_REDUCED is not set -# CONFIG_DEBUG_VM is not set -# CONFIG_DEBUG_WRITECOUNT is not set -CONFIG_DEBUG_MEMORY_INIT=y -# CONFIG_DEBUG_LIST is not set -# CONFIG_TEST_LIST_SORT is not set -# CONFIG_DEBUG_SG is not set -# CONFIG_DEBUG_NOTIFIERS is not set -# CONFIG_DEBUG_CREDENTIALS is not set -CONFIG_FRAME_POINTER=y -# CONFIG_BOOT_PRINTK_DELAY is not set -# CONFIG_RCU_TORTURE_TEST is not set -# CONFIG_BACKTRACE_SELF_TEST is not set -# CONFIG_DEBUG_BLOCK_EXT_DEVT is not set -# CONFIG_DEBUG_FORCE_WEAK_PER_CPU is not set -# CONFIG_FAULT_INJECTION is not set -# CONFIG_SYSCTL_SYSCALL_CHECK is not set -# CONFIG_DEBUG_PAGEALLOC is not set -# CONFIG_ATOMIC64_SELFTEST is not set -# CONFIG_SAMPLES is not set -# CONFIG_TEST_KSTRTOX is not set -# CONFIG_GPROF is not set -# CONFIG_GCOV is not set -CONFIG_EARLY_PRINTK=y diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig new file mode 100644 index 000000000000..6a0354ca032f --- /dev/null +++ b/arch/um/drivers/Kconfig @@ -0,0 +1,190 @@ +# SPDX-License-Identifier: GPL-2.0 + +menu "UML Character Devices" + +config STDERR_CONSOLE + bool "stderr console" + default y + help + console driver which dumps all printk messages to stderr. + +config SSL + bool "Virtual serial line" + help + The User-Mode Linux environment allows you to create virtual serial + lines on the UML that are usually made to show up on the host as + ttys or ptys. + + See <http://user-mode-linux.sourceforge.net/old/input.html> for more + information and command line examples of how to use this facility. + + Unless you have a specific reason for disabling this, say Y. + +config NULL_CHAN + bool "null channel support" + help + This option enables support for attaching UML consoles and serial + lines to a device similar to /dev/null. Data written to it disappears + and there is never any data to be read. + +config PORT_CHAN + bool "port channel support" + help + This option enables support for attaching UML consoles and serial + lines to host portals. They may be accessed with 'telnet <host> + <port number>'. Any number of consoles and serial lines may be + attached to a single portal, although what UML device you get when + you telnet to that portal will be unpredictable. + It is safe to say 'Y' here. + +config PTY_CHAN + bool "pty channel support" + help + This option enables support for attaching UML consoles and serial + lines to host pseudo-terminals. Access to both traditional + pseudo-terminals (/dev/pty*) and pts pseudo-terminals are controlled + with this option. The assignment of UML devices to host devices + will be announced in the kernel message log. + It is safe to say 'Y' here. + +config TTY_CHAN + bool "tty channel support" + help + This option enables support for attaching UML consoles and serial + lines to host terminals. Access to both virtual consoles + (/dev/tty*) and the slave side of pseudo-terminals (/dev/ttyp* and + /dev/pts/*) are controlled by this option. + It is safe to say 'Y' here. + +config XTERM_CHAN + bool "xterm channel support" + help + This option enables support for attaching UML consoles and serial + lines to xterms. Each UML device so assigned will be brought up in + its own xterm. + It is safe to say 'Y' here. + +config XTERM_CHAN_DEFAULT_EMULATOR + string "xterm channel default terminal emulator" + depends on XTERM_CHAN + default "xterm" + help + This option allows changing the default terminal emulator. + +config NOCONFIG_CHAN + bool + default !(XTERM_CHAN && TTY_CHAN && PTY_CHAN && PORT_CHAN && NULL_CHAN) + +config CON_ZERO_CHAN + string "Default main console channel initialization" + default "fd:0,fd:1" + help + This is the string describing the channel to which the main console + will be attached by default. This value can be overridden from the + command line. The default value is "fd:0,fd:1", which attaches the + main console to stdin and stdout. + It is safe to leave this unchanged. + +config CON_CHAN + string "Default console channel initialization" + default "xterm" + help + This is the string describing the channel to which all consoles + except the main console will be attached by default. This value can + be overridden from the command line. The default value is "xterm", + which brings them up in xterms. + It is safe to leave this unchanged, although you may wish to change + this if you expect the UML that you build to be run in environments + which don't have X or xterm available. + +config SSL_CHAN + string "Default serial line channel initialization" + default "pty" + help + This is the string describing the channel to which the serial lines + will be attached by default. This value can be overridden from the + command line. The default value is "pty", which attaches them to + traditional pseudo-terminals. + It is safe to leave this unchanged, although you may wish to change + this if you expect the UML that you build to be run in environments + which don't have a set of /dev/pty* devices. + +config UML_SOUND + tristate "Sound support" + depends on SOUND + select SOUND_OSS_CORE + help + This option enables UML sound support. If enabled, it will pull in + the UML hostaudio relay, which acts as a intermediary + between the host's dsp and mixer devices and the UML sound system. + It is safe to say 'Y' here. + +endmenu + +menu "UML Network Devices" + depends on NET + +config UML_NET_VECTOR + bool "Vector I/O high performance network devices" + select MAY_HAVE_RUNTIME_DEPS + help + This User-Mode Linux network driver uses multi-message send + and receive functions. The host running the UML guest must have + a linux kernel version above 3.0 and a libc version > 2.13. + This driver provides tap, raw, gre and l2tpv3 network transports. + + For more information, including explanations of the networking + and sample configurations, see + <file:Documentation/virt/uml/user_mode_linux_howto_v2.rst>. + +endmenu + +config VIRTIO_UML + bool "UML driver for virtio devices" + select VIRTIO + help + This driver provides support for virtio based paravirtual device + drivers over vhost-user sockets. + +config UML_RTC + bool "UML RTC driver" + depends on RTC_CLASS + # there's no use in this if PM_SLEEP isn't enabled ... + depends on PM_SLEEP + help + When PM_SLEEP is configured, it may be desirable to wake up using + rtcwake, especially in time-travel mode. This driver enables that + by providing a fake RTC clock that causes a wakeup at the right + time. + +config UML_PCI + bool + select FORCE_PCI + select IRQ_MSI_LIB + select UML_IOMEM_EMULATION + select UML_DMA_EMULATION + select PCI_MSI + select PCI_LOCKLESS_CONFIG + +config UML_PCI_OVER_VIRTIO + bool "Enable PCI over VIRTIO device simulation" + # in theory, just VIRTIO is enough, but that causes recursion + depends on VIRTIO_UML + select UML_PCI + +config UML_PCI_OVER_VIRTIO_DEVICE_ID + int "set the virtio device ID for PCI emulation" + default -1 + depends on UML_PCI_OVER_VIRTIO + help + There's no official device ID assigned (yet), set the one you + wish to use for experimentation here. The default of -1 is + not valid and will cause the driver to fail at probe. + +config UML_PCI_OVER_VFIO + bool "Enable VFIO-based PCI passthrough" + select UML_PCI + help + This driver provides support for VFIO-based PCI passthrough. + Currently, only MSI-X capable devices are supported, and it + is assumed that drivers will use MSI-X. diff --git a/arch/um/drivers/Makefile b/arch/um/drivers/Makefile index e7582e1d248c..36dc57840084 100644 --- a/arch/um/drivers/Makefile +++ b/arch/um/drivers/Makefile @@ -1,33 +1,20 @@ +# SPDX-License-Identifier: GPL-2.0 # # Copyright (C) 2000, 2002, 2003 Jeff Dike (jdike@karaya.com) -# Licensed under the GPL # # pcap is broken in 2.5 because kbuild doesn't allow pcap.a to be linked # in to pcap.o -slip-objs := slip_kern.o slip_user.o -slirp-objs := slirp_kern.o slirp_user.o -daemon-objs := daemon_kern.o daemon_user.o -umcast-objs := umcast_kern.o umcast_user.o -net-objs := net_kern.o net_user.o +vector-objs := vector_kern.o vector_user.o vector_transports.o mconsole-objs := mconsole_kern.o mconsole_user.o hostaudio-objs := hostaudio_kern.o ubd-objs := ubd_kern.o ubd_user.o port-objs := port_kern.o port_user.o -harddog-objs := harddog_kern.o harddog_user.o - -LDFLAGS_pcap.o := -r $(shell $(CC) $(KBUILD_CFLAGS) -print-file-name=libpcap.a) - -LDFLAGS_vde.o := -r $(shell $(CC) $(CFLAGS) -print-file-name=libvdeplug.a) - -targets := pcap_kern.o pcap_user.o vde_kern.o vde_user.o - -$(obj)/pcap.o: $(obj)/pcap_kern.o $(obj)/pcap_user.o - $(LD) -r -dp -o $@ $^ $(LDFLAGS) $(LDFLAGS_pcap.o) - -$(obj)/vde.o: $(obj)/vde_kern.o $(obj)/vde_user.o - $(LD) -r -dp -o $@ $^ $(LDFLAGS) $(LDFLAGS_vde.o) +harddog-objs := harddog_kern.o +harddog-builtin-$(CONFIG_UML_WATCHDOG) := harddog_user.o harddog_user_exp.o +rtc-objs := rtc_kern.o rtc_user.o +vfio_uml-objs := vfio_kern.o vfio_user.o #XXX: The call below does not work because the flags are added before the # object name, so nothing from the library gets linked. @@ -40,28 +27,29 @@ obj-y := stdio_console.o fd.o chan_kern.o chan_user.o line.o obj-$(CONFIG_SSL) += ssl.o obj-$(CONFIG_STDERR_CONSOLE) += stderr_console.o -obj-$(CONFIG_UML_NET_SLIP) += slip.o slip_common.o -obj-$(CONFIG_UML_NET_SLIRP) += slirp.o slip_common.o -obj-$(CONFIG_UML_NET_DAEMON) += daemon.o -obj-$(CONFIG_UML_NET_VDE) += vde.o -obj-$(CONFIG_UML_NET_MCAST) += umcast.o -obj-$(CONFIG_UML_NET_PCAP) += pcap.o -obj-$(CONFIG_UML_NET) += net.o +obj-$(CONFIG_UML_NET_VECTOR) += vector.o obj-$(CONFIG_MCONSOLE) += mconsole.o -obj-$(CONFIG_MMAPPER) += mmapper_kern.o obj-$(CONFIG_BLK_DEV_UBD) += ubd.o -obj-$(CONFIG_HOSTAUDIO) += hostaudio.o +obj-$(CONFIG_UML_SOUND) += hostaudio.o obj-$(CONFIG_NULL_CHAN) += null.o obj-$(CONFIG_PORT_CHAN) += port.o obj-$(CONFIG_PTY_CHAN) += pty.o obj-$(CONFIG_TTY_CHAN) += tty.o obj-$(CONFIG_XTERM_CHAN) += xterm.o xterm_kern.o obj-$(CONFIG_UML_WATCHDOG) += harddog.o +obj-y += $(harddog-builtin-y) $(harddog-builtin-m) obj-$(CONFIG_BLK_DEV_COW_COMMON) += cow_user.o obj-$(CONFIG_UML_RANDOM) += random.o +obj-$(CONFIG_VIRTIO_UML) += virtio_uml.o +obj-$(CONFIG_UML_RTC) += rtc.o +obj-$(CONFIG_UML_PCI) += virt-pci.o +obj-$(CONFIG_UML_PCI_OVER_VIRTIO) += virtio_pcidev.o +obj-$(CONFIG_UML_PCI_OVER_VFIO) += vfio_uml.o # pcap_user.o must be added explicitly. -USER_OBJS := fd.o null.o pty.o tty.o xterm.o slip_common.o pcap_user.o vde_user.o +USER_OBJS := fd.o null.o pty.o tty.o xterm.o vector_user.o CFLAGS_null.o = -DDEV_NULL=$(DEV_NULL_PATH) -include arch/um/scripts/Makefile.rules +CFLAGS_xterm.o += '-DCONFIG_XTERM_CHAN_DEFAULT_EMULATOR="$(CONFIG_XTERM_CHAN_DEFAULT_EMULATOR)"' + +include $(srctree)/arch/um/scripts/Makefile.rules diff --git a/arch/um/drivers/chan.h b/arch/um/drivers/chan.h index c512b0306dd4..5a61db512ffb 100644 --- a/arch/um/drivers/chan.h +++ b/arch/um/drivers/chan.h @@ -1,6 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2000, 2001 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL */ #ifndef __CHAN_KERN_H__ @@ -22,7 +22,8 @@ struct chan { unsigned int output:1; unsigned int opened:1; unsigned int enabled:1; - int fd; + int fd_in; + int fd_out; /* only different to fd_in if blocking output is needed */ const struct chan_ops *ops; void *data; }; @@ -30,13 +31,12 @@ struct chan { extern void chan_interrupt(struct line *line, int irq); extern int parse_chan_pair(char *str, struct line *line, int device, const struct chan_opts *opts, char **error_out); -extern int write_chan(struct chan *chan, const char *buf, int len, +extern int write_chan(struct chan *chan, const u8 *buf, size_t len, int write_irq); extern int console_write_chan(struct chan *chan, const char *buf, int len); extern int console_open_chan(struct line *line, struct console *co); extern void deactivate_chan(struct chan *chan, int irq); -extern void reactivate_chan(struct chan *chan, int irq); extern void chan_enable_winch(struct chan *chan, struct tty_port *port); extern int enable_chan(struct line *line); extern void close_chan(struct line *line); diff --git a/arch/um/drivers/chan_kern.c b/arch/um/drivers/chan_kern.c index acbe6c67afba..26442db7d608 100644 --- a/arch/um/drivers/chan_kern.c +++ b/arch/um/drivers/chan_kern.c @@ -1,6 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{linux.intel,addtoit}.com) - * Licensed under the GPL */ #include <linux/slab.h> @@ -33,14 +33,14 @@ static void not_configged_close(int fd, void *data) "UML\n"); } -static int not_configged_read(int fd, char *c_out, void *data) +static int not_configged_read(int fd, u8 *c_out, void *data) { printk(KERN_ERR "Using a channel type which is configured out of " "UML\n"); return -EIO; } -static int not_configged_write(int fd, const char *buf, int len, void *data) +static int not_configged_write(int fd, const u8 *buf, size_t len, void *data) { printk(KERN_ERR "Using a channel type which is configured out of " "UML\n"); @@ -81,6 +81,12 @@ static const struct chan_ops not_configged_ops = { }; #endif /* CONFIG_NOCONFIG_CHAN */ +static inline bool need_output_blocking(void) +{ + return time_travel_mode == TT_MODE_INFCPU || + time_travel_mode == TT_MODE_EXTERNAL; +} + static int open_one_chan(struct chan *chan) { int fd, err; @@ -96,15 +102,43 @@ static int open_one_chan(struct chan *chan) return fd; err = os_set_fd_block(fd, 0); - if (err) { - (*chan->ops->close)(fd, chan->data); - return err; - } + if (err) + goto out_close; + + chan->fd_in = fd; + chan->fd_out = fd; + + /* + * In time-travel modes infinite-CPU and external we need to guarantee + * that any writes to the output succeed immdiately from the point of + * the VM. The best way to do this is to put the FD in blocking mode + * and simply wait/retry until everything is written. + * As every write is guaranteed to complete, we also do not need to + * request an IRQ for the output. + * + * Note that input cannot happen in a time synchronized way. We permit + * it, but time passes very quickly if anything waits for a read. + */ + if (chan->output && need_output_blocking()) { + err = os_dup_file(chan->fd_out); + if (err < 0) + goto out_close; - chan->fd = fd; + chan->fd_out = err; + + err = os_set_fd_block(chan->fd_out, 1); + if (err) { + os_close_file(chan->fd_out); + goto out_close; + } + } chan->opened = 1; return 0; + +out_close: + (*chan->ops->close)(fd, chan->data); + return err; } static int open_chan(struct list_head *chans) @@ -125,7 +159,7 @@ static int open_chan(struct list_head *chans) void chan_enable_winch(struct chan *chan, struct tty_port *port) { if (chan && chan->primary && chan->ops->winch) - register_winch(chan->fd, port); + register_winch(chan->fd_in, port); } static void line_timer_cb(struct work_struct *work) @@ -133,7 +167,7 @@ static void line_timer_cb(struct work_struct *work) struct line *line = container_of(work, struct line, task.work); if (!line->throttled) - chan_interrupt(line, line->driver->read_irq); + chan_interrupt(line, line->read_irq); } int enable_chan(struct line *line) @@ -156,8 +190,9 @@ int enable_chan(struct line *line) if (chan->enabled) continue; - err = line_setup_irq(chan->fd, chan->input, chan->output, line, - chan); + err = line_setup_irq(chan->fd_in, chan->input, + chan->output && !need_output_blocking(), + line, chan); if (err) goto out_close; @@ -177,7 +212,7 @@ int enable_chan(struct line *line) * be permanently disabled. This is discovered in IRQ context, but * the freeing of the IRQ must be done later. */ -static DEFINE_SPINLOCK(irqs_to_free_lock); +static DEFINE_RAW_SPINLOCK(irqs_to_free_lock); static LIST_HEAD(irqs_to_free); void free_irqs(void) @@ -187,17 +222,18 @@ void free_irqs(void) struct list_head *ele; unsigned long flags; - spin_lock_irqsave(&irqs_to_free_lock, flags); + raw_spin_lock_irqsave(&irqs_to_free_lock, flags); list_splice_init(&irqs_to_free, &list); - spin_unlock_irqrestore(&irqs_to_free_lock, flags); + raw_spin_unlock_irqrestore(&irqs_to_free_lock, flags); list_for_each(ele, &list) { chan = list_entry(ele, struct chan, free_list); if (chan->input && chan->enabled) - um_free_irq(chan->line->driver->read_irq, chan); - if (chan->output && chan->enabled) - um_free_irq(chan->line->driver->write_irq, chan); + um_free_irq(chan->line->read_irq, chan); + if (chan->output && chan->enabled && + !need_output_blocking()) + um_free_irq(chan->line->write_irq, chan); chan->enabled = 0; } } @@ -210,22 +246,25 @@ static void close_one_chan(struct chan *chan, int delay_free_irq) return; if (delay_free_irq) { - spin_lock_irqsave(&irqs_to_free_lock, flags); + raw_spin_lock_irqsave(&irqs_to_free_lock, flags); list_add(&chan->free_list, &irqs_to_free); - spin_unlock_irqrestore(&irqs_to_free_lock, flags); - } - else { + raw_spin_unlock_irqrestore(&irqs_to_free_lock, flags); + } else { if (chan->input && chan->enabled) - um_free_irq(chan->line->driver->read_irq, chan); - if (chan->output && chan->enabled) - um_free_irq(chan->line->driver->write_irq, chan); + um_free_irq(chan->line->read_irq, chan); + if (chan->output && chan->enabled && + !need_output_blocking()) + um_free_irq(chan->line->write_irq, chan); chan->enabled = 0; } + if (chan->fd_out != chan->fd_in) + os_close_file(chan->fd_out); if (chan->ops->close != NULL) - (*chan->ops->close)(chan->fd, chan->data); + (*chan->ops->close)(chan->fd_in, chan->data); chan->opened = 0; - chan->fd = -1; + chan->fd_in = -1; + chan->fd_out = -1; } void close_chan(struct line *line) @@ -245,28 +284,19 @@ void close_chan(struct line *line) void deactivate_chan(struct chan *chan, int irq) { if (chan && chan->enabled) - deactivate_fd(chan->fd, irq); -} - -void reactivate_chan(struct chan *chan, int irq) -{ - if (chan && chan->enabled) - reactivate_fd(chan->fd, irq); + deactivate_fd(chan->fd_in, irq); } -int write_chan(struct chan *chan, const char *buf, int len, - int write_irq) +int write_chan(struct chan *chan, const u8 *buf, size_t len, int write_irq) { int n, ret = 0; if (len == 0 || !chan || !chan->ops->write) return 0; - n = chan->ops->write(chan->fd, buf, len, chan->data); + n = chan->ops->write(chan->fd_out, buf, len, chan->data); if (chan->primary) { ret = n; - if ((ret == -EAGAIN) || ((ret >= 0) && (ret < len))) - reactivate_fd(chan->fd, write_irq); } return ret; } @@ -278,7 +308,7 @@ int console_write_chan(struct chan *chan, const char *buf, int len) if (!chan || !chan->ops->console_write) return 0; - n = chan->ops->console_write(chan->fd, buf, len); + n = chan->ops->console_write(chan->fd_out, buf, len); if (chan->primary) ret = n; return ret; @@ -306,14 +336,14 @@ int chan_window_size(struct line *line, unsigned short *rows_out, if (chan && chan->primary) { if (chan->ops->window_size == NULL) return 0; - return chan->ops->window_size(chan->fd, chan->data, + return chan->ops->window_size(chan->fd_in, chan->data, rows_out, cols_out); } chan = line->chan_out; if (chan && chan->primary) { if (chan->ops->window_size == NULL) return 0; - return chan->ops->window_size(chan->fd, chan->data, + return chan->ops->window_size(chan->fd_in, chan->data, rows_out, cols_out); } return 0; @@ -329,7 +359,7 @@ static void free_one_chan(struct chan *chan) (*chan->ops->free)(chan->data); if (chan->primary && chan->output) - ignore_sigio_fd(chan->fd); + ignore_sigio_fd(chan->fd_in); kfree(chan); } @@ -488,7 +518,8 @@ static struct chan *parse_chan(struct line *line, char *str, int device, .output = 0, .opened = 0, .enabled = 0, - .fd = -1, + .fd_in = -1, + .fd_out = -1, .ops = ops, .data = data }); return chan; @@ -549,7 +580,7 @@ void chan_interrupt(struct line *line, int irq) struct tty_port *port = &line->port; struct chan *chan = line->chan_in; int err; - char c; + u8 c; if (!chan || !chan->ops->read) goto out; @@ -559,13 +590,11 @@ void chan_interrupt(struct line *line, int irq) schedule_delayed_work(&line->task, 1); goto out; } - err = chan->ops->read(chan->fd, &c, chan->data); + err = chan->ops->read(chan->fd_in, &c, chan->data); if (err > 0) tty_insert_flip_char(port, c, TTY_NORMAL); } while (err > 0); - if (err == 0) - reactivate_fd(chan->fd, irq); if (err == -EIO) { if (chan->primary) { tty_port_tty_hangup(&line->port, false); diff --git a/arch/um/drivers/chan_user.c b/arch/um/drivers/chan_user.c index 3fd7c3efdb18..35f9beeb19b3 100644 --- a/arch/um/drivers/chan_user.c +++ b/arch/um/drivers/chan_user.c @@ -1,6 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{linux.intel,addtoit}.com) - * Licensed under the GPL */ #include <stdlib.h> @@ -19,29 +19,41 @@ void generic_close(int fd, void *unused) close(fd); } -int generic_read(int fd, char *c_out, void *unused) +int generic_read(int fd, __u8 *c_out, void *unused) { int n; - n = read(fd, c_out, sizeof(*c_out)); + CATCH_EINTR(n = read(fd, c_out, sizeof(*c_out))); if (n > 0) return n; - else if (errno == EAGAIN) - return 0; else if (n == 0) return -EIO; + else if (errno == EAGAIN) + return 0; return -errno; } /* XXX Trivial wrapper around write */ -int generic_write(int fd, const char *buf, int n, void *unused) +int generic_write(int fd, const __u8 *buf, size_t n, void *unused) { + int written = 0; int err; - err = write(fd, buf, n); - if (err > 0) - return err; + /* The FD may be in blocking mode, as such, need to retry short writes, + * they may have been interrupted by a signal. + */ + do { + errno = 0; + err = write(fd, buf + written, n - written); + if (err > 0) { + written += err; + continue; + } + } while (err < 0 && errno == EINTR); + + if (written > 0) + return written; else if (errno == EAGAIN) return 0; else if (err == 0) @@ -141,7 +153,7 @@ struct winch_data { int pipe_fd; }; -static int winch_thread(void *arg) +static __noreturn int winch_thread(void *arg) { struct winch_data *data = arg; sigset_t sigs; @@ -149,12 +161,14 @@ static int winch_thread(void *arg) int count; char c = 1; + os_set_pdeathsig(); + pty_fd = data->pty_fd; pipe_fd = data->pipe_fd; count = write(pipe_fd, &c, sizeof(c)); if (count != sizeof(c)) - printk(UM_KERN_ERR "winch_thread : failed to write " - "synchronization byte, err = %d\n", -count); + os_info("winch_thread : failed to write synchronization byte, err = %d\n", + -count); /* * We are not using SIG_IGN on purpose, so don't fix it as I thought to @@ -166,29 +180,29 @@ static int winch_thread(void *arg) sigfillset(&sigs); /* Block all signals possible. */ if (sigprocmask(SIG_SETMASK, &sigs, NULL) < 0) { - printk(UM_KERN_ERR "winch_thread : sigprocmask failed, " - "errno = %d\n", errno); - exit(1); + os_info("winch_thread : sigprocmask failed, errno = %d\n", + errno); + goto wait_kill; } /* In sigsuspend(), block anything else than SIGWINCH. */ sigdelset(&sigs, SIGWINCH); if (setsid() < 0) { - printk(UM_KERN_ERR "winch_thread : setsid failed, errno = %d\n", + os_info("winch_thread : setsid failed, errno = %d\n", errno); - exit(1); + goto wait_kill; } if (ioctl(pty_fd, TIOCSCTTY, 0) < 0) { - printk(UM_KERN_ERR "winch_thread : TIOCSCTTY failed on " - "fd %d err = %d\n", pty_fd, errno); - exit(1); + os_info("winch_thread : TIOCSCTTY failed on " + "fd %d err = %d\n", pty_fd, errno); + goto wait_kill; } if (tcsetpgrp(pty_fd, os_getpid()) < 0) { - printk(UM_KERN_ERR "winch_thread : tcsetpgrp failed on " - "fd %d err = %d\n", pty_fd, errno); - exit(1); + os_info("winch_thread : tcsetpgrp failed on fd %d err = %d\n", + pty_fd, errno); + goto wait_kill; } /* @@ -199,8 +213,8 @@ static int winch_thread(void *arg) */ count = read(pipe_fd, &c, sizeof(c)); if (count != sizeof(c)) - printk(UM_KERN_ERR "winch_thread : failed to read " - "synchronization byte, err = %d\n", errno); + os_info("winch_thread : failed to read synchronization byte, err = %d\n", + errno); while(1) { /* @@ -211,16 +225,22 @@ static int winch_thread(void *arg) count = write(pipe_fd, &c, sizeof(c)); if (count != sizeof(c)) - printk(UM_KERN_ERR "winch_thread : write failed, " - "err = %d\n", errno); + os_info("winch_thread : write failed, err = %d\n", + errno); } + +wait_kill: + c = 2; + count = write(pipe_fd, &c, sizeof(c)); + while (1) + pause(); } static int winch_tramp(int fd, struct tty_port *port, int *fd_out, unsigned long *stack_out) { struct winch_data data; - int fds[2], n, err; + int fds[2], n, err, pid; char c; err = os_pipe(fds, 1, 1); @@ -238,8 +258,9 @@ static int winch_tramp(int fd, struct tty_port *port, int *fd_out, * problem with /dev/net/tun, which if held open by this * thread, prevents the TUN/TAP device from being reused. */ - err = run_helper_thread(winch_thread, &data, CLONE_FILES, stack_out); - if (err < 0) { + pid = run_helper_thread(winch_thread, &data, CLONE_FILES, stack_out); + if (pid < 0) { + err = pid; printk(UM_KERN_ERR "fork of winch_thread failed - errno = %d\n", -err); goto out_close; @@ -256,13 +277,14 @@ static int winch_tramp(int fd, struct tty_port *port, int *fd_out, goto out_close; } - if (os_set_fd_block(*fd_out, 0)) { + err = os_set_fd_block(*fd_out, 0); + if (err) { printk(UM_KERN_ERR "winch_tramp: failed to set thread_fd " "non-blocking.\n"); goto out_close; } - return err; + return pid; out_close: close(fds[1]); diff --git a/arch/um/drivers/chan_user.h b/arch/um/drivers/chan_user.h index 03f1b565c5f9..e158e16fb3cc 100644 --- a/arch/um/drivers/chan_user.h +++ b/arch/um/drivers/chan_user.h @@ -1,17 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2000, 2001 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL */ #ifndef __CHAN_USER_H__ #define __CHAN_USER_H__ #include <init.h> +#include <linux/types.h> struct chan_opts { void (*const announce)(char *dev_name, int dev); char *xterm_title; - const int raw; + int raw; }; struct chan_ops { @@ -19,8 +20,8 @@ struct chan_ops { void *(*init)(char *, int, const struct chan_opts *); int (*open)(int, int, int, void *, char **); void (*close)(int, void *); - int (*read)(int, char *, void *); - int (*write)(int, const char *, int, void *); + int (*read)(int, __u8 *, void *); + int (*write)(int, const __u8 *, size_t, void *); int (*console_write)(int, const char *, int); int (*window_size)(int, void *, unsigned short *, unsigned short *); void (*free)(void *); @@ -31,8 +32,8 @@ extern const struct chan_ops fd_ops, null_ops, port_ops, pts_ops, pty_ops, tty_ops, xterm_ops; extern void generic_close(int fd, void *unused); -extern int generic_read(int fd, char *c_out, void *unused); -extern int generic_write(int fd, const char *buf, int n, void *unused); +extern int generic_read(int fd, __u8 *c_out, void *unused); +extern int generic_write(int fd, const __u8 *buf, size_t n, void *unused); extern int generic_console_write(int fd, const char *buf, int n); extern int generic_window_size(int fd, void *unused, unsigned short *rows_out, unsigned short *cols_out); diff --git a/arch/um/drivers/cow.h b/arch/um/drivers/cow.h index 6673508f3426..9a67c017000f 100644 --- a/arch/um/drivers/cow.h +++ b/arch/um/drivers/cow.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __COW_H__ #define __COW_H__ @@ -10,7 +11,7 @@ extern int init_cow_file(int fd, char *cow_file, char *backing_file, extern int file_reader(__u64 offset, char *buf, int len, void *arg); extern int read_cow_header(int (*reader)(__u64, char *, int, void *), void *arg, __u32 *version_out, - char **backing_file_out, time_t *mtime_out, + char **backing_file_out, long long *mtime_out, unsigned long long *size_out, int *sectorsize_out, __u32 *align_out, int *bitmap_offset_out); @@ -23,10 +24,3 @@ extern void cow_sizes(int version, __u64 size, int sectorsize, int align, int *data_offset_out); #endif - -/* - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/drivers/cow_sys.h b/arch/um/drivers/cow_sys.h index 67cbee63e702..916811ef5317 100644 --- a/arch/um/drivers/cow_sys.h +++ b/arch/um/drivers/cow_sys.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __COW_SYS_H__ #define __COW_SYS_H__ diff --git a/arch/um/drivers/cow_user.c b/arch/um/drivers/cow_user.c index 0ee9cc6cc4c7..29b46581ddd1 100644 --- a/arch/um/drivers/cow_user.c +++ b/arch/um/drivers/cow_user.c @@ -1,6 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Jeff Dike (jdike@{linux.intel,addtoit}.com) - * Licensed under the GPL */ /* @@ -17,6 +17,7 @@ #define PATH_LEN_V1 256 +/* unsigned time_t works until year 2106 */ typedef __u32 time32_t; struct cow_header_v1 { @@ -197,7 +198,7 @@ int write_cow_header(char *cow_file, int fd, char *backing_file, int sectorsize, int alignment, unsigned long long *size) { struct cow_header_v3 *header; - unsigned long modtime; + long long modtime; int err; err = cow_seek_file(fd, 0); @@ -276,7 +277,7 @@ int file_reader(__u64 offset, char *buf, int len, void *arg) int read_cow_header(int (*reader)(__u64, char *, int, void *), void *arg, __u32 *version_out, char **backing_file_out, - time_t *mtime_out, unsigned long long *size_out, + long long *mtime_out, unsigned long long *size_out, int *sectorsize_out, __u32 *align_out, int *bitmap_offset_out) { @@ -363,7 +364,7 @@ int read_cow_header(int (*reader)(__u64, char *, int, void *), void *arg, /* * this was used until Dec2005 - 64bits are needed to represent - * 2038+. I.e. we can safely do this truncating cast. + * 2106+. I.e. we can safely do this truncating cast. * * Additionally, we must use be32toh() instead of be64toh(), since * the program used to use the former (tested - I got mtime diff --git a/arch/um/drivers/daemon.h b/arch/um/drivers/daemon.h deleted file mode 100644 index c2dd1951559f..000000000000 --- a/arch/um/drivers/daemon.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL - */ - -#ifndef __DAEMON_H__ -#define __DAEMON_H__ - -#include <net_user.h> - -#define SWITCH_VERSION 3 - -struct daemon_data { - char *sock_type; - char *ctl_sock; - void *ctl_addr; - void *data_addr; - void *local_addr; - int fd; - int control; - void *dev; -}; - -extern const struct net_user_info daemon_user_info; - -extern int daemon_user_write(int fd, void *buf, int len, - struct daemon_data *pri); - -#endif diff --git a/arch/um/drivers/daemon_kern.c b/arch/um/drivers/daemon_kern.c deleted file mode 100644 index 7568cc2f3cd6..000000000000 --- a/arch/um/drivers/daemon_kern.c +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and - * James Leu (jleu@mindspring.net). - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Copyright (C) 2001 by various other people who didn't put their name here. - * Licensed under the GPL. - */ - -#include <linux/init.h> -#include <linux/netdevice.h> -#include <net_kern.h> -#include "daemon.h" - -struct daemon_init { - char *sock_type; - char *ctl_sock; -}; - -static void daemon_init(struct net_device *dev, void *data) -{ - struct uml_net_private *pri; - struct daemon_data *dpri; - struct daemon_init *init = data; - - pri = netdev_priv(dev); - dpri = (struct daemon_data *) pri->user; - dpri->sock_type = init->sock_type; - dpri->ctl_sock = init->ctl_sock; - dpri->fd = -1; - dpri->control = -1; - dpri->dev = dev; - /* We will free this pointer. If it contains crap we're burned. */ - dpri->ctl_addr = NULL; - dpri->data_addr = NULL; - dpri->local_addr = NULL; - - printk("daemon backend (uml_switch version %d) - %s:%s", - SWITCH_VERSION, dpri->sock_type, dpri->ctl_sock); - printk("\n"); -} - -static int daemon_read(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - return net_recvfrom(fd, skb_mac_header(skb), - skb->dev->mtu + ETH_HEADER_OTHER); -} - -static int daemon_write(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - return daemon_user_write(fd, skb->data, skb->len, - (struct daemon_data *) &lp->user); -} - -static const struct net_kern_info daemon_kern_info = { - .init = daemon_init, - .protocol = eth_protocol, - .read = daemon_read, - .write = daemon_write, -}; - -static int daemon_setup(char *str, char **mac_out, void *data) -{ - struct daemon_init *init = data; - char *remain; - - *init = ((struct daemon_init) - { .sock_type = "unix", - .ctl_sock = "/tmp/uml.ctl" }); - - remain = split_if_spec(str, mac_out, &init->sock_type, &init->ctl_sock, - NULL); - if (remain != NULL) - printk(KERN_WARNING "daemon_setup : Ignoring data socket " - "specification\n"); - - return 1; -} - -static struct transport daemon_transport = { - .list = LIST_HEAD_INIT(daemon_transport.list), - .name = "daemon", - .setup = daemon_setup, - .user = &daemon_user_info, - .kern = &daemon_kern_info, - .private_size = sizeof(struct daemon_data), - .setup_size = sizeof(struct daemon_init), -}; - -static int register_daemon(void) -{ - register_transport(&daemon_transport); - return 0; -} - -late_initcall(register_daemon); diff --git a/arch/um/drivers/daemon_user.c b/arch/um/drivers/daemon_user.c deleted file mode 100644 index 8813c10d0177..000000000000 --- a/arch/um/drivers/daemon_user.c +++ /dev/null @@ -1,193 +0,0 @@ -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and - * James Leu (jleu@mindspring.net). - * Copyright (C) 2001 by various other people who didn't put their name here. - * Licensed under the GPL. - */ - -#include <stdint.h> -#include <unistd.h> -#include <errno.h> -#include <sys/types.h> -#include <sys/socket.h> -#include <sys/time.h> -#include <sys/un.h> -#include "daemon.h" -#include <net_user.h> -#include <os.h> -#include <um_malloc.h> - -enum request_type { REQ_NEW_CONTROL }; - -#define SWITCH_MAGIC 0xfeedface - -struct request_v3 { - uint32_t magic; - uint32_t version; - enum request_type type; - struct sockaddr_un sock; -}; - -static struct sockaddr_un *new_addr(void *name, int len) -{ - struct sockaddr_un *sun; - - sun = uml_kmalloc(sizeof(struct sockaddr_un), UM_GFP_KERNEL); - if (sun == NULL) { - printk(UM_KERN_ERR "new_addr: allocation of sockaddr_un " - "failed\n"); - return NULL; - } - sun->sun_family = AF_UNIX; - memcpy(sun->sun_path, name, len); - return sun; -} - -static int connect_to_switch(struct daemon_data *pri) -{ - struct sockaddr_un *ctl_addr = pri->ctl_addr; - struct sockaddr_un *local_addr = pri->local_addr; - struct sockaddr_un *sun; - struct request_v3 req; - int fd, n, err; - - pri->control = socket(AF_UNIX, SOCK_STREAM, 0); - if (pri->control < 0) { - err = -errno; - printk(UM_KERN_ERR "daemon_open : control socket failed, " - "errno = %d\n", -err); - return err; - } - - if (connect(pri->control, (struct sockaddr *) ctl_addr, - sizeof(*ctl_addr)) < 0) { - err = -errno; - printk(UM_KERN_ERR "daemon_open : control connect failed, " - "errno = %d\n", -err); - goto out; - } - - fd = socket(AF_UNIX, SOCK_DGRAM, 0); - if (fd < 0) { - err = -errno; - printk(UM_KERN_ERR "daemon_open : data socket failed, " - "errno = %d\n", -err); - goto out; - } - if (bind(fd, (struct sockaddr *) local_addr, sizeof(*local_addr)) < 0) { - err = -errno; - printk(UM_KERN_ERR "daemon_open : data bind failed, " - "errno = %d\n", -err); - goto out_close; - } - - sun = uml_kmalloc(sizeof(struct sockaddr_un), UM_GFP_KERNEL); - if (sun == NULL) { - printk(UM_KERN_ERR "new_addr: allocation of sockaddr_un " - "failed\n"); - err = -ENOMEM; - goto out_close; - } - - req.magic = SWITCH_MAGIC; - req.version = SWITCH_VERSION; - req.type = REQ_NEW_CONTROL; - req.sock = *local_addr; - n = write(pri->control, &req, sizeof(req)); - if (n != sizeof(req)) { - printk(UM_KERN_ERR "daemon_open : control setup request " - "failed, err = %d\n", -errno); - err = -ENOTCONN; - goto out_free; - } - - n = read(pri->control, sun, sizeof(*sun)); - if (n != sizeof(*sun)) { - printk(UM_KERN_ERR "daemon_open : read of data socket failed, " - "err = %d\n", -errno); - err = -ENOTCONN; - goto out_free; - } - - pri->data_addr = sun; - return fd; - - out_free: - kfree(sun); - out_close: - close(fd); - out: - close(pri->control); - return err; -} - -static int daemon_user_init(void *data, void *dev) -{ - struct daemon_data *pri = data; - struct timeval tv; - struct { - char zero; - int pid; - int usecs; - } name; - - if (!strcmp(pri->sock_type, "unix")) - pri->ctl_addr = new_addr(pri->ctl_sock, - strlen(pri->ctl_sock) + 1); - name.zero = 0; - name.pid = os_getpid(); - gettimeofday(&tv, NULL); - name.usecs = tv.tv_usec; - pri->local_addr = new_addr(&name, sizeof(name)); - pri->dev = dev; - pri->fd = connect_to_switch(pri); - if (pri->fd < 0) { - kfree(pri->local_addr); - pri->local_addr = NULL; - return pri->fd; - } - - return 0; -} - -static int daemon_open(void *data) -{ - struct daemon_data *pri = data; - return pri->fd; -} - -static void daemon_remove(void *data) -{ - struct daemon_data *pri = data; - - close(pri->fd); - pri->fd = -1; - close(pri->control); - pri->control = -1; - - kfree(pri->data_addr); - pri->data_addr = NULL; - kfree(pri->ctl_addr); - pri->ctl_addr = NULL; - kfree(pri->local_addr); - pri->local_addr = NULL; -} - -int daemon_user_write(int fd, void *buf, int len, struct daemon_data *pri) -{ - struct sockaddr_un *data_addr = pri->data_addr; - - return net_sendto(fd, buf, len, data_addr, sizeof(*data_addr)); -} - -const struct net_user_info daemon_user_info = { - .init = daemon_user_init, - .open = daemon_open, - .close = NULL, - .remove = daemon_remove, - .add_address = NULL, - .delete_address = NULL, - .mtu = ETH_MAX_PACKET, - .max_packet = ETH_MAX_PACKET + ETH_HEADER_OTHER, -}; diff --git a/arch/um/drivers/fd.c b/arch/um/drivers/fd.c index a13a427b996b..082d739dc052 100644 --- a/arch/um/drivers/fd.c +++ b/arch/um/drivers/fd.c @@ -1,6 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{linux.intel,addtoit}.com) - * Licensed under the GPL */ #include <stdio.h> diff --git a/arch/um/drivers/harddog.h b/arch/um/drivers/harddog.h new file mode 100644 index 000000000000..6d9ea60e7133 --- /dev/null +++ b/arch/um/drivers/harddog.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef UM_WATCHDOG_H +#define UM_WATCHDOG_H + +int start_watchdog(int *in_fd_ret, int *out_fd_ret, char *sock); +void stop_watchdog(int in_fd, int out_fd); +int ping_watchdog(int fd); + +#endif /* UM_WATCHDOG_H */ diff --git a/arch/um/drivers/harddog_kern.c b/arch/um/drivers/harddog_kern.c index 2d0266d0254d..819aabb4ecdc 100644 --- a/arch/um/drivers/harddog_kern.c +++ b/arch/um/drivers/harddog_kern.c @@ -45,9 +45,11 @@ #include <linux/mutex.h> #include <linux/init.h> #include <linux/spinlock.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "mconsole.h" +#include "harddog.h" +MODULE_DESCRIPTION("UML hardware watchdog"); MODULE_LICENSE("GPL"); static DEFINE_MUTEX(harddog_mutex); @@ -60,8 +62,6 @@ static int harddog_out_fd = -1; * Allow only one person to hold it open */ -extern int start_watchdog(int *in_fd_ret, int *out_fd_ret, char *sock); - static int harddog_open(struct inode *inode, struct file *file) { int err = -EBUSY; @@ -85,15 +85,13 @@ static int harddog_open(struct inode *inode, struct file *file) timer_alive = 1; spin_unlock(&lock); mutex_unlock(&harddog_mutex); - return nonseekable_open(inode, file); + return stream_open(inode, file); err: spin_unlock(&lock); mutex_unlock(&harddog_mutex); return err; } -extern void stop_watchdog(int in_fd, int out_fd); - static int harddog_release(struct inode *inode, struct file *file) { /* @@ -112,8 +110,6 @@ static int harddog_release(struct inode *inode, struct file *file) return 0; } -extern int ping_watchdog(int fd); - static ssize_t harddog_write(struct file *file, const char __user *data, size_t len, loff_t *ppos) { @@ -165,9 +161,9 @@ static const struct file_operations harddog_fops = { .owner = THIS_MODULE, .write = harddog_write, .unlocked_ioctl = harddog_ioctl, + .compat_ioctl = compat_ptr_ioctl, .open = harddog_open, .release = harddog_release, - .llseek = no_llseek, }; static struct miscdevice harddog_miscdev = { @@ -175,27 +171,4 @@ static struct miscdevice harddog_miscdev = { .name = "watchdog", .fops = &harddog_fops, }; - -static char banner[] __initdata = KERN_INFO "UML Watchdog Timer\n"; - -static int __init harddog_init(void) -{ - int ret; - - ret = misc_register(&harddog_miscdev); - - if (ret) - return ret; - - printk(banner); - - return 0; -} - -static void __exit harddog_exit(void) -{ - misc_deregister(&harddog_miscdev); -} - -module_init(harddog_init); -module_exit(harddog_exit); +module_misc_device(harddog_miscdev); diff --git a/arch/um/drivers/harddog_user.c b/arch/um/drivers/harddog_user.c index f99b32a4dbff..9ed89304975e 100644 --- a/arch/um/drivers/harddog_user.c +++ b/arch/um/drivers/harddog_user.c @@ -1,16 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <stdio.h> #include <unistd.h> #include <errno.h> #include <os.h> +#include "harddog.h" struct dog_data { - int stdin; - int stdout; + int stdin_fd; + int stdout_fd; int close_me[2]; }; @@ -18,11 +19,11 @@ static void pre_exec(void *d) { struct dog_data *data = d; - dup2(data->stdin, 0); - dup2(data->stdout, 1); - dup2(data->stdout, 2); - close(data->stdin); - close(data->stdout); + dup2(data->stdin_fd, 0); + dup2(data->stdout_fd, 1); + dup2(data->stdout_fd, 2); + close(data->stdin_fd); + close(data->stdout_fd); close(data->close_me[0]); close(data->close_me[1]); } @@ -49,8 +50,8 @@ int start_watchdog(int *in_fd_ret, int *out_fd_ret, char *sock) goto out_close_in; } - data.stdin = out_fds[0]; - data.stdout = in_fds[1]; + data.stdin_fd = out_fds[0]; + data.stdout_fd = in_fds[1]; data.close_me[0] = out_fds[1]; data.close_me[1] = in_fds[0]; diff --git a/arch/um/drivers/harddog_user_exp.c b/arch/um/drivers/harddog_user_exp.c new file mode 100644 index 000000000000..c74d4b815d14 --- /dev/null +++ b/arch/um/drivers/harddog_user_exp.c @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/export.h> +#include "harddog.h" + +#if IS_MODULE(CONFIG_UML_WATCHDOG) +EXPORT_SYMBOL(start_watchdog); +EXPORT_SYMBOL(stop_watchdog); +EXPORT_SYMBOL(ping_watchdog); +#endif diff --git a/arch/um/drivers/hostaudio_kern.c b/arch/um/drivers/hostaudio_kern.c index 9b90fdc4b151..0ac149de1ac0 100644 --- a/arch/um/drivers/hostaudio_kern.c +++ b/arch/um/drivers/hostaudio_kern.c @@ -1,6 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2002 Steve Schmidtke - * Licensed under the GPL */ #include <linux/fs.h> @@ -9,7 +9,7 @@ #include <linux/sound.h> #include <linux/soundcard.h> #include <linux/mutex.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <init.h> #include <os.h> @@ -48,6 +48,7 @@ MODULE_PARM_DESC(mixer, MIXER_HELP); #ifndef MODULE static int set_dsp(char *name, int *add) { + *add = 0; dsp = name; return 0; } @@ -56,6 +57,7 @@ __uml_setup("dsp=", set_dsp, "dsp=<dsp device>\n" DSP_HELP); static int set_mixer(char *name, int *add) { + *add = 0; mixer = name; return 0; } @@ -105,13 +107,9 @@ static ssize_t hostaudio_write(struct file *file, const char __user *buffer, printk(KERN_DEBUG "hostaudio: write called, count = %d\n", count); #endif - kbuf = kmalloc(count, GFP_KERNEL); - if (kbuf == NULL) - return -ENOMEM; - - err = -EFAULT; - if (copy_from_user(kbuf, buffer, count)) - goto out; + kbuf = memdup_user(buffer, count); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); err = os_write_file(state->fd, kbuf, count); if (err < 0) @@ -123,16 +121,14 @@ static ssize_t hostaudio_write(struct file *file, const char __user *buffer, return err; } -static unsigned int hostaudio_poll(struct file *file, - struct poll_table_struct *wait) +static __poll_t hostaudio_poll(struct file *file, + struct poll_table_struct *wait) { - unsigned int mask = 0; - #ifdef DEBUG printk(KERN_DEBUG "hostaudio: poll called (unimplemented)\n"); #endif - return mask; + return 0; } static long hostaudio_ioctl(struct file *file, @@ -185,9 +181,9 @@ static int hostaudio_open(struct inode *inode, struct file *file) int ret; #ifdef DEBUG - kparam_block_sysfs_write(dsp); + kernel_param_lock(THIS_MODULE); printk(KERN_DEBUG "hostaudio: open called (host: %s)\n", dsp); - kparam_unblock_sysfs_write(dsp); + kernel_param_unlock(THIS_MODULE); #endif state = kmalloc(sizeof(struct hostaudio_state), GFP_KERNEL); @@ -199,11 +195,11 @@ static int hostaudio_open(struct inode *inode, struct file *file) if (file->f_mode & FMODE_WRITE) w = 1; - kparam_block_sysfs_write(dsp); + kernel_param_lock(THIS_MODULE); mutex_lock(&hostaudio_mutex); ret = os_open_file(dsp, of_set_rw(OPENFLAGS(), r, w), 0); mutex_unlock(&hostaudio_mutex); - kparam_unblock_sysfs_write(dsp); + kernel_param_unlock(THIS_MODULE); if (ret < 0) { kfree(state); @@ -260,17 +256,17 @@ static int hostmixer_open_mixdev(struct inode *inode, struct file *file) if (file->f_mode & FMODE_WRITE) w = 1; - kparam_block_sysfs_write(mixer); + kernel_param_lock(THIS_MODULE); mutex_lock(&hostaudio_mutex); ret = os_open_file(mixer, of_set_rw(OPENFLAGS(), r, w), 0); mutex_unlock(&hostaudio_mutex); - kparam_unblock_sysfs_write(mixer); + kernel_param_unlock(THIS_MODULE); if (ret < 0) { - kparam_block_sysfs_write(dsp); + kernel_param_lock(THIS_MODULE); printk(KERN_ERR "hostaudio_open_mixdev failed to open '%s', " "err = %d\n", dsp, -ret); - kparam_unblock_sysfs_write(dsp); + kernel_param_unlock(THIS_MODULE); kfree(state); return ret; } @@ -297,11 +293,11 @@ static int hostmixer_release(struct inode *inode, struct file *file) static const struct file_operations hostaudio_fops = { .owner = THIS_MODULE, - .llseek = no_llseek, .read = hostaudio_read, .write = hostaudio_write, .poll = hostaudio_poll, .unlocked_ioctl = hostaudio_ioctl, + .compat_ioctl = compat_ptr_ioctl, .mmap = NULL, .open = hostaudio_open, .release = hostaudio_release, @@ -309,13 +305,12 @@ static const struct file_operations hostaudio_fops = { static const struct file_operations hostmixer_fops = { .owner = THIS_MODULE, - .llseek = no_llseek, .unlocked_ioctl = hostmixer_ioctl_mixdev, .open = hostmixer_open_mixdev, .release = hostmixer_release, }; -struct { +static struct { int dev_audio; int dev_mixer; } module_data; @@ -326,10 +321,10 @@ MODULE_LICENSE("GPL"); static int __init hostaudio_init_module(void) { - __kernel_param_lock(); + kernel_param_lock(THIS_MODULE); printk(KERN_INFO "UML Audio Relay (host dsp = %s, host mixer = %s)\n", dsp, mixer); - __kernel_param_unlock(); + kernel_param_unlock(THIS_MODULE); module_data.dev_audio = register_sound_dsp(&hostaudio_fops, -1); if (module_data.dev_audio < 0) { diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c index 8035145f043b..43d8959cc746 100644 --- a/arch/um/drivers/line.c +++ b/arch/um/drivers/line.c @@ -1,12 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <linux/irqreturn.h> #include <linux/kd.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/slab.h> + #include "chan.h" #include <irq_kern.h> #include <irq_user.h> @@ -31,7 +32,7 @@ static irqreturn_t line_interrupt(int irq, void *data) * * Should be called while holding line->lock (this does not modify data). */ -static int write_room(struct line *line) +static unsigned int write_room(struct line *line) { int n; @@ -46,11 +47,11 @@ static int write_room(struct line *line) return n - 1; } -int line_write_room(struct tty_struct *tty) +unsigned int line_write_room(struct tty_struct *tty) { struct line *line = tty->driver_data; unsigned long flags; - int room; + unsigned int room; spin_lock_irqsave(&line->lock, flags); room = write_room(line); @@ -59,11 +60,11 @@ int line_write_room(struct tty_struct *tty) return room; } -int line_chars_in_buffer(struct tty_struct *tty) +unsigned int line_chars_in_buffer(struct tty_struct *tty) { struct line *line = tty->driver_data; unsigned long flags; - int ret; + unsigned int ret; spin_lock_irqsave(&line->lock, flags); /* write_room subtracts 1 for the needed NULL, so we readd it.*/ @@ -82,7 +83,7 @@ int line_chars_in_buffer(struct tty_struct *tty) * * Must be called while holding line->lock! */ -static int buffer_data(struct line *line, const char *buf, int len) +static int buffer_data(struct line *line, const u8 *buf, size_t len) { int end, room; @@ -138,7 +139,7 @@ static int flush_buffer(struct line *line) count = line->buffer + LINE_BUFSIZE - line->head; n = write_chan(line->chan_out, line->head, count, - line->driver->write_irq); + line->write_irq); if (n < 0) return n; if (n == count) { @@ -155,7 +156,7 @@ static int flush_buffer(struct line *line) count = line->tail - line->head; n = write_chan(line->chan_out, line->head, count, - line->driver->write_irq); + line->write_irq); if (n < 0) return n; @@ -183,12 +184,7 @@ void line_flush_chars(struct tty_struct *tty) line_flush_buffer(tty); } -int line_put_char(struct tty_struct *tty, unsigned char ch) -{ - return line_write(tty, &ch, sizeof(ch)); -} - -int line_write(struct tty_struct *tty, const unsigned char *buf, int len) +ssize_t line_write(struct tty_struct *tty, const u8 *buf, size_t len) { struct line *line = tty->driver_data; unsigned long flags; @@ -199,7 +195,7 @@ int line_write(struct tty_struct *tty, const unsigned char *buf, int len) ret = buffer_data(line, buf, len); else { n = write_chan(line->chan_out, buf, len, - line->driver->write_irq); + line->write_irq); if (n < 0) { ret = n; goto out_up; @@ -215,16 +211,11 @@ out_up: return ret; } -void line_set_termios(struct tty_struct *tty, struct ktermios * old) -{ - /* nothing */ -} - void line_throttle(struct tty_struct *tty) { struct line *line = tty->driver_data; - deactivate_chan(line->chan_in, line->driver->read_irq); + deactivate_chan(line->chan_in, line->read_irq); line->throttled = 1; } @@ -233,15 +224,7 @@ void line_unthrottle(struct tty_struct *tty) struct line *line = tty->driver_data; line->throttled = 0; - chan_interrupt(line, line->driver->read_irq); - - /* - * Maybe there is enough stuff pending that calling the interrupt - * throttles us again. In this case, line->throttled will be 1 - * again and we shouldn't turn the interrupt back on. - */ - if (!line->throttled) - reactivate_chan(line->chan_in, line->driver->read_irq); + chan_interrupt(line, line->read_irq); } static irqreturn_t line_write_interrupt(int irq, void *data) @@ -260,7 +243,7 @@ static irqreturn_t line_write_interrupt(int irq, void *data) if (err == 0) { spin_unlock(&line->lock); return IRQ_NONE; - } else if (err < 0) { + } else if ((err < 0) && (err != -EAGAIN)) { line->head = line->buffer; line->tail = line->buffer; } @@ -274,19 +257,29 @@ static irqreturn_t line_write_interrupt(int irq, void *data) int line_setup_irq(int fd, int input, int output, struct line *line, void *data) { const struct line_driver *driver = line->driver; - int err = 0; + int err; - if (input) - err = um_request_irq(driver->read_irq, fd, IRQ_READ, - line_interrupt, IRQF_SHARED, + if (input) { + err = um_request_irq(UM_IRQ_ALLOC, fd, IRQ_READ, + line_interrupt, 0, driver->read_irq_name, data); - if (err) - return err; - if (output) - err = um_request_irq(driver->write_irq, fd, IRQ_WRITE, - line_write_interrupt, IRQF_SHARED, + if (err < 0) + return err; + + line->read_irq = err; + } + + if (output) { + err = um_request_irq(UM_IRQ_ALLOC, fd, IRQ_WRITE, + line_write_interrupt, 0, driver->write_irq_name, data); - return err; + if (err < 0) + return err; + + line->write_irq = err; + } + + return 0; } static int line_activate(struct tty_port *port, struct tty_struct *tty) @@ -390,6 +383,7 @@ int setup_one_line(struct line *lines, int n, char *init, parse_chan_pair(NULL, line, n, opts, error_out); err = 0; } + *error_out = "configured as 'none'"; } else { char *new = kstrdup(init, GFP_KERNEL); if (!new) { @@ -413,6 +407,7 @@ int setup_one_line(struct line *lines, int n, char *init, } } if (err) { + *error_out = "failed to parse channel pair"; line->init_str = NULL; line->valid = 0; kfree(new); @@ -549,12 +544,14 @@ int register_lines(struct line_driver *line_driver, const struct tty_operations *ops, struct line *lines, int nlines) { - struct tty_driver *driver = alloc_tty_driver(nlines); + struct tty_driver *driver; int err; int i; - if (!driver) - return -ENOMEM; + driver = tty_alloc_driver(nlines, TTY_DRIVER_REAL_RAW | + TTY_DRIVER_DYNAMIC_DEV); + if (IS_ERR(driver)) + return PTR_ERR(driver); driver->driver_name = line_driver->name; driver->name = line_driver->device_name; @@ -562,9 +559,8 @@ int register_lines(struct line_driver *line_driver, driver->minor_start = line_driver->minor_start; driver->type = line_driver->type; driver->subtype = line_driver->subtype; - driver->flags = TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV; driver->init_termios = tty_std_termios; - + for (i = 0; i < nlines; i++) { tty_port_init(&lines[i].port); lines[i].port.ops = &line_port_ops; @@ -578,7 +574,7 @@ int register_lines(struct line_driver *line_driver, if (err) { printk(KERN_ERR "register_lines : can't register %s driver\n", line_driver->name); - put_tty_driver(driver); + tty_driver_kref_put(driver); for (i = 0; i < nlines; i++) tty_port_destroy(&lines[i].port); return err; @@ -620,7 +616,6 @@ static void free_winch(struct winch *winch) winch->fd = -1; if (fd != -1) os_close_file(fd); - list_del(&winch->list); __free_winch(&winch->work); } @@ -632,18 +627,22 @@ static irqreturn_t winch_interrupt(int irq, void *data) int fd = winch->fd; int err; char c; + struct pid *pgrp; if (fd != -1) { err = generic_read(fd, &c, NULL); - if (err < 0) { + /* A read of 2 means the winch thread failed and has warned */ + if (err < 0 || (err == 1 && c == 2)) { if (err != -EAGAIN) { winch->fd = -1; list_del(&winch->list); os_close_file(fd); - printk(KERN_ERR "winch_interrupt : " - "read failed, errno = %d\n", -err); - printk(KERN_ERR "fd %d is losing SIGWINCH " - "support\n", winch->tty_fd); + if (err < 0) { + printk(KERN_ERR "winch_interrupt : read failed, errno = %d\n", + -err); + printk(KERN_ERR "fd %d is losing SIGWINCH support\n", + winch->tty_fd); + } INIT_WORK(&winch->work, __free_winch); schedule_work(&winch->work); return IRQ_HANDLED; @@ -657,13 +656,14 @@ static irqreturn_t winch_interrupt(int irq, void *data) if (line != NULL) { chan_window_size(line, &tty->winsize.ws_row, &tty->winsize.ws_col); - kill_pgrp(tty->pgrp, SIGWINCH, 1); + pgrp = tty_get_pgrp(tty); + if (pgrp) + kill_pgrp(pgrp, SIGWINCH, 1); + put_pid(pgrp); } tty_kref_put(tty); } out: - if (winch->fd != -1) - reactivate_fd(winch->fd, WINCH_IRQ); return IRQ_HANDLED; } @@ -678,24 +678,26 @@ void register_winch_irq(int fd, int tty_fd, int pid, struct tty_port *port, goto cleanup; } - *winch = ((struct winch) { .list = LIST_HEAD_INIT(winch->list), - .fd = fd, + *winch = ((struct winch) { .fd = fd, .tty_fd = tty_fd, .pid = pid, .port = port, .stack = stack }); + spin_lock(&winch_handler_lock); + list_add(&winch->list, &winch_handlers); + spin_unlock(&winch_handler_lock); + if (um_request_irq(WINCH_IRQ, fd, IRQ_READ, winch_interrupt, IRQF_SHARED, "winch", winch) < 0) { printk(KERN_ERR "register_winch_irq - failed to register " "IRQ\n"); + spin_lock(&winch_handler_lock); + list_del(&winch->list); + spin_unlock(&winch_handler_lock); goto out_free; } - spin_lock(&winch_handler_lock); - list_add(&winch->list, &winch_handlers); - spin_unlock(&winch_handler_lock); - return; out_free: @@ -719,6 +721,8 @@ static void unregister_winch(struct tty_struct *tty) winch = list_entry(ele, struct winch, list); wtty = tty_port_tty_get(winch->port); if (wtty == tty) { + list_del(&winch->list); + spin_unlock(&winch_handler_lock); free_winch(winch); break; } @@ -729,14 +733,17 @@ static void unregister_winch(struct tty_struct *tty) static void winch_cleanup(void) { - struct list_head *ele, *next; struct winch *winch; spin_lock(&winch_handler_lock); + while ((winch = list_first_entry_or_null(&winch_handlers, + struct winch, list))) { + list_del(&winch->list); + spin_unlock(&winch_handler_lock); - list_for_each_safe(ele, next, &winch_handlers) { - winch = list_entry(ele, struct winch, list); free_winch(winch); + + spin_lock(&winch_handler_lock); } spin_unlock(&winch_handler_lock); diff --git a/arch/um/drivers/line.h b/arch/um/drivers/line.h index 138a14526d9c..e8bd6f3dfb50 100644 --- a/arch/um/drivers/line.h +++ b/arch/um/drivers/line.h @@ -1,6 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL */ #ifndef __LINE_H__ @@ -23,9 +23,7 @@ struct line_driver { const short minor_start; const short type; const short subtype; - const int read_irq; const char *read_irq_name; - const int write_irq; const char *write_irq_name; struct mc_device mc; struct tty_driver *driver; @@ -35,6 +33,8 @@ struct line { struct tty_port port; int valid; + int read_irq, write_irq; + char *init_str; struct list_head chan_list; struct chan *chan_in, *chan_out; @@ -47,9 +47,9 @@ struct line { * * buffer points to a buffer allocated on demand, of length * LINE_BUFSIZE, head to the start of the ring, tail to the end.*/ - char *buffer; - char *head; - char *tail; + u8 *buffer; + u8 *head; + u8 *tail; int sigio; struct delayed_work task; @@ -64,14 +64,11 @@ extern void line_cleanup(struct tty_struct *tty); extern void line_hangup(struct tty_struct *tty); extern int line_setup(char **conf, unsigned nlines, char **def, char *init, char *name); -extern int line_write(struct tty_struct *tty, const unsigned char *buf, - int len); -extern int line_put_char(struct tty_struct *tty, unsigned char ch); -extern void line_set_termios(struct tty_struct *tty, struct ktermios * old); -extern int line_chars_in_buffer(struct tty_struct *tty); +extern ssize_t line_write(struct tty_struct *tty, const u8 *buf, size_t len); +extern unsigned int line_chars_in_buffer(struct tty_struct *tty); extern void line_flush_buffer(struct tty_struct *tty); extern void line_flush_chars(struct tty_struct *tty); -extern int line_write_room(struct tty_struct *tty); +extern unsigned int line_write_room(struct tty_struct *tty); extern void line_throttle(struct tty_struct *tty); extern void line_unthrottle(struct tty_struct *tty); diff --git a/arch/um/drivers/mconsole.h b/arch/um/drivers/mconsole.h index 8b22535c62ce..6356378304fd 100644 --- a/arch/um/drivers/mconsole.h +++ b/arch/um/drivers/mconsole.h @@ -1,13 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #ifndef __MCONSOLE_H__ #define __MCONSOLE_H__ -#ifndef __KERNEL__ +#ifdef __UM_HOST__ #include <stdint.h> #define u32 uint32_t #endif diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index 3df3bd544492..ff4bda95b9c7 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -1,7 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) * Copyright (C) 2001 - 2008 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <linux/console.h> @@ -12,7 +12,9 @@ #include <linux/mm.h> #include <linux/module.h> #include <linux/notifier.h> +#include <linux/panic_notifier.h> #include <linux/reboot.h> +#include <linux/sched/debug.h> #include <linux/proc_fs.h> #include <linux/slab.h> #include <linux/syscalls.h> @@ -24,7 +26,7 @@ #include <linux/fs.h> #include <linux/mount.h> #include <linux/file.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/switch_to.h> #include <init.h> @@ -35,6 +37,8 @@ #include "mconsole_kern.h" #include <os.h> +static struct vfsmount *proc_mnt = NULL; + static int do_unlink_socket(struct notifier_block *notifier, unsigned long what, void *data) { @@ -95,7 +99,6 @@ static irqreturn_t mconsole_interrupt(int irq, void *dev_id) } if (!list_empty(&mc_requests)) schedule_work(&mconsole_work); - reactivate_fd(fd, MCONSOLE_IRQ); return IRQ_HANDLED; } @@ -123,17 +126,22 @@ void mconsole_log(struct mc_request *req) void mconsole_proc(struct mc_request *req) { - struct vfsmount *mnt = task_active_pid_ns(current)->proc_mnt; + struct vfsmount *mnt = proc_mnt; char *buf; int len; struct file *file; int first_chunk = 1; char *ptr = req->request.data; + loff_t pos = 0; ptr += strlen("proc"); ptr = skip_spaces(ptr); - file = file_open_root(mnt->mnt_root, mnt, ptr, O_RDONLY); + if (!mnt) { + mconsole_reply(req, "Proc not available", 1, 0); + goto out; + } + file = file_open_root_mnt(mnt, ptr, O_RDONLY, 0); if (IS_ERR(file)) { mconsole_reply(req, "Failed to open file", 1, 0); printk(KERN_ERR "open /proc/%s: %ld\n", ptr, PTR_ERR(file)); @@ -147,12 +155,7 @@ void mconsole_proc(struct mc_request *req) } do { - loff_t pos = file->f_pos; - mm_segment_t old_fs = get_fs(); - set_fs(KERNEL_DS); - len = vfs_read(file, buf, PAGE_SIZE - 1, &pos); - set_fs(old_fs); - file->f_pos = pos; + len = kernel_read(file, buf, PAGE_SIZE - 1, &pos); if (len < 0) { mconsole_reply(req, "Read of file failed", 1, 0); goto out_free; @@ -221,7 +224,7 @@ void mconsole_go(struct mc_request *req) void mconsole_stop(struct mc_request *req) { - deactivate_fd(req->originating_fd, MCONSOLE_IRQ); + block_signals(); os_set_fd_block(req->originating_fd, 1); mconsole_reply(req, "stopped", 0, 0); for (;;) { @@ -243,8 +246,8 @@ void mconsole_stop(struct mc_request *req) (*req->cmd->handler)(req); } os_set_fd_block(req->originating_fd, 0); - reactivate_fd(req->originating_fd, MCONSOLE_IRQ); mconsole_reply(req, "", 0, 0); + unblock_signals(); } static DEFINE_SPINLOCK(mc_devices_lock); @@ -280,7 +283,7 @@ struct unplugged_pages { }; static DEFINE_MUTEX(plug_mem_mutex); -static unsigned long long unplugged_pages_count = 0; +static unsigned long long unplugged_pages_count; static LIST_HEAD(unplugged_pages); static int unplug_index = UNPLUGGED_PER_PAGE; @@ -551,7 +554,7 @@ struct mconsole_output { static DEFINE_SPINLOCK(client_lock); static LIST_HEAD(clients); -static char console_buf[MCONSOLE_MAX_DATA]; +static char console_buf[MCONSOLE_MAX_DATA] __nonstring; static void console_write(struct console *console, const char *string, unsigned int len) @@ -564,7 +567,7 @@ static void console_write(struct console *console, const char *string, while (len > 0) { n = min((size_t) len, ARRAY_SIZE(console_buf)); - strncpy(console_buf, string, n); + memcpy(console_buf, string, n); string += n; len -= n; @@ -645,11 +648,9 @@ void mconsole_sysrq(struct mc_request *req) static void stack_proc(void *arg) { - struct task_struct *from = current, *to = arg; + struct task_struct *task = arg; - to->thread.saved_task = from; - rcu_user_hooks_switch(from, to); - switch_to(from, to, from); + show_stack(task, NULL, KERN_INFO); } /* @@ -690,6 +691,24 @@ void mconsole_stack(struct mc_request *req) with_console(req, stack_proc, to); } +static int __init mount_proc(void) +{ + struct file_system_type *proc_fs_type; + struct vfsmount *mnt; + + proc_fs_type = get_fs_type("proc"); + if (!proc_fs_type) + return -ENODEV; + + mnt = kern_mount(proc_fs_type); + put_filesystem(proc_fs_type); + if (IS_ERR(mnt)) + return PTR_ERR(mnt); + + proc_mnt = mnt; + return 0; +} + /* * Changed by mconsole_setup, which is __setup, and called before SMP is * active. @@ -703,6 +722,8 @@ static int __init mconsole_init(void) int err; char file[UNIX_PATH_MAX]; + mount_proc(); + if (umid_file_name("mconsole", file, sizeof(file))) return -1; snprintf(mconsole_socket_name, sizeof(file), "%s", file); @@ -719,7 +740,7 @@ static int __init mconsole_init(void) err = um_request_irq(MCONSOLE_IRQ, sock, IRQ_READ, mconsole_interrupt, IRQF_SHARED, "mconsole", (void *)sock); - if (err) { + if (err < 0) { printk(KERN_ERR "Failed to get IRQ for management console\n"); goto out; } @@ -750,27 +771,18 @@ static ssize_t mconsole_proc_write(struct file *file, { char *buf; - buf = kmalloc(count + 1, GFP_KERNEL); - if (buf == NULL) - return -ENOMEM; - - if (copy_from_user(buf, buffer, count)) { - count = -EFAULT; - goto out; - } - - buf[count] = '\0'; + buf = memdup_user_nul(buffer, count); + if (IS_ERR(buf)) + return PTR_ERR(buf); mconsole_notify(notify_socket, MCONSOLE_USER_NOTIFY, buf, count); - out: kfree(buf); return count; } -static const struct file_operations mconsole_proc_fops = { - .owner = THIS_MODULE, - .write = mconsole_proc_write, - .llseek = noop_llseek, +static const struct proc_ops mconsole_proc_ops = { + .proc_write = mconsole_proc_write, + .proc_lseek = noop_llseek, }; static int create_proc_mconsole(void) @@ -780,7 +792,7 @@ static int create_proc_mconsole(void) if (notify_socket == NULL) return 0; - ent = proc_create("mconsole", 0200, NULL, &mconsole_proc_fops); + ent = proc_create("mconsole", 0200, NULL, &mconsole_proc_ops); if (ent == NULL) { printk(KERN_INFO "create_proc_mconsole : proc_create failed\n"); return 0; @@ -834,13 +846,12 @@ static int notify_panic(struct notifier_block *self, unsigned long unused1, mconsole_notify(notify_socket, MCONSOLE_PANIC, message, strlen(message) + 1); - return 0; + return NOTIFY_DONE; } static struct notifier_block panic_exit_notifier = { - .notifier_call = notify_panic, - .next = NULL, - .priority = 1 + .notifier_call = notify_panic, + .priority = INT_MAX, /* run as soon as possible */ }; static int add_notifier(void) diff --git a/arch/um/drivers/mconsole_kern.h b/arch/um/drivers/mconsole_kern.h index 7a0c6a1ad1d4..56d8d6a3ff76 100644 --- a/arch/um/drivers/mconsole_kern.h +++ b/arch/um/drivers/mconsole_kern.h @@ -1,6 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL */ #ifndef __MCONSOLE_KERN_H__ diff --git a/arch/um/drivers/mconsole_user.c b/arch/um/drivers/mconsole_user.c index 99209826adb1..a04cd13c6315 100644 --- a/arch/um/drivers/mconsole_user.c +++ b/arch/um/drivers/mconsole_user.c @@ -1,7 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <errno.h> @@ -71,7 +71,9 @@ static struct mconsole_command *mconsole_parse(struct mc_request *req) return NULL; } +#ifndef MIN #define MIN(a,b) ((a)<(b) ? (a):(b)) +#endif #define STRINGX(x) #x #define STRING(x) STRINGX(x) diff --git a/arch/um/drivers/mmapper_kern.c b/arch/um/drivers/mmapper_kern.c deleted file mode 100644 index 62145c276167..000000000000 --- a/arch/um/drivers/mmapper_kern.c +++ /dev/null @@ -1,134 +0,0 @@ -/* - * arch/um/drivers/mmapper_kern.c - * - * BRIEF MODULE DESCRIPTION - * - * Copyright (C) 2000 RidgeRun, Inc. - * Author: RidgeRun, Inc. - * Greg Lonnon glonnon@ridgerun.com or info@ridgerun.com - * - */ - -#include <linux/stddef.h> -#include <linux/types.h> -#include <linux/fs.h> -#include <linux/init.h> -#include <linux/miscdevice.h> -#include <linux/module.h> -#include <linux/mm.h> - -#include <asm/uaccess.h> -#include <mem_user.h> - -/* These are set in mmapper_init, which is called at boot time */ -static unsigned long mmapper_size; -static unsigned long p_buf; -static char *v_buf; - -static ssize_t mmapper_read(struct file *file, char __user *buf, size_t count, - loff_t *ppos) -{ - return simple_read_from_buffer(buf, count, ppos, v_buf, mmapper_size); -} - -static ssize_t mmapper_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - if (*ppos > mmapper_size) - return -EINVAL; - - return simple_write_to_buffer(v_buf, mmapper_size, ppos, buf, count); -} - -static long mmapper_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - return -ENOIOCTLCMD; -} - -static int mmapper_mmap(struct file *file, struct vm_area_struct *vma) -{ - int ret = -EINVAL; - int size; - - if (vma->vm_pgoff != 0) - goto out; - - size = vma->vm_end - vma->vm_start; - if (size > mmapper_size) - return -EFAULT; - - /* - * XXX A comment above remap_pfn_range says it should only be - * called when the mm semaphore is held - */ - if (remap_pfn_range(vma, vma->vm_start, p_buf >> PAGE_SHIFT, size, - vma->vm_page_prot)) - goto out; - ret = 0; -out: - return ret; -} - -static int mmapper_open(struct inode *inode, struct file *file) -{ - return 0; -} - -static int mmapper_release(struct inode *inode, struct file *file) -{ - return 0; -} - -static const struct file_operations mmapper_fops = { - .owner = THIS_MODULE, - .read = mmapper_read, - .write = mmapper_write, - .unlocked_ioctl = mmapper_ioctl, - .mmap = mmapper_mmap, - .open = mmapper_open, - .release = mmapper_release, - .llseek = default_llseek, -}; - -/* - * No locking needed - only used (and modified) by below initcall and exitcall. - */ -static struct miscdevice mmapper_dev = { - .minor = MISC_DYNAMIC_MINOR, - .name = "mmapper", - .fops = &mmapper_fops -}; - -static int __init mmapper_init(void) -{ - int err; - - printk(KERN_INFO "Mapper v0.1\n"); - - v_buf = (char *) find_iomem("mmapper", &mmapper_size); - if (mmapper_size == 0) { - printk(KERN_ERR "mmapper_init - find_iomem failed\n"); - return -ENODEV; - } - p_buf = __pa(v_buf); - - err = misc_register(&mmapper_dev); - if (err) { - printk(KERN_ERR "mmapper - misc_register failed, err = %d\n", - err); - return err; - } - return 0; -} - -static void mmapper_exit(void) -{ - misc_deregister(&mmapper_dev); -} - -module_init(mmapper_init); -module_exit(mmapper_exit); - -MODULE_AUTHOR("Greg Lonnon <glonnon@ridgerun.com>"); -MODULE_DESCRIPTION("DSPLinux simulator mmapper driver"); -MODULE_LICENSE("GPL"); diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c deleted file mode 100644 index 39f186252e02..000000000000 --- a/arch/um/drivers/net_kern.c +++ /dev/null @@ -1,915 +0,0 @@ -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and - * James Leu (jleu@mindspring.net). - * Copyright (C) 2001 by various other people who didn't put their name here. - * Licensed under the GPL. - */ - -#include <linux/bootmem.h> -#include <linux/etherdevice.h> -#include <linux/ethtool.h> -#include <linux/inetdevice.h> -#include <linux/init.h> -#include <linux/list.h> -#include <linux/netdevice.h> -#include <linux/platform_device.h> -#include <linux/rtnetlink.h> -#include <linux/skbuff.h> -#include <linux/slab.h> -#include <linux/spinlock.h> -#include <init.h> -#include <irq_kern.h> -#include <irq_user.h> -#include "mconsole_kern.h" -#include <net_kern.h> -#include <net_user.h> - -#define DRIVER_NAME "uml-netdev" - -static DEFINE_SPINLOCK(opened_lock); -static LIST_HEAD(opened); - -/* - * The drop_skb is used when we can't allocate an skb. The - * packet is read into drop_skb in order to get the data off the - * connection to the host. - * It is reallocated whenever a maximum packet size is seen which is - * larger than any seen before. update_drop_skb is called from - * eth_configure when a new interface is added. - */ -static DEFINE_SPINLOCK(drop_lock); -static struct sk_buff *drop_skb; -static int drop_max; - -static int update_drop_skb(int max) -{ - struct sk_buff *new; - unsigned long flags; - int err = 0; - - spin_lock_irqsave(&drop_lock, flags); - - if (max <= drop_max) - goto out; - - err = -ENOMEM; - new = dev_alloc_skb(max); - if (new == NULL) - goto out; - - skb_put(new, max); - - kfree_skb(drop_skb); - drop_skb = new; - drop_max = max; - err = 0; -out: - spin_unlock_irqrestore(&drop_lock, flags); - - return err; -} - -static int uml_net_rx(struct net_device *dev) -{ - struct uml_net_private *lp = netdev_priv(dev); - int pkt_len; - struct sk_buff *skb; - - /* If we can't allocate memory, try again next round. */ - skb = dev_alloc_skb(lp->max_packet); - if (skb == NULL) { - drop_skb->dev = dev; - /* Read a packet into drop_skb and don't do anything with it. */ - (*lp->read)(lp->fd, drop_skb, lp); - dev->stats.rx_dropped++; - return 0; - } - - skb->dev = dev; - skb_put(skb, lp->max_packet); - skb_reset_mac_header(skb); - pkt_len = (*lp->read)(lp->fd, skb, lp); - - if (pkt_len > 0) { - skb_trim(skb, pkt_len); - skb->protocol = (*lp->protocol)(skb); - - dev->stats.rx_bytes += skb->len; - dev->stats.rx_packets++; - netif_rx(skb); - return pkt_len; - } - - kfree_skb(skb); - return pkt_len; -} - -static void uml_dev_close(struct work_struct *work) -{ - struct uml_net_private *lp = - container_of(work, struct uml_net_private, work); - dev_close(lp->dev); -} - -static irqreturn_t uml_net_interrupt(int irq, void *dev_id) -{ - struct net_device *dev = dev_id; - struct uml_net_private *lp = netdev_priv(dev); - int err; - - if (!netif_running(dev)) - return IRQ_NONE; - - spin_lock(&lp->lock); - while ((err = uml_net_rx(dev)) > 0) ; - if (err < 0) { - printk(KERN_ERR - "Device '%s' read returned %d, shutting it down\n", - dev->name, err); - /* dev_close can't be called in interrupt context, and takes - * again lp->lock. - * And dev_close() can be safely called multiple times on the - * same device, since it tests for (dev->flags & IFF_UP). So - * there's no harm in delaying the device shutdown. - * Furthermore, the workqueue will not re-enqueue an already - * enqueued work item. */ - schedule_work(&lp->work); - goto out; - } - reactivate_fd(lp->fd, UM_ETH_IRQ); - -out: - spin_unlock(&lp->lock); - return IRQ_HANDLED; -} - -static int uml_net_open(struct net_device *dev) -{ - struct uml_net_private *lp = netdev_priv(dev); - int err; - - if (lp->fd >= 0) { - err = -ENXIO; - goto out; - } - - lp->fd = (*lp->open)(&lp->user); - if (lp->fd < 0) { - err = lp->fd; - goto out; - } - - err = um_request_irq(dev->irq, lp->fd, IRQ_READ, uml_net_interrupt, - IRQF_SHARED, dev->name, dev); - if (err != 0) { - printk(KERN_ERR "uml_net_open: failed to get irq(%d)\n", err); - err = -ENETUNREACH; - goto out_close; - } - - lp->tl.data = (unsigned long) &lp->user; - netif_start_queue(dev); - - /* clear buffer - it can happen that the host side of the interface - * is full when we get here. In this case, new data is never queued, - * SIGIOs never arrive, and the net never works. - */ - while ((err = uml_net_rx(dev)) > 0) ; - - spin_lock(&opened_lock); - list_add(&lp->list, &opened); - spin_unlock(&opened_lock); - - return 0; -out_close: - if (lp->close != NULL) (*lp->close)(lp->fd, &lp->user); - lp->fd = -1; -out: - return err; -} - -static int uml_net_close(struct net_device *dev) -{ - struct uml_net_private *lp = netdev_priv(dev); - - netif_stop_queue(dev); - - um_free_irq(dev->irq, dev); - if (lp->close != NULL) - (*lp->close)(lp->fd, &lp->user); - lp->fd = -1; - - spin_lock(&opened_lock); - list_del(&lp->list); - spin_unlock(&opened_lock); - - return 0; -} - -static int uml_net_start_xmit(struct sk_buff *skb, struct net_device *dev) -{ - struct uml_net_private *lp = netdev_priv(dev); - unsigned long flags; - int len; - - netif_stop_queue(dev); - - spin_lock_irqsave(&lp->lock, flags); - - len = (*lp->write)(lp->fd, skb, lp); - skb_tx_timestamp(skb); - - if (len == skb->len) { - dev->stats.tx_packets++; - dev->stats.tx_bytes += skb->len; - dev->trans_start = jiffies; - netif_start_queue(dev); - - /* this is normally done in the interrupt when tx finishes */ - netif_wake_queue(dev); - } - else if (len == 0) { - netif_start_queue(dev); - dev->stats.tx_dropped++; - } - else { - netif_start_queue(dev); - printk(KERN_ERR "uml_net_start_xmit: failed(%d)\n", len); - } - - spin_unlock_irqrestore(&lp->lock, flags); - - dev_kfree_skb(skb); - - return NETDEV_TX_OK; -} - -static void uml_net_set_multicast_list(struct net_device *dev) -{ - return; -} - -static void uml_net_tx_timeout(struct net_device *dev) -{ - dev->trans_start = jiffies; - netif_wake_queue(dev); -} - -static int uml_net_change_mtu(struct net_device *dev, int new_mtu) -{ - dev->mtu = new_mtu; - - return 0; -} - -#ifdef CONFIG_NET_POLL_CONTROLLER -static void uml_net_poll_controller(struct net_device *dev) -{ - disable_irq(dev->irq); - uml_net_interrupt(dev->irq, dev); - enable_irq(dev->irq); -} -#endif - -static void uml_net_get_drvinfo(struct net_device *dev, - struct ethtool_drvinfo *info) -{ - strlcpy(info->driver, DRIVER_NAME, sizeof(info->driver)); - strlcpy(info->version, "42", sizeof(info->version)); -} - -static const struct ethtool_ops uml_net_ethtool_ops = { - .get_drvinfo = uml_net_get_drvinfo, - .get_link = ethtool_op_get_link, - .get_ts_info = ethtool_op_get_ts_info, -}; - -static void uml_net_user_timer_expire(unsigned long _conn) -{ -#ifdef undef - struct connection *conn = (struct connection *)_conn; - - dprintk(KERN_INFO "uml_net_user_timer_expire [%p]\n", conn); - do_connect(conn); -#endif -} - -static void setup_etheraddr(struct net_device *dev, char *str) -{ - unsigned char *addr = dev->dev_addr; - char *end; - int i; - - if (str == NULL) - goto random; - - for (i = 0; i < 6; i++) { - addr[i] = simple_strtoul(str, &end, 16); - if ((end == str) || - ((*end != ':') && (*end != ',') && (*end != '\0'))) { - printk(KERN_ERR - "setup_etheraddr: failed to parse '%s' " - "as an ethernet address\n", str); - goto random; - } - str = end + 1; - } - if (is_multicast_ether_addr(addr)) { - printk(KERN_ERR - "Attempt to assign a multicast ethernet address to a " - "device disallowed\n"); - goto random; - } - if (!is_valid_ether_addr(addr)) { - printk(KERN_ERR - "Attempt to assign an invalid ethernet address to a " - "device disallowed\n"); - goto random; - } - if (!is_local_ether_addr(addr)) { - printk(KERN_WARNING - "Warning: Assigning a globally valid ethernet " - "address to a device\n"); - printk(KERN_WARNING "You should set the 2nd rightmost bit in " - "the first byte of the MAC,\n"); - printk(KERN_WARNING "i.e. %02x:%02x:%02x:%02x:%02x:%02x\n", - addr[0] | 0x02, addr[1], addr[2], addr[3], addr[4], - addr[5]); - } - return; - -random: - printk(KERN_INFO - "Choosing a random ethernet address for device %s\n", dev->name); - eth_hw_addr_random(dev); -} - -static DEFINE_SPINLOCK(devices_lock); -static LIST_HEAD(devices); - -static struct platform_driver uml_net_driver = { - .driver = { - .name = DRIVER_NAME, - }, -}; - -static void net_device_release(struct device *dev) -{ - struct uml_net *device = dev_get_drvdata(dev); - struct net_device *netdev = device->dev; - struct uml_net_private *lp = netdev_priv(netdev); - - if (lp->remove != NULL) - (*lp->remove)(&lp->user); - list_del(&device->list); - kfree(device); - free_netdev(netdev); -} - -static const struct net_device_ops uml_netdev_ops = { - .ndo_open = uml_net_open, - .ndo_stop = uml_net_close, - .ndo_start_xmit = uml_net_start_xmit, - .ndo_set_rx_mode = uml_net_set_multicast_list, - .ndo_tx_timeout = uml_net_tx_timeout, - .ndo_set_mac_address = eth_mac_addr, - .ndo_change_mtu = uml_net_change_mtu, - .ndo_validate_addr = eth_validate_addr, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = uml_net_poll_controller, -#endif -}; - -/* - * Ensures that platform_driver_register is called only once by - * eth_configure. Will be set in an initcall. - */ -static int driver_registered; - -static void eth_configure(int n, void *init, char *mac, - struct transport *transport) -{ - struct uml_net *device; - struct net_device *dev; - struct uml_net_private *lp; - int err, size; - - size = transport->private_size + sizeof(struct uml_net_private); - - device = kzalloc(sizeof(*device), GFP_KERNEL); - if (device == NULL) { - printk(KERN_ERR "eth_configure failed to allocate struct " - "uml_net\n"); - return; - } - - dev = alloc_etherdev(size); - if (dev == NULL) { - printk(KERN_ERR "eth_configure: failed to allocate struct " - "net_device for eth%d\n", n); - goto out_free_device; - } - - INIT_LIST_HEAD(&device->list); - device->index = n; - - /* If this name ends up conflicting with an existing registered - * netdevice, that is OK, register_netdev{,ice}() will notice this - * and fail. - */ - snprintf(dev->name, sizeof(dev->name), "eth%d", n); - - setup_etheraddr(dev, mac); - - printk(KERN_INFO "Netdevice %d (%pM) : ", n, dev->dev_addr); - - lp = netdev_priv(dev); - /* This points to the transport private data. It's still clear, but we - * must memset it to 0 *now*. Let's help the drivers. */ - memset(lp, 0, size); - INIT_WORK(&lp->work, uml_dev_close); - - /* sysfs register */ - if (!driver_registered) { - platform_driver_register(¨_net_driver); - driver_registered = 1; - } - device->pdev.id = n; - device->pdev.name = DRIVER_NAME; - device->pdev.dev.release = net_device_release; - dev_set_drvdata(&device->pdev.dev, device); - if (platform_device_register(&device->pdev)) - goto out_free_netdev; - SET_NETDEV_DEV(dev,&device->pdev.dev); - - device->dev = dev; - - /* - * These just fill in a data structure, so there's no failure - * to be worried about. - */ - (*transport->kern->init)(dev, init); - - *lp = ((struct uml_net_private) - { .list = LIST_HEAD_INIT(lp->list), - .dev = dev, - .fd = -1, - .mac = { 0xfe, 0xfd, 0x0, 0x0, 0x0, 0x0}, - .max_packet = transport->user->max_packet, - .protocol = transport->kern->protocol, - .open = transport->user->open, - .close = transport->user->close, - .remove = transport->user->remove, - .read = transport->kern->read, - .write = transport->kern->write, - .add_address = transport->user->add_address, - .delete_address = transport->user->delete_address }); - - init_timer(&lp->tl); - spin_lock_init(&lp->lock); - lp->tl.function = uml_net_user_timer_expire; - memcpy(lp->mac, dev->dev_addr, sizeof(lp->mac)); - - if ((transport->user->init != NULL) && - ((*transport->user->init)(&lp->user, dev) != 0)) - goto out_unregister; - - dev->mtu = transport->user->mtu; - dev->netdev_ops = ¨_netdev_ops; - dev->ethtool_ops = ¨_net_ethtool_ops; - dev->watchdog_timeo = (HZ >> 1); - dev->irq = UM_ETH_IRQ; - - err = update_drop_skb(lp->max_packet); - if (err) - goto out_undo_user_init; - - rtnl_lock(); - err = register_netdevice(dev); - rtnl_unlock(); - if (err) - goto out_undo_user_init; - - spin_lock(&devices_lock); - list_add(&device->list, &devices); - spin_unlock(&devices_lock); - - return; - -out_undo_user_init: - if (transport->user->remove != NULL) - (*transport->user->remove)(&lp->user); -out_unregister: - platform_device_unregister(&device->pdev); - return; /* platform_device_unregister frees dev and device */ -out_free_netdev: - free_netdev(dev); -out_free_device: - kfree(device); -} - -static struct uml_net *find_device(int n) -{ - struct uml_net *device; - struct list_head *ele; - - spin_lock(&devices_lock); - list_for_each(ele, &devices) { - device = list_entry(ele, struct uml_net, list); - if (device->index == n) - goto out; - } - device = NULL; - out: - spin_unlock(&devices_lock); - return device; -} - -static int eth_parse(char *str, int *index_out, char **str_out, - char **error_out) -{ - char *end; - int n, err = -EINVAL; - - n = simple_strtoul(str, &end, 0); - if (end == str) { - *error_out = "Bad device number"; - return err; - } - - str = end; - if (*str != '=') { - *error_out = "Expected '=' after device number"; - return err; - } - - str++; - if (find_device(n)) { - *error_out = "Device already configured"; - return err; - } - - *index_out = n; - *str_out = str; - return 0; -} - -struct eth_init { - struct list_head list; - char *init; - int index; -}; - -static DEFINE_SPINLOCK(transports_lock); -static LIST_HEAD(transports); - -/* Filled in during early boot */ -static LIST_HEAD(eth_cmd_line); - -static int check_transport(struct transport *transport, char *eth, int n, - void **init_out, char **mac_out) -{ - int len; - - len = strlen(transport->name); - if (strncmp(eth, transport->name, len)) - return 0; - - eth += len; - if (*eth == ',') - eth++; - else if (*eth != '\0') - return 0; - - *init_out = kmalloc(transport->setup_size, GFP_KERNEL); - if (*init_out == NULL) - return 1; - - if (!transport->setup(eth, mac_out, *init_out)) { - kfree(*init_out); - *init_out = NULL; - } - return 1; -} - -void register_transport(struct transport *new) -{ - struct list_head *ele, *next; - struct eth_init *eth; - void *init; - char *mac = NULL; - int match; - - spin_lock(&transports_lock); - BUG_ON(!list_empty(&new->list)); - list_add(&new->list, &transports); - spin_unlock(&transports_lock); - - list_for_each_safe(ele, next, ð_cmd_line) { - eth = list_entry(ele, struct eth_init, list); - match = check_transport(new, eth->init, eth->index, &init, - &mac); - if (!match) - continue; - else if (init != NULL) { - eth_configure(eth->index, init, mac, new); - kfree(init); - } - list_del(ð->list); - } -} - -static int eth_setup_common(char *str, int index) -{ - struct list_head *ele; - struct transport *transport; - void *init; - char *mac = NULL; - int found = 0; - - spin_lock(&transports_lock); - list_for_each(ele, &transports) { - transport = list_entry(ele, struct transport, list); - if (!check_transport(transport, str, index, &init, &mac)) - continue; - if (init != NULL) { - eth_configure(index, init, mac, transport); - kfree(init); - } - found = 1; - break; - } - - spin_unlock(&transports_lock); - return found; -} - -static int __init eth_setup(char *str) -{ - struct eth_init *new; - char *error; - int n, err; - - err = eth_parse(str, &n, &str, &error); - if (err) { - printk(KERN_ERR "eth_setup - Couldn't parse '%s' : %s\n", - str, error); - return 1; - } - - new = alloc_bootmem(sizeof(*new)); - if (new == NULL) { - printk(KERN_ERR "eth_init : alloc_bootmem failed\n"); - return 1; - } - - INIT_LIST_HEAD(&new->list); - new->index = n; - new->init = str; - - list_add_tail(&new->list, ð_cmd_line); - return 1; -} - -__setup("eth", eth_setup); -__uml_help(eth_setup, -"eth[0-9]+=<transport>,<options>\n" -" Configure a network device.\n\n" -); - -static int net_config(char *str, char **error_out) -{ - int n, err; - - err = eth_parse(str, &n, &str, error_out); - if (err) - return err; - - /* This string is broken up and the pieces used by the underlying - * driver. So, it is freed only if eth_setup_common fails. - */ - str = kstrdup(str, GFP_KERNEL); - if (str == NULL) { - *error_out = "net_config failed to strdup string"; - return -ENOMEM; - } - err = !eth_setup_common(str, n); - if (err) - kfree(str); - return err; -} - -static int net_id(char **str, int *start_out, int *end_out) -{ - char *end; - int n; - - n = simple_strtoul(*str, &end, 0); - if ((*end != '\0') || (end == *str)) - return -1; - - *start_out = n; - *end_out = n; - *str = end; - return n; -} - -static int net_remove(int n, char **error_out) -{ - struct uml_net *device; - struct net_device *dev; - struct uml_net_private *lp; - - device = find_device(n); - if (device == NULL) - return -ENODEV; - - dev = device->dev; - lp = netdev_priv(dev); - if (lp->fd > 0) - return -EBUSY; - unregister_netdev(dev); - platform_device_unregister(&device->pdev); - - return 0; -} - -static struct mc_device net_mc = { - .list = LIST_HEAD_INIT(net_mc.list), - .name = "eth", - .config = net_config, - .get_config = NULL, - .id = net_id, - .remove = net_remove, -}; - -#ifdef CONFIG_INET -static int uml_inetaddr_event(struct notifier_block *this, unsigned long event, - void *ptr) -{ - struct in_ifaddr *ifa = ptr; - struct net_device *dev = ifa->ifa_dev->dev; - struct uml_net_private *lp; - void (*proc)(unsigned char *, unsigned char *, void *); - unsigned char addr_buf[4], netmask_buf[4]; - - if (dev->netdev_ops->ndo_open != uml_net_open) - return NOTIFY_DONE; - - lp = netdev_priv(dev); - - proc = NULL; - switch (event) { - case NETDEV_UP: - proc = lp->add_address; - break; - case NETDEV_DOWN: - proc = lp->delete_address; - break; - } - if (proc != NULL) { - memcpy(addr_buf, &ifa->ifa_address, sizeof(addr_buf)); - memcpy(netmask_buf, &ifa->ifa_mask, sizeof(netmask_buf)); - (*proc)(addr_buf, netmask_buf, &lp->user); - } - return NOTIFY_DONE; -} - -/* uml_net_init shouldn't be called twice on two CPUs at the same time */ -static struct notifier_block uml_inetaddr_notifier = { - .notifier_call = uml_inetaddr_event, -}; - -static void inet_register(void) -{ - struct list_head *ele; - struct uml_net_private *lp; - struct in_device *ip; - struct in_ifaddr *in; - - register_inetaddr_notifier(¨_inetaddr_notifier); - - /* Devices may have been opened already, so the uml_inetaddr_notifier - * didn't get a chance to run for them. This fakes it so that - * addresses which have already been set up get handled properly. - */ - spin_lock(&opened_lock); - list_for_each(ele, &opened) { - lp = list_entry(ele, struct uml_net_private, list); - ip = lp->dev->ip_ptr; - if (ip == NULL) - continue; - in = ip->ifa_list; - while (in != NULL) { - uml_inetaddr_event(NULL, NETDEV_UP, in); - in = in->ifa_next; - } - } - spin_unlock(&opened_lock); -} -#else -static inline void inet_register(void) -{ -} -#endif - -static int uml_net_init(void) -{ - mconsole_register_dev(&net_mc); - inet_register(); - return 0; -} - -__initcall(uml_net_init); - -static void close_devices(void) -{ - struct list_head *ele; - struct uml_net_private *lp; - - spin_lock(&opened_lock); - list_for_each(ele, &opened) { - lp = list_entry(ele, struct uml_net_private, list); - um_free_irq(lp->dev->irq, lp->dev); - if ((lp->close != NULL) && (lp->fd >= 0)) - (*lp->close)(lp->fd, &lp->user); - if (lp->remove != NULL) - (*lp->remove)(&lp->user); - } - spin_unlock(&opened_lock); -} - -__uml_exitcall(close_devices); - -void iter_addresses(void *d, void (*cb)(unsigned char *, unsigned char *, - void *), - void *arg) -{ - struct net_device *dev = d; - struct in_device *ip = dev->ip_ptr; - struct in_ifaddr *in; - unsigned char address[4], netmask[4]; - - if (ip == NULL) return; - in = ip->ifa_list; - while (in != NULL) { - memcpy(address, &in->ifa_address, sizeof(address)); - memcpy(netmask, &in->ifa_mask, sizeof(netmask)); - (*cb)(address, netmask, arg); - in = in->ifa_next; - } -} - -int dev_netmask(void *d, void *m) -{ - struct net_device *dev = d; - struct in_device *ip = dev->ip_ptr; - struct in_ifaddr *in; - __be32 *mask_out = m; - - if (ip == NULL) - return 1; - - in = ip->ifa_list; - if (in == NULL) - return 1; - - *mask_out = in->ifa_mask; - return 0; -} - -void *get_output_buffer(int *len_out) -{ - void *ret; - - ret = (void *) __get_free_pages(GFP_KERNEL, 0); - if (ret) *len_out = PAGE_SIZE; - else *len_out = 0; - return ret; -} - -void free_output_buffer(void *buffer) -{ - free_pages((unsigned long) buffer, 0); -} - -int tap_setup_common(char *str, char *type, char **dev_name, char **mac_out, - char **gate_addr) -{ - char *remain; - - remain = split_if_spec(str, dev_name, mac_out, gate_addr, NULL); - if (remain != NULL) { - printk(KERN_ERR "tap_setup_common - Extra garbage on " - "specification : '%s'\n", remain); - return 1; - } - - return 0; -} - -unsigned short eth_protocol(struct sk_buff *skb) -{ - return eth_type_trans(skb, skb->dev); -} diff --git a/arch/um/drivers/net_user.c b/arch/um/drivers/net_user.c deleted file mode 100644 index cd14157b556d..000000000000 --- a/arch/um/drivers/net_user.c +++ /dev/null @@ -1,269 +0,0 @@ -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL - */ - -#include <stdio.h> -#include <unistd.h> -#include <stdarg.h> -#include <errno.h> -#include <stddef.h> -#include <string.h> -#include <sys/socket.h> -#include <sys/wait.h> -#include <net_user.h> -#include <os.h> -#include <um_malloc.h> - -int tap_open_common(void *dev, char *gate_addr) -{ - int tap_addr[4]; - - if (gate_addr == NULL) - return 0; - if (sscanf(gate_addr, "%d.%d.%d.%d", &tap_addr[0], - &tap_addr[1], &tap_addr[2], &tap_addr[3]) != 4) { - printk(UM_KERN_ERR "Invalid tap IP address - '%s'\n", - gate_addr); - return -EINVAL; - } - return 0; -} - -void tap_check_ips(char *gate_addr, unsigned char *eth_addr) -{ - int tap_addr[4]; - - if ((gate_addr != NULL) && - (sscanf(gate_addr, "%d.%d.%d.%d", &tap_addr[0], - &tap_addr[1], &tap_addr[2], &tap_addr[3]) == 4) && - (eth_addr[0] == tap_addr[0]) && - (eth_addr[1] == tap_addr[1]) && - (eth_addr[2] == tap_addr[2]) && - (eth_addr[3] == tap_addr[3])) { - printk(UM_KERN_ERR "The tap IP address and the UML eth IP " - "address must be different\n"); - } -} - -/* Do reliable error handling as this fails frequently enough. */ -void read_output(int fd, char *output, int len) -{ - int remain, ret, expected; - char c; - char *str; - - if (output == NULL) { - output = &c; - len = sizeof(c); - } - - *output = '\0'; - ret = read(fd, &remain, sizeof(remain)); - - if (ret != sizeof(remain)) { - if (ret < 0) - ret = -errno; - expected = sizeof(remain); - str = "length"; - goto err; - } - - while (remain != 0) { - expected = (remain < len) ? remain : len; - ret = read(fd, output, expected); - if (ret != expected) { - if (ret < 0) - ret = -errno; - str = "data"; - goto err; - } - remain -= ret; - } - - return; - -err: - if (ret < 0) - printk(UM_KERN_ERR "read_output - read of %s failed, " - "errno = %d\n", str, -ret); - else - printk(UM_KERN_ERR "read_output - read of %s failed, read only " - "%d of %d bytes\n", str, ret, expected); -} - -int net_read(int fd, void *buf, int len) -{ - int n; - - n = read(fd, buf, len); - - if ((n < 0) && (errno == EAGAIN)) - return 0; - else if (n == 0) - return -ENOTCONN; - return n; -} - -int net_recvfrom(int fd, void *buf, int len) -{ - int n; - - CATCH_EINTR(n = recvfrom(fd, buf, len, 0, NULL, NULL)); - if (n < 0) { - if (errno == EAGAIN) - return 0; - return -errno; - } - else if (n == 0) - return -ENOTCONN; - return n; -} - -int net_write(int fd, void *buf, int len) -{ - int n; - - n = write(fd, buf, len); - - if ((n < 0) && (errno == EAGAIN)) - return 0; - else if (n == 0) - return -ENOTCONN; - return n; -} - -int net_send(int fd, void *buf, int len) -{ - int n; - - CATCH_EINTR(n = send(fd, buf, len, 0)); - if (n < 0) { - if (errno == EAGAIN) - return 0; - return -errno; - } - else if (n == 0) - return -ENOTCONN; - return n; -} - -int net_sendto(int fd, void *buf, int len, void *to, int sock_len) -{ - int n; - - CATCH_EINTR(n = sendto(fd, buf, len, 0, (struct sockaddr *) to, - sock_len)); - if (n < 0) { - if (errno == EAGAIN) - return 0; - return -errno; - } - else if (n == 0) - return -ENOTCONN; - return n; -} - -struct change_pre_exec_data { - int close_me; - int stdout; -}; - -static void change_pre_exec(void *arg) -{ - struct change_pre_exec_data *data = arg; - - close(data->close_me); - dup2(data->stdout, 1); -} - -static int change_tramp(char **argv, char *output, int output_len) -{ - int pid, fds[2], err; - struct change_pre_exec_data pe_data; - - err = os_pipe(fds, 1, 0); - if (err < 0) { - printk(UM_KERN_ERR "change_tramp - pipe failed, err = %d\n", - -err); - return err; - } - pe_data.close_me = fds[0]; - pe_data.stdout = fds[1]; - pid = run_helper(change_pre_exec, &pe_data, argv); - - if (pid > 0) /* Avoid hang as we won't get data in failure case. */ - read_output(fds[0], output, output_len); - - close(fds[0]); - close(fds[1]); - - if (pid > 0) - helper_wait(pid); - return pid; -} - -static void change(char *dev, char *what, unsigned char *addr, - unsigned char *netmask) -{ - char addr_buf[sizeof("255.255.255.255\0")]; - char netmask_buf[sizeof("255.255.255.255\0")]; - char version[sizeof("nnnnn\0")]; - char *argv[] = { "uml_net", version, what, dev, addr_buf, - netmask_buf, NULL }; - char *output; - int output_len, pid; - - sprintf(version, "%d", UML_NET_VERSION); - sprintf(addr_buf, "%d.%d.%d.%d", addr[0], addr[1], addr[2], addr[3]); - sprintf(netmask_buf, "%d.%d.%d.%d", netmask[0], netmask[1], - netmask[2], netmask[3]); - - output_len = UM_KERN_PAGE_SIZE; - output = uml_kmalloc(output_len, UM_GFP_KERNEL); - if (output == NULL) - printk(UM_KERN_ERR "change : failed to allocate output " - "buffer\n"); - - pid = change_tramp(argv, output, output_len); - if (pid < 0) { - kfree(output); - return; - } - - if (output != NULL) { - printk("%s", output); - kfree(output); - } -} - -void open_addr(unsigned char *addr, unsigned char *netmask, void *arg) -{ - change(arg, "add", addr, netmask); -} - -void close_addr(unsigned char *addr, unsigned char *netmask, void *arg) -{ - change(arg, "del", addr, netmask); -} - -char *split_if_spec(char *str, ...) -{ - char **arg, *end; - va_list ap; - - va_start(ap, str); - while ((arg = va_arg(ap, char **)) != NULL) { - if (*str == '\0') - return NULL; - end = strchr(str, ','); - if (end != str) - *arg = str; - if (end == NULL) - return NULL; - *end++ = '\0'; - str = end; - } - va_end(ap); - return str; -} diff --git a/arch/um/drivers/null.c b/arch/um/drivers/null.c index 10495747ce8e..30d59b8481b4 100644 --- a/arch/um/drivers/null.c +++ b/arch/um/drivers/null.c @@ -1,6 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{linux.intel,addtoit}.com) - * Licensed under the GPL */ #include <stddef.h> @@ -28,7 +28,7 @@ static int null_open(int input, int output, int primary, void *d, return (fd < 0) ? -errno : fd; } -static int null_read(int fd, char *c_out, void *unused) +static int null_read(int fd, __u8 *c_out, void *unused) { return -ENODEV; } diff --git a/arch/um/drivers/pcap_kern.c b/arch/um/drivers/pcap_kern.c deleted file mode 100644 index be0fb57bd1d7..000000000000 --- a/arch/um/drivers/pcap_kern.c +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL. - */ - -#include <linux/init.h> -#include <linux/netdevice.h> -#include <net_kern.h> -#include "pcap_user.h" - -struct pcap_init { - char *host_if; - int promisc; - int optimize; - char *filter; -}; - -void pcap_init(struct net_device *dev, void *data) -{ - struct uml_net_private *pri; - struct pcap_data *ppri; - struct pcap_init *init = data; - - pri = netdev_priv(dev); - ppri = (struct pcap_data *) pri->user; - ppri->host_if = init->host_if; - ppri->promisc = init->promisc; - ppri->optimize = init->optimize; - ppri->filter = init->filter; - - printk("pcap backend, host interface %s\n", ppri->host_if); -} - -static int pcap_read(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - return pcap_user_read(fd, skb_mac_header(skb), - skb->dev->mtu + ETH_HEADER_OTHER, - (struct pcap_data *) &lp->user); -} - -static int pcap_write(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - return -EPERM; -} - -static const struct net_kern_info pcap_kern_info = { - .init = pcap_init, - .protocol = eth_protocol, - .read = pcap_read, - .write = pcap_write, -}; - -int pcap_setup(char *str, char **mac_out, void *data) -{ - struct pcap_init *init = data; - char *remain, *host_if = NULL, *options[2] = { NULL, NULL }; - int i; - - *init = ((struct pcap_init) - { .host_if = "eth0", - .promisc = 1, - .optimize = 0, - .filter = NULL }); - - remain = split_if_spec(str, &host_if, &init->filter, - &options[0], &options[1], mac_out, NULL); - if (remain != NULL) { - printk(KERN_ERR "pcap_setup - Extra garbage on " - "specification : '%s'\n", remain); - return 0; - } - - if (host_if != NULL) - init->host_if = host_if; - - for (i = 0; i < ARRAY_SIZE(options); i++) { - if (options[i] == NULL) - continue; - if (!strcmp(options[i], "promisc")) - init->promisc = 1; - else if (!strcmp(options[i], "nopromisc")) - init->promisc = 0; - else if (!strcmp(options[i], "optimize")) - init->optimize = 1; - else if (!strcmp(options[i], "nooptimize")) - init->optimize = 0; - else { - printk(KERN_ERR "pcap_setup : bad option - '%s'\n", - options[i]); - return 0; - } - } - - return 1; -} - -static struct transport pcap_transport = { - .list = LIST_HEAD_INIT(pcap_transport.list), - .name = "pcap", - .setup = pcap_setup, - .user = &pcap_user_info, - .kern = &pcap_kern_info, - .private_size = sizeof(struct pcap_data), - .setup_size = sizeof(struct pcap_init), -}; - -static int register_pcap(void) -{ - register_transport(&pcap_transport); - return 0; -} - -late_initcall(register_pcap); diff --git a/arch/um/drivers/pcap_user.c b/arch/um/drivers/pcap_user.c deleted file mode 100644 index c07b9c752c86..000000000000 --- a/arch/um/drivers/pcap_user.c +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL. - */ - -#include <errno.h> -#include <pcap.h> -#include <string.h> -#include <asm/types.h> -#include <net_user.h> -#include "pcap_user.h" -#include <um_malloc.h> - -#define PCAP_FD(p) (*(int *)(p)) - -static int pcap_user_init(void *data, void *dev) -{ - struct pcap_data *pri = data; - pcap_t *p; - char errors[PCAP_ERRBUF_SIZE]; - - p = pcap_open_live(pri->host_if, ETH_MAX_PACKET + ETH_HEADER_OTHER, - pri->promisc, 0, errors); - if (p == NULL) { - printk(UM_KERN_ERR "pcap_user_init : pcap_open_live failed - " - "'%s'\n", errors); - return -EINVAL; - } - - pri->dev = dev; - pri->pcap = p; - return 0; -} - -static int pcap_open(void *data) -{ - struct pcap_data *pri = data; - __u32 netmask; - int err; - - if (pri->pcap == NULL) - return -ENODEV; - - if (pri->filter != NULL) { - err = dev_netmask(pri->dev, &netmask); - if (err < 0) { - printk(UM_KERN_ERR "pcap_open : dev_netmask failed\n"); - return -EIO; - } - - pri->compiled = uml_kmalloc(sizeof(struct bpf_program), - UM_GFP_KERNEL); - if (pri->compiled == NULL) { - printk(UM_KERN_ERR "pcap_open : kmalloc failed\n"); - return -ENOMEM; - } - - err = pcap_compile(pri->pcap, - (struct bpf_program *) pri->compiled, - pri->filter, pri->optimize, netmask); - if (err < 0) { - printk(UM_KERN_ERR "pcap_open : pcap_compile failed - " - "'%s'\n", pcap_geterr(pri->pcap)); - goto out; - } - - err = pcap_setfilter(pri->pcap, pri->compiled); - if (err < 0) { - printk(UM_KERN_ERR "pcap_open : pcap_setfilter " - "failed - '%s'\n", pcap_geterr(pri->pcap)); - goto out; - } - } - - return PCAP_FD(pri->pcap); - - out: - kfree(pri->compiled); - return -EIO; -} - -static void pcap_remove(void *data) -{ - struct pcap_data *pri = data; - - if (pri->compiled != NULL) - pcap_freecode(pri->compiled); - - if (pri->pcap != NULL) - pcap_close(pri->pcap); -} - -struct pcap_handler_data { - char *buffer; - int len; -}; - -static void handler(u_char *data, const struct pcap_pkthdr *header, - const u_char *packet) -{ - int len; - - struct pcap_handler_data *hdata = (struct pcap_handler_data *) data; - - len = hdata->len < header->caplen ? hdata->len : header->caplen; - memcpy(hdata->buffer, packet, len); - hdata->len = len; -} - -int pcap_user_read(int fd, void *buffer, int len, struct pcap_data *pri) -{ - struct pcap_handler_data hdata = ((struct pcap_handler_data) - { .buffer = buffer, - .len = len }); - int n; - - n = pcap_dispatch(pri->pcap, 1, handler, (u_char *) &hdata); - if (n < 0) { - printk(UM_KERN_ERR "pcap_dispatch failed - %s\n", - pcap_geterr(pri->pcap)); - return -EIO; - } - else if (n == 0) - return 0; - return hdata.len; -} - -const struct net_user_info pcap_user_info = { - .init = pcap_user_init, - .open = pcap_open, - .close = NULL, - .remove = pcap_remove, - .add_address = NULL, - .delete_address = NULL, - .mtu = ETH_MAX_PACKET, - .max_packet = ETH_MAX_PACKET + ETH_HEADER_OTHER, -}; diff --git a/arch/um/drivers/pcap_user.h b/arch/um/drivers/pcap_user.h deleted file mode 100644 index 1ca7c764cc63..000000000000 --- a/arch/um/drivers/pcap_user.h +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL - */ - -#include <net_user.h> - -struct pcap_data { - char *host_if; - int promisc; - int optimize; - char *filter; - void *compiled; - void *pcap; - void *dev; -}; - -extern const struct net_user_info pcap_user_info; - -extern int pcap_user_read(int fd, void *buf, int len, struct pcap_data *pri); - diff --git a/arch/um/drivers/port.h b/arch/um/drivers/port.h index 372a80c0556a..9085b336e683 100644 --- a/arch/um/drivers/port.h +++ b/arch/um/drivers/port.h @@ -1,6 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2001 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL */ #ifndef __PORT_H__ diff --git a/arch/um/drivers/port_kern.c b/arch/um/drivers/port_kern.c index 40ca5cc275e9..a4508470df78 100644 --- a/arch/um/drivers/port_kern.c +++ b/arch/um/drivers/port_kern.c @@ -1,6 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{linux.intel,addtoit}.com) - * Licensed under the GPL */ #include <linux/completion.h> @@ -45,15 +45,17 @@ struct connection { static irqreturn_t pipe_interrupt(int irq, void *data) { struct connection *conn = data; - int fd; + int n_fds = 1, fd = -1; + ssize_t ret; - fd = os_rcv_fd(conn->socket[0], &conn->helper_pid); - if (fd < 0) { - if (fd == -EAGAIN) + ret = os_rcv_fd_msg(conn->socket[0], &fd, n_fds, &conn->helper_pid, + sizeof(conn->helper_pid)); + if (ret != sizeof(conn->helper_pid)) { + if (ret == -EAGAIN) return IRQ_NONE; - printk(KERN_ERR "pipe_interrupt : os_rcv_fd returned %d\n", - -fd); + printk(KERN_ERR "pipe_interrupt : os_rcv_fd_msg returned %zd\n", + ret); os_close_file(conn->fd); } @@ -100,7 +102,7 @@ static int port_accept(struct port_list *port) .port = port }); if (um_request_irq(TELNETD_IRQ, socket[0], IRQ_READ, pipe_interrupt, - IRQF_SHARED, "telnetd", conn)) { + IRQF_SHARED, "telnetd", conn) < 0) { printk(KERN_ERR "port_accept : failed to get IRQ for " "telnetd\n"); goto out_free; @@ -137,7 +139,6 @@ static void port_work_proc(struct work_struct *unused) if (!port->has_connection) continue; - reactivate_fd(port->fd, ACCEPT_IRQ); while (port_accept(port)) ; port->has_connection = 0; @@ -145,7 +146,7 @@ static void port_work_proc(struct work_struct *unused) local_irq_restore(flags); } -DECLARE_WORK(port_work, port_work_proc); +static DECLARE_WORK(port_work, port_work_proc); static irqreturn_t port_interrupt(int irq, void *data) { @@ -183,7 +184,7 @@ void *port_data(int port_num) } if (um_request_irq(ACCEPT_IRQ, fd, IRQ_READ, port_interrupt, - IRQF_SHARED, "port", port)) { + IRQF_SHARED, "port", port) < 0) { printk(KERN_ERR "Failed to get IRQ for port %d\n", port_num); goto out_close; } diff --git a/arch/um/drivers/port_user.c b/arch/um/drivers/port_user.c index 9a8e1b64c22e..3c62ae81df62 100644 --- a/arch/um/drivers/port_user.c +++ b/arch/um/drivers/port_user.c @@ -1,10 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{linux.intel,addtoit}.com) - * Licensed under the GPL */ #include <stdio.h> #include <stdlib.h> +#include <string.h> #include <errno.h> #include <termios.h> #include <unistd.h> @@ -167,14 +168,29 @@ static void port_pre_exec(void *arg) int port_connection(int fd, int *socket, int *pid_out) { int new, err; - char *argv[] = { "/usr/sbin/in.telnetd", "-L", - "/usr/lib/uml/port-helper", NULL }; + char *env; + char *argv[] = { "in.telnetd", "-L", + OS_LIB_PATH "/uml/port-helper", NULL }; struct port_pre_exec_data data; + if ((env = getenv("UML_PORT_HELPER"))) + argv[2] = env; + new = accept(fd, NULL, 0); if (new < 0) return -errno; + err = os_access(argv[2], X_OK); + if (err < 0) { + printk(UM_KERN_ERR "port_connection : error accessing port-helper " + "executable at %s: %s\n", argv[2], strerror(-err)); + if (env == NULL) + printk(UM_KERN_ERR "Set UML_PORT_HELPER environment " + "variable to path to uml-utilities port-helper " + "binary\n"); + goto out_close; + } + err = os_pipe(socket, 0, 0); if (err < 0) goto out_close; diff --git a/arch/um/drivers/pty.c b/arch/um/drivers/pty.c index f1fcc2cedb5e..39c60068cfdf 100644 --- a/arch/um/drivers/pty.c +++ b/arch/um/drivers/pty.c @@ -1,6 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <stdio.h> diff --git a/arch/um/drivers/random.c b/arch/um/drivers/random.c index 9e3a72205827..ca08c91f47a3 100644 --- a/arch/um/drivers/random.c +++ b/arch/um/drivers/random.c @@ -6,113 +6,58 @@ * This software may be used and distributed according to the terms * of the GNU General Public License, incorporated herein by reference. */ -#include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/module.h> #include <linux/fs.h> #include <linux/interrupt.h> #include <linux/miscdevice.h> +#include <linux/hw_random.h> #include <linux/delay.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> +#include <init.h> #include <irq_kern.h> #include <os.h> /* - * core module and version information + * core module information */ -#define RNG_VERSION "1.0.0" #define RNG_MODULE_NAME "hw_random" -#define RNG_MISCDEV_MINOR 183 /* official */ - /* Changed at init time, in the non-modular case, and at module load * time, in the module case. Presumably, the module subsystem * protects against a module being loaded twice at the same time. */ static int random_fd = -1; -static DECLARE_WAIT_QUEUE_HEAD(host_read_wait); - -static int rng_dev_open (struct inode *inode, struct file *filp) -{ - /* enforce read-only access to this chrdev */ - if ((filp->f_mode & FMODE_READ) == 0) - return -EINVAL; - if ((filp->f_mode & FMODE_WRITE) != 0) - return -EINVAL; - - return 0; -} +static struct hwrng hwrng; +static DECLARE_COMPLETION(have_data); -static atomic_t host_sleep_count = ATOMIC_INIT(0); - -static ssize_t rng_dev_read (struct file *filp, char __user *buf, size_t size, - loff_t *offp) +static int rng_dev_read(struct hwrng *rng, void *buf, size_t max, bool block) { - u32 data; - int n, ret = 0, have_data; - - while (size) { - n = os_read_file(random_fd, &data, sizeof(data)); - if (n > 0) { - have_data = n; - while (have_data && size) { - if (put_user((u8) data, buf++)) { - ret = ret ? : -EFAULT; - break; - } - size--; - ret++; - have_data--; - data >>= 8; - } - } - else if (n == -EAGAIN) { - DECLARE_WAITQUEUE(wait, current); - - if (filp->f_flags & O_NONBLOCK) - return ret ? : -EAGAIN; + int ret; - atomic_inc(&host_sleep_count); - reactivate_fd(random_fd, RANDOM_IRQ); + for (;;) { + ret = os_read_file(random_fd, buf, max); + if (block && ret == -EAGAIN) { add_sigio_fd(random_fd); - add_wait_queue(&host_read_wait, &wait); - set_task_state(current, TASK_INTERRUPTIBLE); + ret = wait_for_completion_killable(&have_data); - schedule(); - set_task_state(current, TASK_RUNNING); - remove_wait_queue(&host_read_wait, &wait); + ignore_sigio_fd(random_fd); + deactivate_fd(random_fd, RANDOM_IRQ); - if (atomic_dec_and_test(&host_sleep_count)) { - ignore_sigio_fd(random_fd); - deactivate_fd(random_fd, RANDOM_IRQ); - } + if (ret < 0) + break; + } else { + break; } - else - return n; - - if (signal_pending (current)) - return ret ? : -ERESTARTSYS; } - return ret; -} -static const struct file_operations rng_chrdev_ops = { - .owner = THIS_MODULE, - .open = rng_dev_open, - .read = rng_dev_read, - .llseek = noop_llseek, -}; - -/* rng_init shouldn't be called more than once at boot time */ -static struct miscdevice rng_miscdev = { - RNG_MISCDEV_MINOR, - RNG_MODULE_NAME, - &rng_chrdev_ops, -}; + return ret != -EAGAIN ? ret : 0; +} static irqreturn_t random_interrupt(int irq, void *data) { - wake_up(&host_read_wait); + complete(&have_data); return IRQ_HANDLED; } @@ -129,18 +74,18 @@ static int __init rng_init (void) goto out; random_fd = err; - err = um_request_irq(RANDOM_IRQ, random_fd, IRQ_READ, random_interrupt, 0, "random", NULL); - if (err) + if (err < 0) goto err_out_cleanup_hw; - sigio_broken(random_fd, 1); + sigio_broken(); + hwrng.name = RNG_MODULE_NAME; + hwrng.read = rng_dev_read; - err = misc_register (&rng_miscdev); + err = hwrng_register(&hwrng); if (err) { - printk (KERN_ERR RNG_MODULE_NAME ": misc device register " - "failed\n"); + pr_err(RNG_MODULE_NAME " registering failed (%d)\n", err); goto err_out_cleanup_hw; } out: @@ -155,14 +100,22 @@ err_out_cleanup_hw: /* * rng_cleanup - shutdown RNG module */ -static void __exit rng_cleanup (void) + +static void cleanup(void) +{ + free_irq_by_fd(random_fd); + os_close_file(random_fd); +} + +static void __exit rng_cleanup(void) { + hwrng_unregister(&hwrng); os_close_file(random_fd); - misc_deregister (&rng_miscdev); } module_init (rng_init); module_exit (rng_cleanup); +__uml_exitcall(cleanup); MODULE_DESCRIPTION("UML Host Random Number Generator (RNG) driver"); MODULE_LICENSE("GPL"); diff --git a/arch/um/drivers/rtc.h b/arch/um/drivers/rtc.h new file mode 100644 index 000000000000..95e41c7d35c4 --- /dev/null +++ b/arch/um/drivers/rtc.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020 Intel Corporation + * Author: Johannes Berg <johannes@sipsolutions.net> + */ +#ifndef __UM_RTC_H__ +#define __UM_RTC_H__ + +int uml_rtc_start(bool timetravel); +int uml_rtc_enable_alarm(unsigned long long delta_seconds); +void uml_rtc_disable_alarm(void); +void uml_rtc_stop(bool timetravel); +void uml_rtc_send_timetravel_alarm(void); + +#endif /* __UM_RTC_H__ */ diff --git a/arch/um/drivers/rtc_kern.c b/arch/um/drivers/rtc_kern.c new file mode 100644 index 000000000000..9158c936c128 --- /dev/null +++ b/arch/um/drivers/rtc_kern.c @@ -0,0 +1,213 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020 Intel Corporation + * Author: Johannes Berg <johannes@sipsolutions.net> + */ +#include <linux/platform_device.h> +#include <linux/time-internal.h> +#include <linux/suspend.h> +#include <linux/err.h> +#include <linux/rtc.h> +#include <kern_util.h> +#include <irq_kern.h> +#include <os.h> +#include "rtc.h" + +static time64_t uml_rtc_alarm_time; +static bool uml_rtc_alarm_enabled; +static struct rtc_device *uml_rtc; +static int uml_rtc_irq_fd, uml_rtc_irq; + +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT + +static void uml_rtc_time_travel_alarm(struct time_travel_event *ev) +{ + uml_rtc_send_timetravel_alarm(); +} + +static struct time_travel_event uml_rtc_alarm_event = { + .fn = uml_rtc_time_travel_alarm, +}; +#endif + +static int uml_rtc_read_time(struct device *dev, struct rtc_time *tm) +{ + struct timespec64 ts; + + /* Use this to get correct time in time-travel mode */ + read_persistent_clock64(&ts); + rtc_time64_to_tm(timespec64_to_ktime(ts) / NSEC_PER_SEC, tm); + + return 0; +} + +static int uml_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) +{ + rtc_time64_to_tm(uml_rtc_alarm_time, &alrm->time); + alrm->enabled = uml_rtc_alarm_enabled; + + return 0; +} + +static int uml_rtc_alarm_irq_enable(struct device *dev, unsigned int enable) +{ + struct timespec64 ts; + unsigned long long secs; + + if (!enable && !uml_rtc_alarm_enabled) + return 0; + + uml_rtc_alarm_enabled = enable; + + read_persistent_clock64(&ts); + secs = uml_rtc_alarm_time - ts.tv_sec; + + if (time_travel_mode == TT_MODE_OFF) { + if (!enable) { + uml_rtc_disable_alarm(); + return 0; + } + + /* enable or update */ + return uml_rtc_enable_alarm(secs); + } else { + time_travel_del_event(¨_rtc_alarm_event); + + if (enable) + time_travel_add_event_rel(¨_rtc_alarm_event, + secs * NSEC_PER_SEC - + ts.tv_nsec); + } + + return 0; +} + +static int uml_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) +{ + uml_rtc_alarm_irq_enable(dev, 0); + uml_rtc_alarm_time = rtc_tm_to_time64(&alrm->time); + uml_rtc_alarm_irq_enable(dev, alrm->enabled); + + return 0; +} + +static const struct rtc_class_ops uml_rtc_ops = { + .read_time = uml_rtc_read_time, + .read_alarm = uml_rtc_read_alarm, + .alarm_irq_enable = uml_rtc_alarm_irq_enable, + .set_alarm = uml_rtc_set_alarm, +}; + +static irqreturn_t uml_rtc_interrupt(int irq, void *data) +{ + unsigned long long c = 0; + + /* alarm triggered, it's now off */ + uml_rtc_alarm_enabled = false; + + os_read_file(uml_rtc_irq_fd, &c, sizeof(c)); + WARN_ON(c == 0); + + pm_system_wakeup(); + rtc_update_irq(uml_rtc, 1, RTC_IRQF | RTC_AF); + + return IRQ_HANDLED; +} + +static int uml_rtc_setup(void) +{ + int err; + + err = uml_rtc_start(time_travel_mode != TT_MODE_OFF); + if (WARN(err < 0, "err = %d\n", err)) + return err; + + uml_rtc_irq_fd = err; + + err = um_request_irq(UM_IRQ_ALLOC, uml_rtc_irq_fd, IRQ_READ, + uml_rtc_interrupt, 0, "rtc", NULL); + if (err < 0) { + uml_rtc_stop(time_travel_mode != TT_MODE_OFF); + return err; + } + + irq_set_irq_wake(err, 1); + + uml_rtc_irq = err; + return 0; +} + +static void uml_rtc_cleanup(void) +{ + um_free_irq(uml_rtc_irq, NULL); + uml_rtc_stop(time_travel_mode != TT_MODE_OFF); +} + +static int uml_rtc_probe(struct platform_device *pdev) +{ + int err; + + err = uml_rtc_setup(); + if (err) + return err; + + uml_rtc = devm_rtc_allocate_device(&pdev->dev); + if (IS_ERR(uml_rtc)) { + err = PTR_ERR(uml_rtc); + goto cleanup; + } + + uml_rtc->ops = ¨_rtc_ops; + + device_init_wakeup(&pdev->dev, 1); + + err = devm_rtc_register_device(uml_rtc); + if (err) + goto cleanup; + + return 0; +cleanup: + uml_rtc_cleanup(); + return err; +} + +static void uml_rtc_remove(struct platform_device *pdev) +{ + device_init_wakeup(&pdev->dev, 0); + uml_rtc_cleanup(); +} + +static struct platform_driver uml_rtc_driver = { + .probe = uml_rtc_probe, + .remove = uml_rtc_remove, + .driver = { + .name = "uml-rtc", + }, +}; + +static int __init uml_rtc_init(void) +{ + struct platform_device *pdev; + int err; + + err = platform_driver_register(¨_rtc_driver); + if (err) + return err; + + pdev = platform_device_alloc("uml-rtc", 0); + if (!pdev) { + err = -ENOMEM; + goto unregister; + } + + err = platform_device_add(pdev); + if (err) + goto unregister; + return 0; + +unregister: + platform_device_put(pdev); + platform_driver_unregister(¨_rtc_driver); + return err; +} +device_initcall(uml_rtc_init); diff --git a/arch/um/drivers/rtc_user.c b/arch/um/drivers/rtc_user.c new file mode 100644 index 000000000000..67912fcf7b28 --- /dev/null +++ b/arch/um/drivers/rtc_user.c @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020 Intel Corporation + * Author: Johannes Berg <johannes@sipsolutions.net> + */ +#include <stdbool.h> +#include <os.h> +#include <errno.h> +#include <sched.h> +#include <unistd.h> +#include <kern_util.h> +#include <sys/select.h> +#include <stdio.h> +#include <sys/timerfd.h> +#include "rtc.h" + +static int uml_rtc_irq_fds[2]; + +void uml_rtc_send_timetravel_alarm(void) +{ + unsigned long long c = 1; + + CATCH_EINTR(write(uml_rtc_irq_fds[1], &c, sizeof(c))); +} + +int uml_rtc_start(bool timetravel) +{ + int err; + + if (timetravel) { + err = os_pipe(uml_rtc_irq_fds, 1, 1); + if (err) + goto fail; + } else { + uml_rtc_irq_fds[0] = timerfd_create(CLOCK_REALTIME, TFD_CLOEXEC); + if (uml_rtc_irq_fds[0] < 0) { + err = -errno; + goto fail; + } + + /* apparently timerfd won't send SIGIO, use workaround */ + sigio_broken(); + err = add_sigio_fd(uml_rtc_irq_fds[0]); + if (err < 0) { + close(uml_rtc_irq_fds[0]); + goto fail; + } + } + + return uml_rtc_irq_fds[0]; +fail: + uml_rtc_stop(timetravel); + return err; +} + +int uml_rtc_enable_alarm(unsigned long long delta_seconds) +{ + struct itimerspec it = { + .it_value = { + .tv_sec = delta_seconds, + }, + }; + + if (timerfd_settime(uml_rtc_irq_fds[0], 0, &it, NULL)) + return -errno; + return 0; +} + +void uml_rtc_disable_alarm(void) +{ + uml_rtc_enable_alarm(0); +} + +void uml_rtc_stop(bool timetravel) +{ + if (timetravel) + os_close_file(uml_rtc_irq_fds[1]); + else + ignore_sigio_fd(uml_rtc_irq_fds[0]); + os_close_file(uml_rtc_irq_fds[0]); +} diff --git a/arch/um/drivers/slip.h b/arch/um/drivers/slip.h deleted file mode 100644 index c64f8c61d274..000000000000 --- a/arch/um/drivers/slip.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef __UM_SLIP_H -#define __UM_SLIP_H - -#include "slip_common.h" - -struct slip_data { - void *dev; - char name[sizeof("slnnnnn\0")]; - char *addr; - char *gate_addr; - int slave; - struct slip_proto slip; -}; - -extern const struct net_user_info slip_user_info; - -extern int slip_user_read(int fd, void *buf, int len, struct slip_data *pri); -extern int slip_user_write(int fd, void *buf, int len, struct slip_data *pri); - -#endif diff --git a/arch/um/drivers/slip_common.c b/arch/um/drivers/slip_common.c deleted file mode 100644 index f597fa7c91d3..000000000000 --- a/arch/um/drivers/slip_common.c +++ /dev/null @@ -1,54 +0,0 @@ -#include <string.h> -#include "slip_common.h" -#include <net_user.h> - -int slip_proto_read(int fd, void *buf, int len, struct slip_proto *slip) -{ - int i, n, size, start; - - if(slip->more > 0){ - i = 0; - while(i < slip->more){ - size = slip_unesc(slip->ibuf[i++], slip->ibuf, - &slip->pos, &slip->esc); - if(size){ - memcpy(buf, slip->ibuf, size); - memmove(slip->ibuf, &slip->ibuf[i], - slip->more - i); - slip->more = slip->more - i; - return size; - } - } - slip->more = 0; - } - - n = net_read(fd, &slip->ibuf[slip->pos], - sizeof(slip->ibuf) - slip->pos); - if(n <= 0) - return n; - - start = slip->pos; - for(i = 0; i < n; i++){ - size = slip_unesc(slip->ibuf[start + i], slip->ibuf,&slip->pos, - &slip->esc); - if(size){ - memcpy(buf, slip->ibuf, size); - memmove(slip->ibuf, &slip->ibuf[start+i+1], - n - (i + 1)); - slip->more = n - (i + 1); - return size; - } - } - return 0; -} - -int slip_proto_write(int fd, void *buf, int len, struct slip_proto *slip) -{ - int actual, n; - - actual = slip_esc(buf, slip->obuf, len); - n = net_write(fd, slip->obuf, actual); - if(n < 0) - return n; - else return len; -} diff --git a/arch/um/drivers/slip_common.h b/arch/um/drivers/slip_common.h deleted file mode 100644 index d574e0a9dc13..000000000000 --- a/arch/um/drivers/slip_common.h +++ /dev/null @@ -1,105 +0,0 @@ -#ifndef __UM_SLIP_COMMON_H -#define __UM_SLIP_COMMON_H - -#define BUF_SIZE 1500 - /* two bytes each for a (pathological) max packet of escaped chars + * - * terminating END char + initial END char */ -#define ENC_BUF_SIZE (2 * BUF_SIZE + 2) - -/* SLIP protocol characters. */ -#define SLIP_END 0300 /* indicates end of frame */ -#define SLIP_ESC 0333 /* indicates byte stuffing */ -#define SLIP_ESC_END 0334 /* ESC ESC_END means END 'data' */ -#define SLIP_ESC_ESC 0335 /* ESC ESC_ESC means ESC 'data' */ - -static inline int slip_unesc(unsigned char c, unsigned char *buf, int *pos, - int *esc) -{ - int ret; - - switch(c){ - case SLIP_END: - *esc = 0; - ret=*pos; - *pos=0; - return(ret); - case SLIP_ESC: - *esc = 1; - return(0); - case SLIP_ESC_ESC: - if(*esc){ - *esc = 0; - c = SLIP_ESC; - } - break; - case SLIP_ESC_END: - if(*esc){ - *esc = 0; - c = SLIP_END; - } - break; - } - buf[(*pos)++] = c; - return(0); -} - -static inline int slip_esc(unsigned char *s, unsigned char *d, int len) -{ - unsigned char *ptr = d; - unsigned char c; - - /* - * Send an initial END character to flush out any - * data that may have accumulated in the receiver - * due to line noise. - */ - - *ptr++ = SLIP_END; - - /* - * For each byte in the packet, send the appropriate - * character sequence, according to the SLIP protocol. - */ - - while (len-- > 0) { - switch(c = *s++) { - case SLIP_END: - *ptr++ = SLIP_ESC; - *ptr++ = SLIP_ESC_END; - break; - case SLIP_ESC: - *ptr++ = SLIP_ESC; - *ptr++ = SLIP_ESC_ESC; - break; - default: - *ptr++ = c; - break; - } - } - *ptr++ = SLIP_END; - return (ptr - d); -} - -struct slip_proto { - unsigned char ibuf[ENC_BUF_SIZE]; - unsigned char obuf[ENC_BUF_SIZE]; - int more; /* more data: do not read fd until ibuf has been drained */ - int pos; - int esc; -}; - -static inline void slip_proto_init(struct slip_proto * slip) -{ - memset(slip->ibuf, 0, sizeof(slip->ibuf)); - memset(slip->obuf, 0, sizeof(slip->obuf)); - slip->more = 0; - slip->pos = 0; - slip->esc = 0; -} - -extern int slip_proto_read(int fd, void *buf, int len, - struct slip_proto *slip); -extern int slip_proto_write(int fd, void *buf, int len, - struct slip_proto *slip); - -#endif diff --git a/arch/um/drivers/slip_kern.c b/arch/um/drivers/slip_kern.c deleted file mode 100644 index ed5249fc0574..000000000000 --- a/arch/um/drivers/slip_kern.c +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright (C) 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL. - */ - -#include <linux/if_arp.h> -#include <linux/init.h> -#include <linux/netdevice.h> -#include <net_kern.h> -#include "slip.h" - -struct slip_init { - char *gate_addr; -}; - -static void slip_init(struct net_device *dev, void *data) -{ - struct uml_net_private *private; - struct slip_data *spri; - struct slip_init *init = data; - - private = netdev_priv(dev); - spri = (struct slip_data *) private->user; - - memset(spri->name, 0, sizeof(spri->name)); - spri->addr = NULL; - spri->gate_addr = init->gate_addr; - spri->slave = -1; - spri->dev = dev; - - slip_proto_init(&spri->slip); - - dev->hard_header_len = 0; - dev->header_ops = NULL; - dev->addr_len = 0; - dev->type = ARPHRD_SLIP; - dev->tx_queue_len = 256; - dev->flags = IFF_NOARP; - printk("SLIP backend - SLIP IP = %s\n", spri->gate_addr); -} - -static unsigned short slip_protocol(struct sk_buff *skbuff) -{ - return htons(ETH_P_IP); -} - -static int slip_read(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - return slip_user_read(fd, skb_mac_header(skb), skb->dev->mtu, - (struct slip_data *) &lp->user); -} - -static int slip_write(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - return slip_user_write(fd, skb->data, skb->len, - (struct slip_data *) &lp->user); -} - -static const struct net_kern_info slip_kern_info = { - .init = slip_init, - .protocol = slip_protocol, - .read = slip_read, - .write = slip_write, -}; - -static int slip_setup(char *str, char **mac_out, void *data) -{ - struct slip_init *init = data; - - *init = ((struct slip_init) { .gate_addr = NULL }); - - if (str[0] != '\0') - init->gate_addr = str; - return 1; -} - -static struct transport slip_transport = { - .list = LIST_HEAD_INIT(slip_transport.list), - .name = "slip", - .setup = slip_setup, - .user = &slip_user_info, - .kern = &slip_kern_info, - .private_size = sizeof(struct slip_data), - .setup_size = sizeof(struct slip_init), -}; - -static int register_slip(void) -{ - register_transport(&slip_transport); - return 0; -} - -late_initcall(register_slip); diff --git a/arch/um/drivers/slip_user.c b/arch/um/drivers/slip_user.c deleted file mode 100644 index 55c290d925f3..000000000000 --- a/arch/um/drivers/slip_user.c +++ /dev/null @@ -1,251 +0,0 @@ -/* - * Copyright (C) 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL. - */ - -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <errno.h> -#include <fcntl.h> -#include <string.h> -#include <sys/termios.h> -#include <sys/wait.h> -#include <net_user.h> -#include <os.h> -#include "slip.h" -#include <um_malloc.h> - -static int slip_user_init(void *data, void *dev) -{ - struct slip_data *pri = data; - - pri->dev = dev; - return 0; -} - -static int set_up_tty(int fd) -{ - int i; - struct termios tios; - - if (tcgetattr(fd, &tios) < 0) { - printk(UM_KERN_ERR "could not get initial terminal " - "attributes\n"); - return -1; - } - - tios.c_cflag = CS8 | CREAD | HUPCL | CLOCAL; - tios.c_iflag = IGNBRK | IGNPAR; - tios.c_oflag = 0; - tios.c_lflag = 0; - for (i = 0; i < NCCS; i++) - tios.c_cc[i] = 0; - tios.c_cc[VMIN] = 1; - tios.c_cc[VTIME] = 0; - - cfsetospeed(&tios, B38400); - cfsetispeed(&tios, B38400); - - if (tcsetattr(fd, TCSAFLUSH, &tios) < 0) { - printk(UM_KERN_ERR "failed to set terminal attributes\n"); - return -1; - } - return 0; -} - -struct slip_pre_exec_data { - int stdin; - int stdout; - int close_me; -}; - -static void slip_pre_exec(void *arg) -{ - struct slip_pre_exec_data *data = arg; - - if (data->stdin >= 0) - dup2(data->stdin, 0); - dup2(data->stdout, 1); - if (data->close_me >= 0) - close(data->close_me); -} - -static int slip_tramp(char **argv, int fd) -{ - struct slip_pre_exec_data pe_data; - char *output; - int pid, fds[2], err, output_len; - - err = os_pipe(fds, 1, 0); - if (err < 0) { - printk(UM_KERN_ERR "slip_tramp : pipe failed, err = %d\n", - -err); - goto out; - } - - err = 0; - pe_data.stdin = fd; - pe_data.stdout = fds[1]; - pe_data.close_me = fds[0]; - err = run_helper(slip_pre_exec, &pe_data, argv); - if (err < 0) - goto out_close; - pid = err; - - output_len = UM_KERN_PAGE_SIZE; - output = uml_kmalloc(output_len, UM_GFP_KERNEL); - if (output == NULL) { - printk(UM_KERN_ERR "slip_tramp : failed to allocate output " - "buffer\n"); - os_kill_process(pid, 1); - err = -ENOMEM; - goto out_close; - } - - close(fds[1]); - read_output(fds[0], output, output_len); - printk("%s", output); - - err = helper_wait(pid); - close(fds[0]); - - kfree(output); - return err; - -out_close: - close(fds[0]); - close(fds[1]); -out: - return err; -} - -static int slip_open(void *data) -{ - struct slip_data *pri = data; - char version_buf[sizeof("nnnnn\0")]; - char gate_buf[sizeof("nnn.nnn.nnn.nnn\0")]; - char *argv[] = { "uml_net", version_buf, "slip", "up", gate_buf, - NULL }; - int sfd, mfd, err; - - err = get_pty(); - if (err < 0) { - printk(UM_KERN_ERR "slip-open : Failed to open pty, err = %d\n", - -err); - goto out; - } - mfd = err; - - err = open(ptsname(mfd), O_RDWR, 0); - if (err < 0) { - printk(UM_KERN_ERR "Couldn't open tty for slip line, " - "err = %d\n", -err); - goto out_close; - } - sfd = err; - - if (set_up_tty(sfd)) - goto out_close2; - - pri->slave = sfd; - pri->slip.pos = 0; - pri->slip.esc = 0; - if (pri->gate_addr != NULL) { - sprintf(version_buf, "%d", UML_NET_VERSION); - strcpy(gate_buf, pri->gate_addr); - - err = slip_tramp(argv, sfd); - - if (err < 0) { - printk(UM_KERN_ERR "slip_tramp failed - err = %d\n", - -err); - goto out_close2; - } - err = os_get_ifname(pri->slave, pri->name); - if (err < 0) { - printk(UM_KERN_ERR "get_ifname failed, err = %d\n", - -err); - goto out_close2; - } - iter_addresses(pri->dev, open_addr, pri->name); - } - else { - err = os_set_slip(sfd); - if (err < 0) { - printk(UM_KERN_ERR "Failed to set slip discipline " - "encapsulation - err = %d\n", -err); - goto out_close2; - } - } - return mfd; -out_close2: - close(sfd); -out_close: - close(mfd); -out: - return err; -} - -static void slip_close(int fd, void *data) -{ - struct slip_data *pri = data; - char version_buf[sizeof("nnnnn\0")]; - char *argv[] = { "uml_net", version_buf, "slip", "down", pri->name, - NULL }; - int err; - - if (pri->gate_addr != NULL) - iter_addresses(pri->dev, close_addr, pri->name); - - sprintf(version_buf, "%d", UML_NET_VERSION); - - err = slip_tramp(argv, pri->slave); - - if (err != 0) - printk(UM_KERN_ERR "slip_tramp failed - errno = %d\n", -err); - close(fd); - close(pri->slave); - pri->slave = -1; -} - -int slip_user_read(int fd, void *buf, int len, struct slip_data *pri) -{ - return slip_proto_read(fd, buf, len, &pri->slip); -} - -int slip_user_write(int fd, void *buf, int len, struct slip_data *pri) -{ - return slip_proto_write(fd, buf, len, &pri->slip); -} - -static void slip_add_addr(unsigned char *addr, unsigned char *netmask, - void *data) -{ - struct slip_data *pri = data; - - if (pri->slave < 0) - return; - open_addr(addr, netmask, pri->name); -} - -static void slip_del_addr(unsigned char *addr, unsigned char *netmask, - void *data) -{ - struct slip_data *pri = data; - - if (pri->slave < 0) - return; - close_addr(addr, netmask, pri->name); -} - -const struct net_user_info slip_user_info = { - .init = slip_user_init, - .open = slip_open, - .close = slip_close, - .remove = NULL, - .add_address = slip_add_addr, - .delete_address = slip_del_addr, - .mtu = BUF_SIZE, - .max_packet = BUF_SIZE, -}; diff --git a/arch/um/drivers/slirp.h b/arch/um/drivers/slirp.h deleted file mode 100644 index 89ccf83b7577..000000000000 --- a/arch/um/drivers/slirp.h +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef __UM_SLIRP_H -#define __UM_SLIRP_H - -#include "slip_common.h" - -#define SLIRP_MAX_ARGS 100 -/* - * XXX this next definition is here because I don't understand why this - * initializer doesn't work in slirp_kern.c: - * - * argv : { init->argv[ 0 ... SLIRP_MAX_ARGS-1 ] }, - * - * or why I can't typecast like this: - * - * argv : (char* [SLIRP_MAX_ARGS])(init->argv), - */ -struct arg_list_dummy_wrapper { char *argv[SLIRP_MAX_ARGS]; }; - -struct slirp_data { - void *dev; - struct arg_list_dummy_wrapper argw; - int pid; - int slave; - struct slip_proto slip; -}; - -extern const struct net_user_info slirp_user_info; - -extern int slirp_user_read(int fd, void *buf, int len, struct slirp_data *pri); -extern int slirp_user_write(int fd, void *buf, int len, - struct slirp_data *pri); - -#endif diff --git a/arch/um/drivers/slirp_kern.c b/arch/um/drivers/slirp_kern.c deleted file mode 100644 index 4ef11ca7cacf..000000000000 --- a/arch/um/drivers/slirp_kern.c +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (C) 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL. - */ - -#include <linux/if_arp.h> -#include <linux/init.h> -#include <linux/netdevice.h> -#include <linux/string.h> -#include <net_kern.h> -#include <net_user.h> -#include "slirp.h" - -struct slirp_init { - struct arg_list_dummy_wrapper argw; /* XXX should be simpler... */ -}; - -void slirp_init(struct net_device *dev, void *data) -{ - struct uml_net_private *private; - struct slirp_data *spri; - struct slirp_init *init = data; - int i; - - private = netdev_priv(dev); - spri = (struct slirp_data *) private->user; - - spri->argw = init->argw; - spri->pid = -1; - spri->slave = -1; - spri->dev = dev; - - slip_proto_init(&spri->slip); - - dev->hard_header_len = 0; - dev->header_ops = NULL; - dev->addr_len = 0; - dev->type = ARPHRD_SLIP; - dev->tx_queue_len = 256; - dev->flags = IFF_NOARP; - printk("SLIRP backend - command line:"); - for (i = 0; spri->argw.argv[i] != NULL; i++) - printk(" '%s'",spri->argw.argv[i]); - printk("\n"); -} - -static unsigned short slirp_protocol(struct sk_buff *skbuff) -{ - return htons(ETH_P_IP); -} - -static int slirp_read(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - return slirp_user_read(fd, skb_mac_header(skb), skb->dev->mtu, - (struct slirp_data *) &lp->user); -} - -static int slirp_write(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - return slirp_user_write(fd, skb->data, skb->len, - (struct slirp_data *) &lp->user); -} - -const struct net_kern_info slirp_kern_info = { - .init = slirp_init, - .protocol = slirp_protocol, - .read = slirp_read, - .write = slirp_write, -}; - -static int slirp_setup(char *str, char **mac_out, void *data) -{ - struct slirp_init *init = data; - int i=0; - - *init = ((struct slirp_init) { .argw = { { "slirp", NULL } } }); - - str = split_if_spec(str, mac_out, NULL); - - if (str == NULL) /* no command line given after MAC addr */ - return 1; - - do { - if (i >= SLIRP_MAX_ARGS - 1) { - printk(KERN_WARNING "slirp_setup: truncating slirp " - "arguments\n"); - break; - } - init->argw.argv[i++] = str; - while(*str && *str!=',') { - if (*str == '_') - *str=' '; - str++; - } - if (*str != ',') - break; - *str++ = '\0'; - } while (1); - - init->argw.argv[i] = NULL; - return 1; -} - -static struct transport slirp_transport = { - .list = LIST_HEAD_INIT(slirp_transport.list), - .name = "slirp", - .setup = slirp_setup, - .user = &slirp_user_info, - .kern = &slirp_kern_info, - .private_size = sizeof(struct slirp_data), - .setup_size = sizeof(struct slirp_init), -}; - -static int register_slirp(void) -{ - register_transport(&slirp_transport); - return 0; -} - -late_initcall(register_slirp); diff --git a/arch/um/drivers/slirp_user.c b/arch/um/drivers/slirp_user.c deleted file mode 100644 index c999d187abb9..000000000000 --- a/arch/um/drivers/slirp_user.c +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright (C) 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL. - */ - -#include <unistd.h> -#include <errno.h> -#include <string.h> -#include <sys/wait.h> -#include <net_user.h> -#include <os.h> -#include "slirp.h" - -static int slirp_user_init(void *data, void *dev) -{ - struct slirp_data *pri = data; - - pri->dev = dev; - return 0; -} - -struct slirp_pre_exec_data { - int stdin; - int stdout; -}; - -static void slirp_pre_exec(void *arg) -{ - struct slirp_pre_exec_data *data = arg; - - if (data->stdin != -1) - dup2(data->stdin, 0); - if (data->stdout != -1) - dup2(data->stdout, 1); -} - -static int slirp_tramp(char **argv, int fd) -{ - struct slirp_pre_exec_data pe_data; - int pid; - - pe_data.stdin = fd; - pe_data.stdout = fd; - pid = run_helper(slirp_pre_exec, &pe_data, argv); - - return pid; -} - -static int slirp_open(void *data) -{ - struct slirp_data *pri = data; - int fds[2], pid, err; - - err = os_pipe(fds, 1, 1); - if (err) - return err; - - err = slirp_tramp(pri->argw.argv, fds[1]); - if (err < 0) { - printk(UM_KERN_ERR "slirp_tramp failed - errno = %d\n", -err); - goto out; - } - pid = err; - - pri->slave = fds[1]; - pri->slip.pos = 0; - pri->slip.esc = 0; - pri->pid = err; - - return fds[0]; -out: - close(fds[0]); - close(fds[1]); - return err; -} - -static void slirp_close(int fd, void *data) -{ - struct slirp_data *pri = data; - int err; - - close(fd); - close(pri->slave); - - pri->slave = -1; - - if (pri->pid<1) { - printk(UM_KERN_ERR "slirp_close: no child process to shut " - "down\n"); - return; - } - -#if 0 - if (kill(pri->pid, SIGHUP)<0) { - printk(UM_KERN_ERR "slirp_close: sending hangup to %d failed " - "(%d)\n", pri->pid, errno); - } -#endif - err = helper_wait(pri->pid); - if (err < 0) - return; - - pri->pid = -1; -} - -int slirp_user_read(int fd, void *buf, int len, struct slirp_data *pri) -{ - return slip_proto_read(fd, buf, len, &pri->slip); -} - -int slirp_user_write(int fd, void *buf, int len, struct slirp_data *pri) -{ - return slip_proto_write(fd, buf, len, &pri->slip); -} - -const struct net_user_info slirp_user_info = { - .init = slirp_user_init, - .open = slirp_open, - .close = slirp_close, - .remove = NULL, - .add_address = NULL, - .delete_address = NULL, - .mtu = BUF_SIZE, - .max_packet = BUF_SIZE, -}; diff --git a/arch/um/drivers/ssl.c b/arch/um/drivers/ssl.c index b8d14fa52059..8006a5bd578c 100644 --- a/arch/um/drivers/ssl.c +++ b/arch/um/drivers/ssl.c @@ -1,6 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2000, 2002 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL */ #include <linux/fs.h> @@ -12,7 +12,6 @@ #include <linux/console.h> #include <asm/termbits.h> #include <asm/irq.h> -#include "ssl.h" #include "chan.h" #include <init.h> #include <irq_user.h> @@ -48,9 +47,7 @@ static struct line_driver driver = { .minor_start = 64, .type = TTY_DRIVER_TYPE_SERIAL, .subtype = 0, - .read_irq = SSL_IRQ, .read_irq_name = "ssl", - .write_irq = SSL_WRITE_IRQ, .write_irq_name = "ssl-write", .mc = { .list = LIST_HEAD_INIT(driver.mc.list), @@ -96,12 +93,10 @@ static const struct tty_operations ssl_ops = { .open = line_open, .close = line_close, .write = line_write, - .put_char = line_put_char, .write_room = line_write_room, .chars_in_buffer = line_chars_in_buffer, .flush_buffer = line_flush_buffer, .flush_chars = line_flush_chars, - .set_termios = line_set_termios, .throttle = line_throttle, .unthrottle = line_unthrottle, .install = ssl_install, @@ -111,7 +106,7 @@ static const struct tty_operations ssl_ops = { /* Changed by ssl_init and referenced by ssl_exit, which are both serialized * by being an initcall and exitcall, respectively. */ -static int ssl_init_done = 0; +static int ssl_init_done; static void ssl_console_write(struct console *c, const char *string, unsigned len) @@ -197,3 +192,14 @@ static int ssl_chan_setup(char *str) __setup("ssl", ssl_chan_setup); __channel_help(ssl_chan_setup, "ssl"); + +static int ssl_non_raw_setup(char *str) +{ + opts.raw = 0; + return 1; +} +__setup("ssl-non-raw", ssl_non_raw_setup); +__uml_help(ssl_non_raw_setup, +"ssl-non-raw\n" +" Set serial lines to non-raw mode.\n\n" +); diff --git a/arch/um/drivers/ssl.h b/arch/um/drivers/ssl.h deleted file mode 100644 index 314d17725ce6..000000000000 --- a/arch/um/drivers/ssl.h +++ /dev/null @@ -1,13 +0,0 @@ -/* - * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL - */ - -#ifndef __SSL_H__ -#define __SSL_H__ - -extern int ssl_read(int fd, int line); -extern void ssl_receive_char(int line, char ch); - -#endif - diff --git a/arch/um/drivers/stderr_console.c b/arch/um/drivers/stderr_console.c index d07a97f8b994..ecc3a5814932 100644 --- a/arch/um/drivers/stderr_console.c +++ b/arch/um/drivers/stderr_console.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/init.h> #include <linux/console.h> diff --git a/arch/um/drivers/stdio_console.c b/arch/um/drivers/stdio_console.c index 7b361f36ca96..1c239737d88e 100644 --- a/arch/um/drivers/stdio_console.c +++ b/arch/um/drivers/stdio_console.c @@ -1,6 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2000, 2001 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL */ #include <linux/posix_types.h> @@ -53,9 +53,7 @@ static struct line_driver driver = { .minor_start = 0, .type = TTY_DRIVER_TYPE_CONSOLE, .subtype = SYSTEM_TYPE_CONSOLE, - .read_irq = CONSOLE_IRQ, .read_irq_name = "console", - .write_irq = CONSOLE_WRITE_IRQ, .write_irq_name = "console-write", .mc = { .list = LIST_HEAD_INIT(driver.mc.list), @@ -90,7 +88,7 @@ static int con_remove(int n, char **error_out) } /* Set in an initcall, checked in an exitcall */ -static int con_init_done = 0; +static int con_init_done; static int con_install(struct tty_driver *driver, struct tty_struct *tty) { @@ -102,12 +100,10 @@ static const struct tty_operations console_ops = { .install = con_install, .close = line_close, .write = line_write, - .put_char = line_put_char, .write_room = line_write_room, .chars_in_buffer = line_chars_in_buffer, .flush_buffer = line_flush_buffer, .flush_chars = line_flush_chars, - .set_termios = line_set_termios, .throttle = line_throttle, .unthrottle = line_unthrottle, .hangup = line_hangup, @@ -192,6 +188,9 @@ __uml_exitcall(console_exit); static int console_chan_setup(char *str) { + if (!strncmp(str, "sole=", 5)) /* console= option specifies tty */ + return 0; + line_setup(vt_conf, MAX_TTYS, &def_conf, str, "console"); return 1; } diff --git a/arch/um/drivers/stdio_console.h b/arch/um/drivers/stdio_console.h index 6d8275f71fd4..3a409ec23d63 100644 --- a/arch/um/drivers/stdio_console.h +++ b/arch/um/drivers/stdio_console.h @@ -1,6 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL */ #ifndef __STDIO_CONSOLE_H diff --git a/arch/um/drivers/tty.c b/arch/um/drivers/tty.c index eaa201bca5ed..884a762d21c7 100644 --- a/arch/um/drivers/tty.c +++ b/arch/um/drivers/tty.c @@ -1,6 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{linux.intel,addtoit}.com) - * Licensed under the GPL */ #include <errno.h> diff --git a/arch/um/drivers/ubd.h b/arch/um/drivers/ubd.h index 3845051f1b10..2985c14661f4 100644 --- a/arch/um/drivers/ubd.h +++ b/arch/um/drivers/ubd.h @@ -1,16 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) * Copyright (C) 2001 RidgeRun, Inc (glonnon@ridgerun.com) - * Licensed under the GPL */ #ifndef __UM_UBD_USER_H #define __UM_UBD_USER_H -extern void ignore_sigwinch_sig(void); -extern int start_io_thread(unsigned long sp, int *fds_out); -extern int io_thread(void *arg); +#include <os.h> + +int start_io_thread(struct os_helper_thread **td_out, int *fd_out); +void *io_thread(void *arg); extern int kernel_fd; +extern int ubd_read_poll(int timeout); +extern int ubd_write_poll(int timeout); + +#define UBD_REQ_BUFFER_SIZE 64 + #endif diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 879990cb66c6..37455e74d314 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -1,6 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* + * Copyright (C) 2018 Cambridge Greys Ltd + * Copyright (C) 2015-2016 Anton Ivanov (aivanov@brocade.com) * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL */ /* 2001-09-28...2002-04-17 @@ -22,8 +24,10 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/blkdev.h> +#include <linux/blk-mq.h> #include <linux/ata.h> #include <linux/hdreg.h> +#include <linux/major.h> #include <linux/cdrom.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -32,7 +36,6 @@ #include <linux/vmalloc.h> #include <linux/platform_device.h> #include <linux/scatterlist.h> -#include <asm/tlbflush.h> #include <kern_util.h> #include "mconsole_kern.h" #include <init.h> @@ -41,23 +44,41 @@ #include <os.h> #include "cow.h" -enum ubd_req { UBD_READ, UBD_WRITE }; +/* Max request size is determined by sector mask - 32K */ +#define UBD_MAX_REQUEST (8 * sizeof(long)) + +struct io_desc { + char *buffer; + unsigned long length; + unsigned long sector_mask; + unsigned long long cow_offset; + unsigned long bitmap_words[2]; +}; struct io_thread_req { struct request *req; - enum ubd_req op; int fds[2]; unsigned long offsets[2]; unsigned long long offset; - unsigned long length; - char *buffer; int sectorsize; - unsigned long sector_mask; - unsigned long long cow_offset; - unsigned long bitmap_words[2]; int error; + + int desc_cnt; + /* io_desc has to be the last element of the struct */ + struct io_desc io_desc[]; }; + +static struct io_thread_req * (*irq_req_buffer)[]; +static struct io_thread_req *irq_remainder; +static int irq_remainder_size; + +static struct io_thread_req * (*io_req_buffer)[]; +static struct io_thread_req *io_remainder; +static int io_remainder_size; + + + static inline int ubd_test_bit(__u64 bit, unsigned char *data) { __u64 n; @@ -84,29 +105,20 @@ static inline void ubd_set_bit(__u64 bit, unsigned char *data) #define DRIVER_NAME "uml-blkdev" static DEFINE_MUTEX(ubd_lock); -static DEFINE_MUTEX(ubd_mutex); /* replaces BKL, might not be needed */ -static int ubd_open(struct block_device *bdev, fmode_t mode); -static void ubd_release(struct gendisk *disk, fmode_t mode); -static int ubd_ioctl(struct block_device *bdev, fmode_t mode, +static int ubd_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg); -static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo); +static int ubd_getgeo(struct gendisk *disk, struct hd_geometry *geo); #define MAX_DEV (16) static const struct block_device_operations ubd_blops = { .owner = THIS_MODULE, - .open = ubd_open, - .release = ubd_release, .ioctl = ubd_ioctl, + .compat_ioctl = blkdev_compat_ptr_ioctl, .getgeo = ubd_getgeo, }; -/* Protected by ubd_lock */ -static int fake_major = UBD_MAJOR; -static struct gendisk *ubd_gendisk[MAX_DEV]; -static struct gendisk *fake_gendisk[MAX_DEV]; - #ifdef CONFIG_BLK_DEV_UBD_SYNC #define OPEN_FLAGS ((struct openflags) { .r = 1, .w = 1, .s = 1, .c = 0, \ .cl = 1 }) @@ -130,25 +142,22 @@ struct cow { #define MAX_SG 64 struct ubd { - struct list_head restart; /* name (and fd, below) of the file opened for writing, either the * backing or the cow file. */ char *file; - int count; + char *serial; int fd; __u64 size; struct openflags boot_openflags; struct openflags openflags; unsigned shared:1; unsigned no_cow:1; + unsigned no_trim:1; struct cow cow; struct platform_device pdev; - struct request_queue *queue; + struct gendisk *disk; + struct blk_mq_tag_set tag_set; spinlock_t lock; - struct scatterlist sg[MAX_SG]; - struct request *request; - int start_sg, end_sg; - sector_t rq_pos; }; #define DEFAULT_COW { \ @@ -161,81 +170,34 @@ struct ubd { #define DEFAULT_UBD { \ .file = NULL, \ - .count = 0, \ + .serial = NULL, \ .fd = -1, \ .size = -1, \ .boot_openflags = OPEN_FLAGS, \ .openflags = OPEN_FLAGS, \ .no_cow = 0, \ + .no_trim = 0, \ .shared = 0, \ .cow = DEFAULT_COW, \ .lock = __SPIN_LOCK_UNLOCKED(ubd_devs.lock), \ - .request = NULL, \ - .start_sg = 0, \ - .end_sg = 0, \ - .rq_pos = 0, \ } /* Protected by ubd_lock */ static struct ubd ubd_devs[MAX_DEV] = { [0 ... MAX_DEV - 1] = DEFAULT_UBD }; -/* Only changed by fake_ide_setup which is a setup */ -static int fake_ide = 0; -static struct proc_dir_entry *proc_ide_root = NULL; -static struct proc_dir_entry *proc_ide = NULL; - -static void make_proc_ide(void) -{ - proc_ide_root = proc_mkdir("ide", NULL); - proc_ide = proc_mkdir("ide0", proc_ide_root); -} - -static int fake_ide_media_proc_show(struct seq_file *m, void *v) -{ - seq_puts(m, "disk\n"); - return 0; -} - -static int fake_ide_media_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, fake_ide_media_proc_show, NULL); -} - -static const struct file_operations fake_ide_media_proc_fops = { - .owner = THIS_MODULE, - .open = fake_ide_media_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static void make_ide_entries(const char *dev_name) -{ - struct proc_dir_entry *dir, *ent; - char name[64]; - - if(proc_ide_root == NULL) make_proc_ide(); - - dir = proc_mkdir(dev_name, proc_ide); - if(!dir) return; - - ent = proc_create("media", S_IRUGO, dir, &fake_ide_media_proc_fops); - if(!ent) return; - snprintf(name, sizeof(name), "ide0/%s", dev_name); - proc_symlink(dev_name, proc_ide_root, name); -} +static blk_status_t ubd_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd); static int fake_ide_setup(char *str) { - fake_ide = 1; + pr_warn("The fake_ide option has been removed\n"); return 1; } - __setup("fake_ide", fake_ide_setup); __uml_help(fake_ide_setup, "fake_ide\n" -" Create ide0 entries that map onto ubd devices.\n\n" +" Obsolete stub.\n\n" ); static int parse_unit(char **ptr) @@ -265,42 +227,20 @@ static int ubd_setup_common(char *str, int *index_out, char **error_out) { struct ubd *ubd_dev; struct openflags flags = global_openflags; - char *backing_file; + char *file, *backing_file, *serial; int n, err = 0, i; if(index_out) *index_out = -1; n = *str; if(n == '='){ - char *end; - int major; - str++; if(!strcmp(str, "sync")){ global_openflags = of_sync(global_openflags); - goto out1; - } - - err = -EINVAL; - major = simple_strtoul(str, &end, 0); - if((*end != '\0') || (end == str)){ - *error_out = "Didn't parse major number"; - goto out1; + return err; } - mutex_lock(&ubd_lock); - if (fake_major != UBD_MAJOR) { - *error_out = "Can't assign a fake major twice"; - goto out1; - } - - fake_major = major; - - printk(KERN_INFO "Setting extra ubd major number to %d\n", - major); - err = 0; - out1: - mutex_unlock(&ubd_lock); - return err; + pr_warn("fake major not supported any more\n"); + return 0; } n = parse_unit(&str); @@ -326,7 +266,7 @@ static int ubd_setup_common(char *str, int *index_out, char **error_out) *index_out = n; err = -EINVAL; - for (i = 0; i < sizeof("rscd="); i++) { + for (i = 0; i < sizeof("rscdt="); i++) { switch (*str) { case 'r': flags.w = 0; @@ -340,12 +280,15 @@ static int ubd_setup_common(char *str, int *index_out, char **error_out) case 'c': ubd_dev->shared = 1; break; + case 't': + ubd_dev->no_trim = 1; + break; case '=': str++; goto break_loop; default: *error_out = "Expected '=' or flag letter " - "(r, s, c, or d)"; + "(r, s, c, t or d)"; goto out; } str++; @@ -358,24 +301,27 @@ static int ubd_setup_common(char *str, int *index_out, char **error_out) goto out; break_loop: - backing_file = strchr(str, ','); + file = strsep(&str, ",:"); + if (*file == '\0') + file = NULL; - if (backing_file == NULL) - backing_file = strchr(str, ':'); + backing_file = strsep(&str, ",:"); + if (backing_file && *backing_file == '\0') + backing_file = NULL; - if(backing_file != NULL){ - if(ubd_dev->no_cow){ - *error_out = "Can't specify both 'd' and a cow file"; - goto out; - } - else { - *backing_file = '\0'; - backing_file++; - } + serial = strsep(&str, ",:"); + if (serial && *serial == '\0') + serial = NULL; + + if (backing_file && ubd_dev->no_cow) { + *error_out = "Can't specify both 'd' and a cow file"; + goto out; } + err = 0; - ubd_dev->file = str; + ubd_dev->file = file; ubd_dev->cow.file = backing_file; + ubd_dev->serial = serial; ubd_dev->boot_openflags = flags; out: mutex_unlock(&ubd_lock); @@ -396,7 +342,7 @@ static int ubd_setup(char *str) __setup("ubd", ubd_setup); __uml_help(ubd_setup, -"ubd<n><flags>=<filename>[(:|,)<filename2>]\n" +"ubd<n><flags>=<filename>[(:|,)<filename2>][(:|,)<serial>]\n" " This is used to associate a device with a file in the underlying\n" " filesystem. When specifying two filenames, the first one is the\n" " COW name and the second is the backing file name. As separator you can\n" @@ -418,6 +364,13 @@ __uml_help(ubd_setup, " 'c' will cause the device to be treated as being shared between multiple\n" " UMLs and file locking will be turned off - this is appropriate for a\n" " cluster filesystem and inappropriate at almost all other times.\n\n" +" 't' will disable trim/discard support on the device (enabled by default).\n\n" +" An optional device serial number can be exposed using the serial parameter\n" +" on the cmdline which is exposed as a sysfs entry. This is particularly\n" +" useful when a unique number should be given to the device. Note when\n" +" specifying a label, the filename2 must be also presented. It can be\n" +" an empty string, in which case the backing file is not used:\n" +" ubd0=File,,Serial\n\n" ); static int udb_setup(char *str) @@ -436,60 +389,97 @@ __uml_help(udb_setup, " in the boot output.\n\n" ); -static void do_ubd_request(struct request_queue * q); - /* Only changed by ubd_init, which is an initcall. */ static int thread_fd = -1; -static LIST_HEAD(restart); -/* XXX - move this inside ubd_intr. */ -/* Called without dev->lock held, and only in interrupt context. */ -static void ubd_handler(void) +/* Function to read several request pointers at a time +* handling fractional reads if (and as) needed +*/ + +static int bulk_req_safe_read( + int fd, + struct io_thread_req * (*request_buffer)[], + struct io_thread_req **remainder, + int *remainder_size, + int max_recs + ) { - struct io_thread_req *req; - struct ubd *ubd; - struct list_head *list, *next_ele; - unsigned long flags; - int n; + int n = 0; + int res = 0; + + if (*remainder_size > 0) { + memmove( + (char *) request_buffer, + (char *) remainder, *remainder_size + ); + n = *remainder_size; + } - while(1){ - n = os_read_file(thread_fd, &req, - sizeof(struct io_thread_req *)); - if(n != sizeof(req)){ - if(n == -EAGAIN) - break; - printk(KERN_ERR "spurious interrupt in ubd_handler, " - "err = %d\n", -n); - return; + res = os_read_file( + fd, + ((char *) request_buffer) + *remainder_size, + sizeof(struct io_thread_req *)*max_recs + - *remainder_size + ); + if (res > 0) { + n += res; + if ((n % sizeof(struct io_thread_req *)) > 0) { + /* + * Read somehow returned not a multiple of dword + * theoretically possible, but never observed in the + * wild, so read routine must be able to handle it + */ + *remainder_size = n % sizeof(struct io_thread_req *); + WARN(*remainder_size > 0, "UBD IPC read returned a partial result"); + memmove( + remainder, + ((char *) request_buffer) + + (n/sizeof(struct io_thread_req *))*sizeof(struct io_thread_req *), + *remainder_size + ); + n = n - *remainder_size; } - - blk_end_request(req->req, 0, req->length); - kfree(req); + } else { + n = res; } - reactivate_fd(thread_fd, UBD_IRQ); - - list_for_each_safe(list, next_ele, &restart){ - ubd = container_of(list, struct ubd, restart); - list_del_init(&ubd->restart); - spin_lock_irqsave(&ubd->lock, flags); - do_ubd_request(ubd->queue); - spin_unlock_irqrestore(&ubd->lock, flags); + return n; +} + +static void ubd_end_request(struct io_thread_req *io_req) +{ + if (io_req->error == BLK_STS_NOTSUPP) { + if (req_op(io_req->req) == REQ_OP_DISCARD) + blk_queue_disable_discard(io_req->req->q); + else if (req_op(io_req->req) == REQ_OP_WRITE_ZEROES) + blk_queue_disable_write_zeroes(io_req->req->q); } + blk_mq_end_request(io_req->req, io_req->error); + kfree(io_req); } static irqreturn_t ubd_intr(int irq, void *dev) { - ubd_handler(); + int len, i; + + while ((len = bulk_req_safe_read(thread_fd, irq_req_buffer, + &irq_remainder, &irq_remainder_size, + UBD_REQ_BUFFER_SIZE)) >= 0) { + for (i = 0; i < len / sizeof(struct io_thread_req *); i++) + ubd_end_request((*irq_req_buffer)[i]); + } + + if (len < 0 && len != -EAGAIN) + pr_err("spurious interrupt in %s, err = %d\n", __func__, len); return IRQ_HANDLED; } /* Only changed by ubd_init, which is an initcall. */ -static int io_pid = -1; +static struct os_helper_thread *io_td; static void kill_io_thread(void) { - if(io_pid != -1) - os_kill_process(io_pid, 1); + if (io_td) + os_kill_helper_thread(io_td); } __uml_exitcall(kill_io_thread); @@ -503,7 +493,7 @@ static inline int ubd_file_size(struct ubd *ubd_dev, __u64 *size_out) __u32 version; __u32 align; char *backing_file; - time_t mtime; + time64_t mtime; unsigned long long size; int sector_size; int bitmap_offset; @@ -535,20 +525,16 @@ static int read_cow_bitmap(int fd, void *buf, int offset, int len) { int err; - err = os_seek_file(fd, offset); - if (err < 0) - return err; - - err = os_read_file(fd, buf, len); + err = os_pread_file(fd, buf, len, offset); if (err < 0) return err; return 0; } -static int backing_file_mismatch(char *file, __u64 size, time_t mtime) +static int backing_file_mismatch(char *file, __u64 size, time64_t mtime) { - unsigned long modtime; + time64_t modtime; unsigned long long actual; int err; @@ -574,7 +560,7 @@ static int backing_file_mismatch(char *file, __u64 size, time_t mtime) return -EINVAL; } if (modtime != mtime) { - printk(KERN_ERR "mtime mismatch (%ld vs %ld) of COW header vs " + printk(KERN_ERR "mtime mismatch (%lld vs %lld) of COW header vs " "backing file\n", mtime, modtime); return -EINVAL; } @@ -617,7 +603,7 @@ static int open_ubd_file(char *file, struct openflags *openflags, int shared, unsigned long *bitmap_len_out, int *data_offset_out, int *create_cow_out) { - time_t mtime; + time64_t mtime; unsigned long long size; __u32 version, align; char *backing_file; @@ -747,7 +733,7 @@ static int ubd_open_dev(struct ubd *ubd_dev) if((fd == -ENOENT) && create_cow){ fd = create_cow_file(ubd_dev->file, ubd_dev->cow.file, - ubd_dev->openflags, 1 << 9, PAGE_SIZE, + ubd_dev->openflags, SECTOR_SIZE, PAGE_SIZE, &ubd_dev->cow.bitmap_offset, &ubd_dev->cow.bitmap_len, &ubd_dev->cow.data_offset); @@ -765,15 +751,12 @@ static int ubd_open_dev(struct ubd *ubd_dev) ubd_dev->fd = fd; if(ubd_dev->cow.file != NULL){ - blk_queue_max_hw_sectors(ubd_dev->queue, 8 * sizeof(long)); - err = -ENOMEM; ubd_dev->cow.bitmap = vmalloc(ubd_dev->cow.bitmap_len); if(ubd_dev->cow.bitmap == NULL){ printk(KERN_ERR "Failed to vmalloc COW bitmap\n"); goto error; } - flush_tlb_kernel_vm(); err = read_cow_bitmap(ubd_dev->fd, ubd_dev->cow.bitmap, ubd_dev->cow.bitmap_offset, @@ -796,102 +779,136 @@ static int ubd_open_dev(struct ubd *ubd_dev) static void ubd_device_release(struct device *dev) { - struct ubd *ubd_dev = dev_get_drvdata(dev); + struct ubd *ubd_dev = container_of(dev, struct ubd, pdev.dev); - blk_cleanup_queue(ubd_dev->queue); + blk_mq_free_tag_set(&ubd_dev->tag_set); *ubd_dev = ((struct ubd) DEFAULT_UBD); } -static int ubd_disk_register(int major, u64 size, int unit, - struct gendisk **disk_out) +static ssize_t serial_show(struct device *dev, + struct device_attribute *attr, char *buf) { - struct gendisk *disk; + struct gendisk *disk = dev_to_disk(dev); + struct ubd *ubd_dev = disk->private_data; - disk = alloc_disk(1 << UBD_SHIFT); - if(disk == NULL) - return -ENOMEM; + if (!ubd_dev) + return 0; - disk->major = major; - disk->first_minor = unit << UBD_SHIFT; - disk->fops = &ubd_blops; - set_capacity(disk, size / 512); - if (major == UBD_MAJOR) - sprintf(disk->disk_name, "ubd%c", 'a' + unit); - else - sprintf(disk->disk_name, "ubd_fake%d", unit); - - /* sysfs register (not for ide fake devices) */ - if (major == UBD_MAJOR) { - ubd_devs[unit].pdev.id = unit; - ubd_devs[unit].pdev.name = DRIVER_NAME; - ubd_devs[unit].pdev.dev.release = ubd_device_release; - dev_set_drvdata(&ubd_devs[unit].pdev.dev, &ubd_devs[unit]); - platform_device_register(&ubd_devs[unit].pdev); - disk->driverfs_dev = &ubd_devs[unit].pdev.dev; - } + return sprintf(buf, "%s", ubd_dev->serial); +} - disk->private_data = &ubd_devs[unit]; - disk->queue = ubd_devs[unit].queue; - add_disk(disk); +static DEVICE_ATTR_RO(serial); - *disk_out = disk; - return 0; +static struct attribute *ubd_attrs[] = { + &dev_attr_serial.attr, + NULL, +}; + +static umode_t ubd_attrs_are_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + return a->mode; } -#define ROUND_BLOCK(n) ((n + ((1 << 9) - 1)) & (-1 << 9)) +static const struct attribute_group ubd_attr_group = { + .attrs = ubd_attrs, + .is_visible = ubd_attrs_are_visible, +}; + +static const struct attribute_group *ubd_attr_groups[] = { + &ubd_attr_group, + NULL, +}; + +#define ROUND_BLOCK(n) ((n + (SECTOR_SIZE - 1)) & (-SECTOR_SIZE)) + +static const struct blk_mq_ops ubd_mq_ops = { + .queue_rq = ubd_queue_rq, +}; static int ubd_add(int n, char **error_out) { struct ubd *ubd_dev = &ubd_devs[n]; + struct queue_limits lim = { + .max_segments = MAX_SG, + .seg_boundary_mask = PAGE_SIZE - 1, + .features = BLK_FEAT_WRITE_CACHE, + }; + struct gendisk *disk; int err = 0; if(ubd_dev->file == NULL) goto out; + if (ubd_dev->cow.file) + lim.max_hw_sectors = 8 * sizeof(long); + if (!ubd_dev->no_trim) { + lim.max_hw_discard_sectors = UBD_MAX_REQUEST; + lim.max_write_zeroes_sectors = UBD_MAX_REQUEST; + } + err = ubd_file_size(ubd_dev, &ubd_dev->size); if(err < 0){ *error_out = "Couldn't determine size of device's file"; goto out; } + err = ubd_open_dev(ubd_dev); + if (err) { + pr_err("ubd%c: Can't open \"%s\": errno = %d\n", + 'a' + n, ubd_dev->file, -err); + goto out; + } + ubd_dev->size = ROUND_BLOCK(ubd_dev->size); - INIT_LIST_HEAD(&ubd_dev->restart); - sg_init_table(ubd_dev->sg, MAX_SG); + ubd_dev->tag_set.ops = &ubd_mq_ops; + ubd_dev->tag_set.queue_depth = 64; + ubd_dev->tag_set.numa_node = NUMA_NO_NODE; + ubd_dev->tag_set.driver_data = ubd_dev; + ubd_dev->tag_set.nr_hw_queues = 1; - err = -ENOMEM; - ubd_dev->queue = blk_init_queue(do_ubd_request, &ubd_dev->lock); - if (ubd_dev->queue == NULL) { - *error_out = "Failed to initialize device queue"; - goto out; - } - ubd_dev->queue->queuedata = ubd_dev; + err = blk_mq_alloc_tag_set(&ubd_dev->tag_set); + if (err) + goto out_close; - blk_queue_max_segments(ubd_dev->queue, MAX_SG); - err = ubd_disk_register(UBD_MAJOR, ubd_dev->size, n, &ubd_gendisk[n]); - if(err){ - *error_out = "Failed to register device"; - goto out_cleanup; + disk = blk_mq_alloc_disk(&ubd_dev->tag_set, &lim, ubd_dev); + if (IS_ERR(disk)) { + err = PTR_ERR(disk); + goto out_cleanup_tags; } - if (fake_major != UBD_MAJOR) - ubd_disk_register(fake_major, ubd_dev->size, n, - &fake_gendisk[n]); + disk->major = UBD_MAJOR; + disk->first_minor = n << UBD_SHIFT; + disk->minors = 1 << UBD_SHIFT; + disk->fops = &ubd_blops; + set_capacity(disk, ubd_dev->size / 512); + sprintf(disk->disk_name, "ubd%c", 'a' + n); + disk->private_data = ubd_dev; + set_disk_ro(disk, !ubd_dev->openflags.w); - /* - * Perhaps this should also be under the "if (fake_major)" above - * using the fake_disk->disk_name - */ - if (fake_ide) - make_ide_entries(ubd_gendisk[n]->disk_name); + ubd_dev->pdev.id = n; + ubd_dev->pdev.name = DRIVER_NAME; + ubd_dev->pdev.dev.release = ubd_device_release; + dev_set_drvdata(&ubd_dev->pdev.dev, ubd_dev); + platform_device_register(&ubd_dev->pdev); - err = 0; + err = device_add_disk(&ubd_dev->pdev.dev, disk, ubd_attr_groups); + if (err) + goto out_cleanup_disk; + + ubd_dev->disk = disk; + + return 0; + +out_cleanup_disk: + put_disk(disk); +out_cleanup_tags: + blk_mq_free_tag_set(&ubd_dev->tag_set); +out_close: + ubd_close_dev(ubd_dev); out: return err; - -out_cleanup: - blk_cleanup_queue(ubd_dev->queue); - goto out; } static int ubd_config(char *str, char **error_out) @@ -975,7 +992,6 @@ static int ubd_id(char **str, int *start_out, int *end_out) static int ubd_remove(int n, char **error_out) { - struct gendisk *disk = ubd_gendisk[n]; struct ubd *ubd_dev; int err = -ENODEV; @@ -986,21 +1002,15 @@ static int ubd_remove(int n, char **error_out) if(ubd_dev->file == NULL) goto out; - /* you cannot remove a open disk */ - err = -EBUSY; - if(ubd_dev->count > 0) - goto out; - - ubd_gendisk[n] = NULL; - if(disk != NULL){ - del_gendisk(disk); - put_disk(disk); - } + if (ubd_dev->disk) { + /* you cannot remove a open disk */ + err = -EBUSY; + if (disk_openers(ubd_dev->disk)) + goto out; - if(fake_gendisk[n] != NULL){ - del_gendisk(fake_gendisk[n]); - put_disk(fake_gendisk[n]); - fake_gendisk[n] = NULL; + del_gendisk(ubd_dev->disk); + ubd_close_dev(ubd_dev); + put_disk(ubd_dev->disk); } err = 0; @@ -1059,12 +1069,26 @@ static int __init ubd_init(void) if (register_blkdev(UBD_MAJOR, "ubd")) return -1; - if (fake_major != UBD_MAJOR) { - char name[sizeof("ubd_nnn\0")]; + irq_req_buffer = kmalloc_array(UBD_REQ_BUFFER_SIZE, + sizeof(struct io_thread_req *), + GFP_KERNEL + ); + irq_remainder = 0; - snprintf(name, sizeof(name), "ubd_%d", fake_major); - if (register_blkdev(fake_major, "ubd")) - return -1; + if (irq_req_buffer == NULL) { + printk(KERN_ERR "Failed to initialize ubd buffering\n"); + return -ENOMEM; + } + io_req_buffer = kmalloc_array(UBD_REQ_BUFFER_SIZE, + sizeof(struct io_thread_req *), + GFP_KERNEL + ); + + io_remainder = 0; + + if (io_req_buffer == NULL) { + printk(KERN_ERR "Failed to initialize ubd buffering\n"); + return -ENOMEM; } platform_driver_register(&ubd_driver); mutex_lock(&ubd_lock); @@ -1080,8 +1104,8 @@ static int __init ubd_init(void) late_initcall(ubd_init); -static int __init ubd_driver_init(void){ - unsigned long stack; +static int __init ubd_driver_init(void) +{ int err; /* Set by CONFIG_BLK_DEV_UBD_SYNC or ubd=sync.*/ @@ -1090,73 +1114,31 @@ static int __init ubd_driver_init(void){ /* Letting ubd=sync be like using ubd#s= instead of ubd#= is * enough. So use anyway the io thread. */ } - stack = alloc_stack(0, 0); - io_pid = start_io_thread(stack + PAGE_SIZE - sizeof(void *), - &thread_fd); - if(io_pid < 0){ + err = start_io_thread(&io_td, &thread_fd); + if (err < 0) { printk(KERN_ERR "ubd : Failed to start I/O thread (errno = %d) - " - "falling back to synchronous I/O\n", -io_pid); - io_pid = -1; + "falling back to synchronous I/O\n", -err); return 0; } err = um_request_irq(UBD_IRQ, thread_fd, IRQ_READ, ubd_intr, 0, "ubd", ubd_devs); - if(err != 0) + if(err < 0) printk(KERN_ERR "um_request_irq failed - errno = %d\n", -err); return 0; } device_initcall(ubd_driver_init); -static int ubd_open(struct block_device *bdev, fmode_t mode) -{ - struct gendisk *disk = bdev->bd_disk; - struct ubd *ubd_dev = disk->private_data; - int err = 0; - - mutex_lock(&ubd_mutex); - if(ubd_dev->count == 0){ - err = ubd_open_dev(ubd_dev); - if(err){ - printk(KERN_ERR "%s: Can't open \"%s\": errno = %d\n", - disk->disk_name, ubd_dev->file, -err); - goto out; - } - } - ubd_dev->count++; - set_disk_ro(disk, !ubd_dev->openflags.w); - - /* This should no more be needed. And it didn't work anyway to exclude - * read-write remounting of filesystems.*/ - /*if((mode & FMODE_WRITE) && !ubd_dev->openflags.w){ - if(--ubd_dev->count == 0) ubd_close_dev(ubd_dev); - err = -EROFS; - }*/ -out: - mutex_unlock(&ubd_mutex); - return err; -} - -static void ubd_release(struct gendisk *disk, fmode_t mode) -{ - struct ubd *ubd_dev = disk->private_data; - - mutex_lock(&ubd_mutex); - if(--ubd_dev->count == 0) - ubd_close_dev(ubd_dev); - mutex_unlock(&ubd_mutex); -} - static void cowify_bitmap(__u64 io_offset, int length, unsigned long *cow_mask, __u64 *cow_offset, unsigned long *bitmap, __u64 bitmap_offset, unsigned long *bitmap_words, __u64 bitmap_len) { - __u64 sector = io_offset >> 9; + __u64 sector = io_offset >> SECTOR_SHIFT; int i, update_bitmap = 0; - for(i = 0; i < length >> 9; i++){ + for (i = 0; i < length >> SECTOR_SHIFT; i++) { if(cow_mask != NULL) ubd_set_bit(i, (unsigned char *) cow_mask); if(ubd_test_bit(sector + i, (unsigned char *) bitmap)) @@ -1187,115 +1169,164 @@ static void cowify_bitmap(__u64 io_offset, int length, unsigned long *cow_mask, *cow_offset += bitmap_offset; } -static void cowify_req(struct io_thread_req *req, unsigned long *bitmap, +static void cowify_req(struct io_thread_req *req, struct io_desc *segment, + unsigned long offset, unsigned long *bitmap, __u64 bitmap_offset, __u64 bitmap_len) { - __u64 sector = req->offset >> 9; + __u64 sector = offset >> SECTOR_SHIFT; int i; - if(req->length > (sizeof(req->sector_mask) * 8) << 9) + if (segment->length > (sizeof(segment->sector_mask) * 8) << SECTOR_SHIFT) panic("Operation too long"); - if(req->op == UBD_READ) { - for(i = 0; i < req->length >> 9; i++){ + if (req_op(req->req) == REQ_OP_READ) { + for (i = 0; i < segment->length >> SECTOR_SHIFT; i++) { if(ubd_test_bit(sector + i, (unsigned char *) bitmap)) ubd_set_bit(i, (unsigned char *) - &req->sector_mask); + &segment->sector_mask); } + } else { + cowify_bitmap(offset, segment->length, &segment->sector_mask, + &segment->cow_offset, bitmap, bitmap_offset, + segment->bitmap_words, bitmap_len); } - else cowify_bitmap(req->offset, req->length, &req->sector_mask, - &req->cow_offset, bitmap, bitmap_offset, - req->bitmap_words, bitmap_len); } -/* Called with dev->lock held */ -static void prepare_request(struct request *req, struct io_thread_req *io_req, - unsigned long long offset, int page_offset, - int len, struct page *page) +static void ubd_map_req(struct ubd *dev, struct io_thread_req *io_req, + struct request *req) { - struct gendisk *disk = req->rq_disk; - struct ubd *ubd_dev = disk->private_data; + struct bio_vec bvec; + struct req_iterator iter; + int i = 0; + unsigned long byte_offset = io_req->offset; + enum req_op op = req_op(req); + + if (op == REQ_OP_WRITE_ZEROES || op == REQ_OP_DISCARD) { + io_req->io_desc[0].buffer = NULL; + io_req->io_desc[0].length = blk_rq_bytes(req); + } else { + rq_for_each_segment(bvec, req, iter) { + BUG_ON(i >= io_req->desc_cnt); + + io_req->io_desc[i].buffer = bvec_virt(&bvec); + io_req->io_desc[i].length = bvec.bv_len; + i++; + } + } + + if (dev->cow.file) { + for (i = 0; i < io_req->desc_cnt; i++) { + cowify_req(io_req, &io_req->io_desc[i], byte_offset, + dev->cow.bitmap, dev->cow.bitmap_offset, + dev->cow.bitmap_len); + byte_offset += io_req->io_desc[i].length; + } + + } +} + +static struct io_thread_req *ubd_alloc_req(struct ubd *dev, struct request *req, + int desc_cnt) +{ + struct io_thread_req *io_req; + int i; + + io_req = kmalloc(sizeof(*io_req) + + (desc_cnt * sizeof(struct io_desc)), + GFP_ATOMIC); + if (!io_req) + return NULL; io_req->req = req; - io_req->fds[0] = (ubd_dev->cow.file != NULL) ? ubd_dev->cow.fd : - ubd_dev->fd; - io_req->fds[1] = ubd_dev->fd; - io_req->cow_offset = -1; - io_req->offset = offset; - io_req->length = len; + if (dev->cow.file) + io_req->fds[0] = dev->cow.fd; + else + io_req->fds[0] = dev->fd; io_req->error = 0; - io_req->sector_mask = 0; - - io_req->op = (rq_data_dir(req) == READ) ? UBD_READ : UBD_WRITE; + io_req->sectorsize = SECTOR_SIZE; + io_req->fds[1] = dev->fd; + io_req->offset = (u64) blk_rq_pos(req) << SECTOR_SHIFT; io_req->offsets[0] = 0; - io_req->offsets[1] = ubd_dev->cow.data_offset; - io_req->buffer = page_address(page) + page_offset; - io_req->sectorsize = 1 << 9; + io_req->offsets[1] = dev->cow.data_offset; - if(ubd_dev->cow.file != NULL) - cowify_req(io_req, ubd_dev->cow.bitmap, - ubd_dev->cow.bitmap_offset, ubd_dev->cow.bitmap_len); + for (i = 0 ; i < desc_cnt; i++) { + io_req->io_desc[i].sector_mask = 0; + io_req->io_desc[i].cow_offset = -1; + } + return io_req; } -/* Called with dev->lock held */ -static void do_ubd_request(struct request_queue *q) +static int ubd_submit_request(struct ubd *dev, struct request *req) { + int segs = 0; struct io_thread_req *io_req; - struct request *req; - int n; + int ret; + enum req_op op = req_op(req); - while(1){ - struct ubd *dev = q->queuedata; - if(dev->end_sg == 0){ - struct request *req = blk_fetch_request(q); - if(req == NULL) - return; + if (op == REQ_OP_FLUSH) + segs = 0; + else if (op == REQ_OP_WRITE_ZEROES || op == REQ_OP_DISCARD) + segs = 1; + else + segs = blk_rq_nr_phys_segments(req); - dev->request = req; - dev->rq_pos = blk_rq_pos(req); - dev->start_sg = 0; - dev->end_sg = blk_rq_map_sg(q, req, dev->sg); - } + io_req = ubd_alloc_req(dev, req, segs); + if (!io_req) + return -ENOMEM; - req = dev->request; - while(dev->start_sg < dev->end_sg){ - struct scatterlist *sg = &dev->sg[dev->start_sg]; + io_req->desc_cnt = segs; + if (segs) + ubd_map_req(dev, io_req, req); - io_req = kmalloc(sizeof(struct io_thread_req), - GFP_ATOMIC); - if(io_req == NULL){ - if(list_empty(&dev->restart)) - list_add(&dev->restart, &restart); - return; - } - prepare_request(req, io_req, - (unsigned long long)dev->rq_pos << 9, - sg->offset, sg->length, sg_page(sg)); - - n = os_write_file(thread_fd, &io_req, - sizeof(struct io_thread_req *)); - if(n != sizeof(struct io_thread_req *)){ - if(n != -EAGAIN) - printk("write to io thread failed, " - "errno = %d\n", -n); - else if(list_empty(&dev->restart)) - list_add(&dev->restart, &restart); - kfree(io_req); - return; - } + ret = os_write_file(thread_fd, &io_req, sizeof(io_req)); + if (ret != sizeof(io_req)) { + if (ret != -EAGAIN) + pr_err("write to io thread failed: %d\n", -ret); + kfree(io_req); + } + return ret; +} - dev->rq_pos += sg->length >> 9; - dev->start_sg++; - } - dev->end_sg = 0; - dev->request = NULL; +static blk_status_t ubd_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct ubd *ubd_dev = hctx->queue->queuedata; + struct request *req = bd->rq; + int ret = 0, res = BLK_STS_OK; + + blk_mq_start_request(req); + + spin_lock_irq(&ubd_dev->lock); + + switch (req_op(req)) { + case REQ_OP_FLUSH: + case REQ_OP_READ: + case REQ_OP_WRITE: + case REQ_OP_DISCARD: + case REQ_OP_WRITE_ZEROES: + ret = ubd_submit_request(ubd_dev, req); + break; + default: + WARN_ON_ONCE(1); + res = BLK_STS_NOTSUPP; } + + spin_unlock_irq(&ubd_dev->lock); + + if (ret < 0) { + if (ret == -ENOMEM) + res = BLK_STS_RESOURCE; + else + res = BLK_STS_DEV_RESOURCE; + } + + return res; } -static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int ubd_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct ubd *ubd_dev = bdev->bd_disk->private_data; + struct ubd *ubd_dev = disk->private_data; geo->heads = 128; geo->sectors = 32; @@ -1303,7 +1334,7 @@ static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo) return 0; } -static int ubd_ioctl(struct block_device *bdev, fmode_t mode, +static int ubd_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { struct ubd *ubd_dev = bdev->bd_disk->private_data; @@ -1335,87 +1366,124 @@ static int ubd_ioctl(struct block_device *bdev, fmode_t mode, return -EINVAL; } -static int update_bitmap(struct io_thread_req *req) +static int map_error(int error_code) { - int n; + switch (error_code) { + case 0: + return BLK_STS_OK; + case ENOSYS: + case EOPNOTSUPP: + return BLK_STS_NOTSUPP; + case ENOSPC: + return BLK_STS_NOSPC; + } + return BLK_STS_IOERR; +} - if(req->cow_offset == -1) - return 0; +/* + * Everything from here onwards *IS NOT PART OF THE KERNEL* + * + * The following functions are part of UML hypervisor code. + * All functions from here onwards are executed as a helper + * thread and are not allowed to execute any kernel functions. + * + * Any communication must occur strictly via shared memory and IPC. + * + * Do not add printks, locks, kernel memory operations, etc - it + * will result in unpredictable behaviour and/or crashes. + */ - n = os_seek_file(req->fds[1], req->cow_offset); - if(n < 0){ - printk("do_io - bitmap lseek failed : err = %d\n", -n); - return 1; - } +static int update_bitmap(struct io_thread_req *req, struct io_desc *segment) +{ + int n; - n = os_write_file(req->fds[1], &req->bitmap_words, - sizeof(req->bitmap_words)); - if(n != sizeof(req->bitmap_words)){ - printk("do_io - bitmap update failed, err = %d fd = %d\n", -n, - req->fds[1]); - return 1; - } + if (segment->cow_offset == -1) + return map_error(0); - return 0; + n = os_pwrite_file(req->fds[1], &segment->bitmap_words, + sizeof(segment->bitmap_words), segment->cow_offset); + if (n != sizeof(segment->bitmap_words)) + return map_error(-n); + + return map_error(0); } -static void do_io(struct io_thread_req *req) +static void do_io(struct io_thread_req *req, struct io_desc *desc) { - char *buf; + char *buf = NULL; unsigned long len; int n, nsectors, start, end, bit; - int err; __u64 off; - nsectors = req->length / req->sectorsize; + /* FLUSH is really a special case, we cannot "case" it with others */ + + if (req_op(req->req) == REQ_OP_FLUSH) { + /* fds[0] is always either the rw image or our cow file */ + req->error = map_error(-os_sync_file(req->fds[0])); + return; + } + + nsectors = desc->length / req->sectorsize; start = 0; do { - bit = ubd_test_bit(start, (unsigned char *) &req->sector_mask); + bit = ubd_test_bit(start, (unsigned char *) &desc->sector_mask); end = start; while((end < nsectors) && - (ubd_test_bit(end, (unsigned char *) - &req->sector_mask) == bit)) + (ubd_test_bit(end, (unsigned char *) &desc->sector_mask) == bit)) end++; off = req->offset + req->offsets[bit] + start * req->sectorsize; len = (end - start) * req->sectorsize; - buf = &req->buffer[start * req->sectorsize]; + if (desc->buffer != NULL) + buf = &desc->buffer[start * req->sectorsize]; - err = os_seek_file(req->fds[bit], off); - if(err < 0){ - printk("do_io - lseek failed : err = %d\n", -err); - req->error = 1; - return; - } - if(req->op == UBD_READ){ + switch (req_op(req->req)) { + case REQ_OP_READ: n = 0; do { buf = &buf[n]; len -= n; - n = os_read_file(req->fds[bit], buf, len); + n = os_pread_file(req->fds[bit], buf, len, off); if (n < 0) { - printk("do_io - read failed, err = %d " - "fd = %d\n", -n, req->fds[bit]); - req->error = 1; + req->error = map_error(-n); return; } } while((n < len) && (n != 0)); if (n < len) memset(&buf[n], 0, len - n); - } else { - n = os_write_file(req->fds[bit], buf, len); + break; + case REQ_OP_WRITE: + n = os_pwrite_file(req->fds[bit], buf, len, off); if(n != len){ - printk("do_io - write failed err = %d " - "fd = %d\n", -n, req->fds[bit]); - req->error = 1; + req->error = map_error(-n); return; } + break; + case REQ_OP_DISCARD: + n = os_falloc_punch(req->fds[bit], off, len); + if (n) { + req->error = map_error(-n); + return; + } + break; + case REQ_OP_WRITE_ZEROES: + n = os_falloc_zeroes(req->fds[bit], off, len); + if (n) { + req->error = map_error(-n); + return; + } + break; + default: + WARN_ON_ONCE(1); + req->error = BLK_STS_NOTSUPP; + return; } start = end; } while(start < nsectors); - req->error = update_bitmap(req); + req->offset += len; + req->error = update_bitmap(req, desc); } /* Changed in start_io_thread, which is serialized by being called only @@ -1424,35 +1492,53 @@ static void do_io(struct io_thread_req *req) int kernel_fd = -1; /* Only changed by the io thread. XXX: currently unused. */ -static int io_count = 0; +static int io_count; -int io_thread(void *arg) +void *io_thread(void *arg) { - struct io_thread_req *req; - int n; + int n, count, written, res; + + os_fix_helper_thread_signals(); - ignore_sigwinch_sig(); while(1){ - n = os_read_file(kernel_fd, &req, - sizeof(struct io_thread_req *)); - if(n != sizeof(struct io_thread_req *)){ - if(n < 0) - printk("io_thread - read failed, fd = %d, " - "err = %d\n", kernel_fd, -n); - else { - printk("io_thread - short read, fd = %d, " - "length = %d\n", kernel_fd, n); - } + n = bulk_req_safe_read( + kernel_fd, + io_req_buffer, + &io_remainder, + &io_remainder_size, + UBD_REQ_BUFFER_SIZE + ); + if (n <= 0) { + if (n == -EAGAIN) + ubd_read_poll(-1); + continue; } - io_count++; - do_io(req); - n = os_write_file(kernel_fd, &req, - sizeof(struct io_thread_req *)); - if(n != sizeof(struct io_thread_req *)) - printk("io_thread - write failed, fd = %d, err = %d\n", - kernel_fd, -n); + + for (count = 0; count < n/sizeof(struct io_thread_req *); count++) { + struct io_thread_req *req = (*io_req_buffer)[count]; + int i; + + io_count++; + for (i = 0; !req->error && i < req->desc_cnt; i++) + do_io(req, &(req->io_desc[i])); + + } + + written = 0; + + do { + res = os_write_file(kernel_fd, + ((char *) io_req_buffer) + written, + n - written); + if (res >= 0) { + written += res; + } + if (written < n) { + ubd_write_poll(-1); + } + } while (written < n); } - return 0; + return NULL; } diff --git a/arch/um/drivers/ubd_user.c b/arch/um/drivers/ubd_user.c index a703e45d8aac..8e8a8bf518b6 100644 --- a/arch/um/drivers/ubd_user.c +++ b/arch/um/drivers/ubd_user.c @@ -1,7 +1,8 @@ -/* +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2016 Anton Ivanov (aivanov@brocade.com) * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) * Copyright (C) 2001 Ridgerun,Inc (glonnon@ridgerun.com) - * Licensed under the GPL */ #include <stddef.h> @@ -20,15 +21,13 @@ #include "ubd.h" #include <os.h> +#include <poll.h> -void ignore_sigwinch_sig(void) -{ - signal(SIGWINCH, SIG_IGN); -} +static struct pollfd kernel_pollfd; -int start_io_thread(unsigned long sp, int *fd_out) +int start_io_thread(struct os_helper_thread **td_out, int *fd_out) { - int pid, fds[2], err; + int fds[2], err; err = os_pipe(fds, 1, 1); if(err < 0){ @@ -37,22 +36,25 @@ int start_io_thread(unsigned long sp, int *fd_out) } kernel_fd = fds[0]; + kernel_pollfd.fd = kernel_fd; + kernel_pollfd.events = POLLIN; *fd_out = fds[1]; err = os_set_fd_block(*fd_out, 0); + err |= os_set_fd_block(kernel_fd, 0); if (err) { printk("start_io_thread - failed to set nonblocking I/O.\n"); goto out_close; } - pid = clone(io_thread, (void *) sp, CLONE_FILES | CLONE_VM, NULL); - if(pid < 0){ - err = -errno; - printk("start_io_thread - clone failed : errno = %d\n", errno); + err = os_run_helper_thread(td_out, io_thread, NULL); + if (err < 0) { + printk("%s - failed to run helper thread, err = %d\n", + __func__, -err); goto out_close; } - return(pid); + return 0; out_close: os_close_file(fds[0]); @@ -62,3 +64,15 @@ int start_io_thread(unsigned long sp, int *fd_out) out: return err; } + +int ubd_read_poll(int timeout) +{ + kernel_pollfd.events = POLLIN; + return poll(&kernel_pollfd, 1, timeout); +} +int ubd_write_poll(int timeout) +{ + kernel_pollfd.events = POLLOUT; + return poll(&kernel_pollfd, 1, timeout); +} + diff --git a/arch/um/drivers/umcast.h b/arch/um/drivers/umcast.h deleted file mode 100644 index c190c6440911..000000000000 --- a/arch/um/drivers/umcast.h +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL - */ - -#ifndef __DRIVERS_UMCAST_H -#define __DRIVERS_UMCAST_H - -#include <net_user.h> - -struct umcast_data { - char *addr; - unsigned short lport; - unsigned short rport; - void *listen_addr; - void *remote_addr; - int ttl; - int unicast; - void *dev; -}; - -extern const struct net_user_info umcast_user_info; - -extern int umcast_user_write(int fd, void *buf, int len, - struct umcast_data *pri); - -#endif diff --git a/arch/um/drivers/umcast_kern.c b/arch/um/drivers/umcast_kern.c deleted file mode 100644 index f5ba6e377913..000000000000 --- a/arch/um/drivers/umcast_kern.c +++ /dev/null @@ -1,188 +0,0 @@ -/* - * user-mode-linux networking multicast transport - * Copyright (C) 2001 by Harald Welte <laforge@gnumonks.org> - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * - * based on the existing uml-networking code, which is - * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and - * James Leu (jleu@mindspring.net). - * Copyright (C) 2001 by various other people who didn't put their name here. - * - * Licensed under the GPL. - */ - -#include <linux/init.h> -#include <linux/netdevice.h> -#include "umcast.h" -#include <net_kern.h> - -struct umcast_init { - char *addr; - int lport; - int rport; - int ttl; - bool unicast; -}; - -static void umcast_init(struct net_device *dev, void *data) -{ - struct uml_net_private *pri; - struct umcast_data *dpri; - struct umcast_init *init = data; - - pri = netdev_priv(dev); - dpri = (struct umcast_data *) pri->user; - dpri->addr = init->addr; - dpri->lport = init->lport; - dpri->rport = init->rport; - dpri->unicast = init->unicast; - dpri->ttl = init->ttl; - dpri->dev = dev; - - if (dpri->unicast) { - printk(KERN_INFO "ucast backend address: %s:%u listen port: " - "%u\n", dpri->addr, dpri->rport, dpri->lport); - } else { - printk(KERN_INFO "mcast backend multicast address: %s:%u, " - "TTL:%u\n", dpri->addr, dpri->lport, dpri->ttl); - } -} - -static int umcast_read(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - return net_recvfrom(fd, skb_mac_header(skb), - skb->dev->mtu + ETH_HEADER_OTHER); -} - -static int umcast_write(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - return umcast_user_write(fd, skb->data, skb->len, - (struct umcast_data *) &lp->user); -} - -static const struct net_kern_info umcast_kern_info = { - .init = umcast_init, - .protocol = eth_protocol, - .read = umcast_read, - .write = umcast_write, -}; - -static int mcast_setup(char *str, char **mac_out, void *data) -{ - struct umcast_init *init = data; - char *port_str = NULL, *ttl_str = NULL, *remain; - char *last; - - *init = ((struct umcast_init) - { .addr = "239.192.168.1", - .lport = 1102, - .ttl = 1 }); - - remain = split_if_spec(str, mac_out, &init->addr, &port_str, &ttl_str, - NULL); - if (remain != NULL) { - printk(KERN_ERR "mcast_setup - Extra garbage on " - "specification : '%s'\n", remain); - return 0; - } - - if (port_str != NULL) { - init->lport = simple_strtoul(port_str, &last, 10); - if ((*last != '\0') || (last == port_str)) { - printk(KERN_ERR "mcast_setup - Bad port : '%s'\n", - port_str); - return 0; - } - } - - if (ttl_str != NULL) { - init->ttl = simple_strtoul(ttl_str, &last, 10); - if ((*last != '\0') || (last == ttl_str)) { - printk(KERN_ERR "mcast_setup - Bad ttl : '%s'\n", - ttl_str); - return 0; - } - } - - init->unicast = false; - init->rport = init->lport; - - printk(KERN_INFO "Configured mcast device: %s:%u-%u\n", init->addr, - init->lport, init->ttl); - - return 1; -} - -static int ucast_setup(char *str, char **mac_out, void *data) -{ - struct umcast_init *init = data; - char *lport_str = NULL, *rport_str = NULL, *remain; - char *last; - - *init = ((struct umcast_init) - { .addr = "", - .lport = 1102, - .rport = 1102 }); - - remain = split_if_spec(str, mac_out, &init->addr, - &lport_str, &rport_str, NULL); - if (remain != NULL) { - printk(KERN_ERR "ucast_setup - Extra garbage on " - "specification : '%s'\n", remain); - return 0; - } - - if (lport_str != NULL) { - init->lport = simple_strtoul(lport_str, &last, 10); - if ((*last != '\0') || (last == lport_str)) { - printk(KERN_ERR "ucast_setup - Bad listen port : " - "'%s'\n", lport_str); - return 0; - } - } - - if (rport_str != NULL) { - init->rport = simple_strtoul(rport_str, &last, 10); - if ((*last != '\0') || (last == rport_str)) { - printk(KERN_ERR "ucast_setup - Bad remote port : " - "'%s'\n", rport_str); - return 0; - } - } - - init->unicast = true; - - printk(KERN_INFO "Configured ucast device: :%u -> %s:%u\n", - init->lport, init->addr, init->rport); - - return 1; -} - -static struct transport mcast_transport = { - .list = LIST_HEAD_INIT(mcast_transport.list), - .name = "mcast", - .setup = mcast_setup, - .user = &umcast_user_info, - .kern = &umcast_kern_info, - .private_size = sizeof(struct umcast_data), - .setup_size = sizeof(struct umcast_init), -}; - -static struct transport ucast_transport = { - .list = LIST_HEAD_INIT(ucast_transport.list), - .name = "ucast", - .setup = ucast_setup, - .user = &umcast_user_info, - .kern = &umcast_kern_info, - .private_size = sizeof(struct umcast_data), - .setup_size = sizeof(struct umcast_init), -}; - -static int register_umcast(void) -{ - register_transport(&mcast_transport); - register_transport(&ucast_transport); - return 0; -} - -late_initcall(register_umcast); diff --git a/arch/um/drivers/umcast_user.c b/arch/um/drivers/umcast_user.c deleted file mode 100644 index 6074184bb51b..000000000000 --- a/arch/um/drivers/umcast_user.c +++ /dev/null @@ -1,184 +0,0 @@ -/* - * user-mode-linux networking multicast transport - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Copyright (C) 2001 by Harald Welte <laforge@gnumonks.org> - * - * based on the existing uml-networking code, which is - * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and - * James Leu (jleu@mindspring.net). - * Copyright (C) 2001 by various other people who didn't put their name here. - * - * Licensed under the GPL. - * - */ - -#include <unistd.h> -#include <errno.h> -#include <netinet/in.h> -#include "umcast.h" -#include <net_user.h> -#include <um_malloc.h> - -static struct sockaddr_in *new_addr(char *addr, unsigned short port) -{ - struct sockaddr_in *sin; - - sin = uml_kmalloc(sizeof(struct sockaddr_in), UM_GFP_KERNEL); - if (sin == NULL) { - printk(UM_KERN_ERR "new_addr: allocation of sockaddr_in " - "failed\n"); - return NULL; - } - sin->sin_family = AF_INET; - if (addr) - sin->sin_addr.s_addr = in_aton(addr); - else - sin->sin_addr.s_addr = INADDR_ANY; - sin->sin_port = htons(port); - return sin; -} - -static int umcast_user_init(void *data, void *dev) -{ - struct umcast_data *pri = data; - - pri->remote_addr = new_addr(pri->addr, pri->rport); - if (pri->unicast) - pri->listen_addr = new_addr(NULL, pri->lport); - else - pri->listen_addr = pri->remote_addr; - pri->dev = dev; - return 0; -} - -static void umcast_remove(void *data) -{ - struct umcast_data *pri = data; - - kfree(pri->listen_addr); - if (pri->unicast) - kfree(pri->remote_addr); - pri->listen_addr = pri->remote_addr = NULL; -} - -static int umcast_open(void *data) -{ - struct umcast_data *pri = data; - struct sockaddr_in *lsin = pri->listen_addr; - struct sockaddr_in *rsin = pri->remote_addr; - struct ip_mreq mreq; - int fd, yes = 1, err = -EINVAL; - - - if ((!pri->unicast && lsin->sin_addr.s_addr == 0) || - (rsin->sin_addr.s_addr == 0) || - (lsin->sin_port == 0) || (rsin->sin_port == 0)) - goto out; - - fd = socket(AF_INET, SOCK_DGRAM, 0); - - if (fd < 0) { - err = -errno; - printk(UM_KERN_ERR "umcast_open : data socket failed, " - "errno = %d\n", errno); - goto out; - } - - if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0) { - err = -errno; - printk(UM_KERN_ERR "umcast_open: SO_REUSEADDR failed, " - "errno = %d\n", errno); - goto out_close; - } - - if (!pri->unicast) { - /* set ttl according to config */ - if (setsockopt(fd, SOL_IP, IP_MULTICAST_TTL, &pri->ttl, - sizeof(pri->ttl)) < 0) { - err = -errno; - printk(UM_KERN_ERR "umcast_open: IP_MULTICAST_TTL " - "failed, error = %d\n", errno); - goto out_close; - } - - /* set LOOP, so data does get fed back to local sockets */ - if (setsockopt(fd, SOL_IP, IP_MULTICAST_LOOP, - &yes, sizeof(yes)) < 0) { - err = -errno; - printk(UM_KERN_ERR "umcast_open: IP_MULTICAST_LOOP " - "failed, error = %d\n", errno); - goto out_close; - } - } - - /* bind socket to the address */ - if (bind(fd, (struct sockaddr *) lsin, sizeof(*lsin)) < 0) { - err = -errno; - printk(UM_KERN_ERR "umcast_open : data bind failed, " - "errno = %d\n", errno); - goto out_close; - } - - if (!pri->unicast) { - /* subscribe to the multicast group */ - mreq.imr_multiaddr.s_addr = lsin->sin_addr.s_addr; - mreq.imr_interface.s_addr = 0; - if (setsockopt(fd, SOL_IP, IP_ADD_MEMBERSHIP, - &mreq, sizeof(mreq)) < 0) { - err = -errno; - printk(UM_KERN_ERR "umcast_open: IP_ADD_MEMBERSHIP " - "failed, error = %d\n", errno); - printk(UM_KERN_ERR "There appears not to be a " - "multicast-capable network interface on the " - "host.\n"); - printk(UM_KERN_ERR "eth0 should be configured in order " - "to use the multicast transport.\n"); - goto out_close; - } - } - - return fd; - - out_close: - close(fd); - out: - return err; -} - -static void umcast_close(int fd, void *data) -{ - struct umcast_data *pri = data; - - if (!pri->unicast) { - struct ip_mreq mreq; - struct sockaddr_in *lsin = pri->listen_addr; - - mreq.imr_multiaddr.s_addr = lsin->sin_addr.s_addr; - mreq.imr_interface.s_addr = 0; - if (setsockopt(fd, SOL_IP, IP_DROP_MEMBERSHIP, - &mreq, sizeof(mreq)) < 0) { - printk(UM_KERN_ERR "umcast_close: IP_DROP_MEMBERSHIP " - "failed, error = %d\n", errno); - } - } - - close(fd); -} - -int umcast_user_write(int fd, void *buf, int len, struct umcast_data *pri) -{ - struct sockaddr_in *data_addr = pri->remote_addr; - - return net_sendto(fd, buf, len, data_addr, sizeof(*data_addr)); -} - -const struct net_user_info umcast_user_info = { - .init = umcast_user_init, - .open = umcast_open, - .close = umcast_close, - .remove = umcast_remove, - .add_address = NULL, - .delete_address = NULL, - .mtu = ETH_MAX_PACKET, - .max_packet = ETH_MAX_PACKET + ETH_HEADER_OTHER, -}; diff --git a/arch/um/drivers/vde.h b/arch/um/drivers/vde.h deleted file mode 100644 index fc3a05902ba1..000000000000 --- a/arch/um/drivers/vde.h +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (C) 2007 Luca Bigliardi (shammash@artha.org). - * Licensed under the GPL. - */ - -#ifndef __UM_VDE_H__ -#define __UM_VDE_H__ - -struct vde_data { - char *vde_switch; - char *descr; - void *args; - void *conn; - void *dev; -}; - -struct vde_init { - char *vde_switch; - char *descr; - int port; - char *group; - int mode; -}; - -extern const struct net_user_info vde_user_info; - -extern void vde_init_libstuff(struct vde_data *vpri, struct vde_init *init); - -extern int vde_user_read(void *conn, void *buf, int len); -extern int vde_user_write(void *conn, void *buf, int len); - -#endif diff --git a/arch/um/drivers/vde_kern.c b/arch/um/drivers/vde_kern.c deleted file mode 100644 index 6a365fadc7c4..000000000000 --- a/arch/um/drivers/vde_kern.c +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Copyright (C) 2007 Luca Bigliardi (shammash@artha.org). - * Licensed under the GPL. - * - * Transport usage: - * ethN=vde,<vde_switch>,<mac addr>,<port>,<group>,<mode>,<description> - * - */ - -#include <linux/init.h> -#include <linux/netdevice.h> -#include <net_kern.h> -#include <net_user.h> -#include "vde.h" - -static void vde_init(struct net_device *dev, void *data) -{ - struct vde_init *init = data; - struct uml_net_private *pri; - struct vde_data *vpri; - - pri = netdev_priv(dev); - vpri = (struct vde_data *) pri->user; - - vpri->vde_switch = init->vde_switch; - vpri->descr = init->descr ? init->descr : "UML vde_transport"; - vpri->args = NULL; - vpri->conn = NULL; - vpri->dev = dev; - - printk("vde backend - %s, ", vpri->vde_switch ? - vpri->vde_switch : "(default socket)"); - - vde_init_libstuff(vpri, init); - - printk("\n"); -} - -static int vde_read(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - struct vde_data *pri = (struct vde_data *) &lp->user; - - if (pri->conn != NULL) - return vde_user_read(pri->conn, skb_mac_header(skb), - skb->dev->mtu + ETH_HEADER_OTHER); - - printk(KERN_ERR "vde_read - we have no VDECONN to read from"); - return -EBADF; -} - -static int vde_write(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - struct vde_data *pri = (struct vde_data *) &lp->user; - - if (pri->conn != NULL) - return vde_user_write((void *)pri->conn, skb->data, - skb->len); - - printk(KERN_ERR "vde_write - we have no VDECONN to write to"); - return -EBADF; -} - -static const struct net_kern_info vde_kern_info = { - .init = vde_init, - .protocol = eth_protocol, - .read = vde_read, - .write = vde_write, -}; - -static int vde_setup(char *str, char **mac_out, void *data) -{ - struct vde_init *init = data; - char *remain, *port_str = NULL, *mode_str = NULL, *last; - - *init = ((struct vde_init) - { .vde_switch = NULL, - .descr = NULL, - .port = 0, - .group = NULL, - .mode = 0 }); - - remain = split_if_spec(str, &init->vde_switch, mac_out, &port_str, - &init->group, &mode_str, &init->descr, NULL); - - if (remain != NULL) - printk(KERN_WARNING "vde_setup - Ignoring extra data :" - "'%s'\n", remain); - - if (port_str != NULL) { - init->port = simple_strtoul(port_str, &last, 10); - if ((*last != '\0') || (last == port_str)) { - printk(KERN_ERR "vde_setup - Bad port : '%s'\n", - port_str); - return 0; - } - } - - if (mode_str != NULL) { - init->mode = simple_strtoul(mode_str, &last, 8); - if ((*last != '\0') || (last == mode_str)) { - printk(KERN_ERR "vde_setup - Bad mode : '%s'\n", - mode_str); - return 0; - } - } - - printk(KERN_INFO "Configured vde device: %s\n", init->vde_switch ? - init->vde_switch : "(default socket)"); - - return 1; -} - -static struct transport vde_transport = { - .list = LIST_HEAD_INIT(vde_transport.list), - .name = "vde", - .setup = vde_setup, - .user = &vde_user_info, - .kern = &vde_kern_info, - .private_size = sizeof(struct vde_data), - .setup_size = sizeof(struct vde_init), -}; - -static int register_vde(void) -{ - register_transport(&vde_transport); - return 0; -} - -late_initcall(register_vde); diff --git a/arch/um/drivers/vde_user.c b/arch/um/drivers/vde_user.c deleted file mode 100644 index 64cb630d1157..000000000000 --- a/arch/um/drivers/vde_user.c +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright (C) 2007 Luca Bigliardi (shammash@artha.org). - * Licensed under the GPL. - */ - -#include <stddef.h> -#include <errno.h> -#include <libvdeplug.h> -#include <net_user.h> -#include <um_malloc.h> -#include "vde.h" - -static int vde_user_init(void *data, void *dev) -{ - struct vde_data *pri = data; - VDECONN *conn = NULL; - int err = -EINVAL; - - pri->dev = dev; - - conn = vde_open(pri->vde_switch, pri->descr, pri->args); - - if (conn == NULL) { - err = -errno; - printk(UM_KERN_ERR "vde_user_init: vde_open failed, " - "errno = %d\n", errno); - return err; - } - - printk(UM_KERN_INFO "vde backend - connection opened\n"); - - pri->conn = conn; - - return 0; -} - -static int vde_user_open(void *data) -{ - struct vde_data *pri = data; - - if (pri->conn != NULL) - return vde_datafd(pri->conn); - - printk(UM_KERN_WARNING "vde_open - we have no VDECONN to open"); - return -EINVAL; -} - -static void vde_remove(void *data) -{ - struct vde_data *pri = data; - - if (pri->conn != NULL) { - printk(UM_KERN_INFO "vde backend - closing connection\n"); - vde_close(pri->conn); - pri->conn = NULL; - kfree(pri->args); - pri->args = NULL; - return; - } - - printk(UM_KERN_WARNING "vde_remove - we have no VDECONN to remove"); -} - -const struct net_user_info vde_user_info = { - .init = vde_user_init, - .open = vde_user_open, - .close = NULL, - .remove = vde_remove, - .add_address = NULL, - .delete_address = NULL, - .mtu = ETH_MAX_PACKET, - .max_packet = ETH_MAX_PACKET + ETH_HEADER_OTHER, -}; - -void vde_init_libstuff(struct vde_data *vpri, struct vde_init *init) -{ - struct vde_open_args *args; - - vpri->args = uml_kmalloc(sizeof(struct vde_open_args), UM_GFP_KERNEL); - if (vpri->args == NULL) { - printk(UM_KERN_ERR "vde_init_libstuff - vde_open_args " - "allocation failed"); - return; - } - - args = vpri->args; - - args->port = init->port; - args->group = init->group; - args->mode = init->mode ? init->mode : 0700; - - args->port ? printk("port %d", args->port) : - printk("undefined port"); -} - -int vde_user_read(void *conn, void *buf, int len) -{ - VDECONN *vconn = conn; - int rv; - - if (vconn == NULL) - return 0; - - rv = vde_recv(vconn, buf, len, 0); - if (rv < 0) { - if (errno == EAGAIN) - return 0; - return -errno; - } - else if (rv == 0) - return -ENOTCONN; - - return rv; -} - -int vde_user_write(void *conn, void *buf, int len) -{ - VDECONN *vconn = conn; - - if (vconn == NULL) - return 0; - - return vde_send(vconn, buf, len, 0); -} - diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c new file mode 100644 index 000000000000..25d9258fa592 --- /dev/null +++ b/arch/um/drivers/vector_kern.c @@ -0,0 +1,1771 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2017 - 2019 Cambridge Greys Limited + * Copyright (C) 2011 - 2014 Cisco Systems Inc + * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and + * James Leu (jleu@mindspring.net). + * Copyright (C) 2001 by various other people who didn't put their name here. + */ + +#define pr_fmt(fmt) "uml-vector: " fmt + +#include <linux/memblock.h> +#include <linux/etherdevice.h> +#include <linux/ethtool.h> +#include <linux/inetdevice.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/netdevice.h> +#include <linux/platform_device.h> +#include <linux/rtnetlink.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/interrupt.h> +#include <linux/firmware.h> +#include <linux/fs.h> +#include <asm/atomic.h> +#include <uapi/linux/filter.h> +#include <init.h> +#include <irq_kern.h> +#include <irq_user.h> +#include <os.h> +#include "mconsole_kern.h" +#include "vector_user.h" +#include "vector_kern.h" + +/* + * Adapted from network devices with the following major changes: + * All transports are static - simplifies the code significantly + * Multiple FDs/IRQs per device + * Vector IO optionally used for read/write, falling back to legacy + * based on configuration and/or availability + * Configuration is no longer positional - L2TPv3 and GRE require up to + * 10 parameters, passing this as positional is not fit for purpose. + * Only socket transports are supported + */ + + +#define DRIVER_NAME "uml-vector" +struct vector_cmd_line_arg { + struct list_head list; + int unit; + char *arguments; +}; + +struct vector_device { + struct list_head list; + struct net_device *dev; + struct platform_device pdev; + int unit; + int opened; +}; + +static LIST_HEAD(vec_cmd_line); + +static DEFINE_SPINLOCK(vector_devices_lock); +static LIST_HEAD(vector_devices); + +static int driver_registered; + +static void vector_eth_configure(int n, struct arglist *def); +static int vector_mmsg_rx(struct vector_private *vp, int budget); + +/* Argument accessors to set variables (and/or set default values) + * mtu, buffer sizing, default headroom, etc + */ + +#define DEFAULT_HEADROOM 2 +#define SAFETY_MARGIN 32 +#define DEFAULT_VECTOR_SIZE 64 +#define TX_SMALL_PACKET 128 +#define MAX_IOV_SIZE (MAX_SKB_FRAGS + 1) + +static const struct { + const char string[ETH_GSTRING_LEN]; +} ethtool_stats_keys[] = { + { "rx_queue_max" }, + { "rx_queue_running_average" }, + { "tx_queue_max" }, + { "tx_queue_running_average" }, + { "rx_encaps_errors" }, + { "tx_timeout_count" }, + { "tx_restart_queue" }, + { "tx_kicks" }, + { "tx_flow_control_xon" }, + { "tx_flow_control_xoff" }, + { "rx_csum_offload_good" }, + { "rx_csum_offload_errors"}, + { "sg_ok"}, + { "sg_linearized"}, +}; + +#define VECTOR_NUM_STATS ARRAY_SIZE(ethtool_stats_keys) + +static void vector_reset_stats(struct vector_private *vp) +{ + /* We reuse the existing queue locks for stats */ + + /* RX stats are modified with RX head_lock held + * in vector_poll. + */ + + spin_lock(&vp->rx_queue->head_lock); + vp->estats.rx_queue_max = 0; + vp->estats.rx_queue_running_average = 0; + vp->estats.rx_encaps_errors = 0; + vp->estats.sg_ok = 0; + vp->estats.sg_linearized = 0; + spin_unlock(&vp->rx_queue->head_lock); + + /* TX stats are modified with TX head_lock held + * in vector_send. + */ + + spin_lock(&vp->tx_queue->head_lock); + vp->estats.tx_timeout_count = 0; + vp->estats.tx_restart_queue = 0; + vp->estats.tx_kicks = 0; + vp->estats.tx_flow_control_xon = 0; + vp->estats.tx_flow_control_xoff = 0; + vp->estats.tx_queue_max = 0; + vp->estats.tx_queue_running_average = 0; + spin_unlock(&vp->tx_queue->head_lock); +} + +static int get_mtu(struct arglist *def) +{ + char *mtu = uml_vector_fetch_arg(def, "mtu"); + long result; + + if (mtu != NULL) { + if (kstrtoul(mtu, 10, &result) == 0) + if ((result < (1 << 16) - 1) && (result >= 576)) + return result; + } + return ETH_MAX_PACKET; +} + +static char *get_bpf_file(struct arglist *def) +{ + return uml_vector_fetch_arg(def, "bpffile"); +} + +static bool get_bpf_flash(struct arglist *def) +{ + char *allow = uml_vector_fetch_arg(def, "bpfflash"); + long result; + + if (allow != NULL) { + if (kstrtoul(allow, 10, &result) == 0) + return result > 0; + } + return false; +} + +static int get_depth(struct arglist *def) +{ + char *mtu = uml_vector_fetch_arg(def, "depth"); + long result; + + if (mtu != NULL) { + if (kstrtoul(mtu, 10, &result) == 0) + return result; + } + return DEFAULT_VECTOR_SIZE; +} + +static int get_headroom(struct arglist *def) +{ + char *mtu = uml_vector_fetch_arg(def, "headroom"); + long result; + + if (mtu != NULL) { + if (kstrtoul(mtu, 10, &result) == 0) + return result; + } + return DEFAULT_HEADROOM; +} + +static int get_req_size(struct arglist *def) +{ + char *gro = uml_vector_fetch_arg(def, "gro"); + long result; + + if (gro != NULL) { + if (kstrtoul(gro, 10, &result) == 0) { + if (result > 0) + return 65536; + } + } + return get_mtu(def) + ETH_HEADER_OTHER + + get_headroom(def) + SAFETY_MARGIN; +} + + +static int get_transport_options(struct arglist *def) +{ + char *transport = uml_vector_fetch_arg(def, "transport"); + char *vector = uml_vector_fetch_arg(def, "vec"); + + int vec_rx = VECTOR_RX; + int vec_tx = VECTOR_TX; + long parsed; + int result = 0; + + if (transport == NULL) + return -EINVAL; + + if (vector != NULL) { + if (kstrtoul(vector, 10, &parsed) == 0) { + if (parsed == 0) { + vec_rx = 0; + vec_tx = 0; + } + } + } + + if (get_bpf_flash(def)) + result = VECTOR_BPF_FLASH; + + if (strncmp(transport, TRANS_TAP, TRANS_TAP_LEN) == 0) + return result; + if (strncmp(transport, TRANS_HYBRID, TRANS_HYBRID_LEN) == 0) + return (result | vec_rx | VECTOR_BPF); + if (strncmp(transport, TRANS_RAW, TRANS_RAW_LEN) == 0) + return (result | vec_rx | vec_tx | VECTOR_QDISC_BYPASS); + return (result | vec_rx | vec_tx); +} + + +/* A mini-buffer for packet drop read + * All of our supported transports are datagram oriented and we always + * read using recvmsg or recvmmsg. If we pass a buffer which is smaller + * than the packet size it still counts as full packet read and will + * clean the incoming stream to keep sigio/epoll happy + */ + +#define DROP_BUFFER_SIZE 32 + +static char *drop_buffer; + + +/* + * Advance the mmsg queue head by n = advance. Resets the queue to + * maximum enqueue/dequeue-at-once capacity if possible. Called by + * dequeuers. Caller must hold the head_lock! + */ + +static int vector_advancehead(struct vector_queue *qi, int advance) +{ + qi->head = + (qi->head + advance) + % qi->max_depth; + + + atomic_sub(advance, &qi->queue_depth); + return atomic_read(&qi->queue_depth); +} + +/* Advance the queue tail by n = advance. + * This is called by enqueuers which should hold the + * head lock already + */ + +static int vector_advancetail(struct vector_queue *qi, int advance) +{ + qi->tail = + (qi->tail + advance) + % qi->max_depth; + atomic_add(advance, &qi->queue_depth); + return atomic_read(&qi->queue_depth); +} + +static int prep_msg(struct vector_private *vp, + struct sk_buff *skb, + struct iovec *iov) +{ + int iov_index = 0; + int nr_frags, frag; + skb_frag_t *skb_frag; + + nr_frags = skb_shinfo(skb)->nr_frags; + if (nr_frags > MAX_IOV_SIZE) { + if (skb_linearize(skb) != 0) + goto drop; + } + if (vp->header_size > 0) { + iov[iov_index].iov_len = vp->header_size; + vp->form_header(iov[iov_index].iov_base, skb, vp); + iov_index++; + } + iov[iov_index].iov_base = skb->data; + if (nr_frags > 0) { + iov[iov_index].iov_len = skb->len - skb->data_len; + vp->estats.sg_ok++; + } else + iov[iov_index].iov_len = skb->len; + iov_index++; + for (frag = 0; frag < nr_frags; frag++) { + skb_frag = &skb_shinfo(skb)->frags[frag]; + iov[iov_index].iov_base = skb_frag_address_safe(skb_frag); + iov[iov_index].iov_len = skb_frag_size(skb_frag); + iov_index++; + } + return iov_index; +drop: + return -1; +} +/* + * Generic vector enqueue with support for forming headers using transport + * specific callback. Allows GRE, L2TPv3, RAW and other transports + * to use a common enqueue procedure in vector mode + */ + +static int vector_enqueue(struct vector_queue *qi, struct sk_buff *skb) +{ + struct vector_private *vp = netdev_priv(qi->dev); + int queue_depth; + int packet_len; + struct mmsghdr *mmsg_vector = qi->mmsg_vector; + int iov_count; + + spin_lock(&qi->tail_lock); + queue_depth = atomic_read(&qi->queue_depth); + + if (skb) + packet_len = skb->len; + + if (queue_depth < qi->max_depth) { + + *(qi->skbuff_vector + qi->tail) = skb; + mmsg_vector += qi->tail; + iov_count = prep_msg( + vp, + skb, + mmsg_vector->msg_hdr.msg_iov + ); + if (iov_count < 1) + goto drop; + mmsg_vector->msg_hdr.msg_iovlen = iov_count; + mmsg_vector->msg_hdr.msg_name = vp->fds->remote_addr; + mmsg_vector->msg_hdr.msg_namelen = vp->fds->remote_addr_size; + wmb(); /* Make the packet visible to the NAPI poll thread */ + queue_depth = vector_advancetail(qi, 1); + } else + goto drop; + spin_unlock(&qi->tail_lock); + return queue_depth; +drop: + qi->dev->stats.tx_dropped++; + if (skb != NULL) { + packet_len = skb->len; + dev_consume_skb_any(skb); + netdev_completed_queue(qi->dev, 1, packet_len); + } + spin_unlock(&qi->tail_lock); + return queue_depth; +} + +static int consume_vector_skbs(struct vector_queue *qi, int count) +{ + struct sk_buff *skb; + int skb_index; + int bytes_compl = 0; + + for (skb_index = qi->head; skb_index < qi->head + count; skb_index++) { + skb = *(qi->skbuff_vector + skb_index); + /* mark as empty to ensure correct destruction if + * needed + */ + bytes_compl += skb->len; + *(qi->skbuff_vector + skb_index) = NULL; + dev_consume_skb_any(skb); + } + qi->dev->stats.tx_bytes += bytes_compl; + qi->dev->stats.tx_packets += count; + netdev_completed_queue(qi->dev, count, bytes_compl); + return vector_advancehead(qi, count); +} + +/* + * Generic vector dequeue via sendmmsg with support for forming headers + * using transport specific callback. Allows GRE, L2TPv3, RAW and + * other transports to use a common dequeue procedure in vector mode + */ + + +static int vector_send(struct vector_queue *qi) +{ + struct vector_private *vp = netdev_priv(qi->dev); + struct mmsghdr *send_from; + int result = 0, send_len; + + if (spin_trylock(&qi->head_lock)) { + /* update queue_depth to current value */ + while (atomic_read(&qi->queue_depth) > 0) { + /* Calculate the start of the vector */ + send_len = atomic_read(&qi->queue_depth); + send_from = qi->mmsg_vector; + send_from += qi->head; + /* Adjust vector size if wraparound */ + if (send_len + qi->head > qi->max_depth) + send_len = qi->max_depth - qi->head; + /* Try to TX as many packets as possible */ + if (send_len > 0) { + result = uml_vector_sendmmsg( + vp->fds->tx_fd, + send_from, + send_len, + 0 + ); + vp->in_write_poll = + (result != send_len); + } + /* For some of the sendmmsg error scenarios + * we may end being unsure in the TX success + * for all packets. It is safer to declare + * them all TX-ed and blame the network. + */ + if (result < 0) { + if (net_ratelimit()) + netdev_err(vp->dev, "sendmmsg err=%i\n", + result); + vp->in_error = true; + result = send_len; + } + if (result > 0) { + consume_vector_skbs(qi, result); + /* This is equivalent to an TX IRQ. + * Restart the upper layers to feed us + * more packets. + */ + if (result > vp->estats.tx_queue_max) + vp->estats.tx_queue_max = result; + vp->estats.tx_queue_running_average = + (vp->estats.tx_queue_running_average + result) >> 1; + } + netif_wake_queue(qi->dev); + /* if TX is busy, break out of the send loop, + * poll write IRQ will reschedule xmit for us. + */ + if (result != send_len) { + vp->estats.tx_restart_queue++; + break; + } + } + spin_unlock(&qi->head_lock); + } + return atomic_read(&qi->queue_depth); +} + +/* Queue destructor. Deliberately stateless so we can use + * it in queue cleanup if initialization fails. + */ + +static void destroy_queue(struct vector_queue *qi) +{ + int i; + struct iovec *iov; + struct vector_private *vp = netdev_priv(qi->dev); + struct mmsghdr *mmsg_vector; + + if (qi == NULL) + return; + /* deallocate any skbuffs - we rely on any unused to be + * set to NULL. + */ + if (qi->skbuff_vector != NULL) { + for (i = 0; i < qi->max_depth; i++) { + if (*(qi->skbuff_vector + i) != NULL) + dev_kfree_skb_any(*(qi->skbuff_vector + i)); + } + kfree(qi->skbuff_vector); + } + /* deallocate matching IOV structures including header buffs */ + if (qi->mmsg_vector != NULL) { + mmsg_vector = qi->mmsg_vector; + for (i = 0; i < qi->max_depth; i++) { + iov = mmsg_vector->msg_hdr.msg_iov; + if (iov != NULL) { + if ((vp->header_size > 0) && + (iov->iov_base != NULL)) + kfree(iov->iov_base); + kfree(iov); + } + mmsg_vector++; + } + kfree(qi->mmsg_vector); + } + kfree(qi); +} + +/* + * Queue constructor. Create a queue with a given side. + */ +static struct vector_queue *create_queue( + struct vector_private *vp, + int max_size, + int header_size, + int num_extra_frags) +{ + struct vector_queue *result; + int i; + struct iovec *iov; + struct mmsghdr *mmsg_vector; + + result = kmalloc(sizeof(struct vector_queue), GFP_KERNEL); + if (result == NULL) + return NULL; + result->max_depth = max_size; + result->dev = vp->dev; + result->mmsg_vector = kmalloc( + (sizeof(struct mmsghdr) * max_size), GFP_KERNEL); + if (result->mmsg_vector == NULL) + goto out_mmsg_fail; + result->skbuff_vector = kmalloc( + (sizeof(void *) * max_size), GFP_KERNEL); + if (result->skbuff_vector == NULL) + goto out_skb_fail; + + /* further failures can be handled safely by destroy_queue*/ + + mmsg_vector = result->mmsg_vector; + for (i = 0; i < max_size; i++) { + /* Clear all pointers - we use non-NULL as marking on + * what to free on destruction + */ + *(result->skbuff_vector + i) = NULL; + mmsg_vector->msg_hdr.msg_iov = NULL; + mmsg_vector++; + } + mmsg_vector = result->mmsg_vector; + result->max_iov_frags = num_extra_frags; + for (i = 0; i < max_size; i++) { + if (vp->header_size > 0) + iov = kmalloc_array(3 + num_extra_frags, + sizeof(struct iovec), + GFP_KERNEL + ); + else + iov = kmalloc_array(2 + num_extra_frags, + sizeof(struct iovec), + GFP_KERNEL + ); + if (iov == NULL) + goto out_fail; + mmsg_vector->msg_hdr.msg_iov = iov; + mmsg_vector->msg_hdr.msg_iovlen = 1; + mmsg_vector->msg_hdr.msg_control = NULL; + mmsg_vector->msg_hdr.msg_controllen = 0; + mmsg_vector->msg_hdr.msg_flags = MSG_DONTWAIT; + mmsg_vector->msg_hdr.msg_name = NULL; + mmsg_vector->msg_hdr.msg_namelen = 0; + if (vp->header_size > 0) { + iov->iov_base = kmalloc(header_size, GFP_KERNEL); + if (iov->iov_base == NULL) + goto out_fail; + iov->iov_len = header_size; + mmsg_vector->msg_hdr.msg_iovlen = 2; + iov++; + } + iov->iov_base = NULL; + iov->iov_len = 0; + mmsg_vector++; + } + spin_lock_init(&result->head_lock); + spin_lock_init(&result->tail_lock); + atomic_set(&result->queue_depth, 0); + result->head = 0; + result->tail = 0; + return result; +out_skb_fail: + kfree(result->mmsg_vector); +out_mmsg_fail: + kfree(result); + return NULL; +out_fail: + destroy_queue(result); + return NULL; +} + +/* + * We do not use the RX queue as a proper wraparound queue for now + * This is not necessary because the consumption via napi_gro_receive() + * happens in-line. While we can try using the return code of + * netif_rx() for flow control there are no drivers doing this today. + * For this RX specific use we ignore the tail/head locks and + * just read into a prepared queue filled with skbuffs. + */ + +static struct sk_buff *prep_skb( + struct vector_private *vp, + struct user_msghdr *msg) +{ + int linear = vp->max_packet + vp->headroom + SAFETY_MARGIN; + struct sk_buff *result; + int iov_index = 0, len; + struct iovec *iov = msg->msg_iov; + int err, nr_frags, frag; + skb_frag_t *skb_frag; + + if (vp->req_size <= linear) + len = linear; + else + len = vp->req_size; + result = alloc_skb_with_frags( + linear, + len - vp->max_packet, + 3, + &err, + GFP_ATOMIC + ); + if (vp->header_size > 0) + iov_index++; + if (result == NULL) { + iov[iov_index].iov_base = NULL; + iov[iov_index].iov_len = 0; + goto done; + } + skb_reserve(result, vp->headroom); + result->dev = vp->dev; + skb_put(result, vp->max_packet); + result->data_len = len - vp->max_packet; + result->len += len - vp->max_packet; + skb_reset_mac_header(result); + result->ip_summed = CHECKSUM_NONE; + iov[iov_index].iov_base = result->data; + iov[iov_index].iov_len = vp->max_packet; + iov_index++; + + nr_frags = skb_shinfo(result)->nr_frags; + for (frag = 0; frag < nr_frags; frag++) { + skb_frag = &skb_shinfo(result)->frags[frag]; + iov[iov_index].iov_base = skb_frag_address_safe(skb_frag); + if (iov[iov_index].iov_base != NULL) + iov[iov_index].iov_len = skb_frag_size(skb_frag); + else + iov[iov_index].iov_len = 0; + iov_index++; + } +done: + msg->msg_iovlen = iov_index; + return result; +} + + +/* Prepare queue for recvmmsg one-shot rx - fill with fresh sk_buffs */ + +static void prep_queue_for_rx(struct vector_queue *qi) +{ + struct vector_private *vp = netdev_priv(qi->dev); + struct mmsghdr *mmsg_vector = qi->mmsg_vector; + void **skbuff_vector = qi->skbuff_vector; + int i, queue_depth; + + queue_depth = atomic_read(&qi->queue_depth); + + if (queue_depth == 0) + return; + + /* RX is always emptied 100% during each cycle, so we do not + * have to do the tail wraparound math for it. + */ + + qi->head = qi->tail = 0; + + for (i = 0; i < queue_depth; i++) { + /* it is OK if allocation fails - recvmmsg with NULL data in + * iov argument still performs an RX, just drops the packet + * This allows us stop faffing around with a "drop buffer" + */ + + *skbuff_vector = prep_skb(vp, &mmsg_vector->msg_hdr); + skbuff_vector++; + mmsg_vector++; + } + atomic_set(&qi->queue_depth, 0); +} + +static struct vector_device *find_device(int n) +{ + struct vector_device *device; + struct list_head *ele; + + spin_lock(&vector_devices_lock); + list_for_each(ele, &vector_devices) { + device = list_entry(ele, struct vector_device, list); + if (device->unit == n) + goto out; + } + device = NULL; + out: + spin_unlock(&vector_devices_lock); + return device; +} + +static int vector_parse(char *str, int *index_out, char **str_out, + char **error_out) +{ + int n, err; + char *start = str; + + while ((*str != ':') && (strlen(str) > 1)) + str++; + if (*str != ':') { + *error_out = "Expected ':' after device number"; + return -EINVAL; + } + *str = '\0'; + + err = kstrtouint(start, 0, &n); + if (err < 0) { + *error_out = "Bad device number"; + return err; + } + + str++; + if (find_device(n)) { + *error_out = "Device already configured"; + return -EINVAL; + } + + *index_out = n; + *str_out = str; + return 0; +} + +static int vector_config(char *str, char **error_out) +{ + int err, n; + char *params; + struct arglist *parsed; + + err = vector_parse(str, &n, ¶ms, error_out); + if (err != 0) + return err; + + /* This string is broken up and the pieces used by the underlying + * driver. We should copy it to make sure things do not go wrong + * later. + */ + + params = kstrdup(params, GFP_KERNEL); + if (params == NULL) { + *error_out = "vector_config failed to strdup string"; + return -ENOMEM; + } + + parsed = uml_parse_vector_ifspec(params); + + if (parsed == NULL) { + *error_out = "vector_config failed to parse parameters"; + kfree(params); + return -EINVAL; + } + + vector_eth_configure(n, parsed); + return 0; +} + +static int vector_id(char **str, int *start_out, int *end_out) +{ + char *end; + int n; + + n = simple_strtoul(*str, &end, 0); + if ((*end != '\0') || (end == *str)) + return -1; + + *start_out = n; + *end_out = n; + *str = end; + return n; +} + +static int vector_remove(int n, char **error_out) +{ + struct vector_device *vec_d; + struct net_device *dev; + struct vector_private *vp; + + vec_d = find_device(n); + if (vec_d == NULL) + return -ENODEV; + dev = vec_d->dev; + vp = netdev_priv(dev); + if (vp->fds != NULL) + return -EBUSY; + unregister_netdev(dev); + platform_device_unregister(&vec_d->pdev); + return 0; +} + +/* + * There is no shared per-transport initialization code, so + * we will just initialize each interface one by one and + * add them to a list + */ + +static struct platform_driver uml_net_driver = { + .driver = { + .name = DRIVER_NAME, + }, +}; + + +static void vector_device_release(struct device *dev) +{ + struct vector_device *device = + container_of(dev, struct vector_device, pdev.dev); + struct net_device *netdev = device->dev; + + list_del(&device->list); + kfree(device); + free_netdev(netdev); +} + +/* Bog standard recv using recvmsg - not used normally unless the user + * explicitly specifies not to use recvmmsg vector RX. + */ + +static int vector_legacy_rx(struct vector_private *vp) +{ + int pkt_len; + struct user_msghdr hdr; + struct iovec iov[2 + MAX_IOV_SIZE]; /* header + data use case only */ + int iovpos = 0; + struct sk_buff *skb; + int header_check; + + hdr.msg_name = NULL; + hdr.msg_namelen = 0; + hdr.msg_iov = (struct iovec *) &iov; + hdr.msg_control = NULL; + hdr.msg_controllen = 0; + hdr.msg_flags = 0; + + if (vp->header_size > 0) { + iov[0].iov_base = vp->header_rxbuffer; + iov[0].iov_len = vp->header_size; + } + + skb = prep_skb(vp, &hdr); + + if (skb == NULL) { + /* Read a packet into drop_buffer and don't do + * anything with it. + */ + iov[iovpos].iov_base = drop_buffer; + iov[iovpos].iov_len = DROP_BUFFER_SIZE; + hdr.msg_iovlen = 1; + vp->dev->stats.rx_dropped++; + } + + pkt_len = uml_vector_recvmsg(vp->fds->rx_fd, &hdr, 0); + if (pkt_len < 0) { + vp->in_error = true; + return pkt_len; + } + + if (skb != NULL) { + if (pkt_len > vp->header_size) { + if (vp->header_size > 0) { + header_check = vp->verify_header( + vp->header_rxbuffer, skb, vp); + if (header_check < 0) { + dev_kfree_skb_irq(skb); + vp->dev->stats.rx_dropped++; + vp->estats.rx_encaps_errors++; + return 0; + } + if (header_check > 0) { + vp->estats.rx_csum_offload_good++; + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + } + pskb_trim(skb, pkt_len - vp->rx_header_size); + skb->protocol = eth_type_trans(skb, skb->dev); + vp->dev->stats.rx_bytes += skb->len; + vp->dev->stats.rx_packets++; + napi_gro_receive(&vp->napi, skb); + } else { + dev_kfree_skb_irq(skb); + } + } + return pkt_len; +} + +/* + * Packet at a time TX which falls back to vector TX if the + * underlying transport is busy. + */ + + + +static int writev_tx(struct vector_private *vp, struct sk_buff *skb) +{ + struct iovec iov[3 + MAX_IOV_SIZE]; + int iov_count, pkt_len = 0; + + iov[0].iov_base = vp->header_txbuffer; + iov_count = prep_msg(vp, skb, (struct iovec *) &iov); + + if (iov_count < 1) + goto drop; + + pkt_len = uml_vector_writev( + vp->fds->tx_fd, + (struct iovec *) &iov, + iov_count + ); + + if (pkt_len < 0) + goto drop; + + netif_trans_update(vp->dev); + netif_wake_queue(vp->dev); + + if (pkt_len > 0) { + vp->dev->stats.tx_bytes += skb->len; + vp->dev->stats.tx_packets++; + } else { + vp->dev->stats.tx_dropped++; + } + consume_skb(skb); + return pkt_len; +drop: + vp->dev->stats.tx_dropped++; + consume_skb(skb); + if (pkt_len < 0) + vp->in_error = true; + return pkt_len; +} + +/* + * Receive as many messages as we can in one call using the special + * mmsg vector matched to an skb vector which we prepared earlier. + */ + +static int vector_mmsg_rx(struct vector_private *vp, int budget) +{ + int packet_count, i; + struct vector_queue *qi = vp->rx_queue; + struct sk_buff *skb; + struct mmsghdr *mmsg_vector = qi->mmsg_vector; + void **skbuff_vector = qi->skbuff_vector; + int header_check; + + /* Refresh the vector and make sure it is with new skbs and the + * iovs are updated to point to them. + */ + + prep_queue_for_rx(qi); + + /* Fire the Lazy Gun - get as many packets as we can in one go. */ + + if (budget > qi->max_depth) + budget = qi->max_depth; + + packet_count = uml_vector_recvmmsg( + vp->fds->rx_fd, qi->mmsg_vector, budget, 0); + + if (packet_count < 0) + vp->in_error = true; + + if (packet_count <= 0) + return packet_count; + + /* We treat packet processing as enqueue, buffer refresh as dequeue + * The queue_depth tells us how many buffers have been used and how + * many do we need to prep the next time prep_queue_for_rx() is called. + */ + + atomic_add(packet_count, &qi->queue_depth); + + for (i = 0; i < packet_count; i++) { + skb = (*skbuff_vector); + if (mmsg_vector->msg_len > vp->header_size) { + if (vp->header_size > 0) { + header_check = vp->verify_header( + mmsg_vector->msg_hdr.msg_iov->iov_base, + skb, + vp + ); + if (header_check < 0) { + /* Overlay header failed to verify - discard. + * We can actually keep this skb and reuse it, + * but that will make the prep logic too + * complex. + */ + dev_kfree_skb_irq(skb); + vp->estats.rx_encaps_errors++; + continue; + } + if (header_check > 0) { + vp->estats.rx_csum_offload_good++; + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + } + pskb_trim(skb, + mmsg_vector->msg_len - vp->rx_header_size); + skb->protocol = eth_type_trans(skb, skb->dev); + /* + * We do not need to lock on updating stats here + * The interrupt loop is non-reentrant. + */ + vp->dev->stats.rx_bytes += skb->len; + vp->dev->stats.rx_packets++; + napi_gro_receive(&vp->napi, skb); + } else { + /* Overlay header too short to do anything - discard. + * We can actually keep this skb and reuse it, + * but that will make the prep logic too complex. + */ + if (skb != NULL) + dev_kfree_skb_irq(skb); + } + (*skbuff_vector) = NULL; + /* Move to the next buffer element */ + mmsg_vector++; + skbuff_vector++; + } + if (packet_count > 0) { + if (vp->estats.rx_queue_max < packet_count) + vp->estats.rx_queue_max = packet_count; + vp->estats.rx_queue_running_average = + (vp->estats.rx_queue_running_average + packet_count) >> 1; + } + return packet_count; +} + +static int vector_net_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct vector_private *vp = netdev_priv(dev); + int queue_depth = 0; + + if (vp->in_error) { + deactivate_fd(vp->fds->rx_fd, vp->rx_irq); + if ((vp->fds->rx_fd != vp->fds->tx_fd) && (vp->tx_irq != 0)) + deactivate_fd(vp->fds->tx_fd, vp->tx_irq); + return NETDEV_TX_BUSY; + } + + if ((vp->options & VECTOR_TX) == 0) { + writev_tx(vp, skb); + return NETDEV_TX_OK; + } + + /* We do BQL only in the vector path, no point doing it in + * packet at a time mode as there is no device queue + */ + + netdev_sent_queue(vp->dev, skb->len); + queue_depth = vector_enqueue(vp->tx_queue, skb); + + if (queue_depth < vp->tx_queue->max_depth && netdev_xmit_more()) { + mod_timer(&vp->tl, vp->coalesce); + return NETDEV_TX_OK; + } else { + queue_depth = vector_send(vp->tx_queue); + if (queue_depth > 0) + napi_schedule(&vp->napi); + } + + return NETDEV_TX_OK; +} + +static irqreturn_t vector_rx_interrupt(int irq, void *dev_id) +{ + struct net_device *dev = dev_id; + struct vector_private *vp = netdev_priv(dev); + + if (!netif_running(dev)) + return IRQ_NONE; + napi_schedule(&vp->napi); + return IRQ_HANDLED; + +} + +static irqreturn_t vector_tx_interrupt(int irq, void *dev_id) +{ + struct net_device *dev = dev_id; + struct vector_private *vp = netdev_priv(dev); + + if (!netif_running(dev)) + return IRQ_NONE; + /* We need to pay attention to it only if we got + * -EAGAIN or -ENOBUFFS from sendmmsg. Otherwise + * we ignore it. In the future, it may be worth + * it to improve the IRQ controller a bit to make + * tweaking the IRQ mask less costly + */ + + napi_schedule(&vp->napi); + return IRQ_HANDLED; + +} + +static int irq_rr; + +static int vector_net_close(struct net_device *dev) +{ + struct vector_private *vp = netdev_priv(dev); + + netif_stop_queue(dev); + timer_delete(&vp->tl); + + vp->opened = false; + + if (vp->fds == NULL) + return 0; + + /* Disable and free all IRQS */ + if (vp->rx_irq > 0) { + um_free_irq(vp->rx_irq, dev); + vp->rx_irq = 0; + } + if (vp->tx_irq > 0) { + um_free_irq(vp->tx_irq, dev); + vp->tx_irq = 0; + } + napi_disable(&vp->napi); + netif_napi_del(&vp->napi); + if (vp->fds->rx_fd > 0) { + if (vp->bpf) + uml_vector_detach_bpf(vp->fds->rx_fd, vp->bpf); + os_close_file(vp->fds->rx_fd); + vp->fds->rx_fd = -1; + } + if (vp->fds->tx_fd > 0) { + os_close_file(vp->fds->tx_fd); + vp->fds->tx_fd = -1; + } + if (vp->bpf != NULL) + kfree(vp->bpf->filter); + kfree(vp->bpf); + vp->bpf = NULL; + kfree(vp->fds->remote_addr); + kfree(vp->transport_data); + kfree(vp->header_rxbuffer); + kfree(vp->header_txbuffer); + if (vp->rx_queue != NULL) + destroy_queue(vp->rx_queue); + if (vp->tx_queue != NULL) + destroy_queue(vp->tx_queue); + kfree(vp->fds); + vp->fds = NULL; + vp->in_error = false; + return 0; +} + +static int vector_poll(struct napi_struct *napi, int budget) +{ + struct vector_private *vp = container_of(napi, struct vector_private, napi); + int work_done = 0; + int err; + bool tx_enqueued = false; + + if ((vp->options & VECTOR_TX) != 0) + tx_enqueued = (vector_send(vp->tx_queue) > 0); + spin_lock(&vp->rx_queue->head_lock); + if ((vp->options & VECTOR_RX) > 0) + err = vector_mmsg_rx(vp, budget); + else { + err = vector_legacy_rx(vp); + if (err > 0) + err = 1; + } + spin_unlock(&vp->rx_queue->head_lock); + if (err > 0) + work_done += err; + + if (tx_enqueued || err > 0) + napi_schedule(napi); + if (work_done <= budget) + napi_complete_done(napi, work_done); + return work_done; +} + +static void vector_reset_tx(struct work_struct *work) +{ + struct vector_private *vp = + container_of(work, struct vector_private, reset_tx); + netdev_reset_queue(vp->dev); + netif_start_queue(vp->dev); + netif_wake_queue(vp->dev); +} + +static int vector_net_open(struct net_device *dev) +{ + struct vector_private *vp = netdev_priv(dev); + int err = -EINVAL; + struct vector_device *vdevice; + + if (vp->opened) + return -ENXIO; + vp->opened = true; + + vp->bpf = uml_vector_user_bpf(get_bpf_file(vp->parsed)); + + vp->fds = uml_vector_user_open(vp->unit, vp->parsed); + + if (vp->fds == NULL) + goto out_close; + + if (build_transport_data(vp) < 0) + goto out_close; + + if ((vp->options & VECTOR_RX) > 0) { + vp->rx_queue = create_queue( + vp, + get_depth(vp->parsed), + vp->rx_header_size, + MAX_IOV_SIZE + ); + atomic_set(&vp->rx_queue->queue_depth, get_depth(vp->parsed)); + } else { + vp->header_rxbuffer = kmalloc( + vp->rx_header_size, + GFP_KERNEL + ); + if (vp->header_rxbuffer == NULL) + goto out_close; + } + if ((vp->options & VECTOR_TX) > 0) { + vp->tx_queue = create_queue( + vp, + get_depth(vp->parsed), + vp->header_size, + MAX_IOV_SIZE + ); + } else { + vp->header_txbuffer = kmalloc(vp->header_size, GFP_KERNEL); + if (vp->header_txbuffer == NULL) + goto out_close; + } + + netif_napi_add_weight(vp->dev, &vp->napi, vector_poll, + get_depth(vp->parsed)); + napi_enable(&vp->napi); + + /* READ IRQ */ + err = um_request_irq( + irq_rr + VECTOR_BASE_IRQ, vp->fds->rx_fd, + IRQ_READ, vector_rx_interrupt, + IRQF_SHARED, dev->name, dev); + if (err < 0) { + netdev_err(dev, "vector_open: failed to get rx irq(%d)\n", err); + err = -ENETUNREACH; + goto out_close; + } + vp->rx_irq = irq_rr + VECTOR_BASE_IRQ; + dev->irq = irq_rr + VECTOR_BASE_IRQ; + irq_rr = (irq_rr + 1) % VECTOR_IRQ_SPACE; + + /* WRITE IRQ - we need it only if we have vector TX */ + if ((vp->options & VECTOR_TX) > 0) { + err = um_request_irq( + irq_rr + VECTOR_BASE_IRQ, vp->fds->tx_fd, + IRQ_WRITE, vector_tx_interrupt, + IRQF_SHARED, dev->name, dev); + if (err < 0) { + netdev_err(dev, + "vector_open: failed to get tx irq(%d)\n", err); + err = -ENETUNREACH; + goto out_close; + } + vp->tx_irq = irq_rr + VECTOR_BASE_IRQ; + irq_rr = (irq_rr + 1) % VECTOR_IRQ_SPACE; + } + + if ((vp->options & VECTOR_QDISC_BYPASS) != 0) { + if (!uml_raw_enable_qdisc_bypass(vp->fds->rx_fd)) + vp->options |= VECTOR_BPF; + } + if (((vp->options & VECTOR_BPF) != 0) && (vp->bpf == NULL)) + vp->bpf = uml_vector_default_bpf(dev->dev_addr); + + if (vp->bpf != NULL) + uml_vector_attach_bpf(vp->fds->rx_fd, vp->bpf); + + netif_start_queue(dev); + vector_reset_stats(vp); + + /* clear buffer - it can happen that the host side of the interface + * is full when we get here. In this case, new data is never queued, + * SIGIOs never arrive, and the net never works. + */ + + napi_schedule(&vp->napi); + + vdevice = find_device(vp->unit); + vdevice->opened = 1; + + if ((vp->options & VECTOR_TX) != 0) + add_timer(&vp->tl); + return 0; +out_close: + vector_net_close(dev); + return err; +} + + +static void vector_net_set_multicast_list(struct net_device *dev) +{ + /* TODO: - we can do some BPF games here */ + return; +} + +static void vector_net_tx_timeout(struct net_device *dev, unsigned int txqueue) +{ + struct vector_private *vp = netdev_priv(dev); + + vp->estats.tx_timeout_count++; + netif_trans_update(dev); + schedule_work(&vp->reset_tx); +} + +static netdev_features_t vector_fix_features(struct net_device *dev, + netdev_features_t features) +{ + features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); + return features; +} + +static int vector_set_features(struct net_device *dev, + netdev_features_t features) +{ + struct vector_private *vp = netdev_priv(dev); + /* Adjust buffer sizes for GSO/GRO. Unfortunately, there is + * no way to negotiate it on raw sockets, so we can change + * only our side. + */ + if (features & NETIF_F_GRO) + /* All new frame buffers will be GRO-sized */ + vp->req_size = 65536; + else + /* All new frame buffers will be normal sized */ + vp->req_size = vp->max_packet + vp->headroom + SAFETY_MARGIN; + return 0; +} + +#ifdef CONFIG_NET_POLL_CONTROLLER +static void vector_net_poll_controller(struct net_device *dev) +{ + disable_irq(dev->irq); + vector_rx_interrupt(dev->irq, dev); + enable_irq(dev->irq); +} +#endif + +static void vector_net_get_drvinfo(struct net_device *dev, + struct ethtool_drvinfo *info) +{ + strscpy(info->driver, DRIVER_NAME); +} + +static int vector_net_load_bpf_flash(struct net_device *dev, + struct ethtool_flash *efl) +{ + struct vector_private *vp = netdev_priv(dev); + struct vector_device *vdevice; + const struct firmware *fw; + int result = 0; + + if (!(vp->options & VECTOR_BPF_FLASH)) { + netdev_err(dev, "loading firmware not permitted: %s\n", efl->data); + return -1; + } + + if (vp->bpf != NULL) { + if (vp->opened) + uml_vector_detach_bpf(vp->fds->rx_fd, vp->bpf); + kfree(vp->bpf->filter); + vp->bpf->filter = NULL; + } else { + vp->bpf = kmalloc(sizeof(struct sock_fprog), GFP_ATOMIC); + if (vp->bpf == NULL) { + netdev_err(dev, "failed to allocate memory for firmware\n"); + goto flash_fail; + } + } + + vdevice = find_device(vp->unit); + + if (request_firmware(&fw, efl->data, &vdevice->pdev.dev)) + goto flash_fail; + + vp->bpf->filter = kmemdup(fw->data, fw->size, GFP_ATOMIC); + if (!vp->bpf->filter) + goto free_buffer; + + vp->bpf->len = fw->size / sizeof(struct sock_filter); + release_firmware(fw); + + if (vp->opened) + result = uml_vector_attach_bpf(vp->fds->rx_fd, vp->bpf); + + return result; + +free_buffer: + release_firmware(fw); + +flash_fail: + if (vp->bpf != NULL) + kfree(vp->bpf->filter); + kfree(vp->bpf); + vp->bpf = NULL; + return -1; +} + +static void vector_get_ringparam(struct net_device *netdev, + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) +{ + struct vector_private *vp = netdev_priv(netdev); + + ring->rx_max_pending = vp->rx_queue->max_depth; + ring->tx_max_pending = vp->tx_queue->max_depth; + ring->rx_pending = vp->rx_queue->max_depth; + ring->tx_pending = vp->tx_queue->max_depth; +} + +static void vector_get_strings(struct net_device *dev, u32 stringset, u8 *buf) +{ + switch (stringset) { + case ETH_SS_TEST: + *buf = '\0'; + break; + case ETH_SS_STATS: + memcpy(buf, ðtool_stats_keys, sizeof(ethtool_stats_keys)); + break; + default: + WARN_ON(1); + break; + } +} + +static int vector_get_sset_count(struct net_device *dev, int sset) +{ + switch (sset) { + case ETH_SS_TEST: + return 0; + case ETH_SS_STATS: + return VECTOR_NUM_STATS; + default: + return -EOPNOTSUPP; + } +} + +static void vector_get_ethtool_stats(struct net_device *dev, + struct ethtool_stats *estats, + u64 *tmp_stats) +{ + struct vector_private *vp = netdev_priv(dev); + + /* Stats are modified in the dequeue portions of + * rx/tx which are protected by the head locks + * grabbing these locks here ensures they are up + * to date. + */ + + spin_lock(&vp->tx_queue->head_lock); + spin_lock(&vp->rx_queue->head_lock); + memcpy(tmp_stats, &vp->estats, sizeof(struct vector_estats)); + spin_unlock(&vp->rx_queue->head_lock); + spin_unlock(&vp->tx_queue->head_lock); +} + +static int vector_get_coalesce(struct net_device *netdev, + struct ethtool_coalesce *ec, + struct kernel_ethtool_coalesce *kernel_coal, + struct netlink_ext_ack *extack) +{ + struct vector_private *vp = netdev_priv(netdev); + + ec->tx_coalesce_usecs = (vp->coalesce * 1000000) / HZ; + return 0; +} + +static int vector_set_coalesce(struct net_device *netdev, + struct ethtool_coalesce *ec, + struct kernel_ethtool_coalesce *kernel_coal, + struct netlink_ext_ack *extack) +{ + struct vector_private *vp = netdev_priv(netdev); + + vp->coalesce = (ec->tx_coalesce_usecs * HZ) / 1000000; + if (vp->coalesce == 0) + vp->coalesce = 1; + return 0; +} + +static const struct ethtool_ops vector_net_ethtool_ops = { + .supported_coalesce_params = ETHTOOL_COALESCE_TX_USECS, + .get_drvinfo = vector_net_get_drvinfo, + .get_link = ethtool_op_get_link, + .get_ts_info = ethtool_op_get_ts_info, + .get_ringparam = vector_get_ringparam, + .get_strings = vector_get_strings, + .get_sset_count = vector_get_sset_count, + .get_ethtool_stats = vector_get_ethtool_stats, + .get_coalesce = vector_get_coalesce, + .set_coalesce = vector_set_coalesce, + .flash_device = vector_net_load_bpf_flash, +}; + + +static const struct net_device_ops vector_netdev_ops = { + .ndo_open = vector_net_open, + .ndo_stop = vector_net_close, + .ndo_start_xmit = vector_net_start_xmit, + .ndo_set_rx_mode = vector_net_set_multicast_list, + .ndo_tx_timeout = vector_net_tx_timeout, + .ndo_set_mac_address = eth_mac_addr, + .ndo_validate_addr = eth_validate_addr, + .ndo_fix_features = vector_fix_features, + .ndo_set_features = vector_set_features, +#ifdef CONFIG_NET_POLL_CONTROLLER + .ndo_poll_controller = vector_net_poll_controller, +#endif +}; + +static void vector_timer_expire(struct timer_list *t) +{ + struct vector_private *vp = timer_container_of(vp, t, tl); + + vp->estats.tx_kicks++; + napi_schedule(&vp->napi); +} + +static void vector_setup_etheraddr(struct net_device *dev, char *str) +{ + u8 addr[ETH_ALEN]; + + if (str == NULL) + goto random; + + if (!mac_pton(str, addr)) { + netdev_err(dev, + "Failed to parse '%s' as an ethernet address\n", str); + goto random; + } + if (is_multicast_ether_addr(addr)) { + netdev_err(dev, + "Attempt to assign a multicast ethernet address to a device disallowed\n"); + goto random; + } + if (!is_valid_ether_addr(addr)) { + netdev_err(dev, + "Attempt to assign an invalid ethernet address to a device disallowed\n"); + goto random; + } + if (!is_local_ether_addr(addr)) { + netdev_warn(dev, "Warning: Assigning a globally valid ethernet address to a device\n"); + netdev_warn(dev, "You should set the 2nd rightmost bit in the first byte of the MAC,\n"); + netdev_warn(dev, "i.e. %02x:%02x:%02x:%02x:%02x:%02x\n", + addr[0] | 0x02, addr[1], addr[2], addr[3], addr[4], addr[5]); + } + eth_hw_addr_set(dev, addr); + return; + +random: + netdev_info(dev, "Choosing a random ethernet address\n"); + eth_hw_addr_random(dev); +} + +static void vector_eth_configure( + int n, + struct arglist *def + ) +{ + struct vector_device *device; + struct net_device *dev; + struct vector_private *vp; + int err; + + device = kzalloc(sizeof(*device), GFP_KERNEL); + if (device == NULL) { + pr_err("Failed to allocate struct vector_device for vec%d\n", n); + return; + } + dev = alloc_etherdev(sizeof(struct vector_private)); + if (dev == NULL) { + pr_err("Failed to allocate struct net_device for vec%d\n", n); + goto out_free_device; + } + + dev->mtu = get_mtu(def); + + INIT_LIST_HEAD(&device->list); + device->unit = n; + + /* If this name ends up conflicting with an existing registered + * netdevice, that is OK, register_netdev{,ice}() will notice this + * and fail. + */ + snprintf(dev->name, sizeof(dev->name), "vec%d", n); + vector_setup_etheraddr(dev, uml_vector_fetch_arg(def, "mac")); + vp = netdev_priv(dev); + + /* sysfs register */ + if (!driver_registered) { + platform_driver_register(¨_net_driver); + driver_registered = 1; + } + device->pdev.id = n; + device->pdev.name = DRIVER_NAME; + device->pdev.dev.release = vector_device_release; + dev_set_drvdata(&device->pdev.dev, device); + if (platform_device_register(&device->pdev)) + goto out_free_netdev; + SET_NETDEV_DEV(dev, &device->pdev.dev); + + device->dev = dev; + + INIT_LIST_HEAD(&vp->list); + vp->dev = dev; + vp->unit = n; + vp->options = get_transport_options(def); + vp->parsed = def; + vp->max_packet = get_mtu(def) + ETH_HEADER_OTHER; + /* + * TODO - we need to calculate headroom so that ip header + * is 16 byte aligned all the time + */ + vp->headroom = get_headroom(def); + vp->coalesce = 2; + vp->req_size = get_req_size(def); + + dev->features = dev->hw_features = (NETIF_F_SG | NETIF_F_FRAGLIST); + INIT_WORK(&vp->reset_tx, vector_reset_tx); + + timer_setup(&vp->tl, vector_timer_expire, 0); + + /* FIXME */ + dev->netdev_ops = &vector_netdev_ops; + dev->ethtool_ops = &vector_net_ethtool_ops; + dev->watchdog_timeo = (HZ >> 1); + /* primary IRQ - fixme */ + dev->irq = 0; /* we will adjust this once opened */ + + rtnl_lock(); + err = register_netdevice(dev); + rtnl_unlock(); + if (err) + goto out_undo_user_init; + + spin_lock(&vector_devices_lock); + list_add(&device->list, &vector_devices); + spin_unlock(&vector_devices_lock); + + return; + +out_undo_user_init: + return; +out_free_netdev: + free_netdev(dev); +out_free_device: + kfree(device); +} + + + + +/* + * Invoked late in the init + */ + +static int __init vector_init(void) +{ + struct list_head *ele; + struct vector_cmd_line_arg *def; + struct arglist *parsed; + + list_for_each(ele, &vec_cmd_line) { + def = list_entry(ele, struct vector_cmd_line_arg, list); + parsed = uml_parse_vector_ifspec(def->arguments); + if (parsed != NULL) + vector_eth_configure(def->unit, parsed); + } + return 0; +} + + +/* Invoked at initial argument parsing, only stores + * arguments until a proper vector_init is called + * later + */ + +static int __init vector_setup(char *str) +{ + char *error; + int n, err; + struct vector_cmd_line_arg *new; + + err = vector_parse(str, &n, &str, &error); + if (err) { + pr_err("Couldn't parse '%s': %s\n", str, error); + return 1; + } + new = memblock_alloc_or_panic(sizeof(*new), SMP_CACHE_BYTES); + INIT_LIST_HEAD(&new->list); + new->unit = n; + new->arguments = str; + list_add_tail(&new->list, &vec_cmd_line); + return 1; +} + +__setup("vec", vector_setup); +__uml_help(vector_setup, +"vec[0-9]+:<option>=<value>,<option>=<value>\n" +" Configure a vector io network device.\n\n" +); + +late_initcall(vector_init); + +static struct mc_device vector_mc = { + .list = LIST_HEAD_INIT(vector_mc.list), + .name = "vec", + .config = vector_config, + .get_config = NULL, + .id = vector_id, + .remove = vector_remove, +}; + +#ifdef CONFIG_INET +static int vector_inetaddr_event( + struct notifier_block *this, + unsigned long event, + void *ptr) +{ + return NOTIFY_DONE; +} + +static struct notifier_block vector_inetaddr_notifier = { + .notifier_call = vector_inetaddr_event, +}; + +static void inet_register(void) +{ + register_inetaddr_notifier(&vector_inetaddr_notifier); +} +#else +static inline void inet_register(void) +{ +} +#endif + +static int vector_net_init(void) +{ + mconsole_register_dev(&vector_mc); + inet_register(); + return 0; +} + +__initcall(vector_net_init); + + + diff --git a/arch/um/drivers/vector_kern.h b/arch/um/drivers/vector_kern.h new file mode 100644 index 000000000000..417834793658 --- /dev/null +++ b/arch/um/drivers/vector_kern.h @@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2002 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + */ + +#ifndef __UM_VECTOR_KERN_H +#define __UM_VECTOR_KERN_H + +#include <linux/netdevice.h> +#include <linux/platform_device.h> +#include <linux/skbuff.h> +#include <linux/socket.h> +#include <linux/list.h> +#include <linux/ctype.h> +#include <linux/workqueue.h> +#include <linux/interrupt.h> +#include <asm/atomic.h> + +#include "vector_user.h" + +/* Queue structure specially adapted for multiple enqueue/dequeue + * in a mmsgrecv/mmsgsend context + */ + +/* Dequeue method */ + +#define QUEUE_SENDMSG 0 +#define QUEUE_SENDMMSG 1 + +#define VECTOR_RX 1 +#define VECTOR_TX (1 << 1) +#define VECTOR_BPF (1 << 2) +#define VECTOR_QDISC_BYPASS (1 << 3) +#define VECTOR_BPF_FLASH (1 << 4) + +#define ETH_MAX_PACKET 1500 +#define ETH_HEADER_OTHER 32 /* just in case someone decides to go mad on QnQ */ + +#define MAX_FILTER_PROG (2 << 16) + +struct vector_queue { + struct mmsghdr *mmsg_vector; + void **skbuff_vector; + /* backlink to device which owns us */ + struct net_device *dev; + spinlock_t head_lock; + spinlock_t tail_lock; + atomic_t queue_depth; + int head, tail, max_depth, max_iov_frags; + short options; +}; + +struct vector_estats { + uint64_t rx_queue_max; + uint64_t rx_queue_running_average; + uint64_t tx_queue_max; + uint64_t tx_queue_running_average; + uint64_t rx_encaps_errors; + uint64_t tx_timeout_count; + uint64_t tx_restart_queue; + uint64_t tx_kicks; + uint64_t tx_flow_control_xon; + uint64_t tx_flow_control_xoff; + uint64_t rx_csum_offload_good; + uint64_t rx_csum_offload_errors; + uint64_t sg_ok; + uint64_t sg_linearized; +}; + +#define VERIFY_HEADER_NOK -1 +#define VERIFY_HEADER_OK 0 +#define VERIFY_CSUM_OK 1 + +struct vector_private { + struct list_head list; + struct net_device *dev; + struct napi_struct napi ____cacheline_aligned; + + int unit; + + /* Timeout timer in TX */ + + struct timer_list tl; + + /* Scheduled "remove device" work */ + struct work_struct reset_tx; + struct vector_fds *fds; + + struct vector_queue *rx_queue; + struct vector_queue *tx_queue; + + int rx_irq; + int tx_irq; + + struct arglist *parsed; + + void *transport_data; /* transport specific params if needed */ + + int max_packet; + int req_size; /* different from max packet - used for TSO */ + int headroom; + + int options; + + /* remote address if any - some transports will leave this as null */ + + int header_size; + int rx_header_size; + int coalesce; + + void *header_rxbuffer; + void *header_txbuffer; + + int (*form_header)(uint8_t *header, + struct sk_buff *skb, struct vector_private *vp); + int (*verify_header)(uint8_t *header, + struct sk_buff *skb, struct vector_private *vp); + + spinlock_t stats_lock; + + bool rexmit_scheduled; + bool opened; + bool in_write_poll; + bool in_error; + + /* guest allowed to use ethtool flash to load bpf */ + bool bpf_via_flash; + + /* ethtool stats */ + + struct vector_estats estats; + struct sock_fprog *bpf; + + char user[]; +}; + +extern int build_transport_data(struct vector_private *vp); + +#endif diff --git a/arch/um/drivers/vector_transports.c b/arch/um/drivers/vector_transports.c new file mode 100644 index 000000000000..0794d23f07cb --- /dev/null +++ b/arch/um/drivers/vector_transports.c @@ -0,0 +1,495 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2017 - Cambridge Greys Limited + * Copyright (C) 2011 - 2014 Cisco Systems Inc + */ + +#include <linux/etherdevice.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <asm/byteorder.h> +#include <uapi/linux/ip.h> +#include <uapi/linux/virtio_net.h> +#include <linux/virtio_net.h> +#include <linux/virtio_byteorder.h> +#include <linux/netdev_features.h> +#include "vector_user.h" +#include "vector_kern.h" + +#define GOOD_LINEAR 512 +#define GSO_ERROR "Incoming GSO frames and GRO disabled on the interface" + +struct gre_minimal_header { + uint16_t header; + uint16_t arptype; +}; + + +struct uml_gre_data { + uint32_t rx_key; + uint32_t tx_key; + uint32_t sequence; + + bool ipv6; + bool has_sequence; + bool pin_sequence; + bool checksum; + bool key; + struct gre_minimal_header expected_header; + + uint32_t checksum_offset; + uint32_t key_offset; + uint32_t sequence_offset; + +}; + +struct uml_l2tpv3_data { + uint64_t rx_cookie; + uint64_t tx_cookie; + uint64_t rx_session; + uint64_t tx_session; + uint32_t counter; + + bool udp; + bool ipv6; + bool has_counter; + bool pin_counter; + bool cookie; + bool cookie_is_64; + + uint32_t cookie_offset; + uint32_t session_offset; + uint32_t counter_offset; +}; + +static int l2tpv3_form_header(uint8_t *header, + struct sk_buff *skb, struct vector_private *vp) +{ + struct uml_l2tpv3_data *td = vp->transport_data; + uint32_t *counter; + + if (td->udp) + *(uint32_t *) header = cpu_to_be32(L2TPV3_DATA_PACKET); + (*(uint32_t *) (header + td->session_offset)) = td->tx_session; + + if (td->cookie) { + if (td->cookie_is_64) + (*(uint64_t *)(header + td->cookie_offset)) = + td->tx_cookie; + else + (*(uint32_t *)(header + td->cookie_offset)) = + td->tx_cookie; + } + if (td->has_counter) { + counter = (uint32_t *)(header + td->counter_offset); + if (td->pin_counter) { + *counter = 0; + } else { + td->counter++; + *counter = cpu_to_be32(td->counter); + } + } + return 0; +} + +static int gre_form_header(uint8_t *header, + struct sk_buff *skb, struct vector_private *vp) +{ + struct uml_gre_data *td = vp->transport_data; + uint32_t *sequence; + *((uint32_t *) header) = *((uint32_t *) &td->expected_header); + if (td->key) + (*(uint32_t *) (header + td->key_offset)) = td->tx_key; + if (td->has_sequence) { + sequence = (uint32_t *)(header + td->sequence_offset); + if (td->pin_sequence) + *sequence = 0; + else + *sequence = cpu_to_be32(++td->sequence); + } + return 0; +} + +static int raw_form_header(uint8_t *header, + struct sk_buff *skb, struct vector_private *vp) +{ + struct virtio_net_hdr *vheader = (struct virtio_net_hdr *) header; + + virtio_net_hdr_from_skb( + skb, + vheader, + virtio_legacy_is_little_endian(), + false, + 0 + ); + + return 0; +} + +static int l2tpv3_verify_header( + uint8_t *header, struct sk_buff *skb, struct vector_private *vp) +{ + struct uml_l2tpv3_data *td = vp->transport_data; + uint32_t *session; + uint64_t cookie; + + if ((!td->udp) && (!td->ipv6)) + header += sizeof(struct iphdr) /* fix for ipv4 raw */; + + /* we do not do a strict check for "data" packets as per + * the RFC spec because the pure IP spec does not have + * that anyway. + */ + + if (td->cookie) { + if (td->cookie_is_64) + cookie = *(uint64_t *)(header + td->cookie_offset); + else + cookie = *(uint32_t *)(header + td->cookie_offset); + if (cookie != td->rx_cookie) { + if (net_ratelimit()) + netdev_err(vp->dev, "uml_l2tpv3: unknown cookie id"); + return -1; + } + } + session = (uint32_t *) (header + td->session_offset); + if (*session != td->rx_session) { + if (net_ratelimit()) + netdev_err(vp->dev, "uml_l2tpv3: session mismatch"); + return -1; + } + return 0; +} + +static int gre_verify_header( + uint8_t *header, struct sk_buff *skb, struct vector_private *vp) +{ + + uint32_t key; + struct uml_gre_data *td = vp->transport_data; + + if (!td->ipv6) + header += sizeof(struct iphdr) /* fix for ipv4 raw */; + + if (*((uint32_t *) header) != *((uint32_t *) &td->expected_header)) { + if (net_ratelimit()) + netdev_err(vp->dev, "header type disagreement, expecting %0x, got %0x", + *((uint32_t *) &td->expected_header), + *((uint32_t *) header) + ); + return -1; + } + + if (td->key) { + key = (*(uint32_t *)(header + td->key_offset)); + if (key != td->rx_key) { + if (net_ratelimit()) + netdev_err(vp->dev, "unknown key id %0x, expecting %0x", + key, td->rx_key); + return -1; + } + } + return 0; +} + +static int raw_verify_header( + uint8_t *header, struct sk_buff *skb, struct vector_private *vp) +{ + struct virtio_net_hdr *vheader = (struct virtio_net_hdr *) header; + + if ((vheader->gso_type != VIRTIO_NET_HDR_GSO_NONE) && + (vp->req_size != 65536)) { + if (net_ratelimit()) + netdev_err( + vp->dev, + GSO_ERROR + ); + } + if ((vheader->flags & VIRTIO_NET_HDR_F_DATA_VALID) > 0) + return 1; + + virtio_net_hdr_to_skb(skb, vheader, virtio_legacy_is_little_endian()); + return 0; +} + +static bool get_uint_param( + struct arglist *def, char *param, unsigned int *result) +{ + char *arg = uml_vector_fetch_arg(def, param); + + if (arg != NULL) { + if (kstrtoint(arg, 0, result) == 0) + return true; + } + return false; +} + +static bool get_ulong_param( + struct arglist *def, char *param, unsigned long *result) +{ + char *arg = uml_vector_fetch_arg(def, param); + + if (arg != NULL) { + if (kstrtoul(arg, 0, result) == 0) + return true; + return true; + } + return false; +} + +static int build_gre_transport_data(struct vector_private *vp) +{ + struct uml_gre_data *td; + int temp_int; + int temp_rx; + int temp_tx; + + vp->transport_data = kmalloc(sizeof(struct uml_gre_data), GFP_KERNEL); + if (vp->transport_data == NULL) + return -ENOMEM; + td = vp->transport_data; + td->sequence = 0; + + td->expected_header.arptype = GRE_IRB; + td->expected_header.header = 0; + + vp->form_header = &gre_form_header; + vp->verify_header = &gre_verify_header; + vp->header_size = 4; + td->key_offset = 4; + td->sequence_offset = 4; + td->checksum_offset = 4; + + td->ipv6 = false; + if (get_uint_param(vp->parsed, "v6", &temp_int)) { + if (temp_int > 0) + td->ipv6 = true; + } + td->key = false; + if (get_uint_param(vp->parsed, "rx_key", &temp_rx)) { + if (get_uint_param(vp->parsed, "tx_key", &temp_tx)) { + td->key = true; + td->expected_header.header |= GRE_MODE_KEY; + td->rx_key = cpu_to_be32(temp_rx); + td->tx_key = cpu_to_be32(temp_tx); + vp->header_size += 4; + td->sequence_offset += 4; + } else { + return -EINVAL; + } + } + + td->sequence = false; + if (get_uint_param(vp->parsed, "sequence", &temp_int)) { + if (temp_int > 0) { + vp->header_size += 4; + td->has_sequence = true; + td->expected_header.header |= GRE_MODE_SEQUENCE; + if (get_uint_param( + vp->parsed, "pin_sequence", &temp_int)) { + if (temp_int > 0) + td->pin_sequence = true; + } + } + } + vp->rx_header_size = vp->header_size; + if (!td->ipv6) + vp->rx_header_size += sizeof(struct iphdr); + return 0; +} + +static int build_l2tpv3_transport_data(struct vector_private *vp) +{ + + struct uml_l2tpv3_data *td; + int temp_int, temp_rxs, temp_txs; + unsigned long temp_rx; + unsigned long temp_tx; + + vp->transport_data = kmalloc( + sizeof(struct uml_l2tpv3_data), GFP_KERNEL); + + if (vp->transport_data == NULL) + return -ENOMEM; + + td = vp->transport_data; + + vp->form_header = &l2tpv3_form_header; + vp->verify_header = &l2tpv3_verify_header; + td->counter = 0; + + vp->header_size = 4; + td->session_offset = 0; + td->cookie_offset = 4; + td->counter_offset = 4; + + + td->ipv6 = false; + if (get_uint_param(vp->parsed, "v6", &temp_int)) { + if (temp_int > 0) + td->ipv6 = true; + } + + if (get_uint_param(vp->parsed, "rx_session", &temp_rxs)) { + if (get_uint_param(vp->parsed, "tx_session", &temp_txs)) { + td->tx_session = cpu_to_be32(temp_txs); + td->rx_session = cpu_to_be32(temp_rxs); + } else { + return -EINVAL; + } + } else { + return -EINVAL; + } + + td->cookie_is_64 = false; + if (get_uint_param(vp->parsed, "cookie64", &temp_int)) { + if (temp_int > 0) + td->cookie_is_64 = true; + } + td->cookie = false; + if (get_ulong_param(vp->parsed, "rx_cookie", &temp_rx)) { + if (get_ulong_param(vp->parsed, "tx_cookie", &temp_tx)) { + td->cookie = true; + if (td->cookie_is_64) { + td->rx_cookie = cpu_to_be64(temp_rx); + td->tx_cookie = cpu_to_be64(temp_tx); + vp->header_size += 8; + td->counter_offset += 8; + } else { + td->rx_cookie = cpu_to_be32(temp_rx); + td->tx_cookie = cpu_to_be32(temp_tx); + vp->header_size += 4; + td->counter_offset += 4; + } + } else { + return -EINVAL; + } + } + + td->has_counter = false; + if (get_uint_param(vp->parsed, "counter", &temp_int)) { + if (temp_int > 0) { + td->has_counter = true; + vp->header_size += 4; + if (get_uint_param( + vp->parsed, "pin_counter", &temp_int)) { + if (temp_int > 0) + td->pin_counter = true; + } + } + } + + if (get_uint_param(vp->parsed, "udp", &temp_int)) { + if (temp_int > 0) { + td->udp = true; + vp->header_size += 4; + td->counter_offset += 4; + td->session_offset += 4; + td->cookie_offset += 4; + } + } + + vp->rx_header_size = vp->header_size; + if ((!td->ipv6) && (!td->udp)) + vp->rx_header_size += sizeof(struct iphdr); + + return 0; +} + +static int build_raw_transport_data(struct vector_private *vp) +{ + if (uml_raw_enable_vnet_headers(vp->fds->rx_fd)) { + if (!uml_raw_enable_vnet_headers(vp->fds->tx_fd)) + return -1; + vp->form_header = &raw_form_header; + vp->verify_header = &raw_verify_header; + vp->header_size = sizeof(struct virtio_net_hdr); + vp->rx_header_size = sizeof(struct virtio_net_hdr); + vp->dev->hw_features |= (NETIF_F_TSO | NETIF_F_GRO); + vp->dev->features |= + (NETIF_F_RXCSUM | NETIF_F_HW_CSUM | + NETIF_F_TSO | NETIF_F_GRO); + netdev_info( + vp->dev, + "raw: using vnet headers for tso and tx/rx checksum" + ); + } + return 0; +} + +static int build_hybrid_transport_data(struct vector_private *vp) +{ + if (uml_raw_enable_vnet_headers(vp->fds->rx_fd)) { + vp->form_header = &raw_form_header; + vp->verify_header = &raw_verify_header; + vp->header_size = sizeof(struct virtio_net_hdr); + vp->rx_header_size = sizeof(struct virtio_net_hdr); + vp->dev->hw_features |= + (NETIF_F_TSO | NETIF_F_GSO | NETIF_F_GRO); + vp->dev->features |= + (NETIF_F_RXCSUM | NETIF_F_HW_CSUM | + NETIF_F_TSO | NETIF_F_GSO | NETIF_F_GRO); + netdev_info( + vp->dev, + "tap/raw hybrid: using vnet headers for tso and tx/rx checksum" + ); + } else { + return 0; /* do not try to enable tap too if raw failed */ + } + if (uml_tap_enable_vnet_headers(vp->fds->tx_fd)) + return 0; + return -1; +} + +static int build_tap_transport_data(struct vector_private *vp) +{ + /* "Pure" tap uses the same fd for rx and tx */ + if (uml_tap_enable_vnet_headers(vp->fds->tx_fd)) { + vp->form_header = &raw_form_header; + vp->verify_header = &raw_verify_header; + vp->header_size = sizeof(struct virtio_net_hdr); + vp->rx_header_size = sizeof(struct virtio_net_hdr); + vp->dev->hw_features |= + (NETIF_F_TSO | NETIF_F_GSO | NETIF_F_GRO); + vp->dev->features |= + (NETIF_F_RXCSUM | NETIF_F_HW_CSUM | + NETIF_F_TSO | NETIF_F_GSO | NETIF_F_GRO); + netdev_info( + vp->dev, + "tap: using vnet headers for tso and tx/rx checksum" + ); + return 0; + } + return -1; +} + + +static int build_bess_transport_data(struct vector_private *vp) +{ + vp->form_header = NULL; + vp->verify_header = NULL; + vp->header_size = 0; + vp->rx_header_size = 0; + return 0; +} + +int build_transport_data(struct vector_private *vp) +{ + char *transport = uml_vector_fetch_arg(vp->parsed, "transport"); + + if (strncmp(transport, TRANS_GRE, TRANS_GRE_LEN) == 0) + return build_gre_transport_data(vp); + if (strncmp(transport, TRANS_L2TPV3, TRANS_L2TPV3_LEN) == 0) + return build_l2tpv3_transport_data(vp); + if (strncmp(transport, TRANS_RAW, TRANS_RAW_LEN) == 0) + return build_raw_transport_data(vp); + if (strncmp(transport, TRANS_TAP, TRANS_TAP_LEN) == 0) + return build_tap_transport_data(vp); + if (strncmp(transport, TRANS_HYBRID, TRANS_HYBRID_LEN) == 0) + return build_hybrid_transport_data(vp); + if (strncmp(transport, TRANS_BESS, TRANS_BESS_LEN) == 0) + return build_bess_transport_data(vp); + return 0; +} + diff --git a/arch/um/drivers/vector_user.c b/arch/um/drivers/vector_user.c new file mode 100644 index 000000000000..2ea67e6fd067 --- /dev/null +++ b/arch/um/drivers/vector_user.c @@ -0,0 +1,941 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + */ + +#include <stdbool.h> +#include <stdio.h> +#include <unistd.h> +#include <stdarg.h> +#include <errno.h> +#include <stddef.h> +#include <string.h> +#include <sys/ioctl.h> +#include <net/if.h> +#include <linux/if_tun.h> +#include <arpa/inet.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <netinet/ip.h> +#include <linux/if_ether.h> +#include <linux/if_packet.h> +#include <sys/wait.h> +#include <sys/uio.h> +#include <linux/virtio_net.h> +#include <netdb.h> +#include <stdlib.h> +#include <os.h> +#include <limits.h> +#include <um_malloc.h> +#include "vector_user.h" + +#define ID_GRE 0 +#define ID_L2TPV3 1 +#define ID_BESS 2 +#define ID_MAX 2 + +#define TOKEN_IFNAME "ifname" +#define TOKEN_SCRIPT "ifup" + +#define TRANS_RAW "raw" +#define TRANS_RAW_LEN strlen(TRANS_RAW) + +#define TRANS_FD "fd" +#define TRANS_FD_LEN strlen(TRANS_FD) + +#define TRANS_VDE "vde" +#define TRANS_VDE_LEN strlen(TRANS_VDE) + +#define VNET_HDR_FAIL "could not enable vnet headers on fd %d" +#define TUN_GET_F_FAIL "tapraw: TUNGETFEATURES failed: %s" +#define L2TPV3_BIND_FAIL "l2tpv3_open : could not bind socket err=%i" +#define UNIX_BIND_FAIL "unix_open : could not bind socket err=%i" +#define BPF_ATTACH_FAIL "Failed to attach filter size %d prog %px to %d, err %d\n" +#define BPF_DETACH_FAIL "Failed to detach filter size %d prog %px to %d, err %d\n" + +#define MAX_UN_LEN 107 + +static const char padchar[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"; +static const char *template = "tapXXXXXX"; + +/* This is very ugly and brute force lookup, but it is done + * only once at initialization so not worth doing hashes or + * anything more intelligent + */ + +char *uml_vector_fetch_arg(struct arglist *ifspec, char *token) +{ + int i; + + for (i = 0; i < ifspec->numargs; i++) { + if (strcmp(ifspec->tokens[i], token) == 0) + return ifspec->values[i]; + } + return NULL; + +} + +struct arglist *uml_parse_vector_ifspec(char *arg) +{ + struct arglist *result; + int pos, len; + bool parsing_token = true, next_starts = true; + + if (arg == NULL) + return NULL; + result = uml_kmalloc(sizeof(struct arglist), UM_GFP_KERNEL); + if (result == NULL) + return NULL; + result->numargs = 0; + len = strlen(arg); + for (pos = 0; pos < len; pos++) { + if (next_starts) { + if (parsing_token) { + result->tokens[result->numargs] = arg + pos; + } else { + result->values[result->numargs] = arg + pos; + result->numargs++; + } + next_starts = false; + } + if (*(arg + pos) == '=') { + if (parsing_token) + parsing_token = false; + else + goto cleanup; + next_starts = true; + (*(arg + pos)) = '\0'; + } + if (*(arg + pos) == ',') { + parsing_token = true; + next_starts = true; + (*(arg + pos)) = '\0'; + } + } + return result; +cleanup: + printk(UM_KERN_ERR "vector_setup - Couldn't parse '%s'\n", arg); + kfree(result); + return NULL; +} + +/* + * Socket/FD configuration functions. These return an structure + * of rx and tx descriptors to cover cases where these are not + * the same (f.e. read via raw socket and write via tap). + */ + +#define PATH_NET_TUN "/dev/net/tun" + + +static int create_tap_fd(char *iface) +{ + struct ifreq ifr; + int fd = -1; + int err = -ENOMEM, offload; + + fd = open(PATH_NET_TUN, O_RDWR); + if (fd < 0) { + printk(UM_KERN_ERR "uml_tap: failed to open tun device\n"); + goto tap_fd_cleanup; + } + memset(&ifr, 0, sizeof(ifr)); + ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR; + strscpy(ifr.ifr_name, iface); + + err = ioctl(fd, TUNSETIFF, (void *) &ifr); + if (err != 0) { + printk(UM_KERN_ERR "uml_tap: failed to select tap interface\n"); + goto tap_fd_cleanup; + } + + offload = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6; + ioctl(fd, TUNSETOFFLOAD, offload); + return fd; +tap_fd_cleanup: + if (fd >= 0) + os_close_file(fd); + return err; +} + +static int create_raw_fd(char *iface, int flags, int proto) +{ + struct ifreq ifr; + int fd = -1; + struct sockaddr_ll sock; + int err = -ENOMEM; + + fd = socket(AF_PACKET, SOCK_RAW, flags); + if (fd == -1) { + err = -errno; + goto raw_fd_cleanup; + } + memset(&ifr, 0, sizeof(ifr)); + strscpy(ifr.ifr_name, iface); + if (ioctl(fd, SIOCGIFINDEX, (void *) &ifr) < 0) { + err = -errno; + goto raw_fd_cleanup; + } + + sock.sll_family = AF_PACKET; + sock.sll_protocol = htons(proto); + sock.sll_ifindex = ifr.ifr_ifindex; + + if (bind(fd, + (struct sockaddr *) &sock, sizeof(struct sockaddr_ll)) < 0) { + err = -errno; + goto raw_fd_cleanup; + } + return fd; +raw_fd_cleanup: + printk(UM_KERN_ERR "user_init_raw: init failed, error %d", err); + if (fd >= 0) + os_close_file(fd); + return err; +} + + +static struct vector_fds *user_init_tap_fds(struct arglist *ifspec) +{ + int fd = -1, i; + char *iface; + struct vector_fds *result = NULL; + bool dynamic = false; + char dynamic_ifname[IFNAMSIZ]; + char *argv[] = {NULL, NULL, NULL, NULL}; + + iface = uml_vector_fetch_arg(ifspec, TOKEN_IFNAME); + if (iface == NULL) { + dynamic = true; + iface = dynamic_ifname; + srand(getpid()); + } + + result = uml_kmalloc(sizeof(struct vector_fds), UM_GFP_KERNEL); + if (result == NULL) { + printk(UM_KERN_ERR "uml_tap: failed to allocate file descriptors\n"); + goto tap_cleanup; + } + result->rx_fd = -1; + result->tx_fd = -1; + result->remote_addr = NULL; + result->remote_addr_size = 0; + + /* TAP */ + do { + if (dynamic) { + strcpy(iface, template); + for (i = 0; i < strlen(iface); i++) { + if (iface[i] == 'X') { + iface[i] = padchar[rand() % strlen(padchar)]; + } + } + } + fd = create_tap_fd(iface); + if ((fd < 0) && (!dynamic)) { + printk(UM_KERN_ERR "uml_tap: failed to create tun interface\n"); + goto tap_cleanup; + } + result->tx_fd = fd; + result->rx_fd = fd; + } while (fd < 0); + + argv[0] = uml_vector_fetch_arg(ifspec, TOKEN_SCRIPT); + if (argv[0]) { + argv[1] = iface; + run_helper(NULL, NULL, argv); + } + + return result; +tap_cleanup: + printk(UM_KERN_ERR "user_init_tap: init failed, error %d", fd); + kfree(result); + return NULL; +} + +static struct vector_fds *user_init_hybrid_fds(struct arglist *ifspec) +{ + char *iface; + struct vector_fds *result = NULL; + char *argv[] = {NULL, NULL, NULL, NULL}; + + iface = uml_vector_fetch_arg(ifspec, TOKEN_IFNAME); + if (iface == NULL) { + printk(UM_KERN_ERR "uml_tap: failed to parse interface spec\n"); + goto hybrid_cleanup; + } + + result = uml_kmalloc(sizeof(struct vector_fds), UM_GFP_KERNEL); + if (result == NULL) { + printk(UM_KERN_ERR "uml_tap: failed to allocate file descriptors\n"); + goto hybrid_cleanup; + } + result->rx_fd = -1; + result->tx_fd = -1; + result->remote_addr = NULL; + result->remote_addr_size = 0; + + /* TAP */ + + result->tx_fd = create_tap_fd(iface); + if (result->tx_fd < 0) { + printk(UM_KERN_ERR "uml_tap: failed to create tun interface: %i\n", result->tx_fd); + goto hybrid_cleanup; + } + + /* RAW */ + + result->rx_fd = create_raw_fd(iface, ETH_P_ALL, ETH_P_ALL); + if (result->rx_fd == -1) { + printk(UM_KERN_ERR + "uml_tap: failed to create paired raw socket: %i\n", result->rx_fd); + goto hybrid_cleanup; + } + + argv[0] = uml_vector_fetch_arg(ifspec, TOKEN_SCRIPT); + if (argv[0]) { + argv[1] = iface; + run_helper(NULL, NULL, argv); + } + return result; +hybrid_cleanup: + printk(UM_KERN_ERR "user_init_hybrid: init failed"); + kfree(result); + return NULL; +} + +static struct vector_fds *user_init_unix_fds(struct arglist *ifspec, int id) +{ + int fd = -1; + int socktype; + char *src, *dst; + struct vector_fds *result = NULL; + struct sockaddr_un *local_addr = NULL, *remote_addr = NULL; + + src = uml_vector_fetch_arg(ifspec, "src"); + dst = uml_vector_fetch_arg(ifspec, "dst"); + result = uml_kmalloc(sizeof(struct vector_fds), UM_GFP_KERNEL); + if (result == NULL) { + printk(UM_KERN_ERR "unix open:cannot allocate remote addr"); + goto unix_cleanup; + } + remote_addr = uml_kmalloc(sizeof(struct sockaddr_un), UM_GFP_KERNEL); + if (remote_addr == NULL) { + printk(UM_KERN_ERR "unix open:cannot allocate remote addr"); + goto unix_cleanup; + } + + switch (id) { + case ID_BESS: + socktype = SOCK_SEQPACKET; + if ((src != NULL) && (strlen(src) <= MAX_UN_LEN)) { + local_addr = uml_kmalloc(sizeof(struct sockaddr_un), UM_GFP_KERNEL); + if (local_addr == NULL) { + printk(UM_KERN_ERR "bess open:cannot allocate local addr"); + goto unix_cleanup; + } + local_addr->sun_family = AF_UNIX; + memcpy(local_addr->sun_path, src, strlen(src) + 1); + } + if ((dst == NULL) || (strlen(dst) > MAX_UN_LEN)) + goto unix_cleanup; + remote_addr->sun_family = AF_UNIX; + memcpy(remote_addr->sun_path, dst, strlen(dst) + 1); + break; + default: + printk(KERN_ERR "Unsupported unix socket type\n"); + return NULL; + } + + fd = socket(AF_UNIX, socktype, 0); + if (fd == -1) { + printk(UM_KERN_ERR + "unix open: could not open socket, error = %d", + -errno + ); + goto unix_cleanup; + } + if (local_addr != NULL) { + if (bind(fd, (struct sockaddr *) local_addr, sizeof(struct sockaddr_un))) { + printk(UM_KERN_ERR UNIX_BIND_FAIL, errno); + goto unix_cleanup; + } + } + switch (id) { + case ID_BESS: + if (connect(fd, (const struct sockaddr *) remote_addr, sizeof(struct sockaddr_un)) < 0) { + printk(UM_KERN_ERR "bess open:cannot connect to %s %i", remote_addr->sun_path, -errno); + goto unix_cleanup; + } + break; + } + result->rx_fd = fd; + result->tx_fd = fd; + result->remote_addr_size = sizeof(struct sockaddr_un); + result->remote_addr = remote_addr; + return result; +unix_cleanup: + if (fd >= 0) + os_close_file(fd); + kfree(remote_addr); + kfree(result); + return NULL; +} + +static int strtofd(const char *nptr) +{ + long fd; + char *endptr; + + if (nptr == NULL) + return -1; + + errno = 0; + fd = strtol(nptr, &endptr, 10); + if (nptr == endptr || + errno != 0 || + *endptr != '\0' || + fd < 0 || + fd > INT_MAX) { + return -1; + } + return fd; +} + +static struct vector_fds *user_init_fd_fds(struct arglist *ifspec) +{ + int fd = -1; + char *fdarg = NULL; + struct vector_fds *result = NULL; + + fdarg = uml_vector_fetch_arg(ifspec, "fd"); + fd = strtofd(fdarg); + if (fd == -1) { + printk(UM_KERN_ERR "fd open: bad or missing fd argument"); + goto fd_cleanup; + } + + result = uml_kmalloc(sizeof(struct vector_fds), UM_GFP_KERNEL); + if (result == NULL) { + printk(UM_KERN_ERR "fd open: allocation failed"); + goto fd_cleanup; + } + + result->rx_fd = fd; + result->tx_fd = fd; + result->remote_addr_size = 0; + result->remote_addr = NULL; + return result; + +fd_cleanup: + if (fd >= 0) + os_close_file(fd); + kfree(result); + return NULL; +} + +/* enough char to store an int type */ +#define ENOUGH(type) ((CHAR_BIT * sizeof(type) - 1) / 3 + 2) +#define ENOUGH_OCTAL(type) ((CHAR_BIT * sizeof(type) + 2) / 3) +/* vde_plug --descr xx --port2 xx --mod2 xx --group2 xx seqpacket://NN vnl (NULL) */ +#define VDE_MAX_ARGC 12 +#define VDE_SEQPACKET_HEAD "seqpacket://" +#define VDE_SEQPACKET_HEAD_LEN (sizeof(VDE_SEQPACKET_HEAD) - 1) +#define VDE_DEFAULT_DESCRIPTION "UML" + +static struct vector_fds *user_init_vde_fds(struct arglist *ifspec) +{ + char seqpacketvnl[VDE_SEQPACKET_HEAD_LEN + ENOUGH(int) + 1]; + char *argv[VDE_MAX_ARGC] = {"vde_plug"}; + int argc = 1; + int rv; + int sv[2]; + struct vector_fds *result = NULL; + + char *vnl = uml_vector_fetch_arg(ifspec,"vnl"); + char *descr = uml_vector_fetch_arg(ifspec,"descr"); + char *port = uml_vector_fetch_arg(ifspec,"port"); + char *mode = uml_vector_fetch_arg(ifspec,"mode"); + char *group = uml_vector_fetch_arg(ifspec,"group"); + if (descr == NULL) descr = VDE_DEFAULT_DESCRIPTION; + + argv[argc++] = "--descr"; + argv[argc++] = descr; + if (port != NULL) { + argv[argc++] = "--port2"; + argv[argc++] = port; + } + if (mode != NULL) { + argv[argc++] = "--mod2"; + argv[argc++] = mode; + } + if (group != NULL) { + argv[argc++] = "--group2"; + argv[argc++] = group; + } + argv[argc++] = seqpacketvnl; + argv[argc++] = vnl; + argv[argc++] = NULL; + + rv = socketpair(AF_UNIX, SOCK_SEQPACKET, 0, sv); + if (rv < 0) { + printk(UM_KERN_ERR "vde: seqpacket socketpair err %d", -errno); + return NULL; + } + rv = os_set_exec_close(sv[0]); + if (rv < 0) { + printk(UM_KERN_ERR "vde: seqpacket socketpair cloexec err %d", -errno); + goto vde_cleanup_sv; + } + snprintf(seqpacketvnl, sizeof(seqpacketvnl), VDE_SEQPACKET_HEAD "%d", sv[1]); + + run_helper(NULL, NULL, argv); + + close(sv[1]); + + result = uml_kmalloc(sizeof(struct vector_fds), UM_GFP_KERNEL); + if (result == NULL) { + printk(UM_KERN_ERR "fd open: allocation failed"); + goto vde_cleanup; + } + + result->rx_fd = sv[0]; + result->tx_fd = sv[0]; + result->remote_addr_size = 0; + result->remote_addr = NULL; + return result; + +vde_cleanup_sv: + close(sv[1]); +vde_cleanup: + close(sv[0]); + return NULL; +} + +static struct vector_fds *user_init_raw_fds(struct arglist *ifspec) +{ + int rxfd = -1, txfd = -1; + int err = -ENOMEM; + char *iface; + struct vector_fds *result = NULL; + char *argv[] = {NULL, NULL, NULL, NULL}; + + iface = uml_vector_fetch_arg(ifspec, TOKEN_IFNAME); + if (iface == NULL) + goto raw_cleanup; + + rxfd = create_raw_fd(iface, ETH_P_ALL, ETH_P_ALL); + if (rxfd == -1) { + err = -errno; + goto raw_cleanup; + } + txfd = create_raw_fd(iface, 0, ETH_P_IP); /* Turn off RX on this fd */ + if (txfd == -1) { + err = -errno; + goto raw_cleanup; + } + result = uml_kmalloc(sizeof(struct vector_fds), UM_GFP_KERNEL); + if (result != NULL) { + result->rx_fd = rxfd; + result->tx_fd = txfd; + result->remote_addr = NULL; + result->remote_addr_size = 0; + } + argv[0] = uml_vector_fetch_arg(ifspec, TOKEN_SCRIPT); + if (argv[0]) { + argv[1] = iface; + run_helper(NULL, NULL, argv); + } + return result; +raw_cleanup: + printk(UM_KERN_ERR "user_init_raw: init failed, error %d", err); + kfree(result); + return NULL; +} + + +bool uml_raw_enable_qdisc_bypass(int fd) +{ + int optval = 1; + + if (setsockopt(fd, + SOL_PACKET, PACKET_QDISC_BYPASS, + &optval, sizeof(optval)) != 0) { + return false; + } + return true; +} + +bool uml_raw_enable_vnet_headers(int fd) +{ + int optval = 1; + + if (setsockopt(fd, + SOL_PACKET, PACKET_VNET_HDR, + &optval, sizeof(optval)) != 0) { + printk(UM_KERN_INFO VNET_HDR_FAIL, fd); + return false; + } + return true; +} +bool uml_tap_enable_vnet_headers(int fd) +{ + unsigned int features; + int len = sizeof(struct virtio_net_hdr); + + if (ioctl(fd, TUNGETFEATURES, &features) == -1) { + printk(UM_KERN_INFO TUN_GET_F_FAIL, strerror(errno)); + return false; + } + if ((features & IFF_VNET_HDR) == 0) { + printk(UM_KERN_INFO "tapraw: No VNET HEADER support"); + return false; + } + ioctl(fd, TUNSETVNETHDRSZ, &len); + return true; +} + +static struct vector_fds *user_init_socket_fds(struct arglist *ifspec, int id) +{ + int err = -ENOMEM; + int fd = -1, gairet; + struct addrinfo srchints; + struct addrinfo dsthints; + bool v6, udp; + char *value; + char *src, *dst, *srcport, *dstport; + struct addrinfo *gairesult = NULL; + struct vector_fds *result = NULL; + + + value = uml_vector_fetch_arg(ifspec, "v6"); + v6 = false; + udp = false; + if (value != NULL) { + if (strtol((const char *) value, NULL, 10) > 0) + v6 = true; + } + + value = uml_vector_fetch_arg(ifspec, "udp"); + if (value != NULL) { + if (strtol((const char *) value, NULL, 10) > 0) + udp = true; + } + src = uml_vector_fetch_arg(ifspec, "src"); + dst = uml_vector_fetch_arg(ifspec, "dst"); + srcport = uml_vector_fetch_arg(ifspec, "srcport"); + dstport = uml_vector_fetch_arg(ifspec, "dstport"); + + memset(&dsthints, 0, sizeof(dsthints)); + + if (v6) + dsthints.ai_family = AF_INET6; + else + dsthints.ai_family = AF_INET; + + switch (id) { + case ID_GRE: + dsthints.ai_socktype = SOCK_RAW; + dsthints.ai_protocol = IPPROTO_GRE; + break; + case ID_L2TPV3: + if (udp) { + dsthints.ai_socktype = SOCK_DGRAM; + dsthints.ai_protocol = 0; + } else { + dsthints.ai_socktype = SOCK_RAW; + dsthints.ai_protocol = IPPROTO_L2TP; + } + break; + default: + printk(KERN_ERR "Unsupported socket type\n"); + return NULL; + } + memcpy(&srchints, &dsthints, sizeof(struct addrinfo)); + + gairet = getaddrinfo(src, srcport, &dsthints, &gairesult); + if ((gairet != 0) || (gairesult == NULL)) { + printk(UM_KERN_ERR + "socket_open : could not resolve src, error = %s", + gai_strerror(gairet) + ); + return NULL; + } + fd = socket(gairesult->ai_family, + gairesult->ai_socktype, gairesult->ai_protocol); + if (fd == -1) { + printk(UM_KERN_ERR + "socket_open : could not open socket, error = %d", + -errno + ); + goto cleanup; + } + if (bind(fd, + (struct sockaddr *) gairesult->ai_addr, + gairesult->ai_addrlen)) { + printk(UM_KERN_ERR L2TPV3_BIND_FAIL, errno); + goto cleanup; + } + + if (gairesult != NULL) + freeaddrinfo(gairesult); + + gairesult = NULL; + + gairet = getaddrinfo(dst, dstport, &dsthints, &gairesult); + if ((gairet != 0) || (gairesult == NULL)) { + printk(UM_KERN_ERR + "socket_open : could not resolve dst, error = %s", + gai_strerror(gairet) + ); + return NULL; + } + + result = uml_kmalloc(sizeof(struct vector_fds), UM_GFP_KERNEL); + if (result != NULL) { + result->rx_fd = fd; + result->tx_fd = fd; + result->remote_addr = uml_kmalloc( + gairesult->ai_addrlen, UM_GFP_KERNEL); + if (result->remote_addr == NULL) + goto cleanup; + result->remote_addr_size = gairesult->ai_addrlen; + memcpy( + result->remote_addr, + gairesult->ai_addr, + gairesult->ai_addrlen + ); + } + freeaddrinfo(gairesult); + return result; +cleanup: + if (gairesult != NULL) + freeaddrinfo(gairesult); + printk(UM_KERN_ERR "user_init_socket: init failed, error %d", err); + if (fd >= 0) + os_close_file(fd); + if (result != NULL) { + kfree(result->remote_addr); + kfree(result); + } + return NULL; +} + +struct vector_fds *uml_vector_user_open( + int unit, + struct arglist *parsed +) +{ + char *transport; + + if (parsed == NULL) { + printk(UM_KERN_ERR "no parsed config for unit %d\n", unit); + return NULL; + } + transport = uml_vector_fetch_arg(parsed, "transport"); + if (transport == NULL) { + printk(UM_KERN_ERR "missing transport for unit %d\n", unit); + return NULL; + } + if (strncmp(transport, TRANS_RAW, TRANS_RAW_LEN) == 0) + return user_init_raw_fds(parsed); + if (strncmp(transport, TRANS_HYBRID, TRANS_HYBRID_LEN) == 0) + return user_init_hybrid_fds(parsed); + if (strncmp(transport, TRANS_TAP, TRANS_TAP_LEN) == 0) + return user_init_tap_fds(parsed); + if (strncmp(transport, TRANS_GRE, TRANS_GRE_LEN) == 0) + return user_init_socket_fds(parsed, ID_GRE); + if (strncmp(transport, TRANS_L2TPV3, TRANS_L2TPV3_LEN) == 0) + return user_init_socket_fds(parsed, ID_L2TPV3); + if (strncmp(transport, TRANS_BESS, TRANS_BESS_LEN) == 0) + return user_init_unix_fds(parsed, ID_BESS); + if (strncmp(transport, TRANS_FD, TRANS_FD_LEN) == 0) + return user_init_fd_fds(parsed); + if (strncmp(transport, TRANS_VDE, TRANS_VDE_LEN) == 0) + return user_init_vde_fds(parsed); + return NULL; +} + + +int uml_vector_sendmsg(int fd, void *hdr, int flags) +{ + int n; + + CATCH_EINTR(n = sendmsg(fd, (struct msghdr *) hdr, flags)); + if ((n < 0) && (errno == EAGAIN)) + return 0; + if (n >= 0) + return n; + else + return -errno; +} + +int uml_vector_recvmsg(int fd, void *hdr, int flags) +{ + int n; + struct msghdr *msg = (struct msghdr *) hdr; + + CATCH_EINTR(n = readv(fd, msg->msg_iov, msg->msg_iovlen)); + if ((n < 0) && (errno == EAGAIN)) + return 0; + if (n >= 0) + return n; + else + return -errno; +} + +int uml_vector_writev(int fd, void *hdr, int iovcount) +{ + int n; + + CATCH_EINTR(n = writev(fd, (struct iovec *) hdr, iovcount)); + if ((n < 0) && ((errno == EAGAIN) || (errno == ENOBUFS))) + return 0; + if (n >= 0) + return n; + else + return -errno; +} + +int uml_vector_sendmmsg( + int fd, + void *msgvec, + unsigned int vlen, + unsigned int flags) +{ + int n; + + CATCH_EINTR(n = sendmmsg(fd, (struct mmsghdr *) msgvec, vlen, flags)); + if ((n < 0) && ((errno == EAGAIN) || (errno == ENOBUFS))) + return 0; + if (n >= 0) + return n; + else + return -errno; +} + +int uml_vector_recvmmsg( + int fd, + void *msgvec, + unsigned int vlen, + unsigned int flags) +{ + int n; + + CATCH_EINTR( + n = recvmmsg(fd, (struct mmsghdr *) msgvec, vlen, flags, 0)); + if ((n < 0) && (errno == EAGAIN)) + return 0; + if (n >= 0) + return n; + else + return -errno; +} +int uml_vector_attach_bpf(int fd, void *bpf) +{ + struct sock_fprog *prog = bpf; + + int err = setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, bpf, sizeof(struct sock_fprog)); + + if (err < 0) + printk(KERN_ERR BPF_ATTACH_FAIL, prog->len, prog->filter, fd, -errno); + return err; +} + +int uml_vector_detach_bpf(int fd, void *bpf) +{ + struct sock_fprog *prog = bpf; + + int err = setsockopt(fd, SOL_SOCKET, SO_DETACH_FILTER, bpf, sizeof(struct sock_fprog)); + if (err < 0) + printk(KERN_ERR BPF_DETACH_FAIL, prog->len, prog->filter, fd, -errno); + return err; +} +void *uml_vector_default_bpf(const void *mac) +{ + struct sock_filter *bpf; + uint32_t *mac1 = (uint32_t *)(mac + 2); + uint16_t *mac2 = (uint16_t *) mac; + struct sock_fprog *bpf_prog; + + bpf_prog = uml_kmalloc(sizeof(struct sock_fprog), UM_GFP_KERNEL); + if (bpf_prog) { + bpf_prog->len = DEFAULT_BPF_LEN; + bpf_prog->filter = NULL; + } else { + return NULL; + } + bpf = uml_kmalloc( + sizeof(struct sock_filter) * DEFAULT_BPF_LEN, UM_GFP_KERNEL); + if (bpf) { + bpf_prog->filter = bpf; + /* ld [8] */ + bpf[0] = (struct sock_filter){ 0x20, 0, 0, 0x00000008 }; + /* jeq #0xMAC[2-6] jt 2 jf 5*/ + bpf[1] = (struct sock_filter){ 0x15, 0, 3, ntohl(*mac1)}; + /* ldh [6] */ + bpf[2] = (struct sock_filter){ 0x28, 0, 0, 0x00000006 }; + /* jeq #0xMAC[0-1] jt 4 jf 5 */ + bpf[3] = (struct sock_filter){ 0x15, 0, 1, ntohs(*mac2)}; + /* ret #0 */ + bpf[4] = (struct sock_filter){ 0x6, 0, 0, 0x00000000 }; + /* ret #0x40000 */ + bpf[5] = (struct sock_filter){ 0x6, 0, 0, 0x00040000 }; + } else { + kfree(bpf_prog); + bpf_prog = NULL; + } + return bpf_prog; +} + +/* Note - this function requires a valid mac being passed as an arg */ + +void *uml_vector_user_bpf(char *filename) +{ + struct sock_filter *bpf; + struct sock_fprog *bpf_prog; + struct stat statbuf; + int res, ffd = -1; + + if (filename == NULL) + return NULL; + + if (stat(filename, &statbuf) < 0) { + printk(KERN_ERR "Error %d reading bpf file", -errno); + return false; + } + bpf_prog = uml_kmalloc(sizeof(struct sock_fprog), UM_GFP_KERNEL); + if (bpf_prog == NULL) { + printk(KERN_ERR "Failed to allocate bpf prog buffer"); + return NULL; + } + bpf_prog->len = statbuf.st_size / sizeof(struct sock_filter); + bpf_prog->filter = NULL; + ffd = os_open_file(filename, of_read(OPENFLAGS()), 0); + if (ffd < 0) { + printk(KERN_ERR "Error %d opening bpf file", -errno); + goto bpf_failed; + } + bpf = uml_kmalloc(statbuf.st_size, UM_GFP_KERNEL); + if (bpf == NULL) { + printk(KERN_ERR "Failed to allocate bpf buffer"); + goto bpf_failed; + } + bpf_prog->filter = bpf; + res = os_read_file(ffd, bpf, statbuf.st_size); + if (res < statbuf.st_size) { + printk(KERN_ERR "Failed to read bpf program %s, error %d", filename, res); + kfree(bpf); + goto bpf_failed; + } + os_close_file(ffd); + return bpf_prog; +bpf_failed: + if (ffd > 0) + os_close_file(ffd); + kfree(bpf_prog); + return NULL; +} diff --git a/arch/um/drivers/vector_user.h b/arch/um/drivers/vector_user.h new file mode 100644 index 000000000000..59ed5f9e6e41 --- /dev/null +++ b/arch/um/drivers/vector_user.h @@ -0,0 +1,107 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2002 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + */ + +#ifndef __UM_VECTOR_USER_H +#define __UM_VECTOR_USER_H + +#define MAXVARGS 20 + +#define TOKEN_IFNAME "ifname" + +#define TRANS_RAW "raw" +#define TRANS_RAW_LEN strlen(TRANS_RAW) + +#define TRANS_TAP "tap" +#define TRANS_TAP_LEN strlen(TRANS_TAP) + +#define TRANS_GRE "gre" +#define TRANS_GRE_LEN strlen(TRANS_GRE) + +#define TRANS_L2TPV3 "l2tpv3" +#define TRANS_L2TPV3_LEN strlen(TRANS_L2TPV3) + +#define TRANS_HYBRID "hybrid" +#define TRANS_HYBRID_LEN strlen(TRANS_HYBRID) + +#define TRANS_BESS "bess" +#define TRANS_BESS_LEN strlen(TRANS_BESS) + +#define DEFAULT_BPF_LEN 6 + +#ifndef IPPROTO_GRE +#define IPPROTO_GRE 0x2F +#endif + +#define GRE_MODE_CHECKSUM cpu_to_be16(8 << 12) /* checksum */ +#define GRE_MODE_RESERVED cpu_to_be16(4 << 12) /* unused */ +#define GRE_MODE_KEY cpu_to_be16(2 << 12) /* KEY present */ +#define GRE_MODE_SEQUENCE cpu_to_be16(1 << 12) /* sequence */ + +#define GRE_IRB cpu_to_be16(0x6558) + +#define L2TPV3_DATA_PACKET 0x30000 + +/* IANA-assigned IP protocol ID for L2TPv3 */ + +#ifndef IPPROTO_L2TP +#define IPPROTO_L2TP 0x73 +#endif + +struct arglist { + int numargs; + char *tokens[MAXVARGS]; + char *values[MAXVARGS]; +}; + +/* Separating read and write FDs allows us to have different + * rx and tx method. Example - read tap via raw socket using + * recvmmsg, write using legacy tap write calls + */ + +struct vector_fds { + int rx_fd; + int tx_fd; + void *remote_addr; + int remote_addr_size; +}; + +#define VECTOR_READ 1 + +extern struct arglist *uml_parse_vector_ifspec(char *arg); + +extern struct vector_fds *uml_vector_user_open( + int unit, + struct arglist *parsed +); + +extern char *uml_vector_fetch_arg( + struct arglist *ifspec, + char *token +); + +extern int uml_vector_recvmsg(int fd, void *hdr, int flags); +extern int uml_vector_sendmsg(int fd, void *hdr, int flags); +extern int uml_vector_writev(int fd, void *hdr, int iovcount); +extern int uml_vector_sendmmsg( + int fd, void *msgvec, + unsigned int vlen, + unsigned int flags +); +extern int uml_vector_recvmmsg( + int fd, + void *msgvec, + unsigned int vlen, + unsigned int flags +); +extern void *uml_vector_default_bpf(const void *mac); +extern void *uml_vector_user_bpf(char *filename); +extern int uml_vector_attach_bpf(int fd, void *bpf); +extern int uml_vector_detach_bpf(int fd, void *bpf); +extern bool uml_raw_enable_qdisc_bypass(int fd); +extern bool uml_raw_enable_vnet_headers(int fd); +extern bool uml_tap_enable_vnet_headers(int fd); + + +#endif diff --git a/arch/um/drivers/vfio_kern.c b/arch/um/drivers/vfio_kern.c new file mode 100644 index 000000000000..915812a79bfc --- /dev/null +++ b/arch/um/drivers/vfio_kern.c @@ -0,0 +1,708 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025 Ant Group + * Author: Tiwei Bie <tiwei.btw@antgroup.com> + */ + +#define pr_fmt(fmt) "vfio-uml: " fmt + +#include <linux/module.h> +#include <linux/logic_iomem.h> +#include <linux/mutex.h> +#include <linux/list.h> +#include <linux/string.h> +#include <linux/unaligned.h> +#include <irq_kern.h> +#include <init.h> +#include <os.h> + +#include "mconsole_kern.h" +#include "virt-pci.h" +#include "vfio_user.h" + +#define to_vdev(_pdev) container_of(_pdev, struct uml_vfio_device, pdev) + +struct uml_vfio_intr_ctx { + struct uml_vfio_device *dev; + int irq; +}; + +struct uml_vfio_device { + const char *name; + int group; + + struct um_pci_device pdev; + struct uml_vfio_user_device udev; + struct uml_vfio_intr_ctx *intr_ctx; + + int msix_cap; + int msix_bar; + int msix_offset; + int msix_size; + u32 *msix_data; + + struct list_head list; +}; + +struct uml_vfio_group { + int id; + int fd; + int users; + struct list_head list; +}; + +static struct { + int fd; + int users; +} uml_vfio_container = { .fd = -1 }; +static DEFINE_MUTEX(uml_vfio_container_mtx); + +static LIST_HEAD(uml_vfio_groups); +static DEFINE_MUTEX(uml_vfio_groups_mtx); + +static LIST_HEAD(uml_vfio_devices); +static DEFINE_MUTEX(uml_vfio_devices_mtx); + +static int uml_vfio_set_container(int group_fd) +{ + int err; + + guard(mutex)(¨_vfio_container_mtx); + + err = uml_vfio_user_set_container(uml_vfio_container.fd, group_fd); + if (err) + return err; + + uml_vfio_container.users++; + if (uml_vfio_container.users > 1) + return 0; + + err = uml_vfio_user_setup_iommu(uml_vfio_container.fd); + if (err) { + uml_vfio_user_unset_container(uml_vfio_container.fd, group_fd); + uml_vfio_container.users--; + } + return err; +} + +static void uml_vfio_unset_container(int group_fd) +{ + guard(mutex)(¨_vfio_container_mtx); + + uml_vfio_user_unset_container(uml_vfio_container.fd, group_fd); + uml_vfio_container.users--; +} + +static int uml_vfio_open_group(int group_id) +{ + struct uml_vfio_group *group; + int err; + + guard(mutex)(¨_vfio_groups_mtx); + + list_for_each_entry(group, ¨_vfio_groups, list) { + if (group->id == group_id) { + group->users++; + return group->fd; + } + } + + group = kzalloc(sizeof(*group), GFP_KERNEL); + if (!group) + return -ENOMEM; + + group->fd = uml_vfio_user_open_group(group_id); + if (group->fd < 0) { + err = group->fd; + goto free_group; + } + + err = uml_vfio_set_container(group->fd); + if (err) + goto close_group; + + group->id = group_id; + group->users = 1; + + list_add(&group->list, ¨_vfio_groups); + + return group->fd; + +close_group: + os_close_file(group->fd); +free_group: + kfree(group); + return err; +} + +static int uml_vfio_release_group(int group_fd) +{ + struct uml_vfio_group *group; + + guard(mutex)(¨_vfio_groups_mtx); + + list_for_each_entry(group, ¨_vfio_groups, list) { + if (group->fd == group_fd) { + group->users--; + if (group->users == 0) { + uml_vfio_unset_container(group_fd); + os_close_file(group_fd); + list_del(&group->list); + kfree(group); + } + return 0; + } + } + + return -ENOENT; +} + +static irqreturn_t uml_vfio_interrupt(int unused, void *opaque) +{ + struct uml_vfio_intr_ctx *ctx = opaque; + struct uml_vfio_device *dev = ctx->dev; + int index = ctx - dev->intr_ctx; + int irqfd = dev->udev.irqfd[index]; + int irq = dev->msix_data[index]; + uint64_t v; + int r; + + do { + r = os_read_file(irqfd, &v, sizeof(v)); + if (r == sizeof(v)) + generic_handle_irq(irq); + } while (r == sizeof(v) || r == -EINTR); + WARN(r != -EAGAIN, "read returned %d\n", r); + + return IRQ_HANDLED; +} + +static int uml_vfio_activate_irq(struct uml_vfio_device *dev, int index) +{ + struct uml_vfio_intr_ctx *ctx = &dev->intr_ctx[index]; + int err, irqfd; + + if (ctx->irq >= 0) + return 0; + + irqfd = uml_vfio_user_activate_irq(&dev->udev, index); + if (irqfd < 0) + return irqfd; + + ctx->irq = um_request_irq(UM_IRQ_ALLOC, irqfd, IRQ_READ, + uml_vfio_interrupt, 0, + "vfio-uml", ctx); + if (ctx->irq < 0) { + err = ctx->irq; + goto deactivate; + } + + err = add_sigio_fd(irqfd); + if (err) + goto free_irq; + + return 0; + +free_irq: + um_free_irq(ctx->irq, ctx); + ctx->irq = -1; +deactivate: + uml_vfio_user_deactivate_irq(&dev->udev, index); + return err; +} + +static int uml_vfio_deactivate_irq(struct uml_vfio_device *dev, int index) +{ + struct uml_vfio_intr_ctx *ctx = &dev->intr_ctx[index]; + + if (ctx->irq >= 0) { + ignore_sigio_fd(dev->udev.irqfd[index]); + um_free_irq(ctx->irq, ctx); + uml_vfio_user_deactivate_irq(&dev->udev, index); + ctx->irq = -1; + } + return 0; +} + +static int uml_vfio_update_msix_cap(struct uml_vfio_device *dev, + unsigned int offset, int size, + unsigned long val) +{ + /* + * Here, we handle only the operations we care about, + * ignoring the rest. + */ + if (size == 2 && offset == dev->msix_cap + PCI_MSIX_FLAGS) { + switch (val & ~PCI_MSIX_FLAGS_QSIZE) { + case PCI_MSIX_FLAGS_ENABLE: + case 0: + return uml_vfio_user_update_irqs(&dev->udev); + } + } + return 0; +} + +static int uml_vfio_update_msix_table(struct uml_vfio_device *dev, + unsigned int offset, int size, + unsigned long val) +{ + int index; + + /* + * Here, we handle only the operations we care about, + * ignoring the rest. + */ + offset -= dev->msix_offset + PCI_MSIX_ENTRY_DATA; + + if (size != 4 || offset % PCI_MSIX_ENTRY_SIZE != 0) + return 0; + + index = offset / PCI_MSIX_ENTRY_SIZE; + if (index >= dev->udev.irq_count) + return -EINVAL; + + dev->msix_data[index] = val; + + return val ? uml_vfio_activate_irq(dev, index) : + uml_vfio_deactivate_irq(dev, index); +} + +static unsigned long __uml_vfio_cfgspace_read(struct uml_vfio_device *dev, + unsigned int offset, int size) +{ + u8 data[8]; + + memset(data, 0xff, sizeof(data)); + + if (uml_vfio_user_cfgspace_read(&dev->udev, offset, data, size)) + return ULONG_MAX; + + switch (size) { + case 1: + return data[0]; + case 2: + return le16_to_cpup((void *)data); + case 4: + return le32_to_cpup((void *)data); +#ifdef CONFIG_64BIT + case 8: + return le64_to_cpup((void *)data); +#endif + default: + return ULONG_MAX; + } +} + +static unsigned long uml_vfio_cfgspace_read(struct um_pci_device *pdev, + unsigned int offset, int size) +{ + struct uml_vfio_device *dev = to_vdev(pdev); + + return __uml_vfio_cfgspace_read(dev, offset, size); +} + +static void __uml_vfio_cfgspace_write(struct uml_vfio_device *dev, + unsigned int offset, int size, + unsigned long val) +{ + u8 data[8]; + + switch (size) { + case 1: + data[0] = (u8)val; + break; + case 2: + put_unaligned_le16(val, (void *)data); + break; + case 4: + put_unaligned_le32(val, (void *)data); + break; +#ifdef CONFIG_64BIT + case 8: + put_unaligned_le64(val, (void *)data); + break; +#endif + } + + WARN_ON(uml_vfio_user_cfgspace_write(&dev->udev, offset, data, size)); +} + +static void uml_vfio_cfgspace_write(struct um_pci_device *pdev, + unsigned int offset, int size, + unsigned long val) +{ + struct uml_vfio_device *dev = to_vdev(pdev); + + if (offset < dev->msix_cap + PCI_CAP_MSIX_SIZEOF && + offset + size > dev->msix_cap) + WARN_ON(uml_vfio_update_msix_cap(dev, offset, size, val)); + + __uml_vfio_cfgspace_write(dev, offset, size, val); +} + +static void uml_vfio_bar_copy_from(struct um_pci_device *pdev, int bar, + void *buffer, unsigned int offset, int size) +{ + struct uml_vfio_device *dev = to_vdev(pdev); + + memset(buffer, 0xff, size); + uml_vfio_user_bar_read(&dev->udev, bar, offset, buffer, size); +} + +static unsigned long uml_vfio_bar_read(struct um_pci_device *pdev, int bar, + unsigned int offset, int size) +{ + u8 data[8]; + + uml_vfio_bar_copy_from(pdev, bar, data, offset, size); + + switch (size) { + case 1: + return data[0]; + case 2: + return le16_to_cpup((void *)data); + case 4: + return le32_to_cpup((void *)data); +#ifdef CONFIG_64BIT + case 8: + return le64_to_cpup((void *)data); +#endif + default: + return ULONG_MAX; + } +} + +static void uml_vfio_bar_copy_to(struct um_pci_device *pdev, int bar, + unsigned int offset, const void *buffer, + int size) +{ + struct uml_vfio_device *dev = to_vdev(pdev); + + uml_vfio_user_bar_write(&dev->udev, bar, offset, buffer, size); +} + +static void uml_vfio_bar_write(struct um_pci_device *pdev, int bar, + unsigned int offset, int size, + unsigned long val) +{ + struct uml_vfio_device *dev = to_vdev(pdev); + u8 data[8]; + + if (bar == dev->msix_bar && offset + size > dev->msix_offset && + offset < dev->msix_offset + dev->msix_size) + WARN_ON(uml_vfio_update_msix_table(dev, offset, size, val)); + + switch (size) { + case 1: + data[0] = (u8)val; + break; + case 2: + put_unaligned_le16(val, (void *)data); + break; + case 4: + put_unaligned_le32(val, (void *)data); + break; +#ifdef CONFIG_64BIT + case 8: + put_unaligned_le64(val, (void *)data); + break; +#endif + } + + uml_vfio_bar_copy_to(pdev, bar, offset, data, size); +} + +static void uml_vfio_bar_set(struct um_pci_device *pdev, int bar, + unsigned int offset, u8 value, int size) +{ + struct uml_vfio_device *dev = to_vdev(pdev); + int i; + + for (i = 0; i < size; i++) + uml_vfio_user_bar_write(&dev->udev, bar, offset + i, &value, 1); +} + +static const struct um_pci_ops uml_vfio_um_pci_ops = { + .cfgspace_read = uml_vfio_cfgspace_read, + .cfgspace_write = uml_vfio_cfgspace_write, + .bar_read = uml_vfio_bar_read, + .bar_write = uml_vfio_bar_write, + .bar_copy_from = uml_vfio_bar_copy_from, + .bar_copy_to = uml_vfio_bar_copy_to, + .bar_set = uml_vfio_bar_set, +}; + +static u8 uml_vfio_find_capability(struct uml_vfio_device *dev, u8 cap) +{ + u8 id, pos; + u16 ent; + int ttl = 48; /* PCI_FIND_CAP_TTL */ + + pos = __uml_vfio_cfgspace_read(dev, PCI_CAPABILITY_LIST, sizeof(pos)); + + while (pos && ttl--) { + ent = __uml_vfio_cfgspace_read(dev, pos, sizeof(ent)); + + id = ent & 0xff; + if (id == 0xff) + break; + if (id == cap) + return pos; + + pos = ent >> 8; + } + + return 0; +} + +static int uml_vfio_read_msix_table(struct uml_vfio_device *dev) +{ + unsigned int off; + u16 flags; + u32 tbl; + + off = uml_vfio_find_capability(dev, PCI_CAP_ID_MSIX); + if (!off) + return -ENOTSUPP; + + dev->msix_cap = off; + + tbl = __uml_vfio_cfgspace_read(dev, off + PCI_MSIX_TABLE, sizeof(tbl)); + flags = __uml_vfio_cfgspace_read(dev, off + PCI_MSIX_FLAGS, sizeof(flags)); + + dev->msix_bar = tbl & PCI_MSIX_TABLE_BIR; + dev->msix_offset = tbl & PCI_MSIX_TABLE_OFFSET; + dev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * PCI_MSIX_ENTRY_SIZE; + + dev->msix_data = kzalloc(dev->msix_size, GFP_KERNEL); + if (!dev->msix_data) + return -ENOMEM; + + return 0; +} + +static void uml_vfio_open_device(struct uml_vfio_device *dev) +{ + struct uml_vfio_intr_ctx *ctx; + int err, group_id, i; + + group_id = uml_vfio_user_get_group_id(dev->name); + if (group_id < 0) { + pr_err("Failed to get group id (%s), error %d\n", + dev->name, group_id); + goto free_dev; + } + + dev->group = uml_vfio_open_group(group_id); + if (dev->group < 0) { + pr_err("Failed to open group %d (%s), error %d\n", + group_id, dev->name, dev->group); + goto free_dev; + } + + err = uml_vfio_user_setup_device(&dev->udev, dev->group, dev->name); + if (err) { + pr_err("Failed to setup device (%s), error %d\n", + dev->name, err); + goto release_group; + } + + err = uml_vfio_read_msix_table(dev); + if (err) { + pr_err("Failed to read MSI-X table (%s), error %d\n", + dev->name, err); + goto teardown_udev; + } + + dev->intr_ctx = kmalloc_array(dev->udev.irq_count, + sizeof(struct uml_vfio_intr_ctx), + GFP_KERNEL); + if (!dev->intr_ctx) { + pr_err("Failed to allocate interrupt context (%s)\n", + dev->name); + goto free_msix; + } + + for (i = 0; i < dev->udev.irq_count; i++) { + ctx = &dev->intr_ctx[i]; + ctx->dev = dev; + ctx->irq = -1; + } + + dev->pdev.ops = ¨_vfio_um_pci_ops; + + err = um_pci_device_register(&dev->pdev); + if (err) { + pr_err("Failed to register UM PCI device (%s), error %d\n", + dev->name, err); + goto free_intr_ctx; + } + + return; + +free_intr_ctx: + kfree(dev->intr_ctx); +free_msix: + kfree(dev->msix_data); +teardown_udev: + uml_vfio_user_teardown_device(&dev->udev); +release_group: + uml_vfio_release_group(dev->group); +free_dev: + list_del(&dev->list); + kfree(dev->name); + kfree(dev); +} + +static void uml_vfio_release_device(struct uml_vfio_device *dev) +{ + int i; + + for (i = 0; i < dev->udev.irq_count; i++) + uml_vfio_deactivate_irq(dev, i); + uml_vfio_user_update_irqs(&dev->udev); + + um_pci_device_unregister(&dev->pdev); + kfree(dev->intr_ctx); + kfree(dev->msix_data); + uml_vfio_user_teardown_device(&dev->udev); + uml_vfio_release_group(dev->group); + list_del(&dev->list); + kfree(dev->name); + kfree(dev); +} + +static struct uml_vfio_device *uml_vfio_find_device(const char *device) +{ + struct uml_vfio_device *dev; + + list_for_each_entry(dev, ¨_vfio_devices, list) { + if (!strcmp(dev->name, device)) + return dev; + } + return NULL; +} + +static struct uml_vfio_device *uml_vfio_add_device(const char *device) +{ + struct uml_vfio_device *dev; + int fd; + + guard(mutex)(¨_vfio_devices_mtx); + + if (uml_vfio_container.fd < 0) { + fd = uml_vfio_user_open_container(); + if (fd < 0) + return ERR_PTR(fd); + uml_vfio_container.fd = fd; + } + + if (uml_vfio_find_device(device)) + return ERR_PTR(-EEXIST); + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return ERR_PTR(-ENOMEM); + + dev->name = kstrdup(device, GFP_KERNEL); + if (!dev->name) { + kfree(dev); + return ERR_PTR(-ENOMEM); + } + + list_add_tail(&dev->list, ¨_vfio_devices); + return dev; +} + +static int uml_vfio_cmdline_set(const char *device, const struct kernel_param *kp) +{ + struct uml_vfio_device *dev; + + dev = uml_vfio_add_device(device); + if (IS_ERR(dev)) + return PTR_ERR(dev); + return 0; +} + +static int uml_vfio_cmdline_get(char *buffer, const struct kernel_param *kp) +{ + return 0; +} + +static const struct kernel_param_ops uml_vfio_cmdline_param_ops = { + .set = uml_vfio_cmdline_set, + .get = uml_vfio_cmdline_get, +}; + +device_param_cb(device, ¨_vfio_cmdline_param_ops, NULL, 0400); +__uml_help(uml_vfio_cmdline_param_ops, +"vfio_uml.device=<domain:bus:slot.function>\n" +" Pass through a PCI device to UML via VFIO. Currently, only MSI-X\n" +" capable devices are supported, and it is assumed that drivers will\n" +" use MSI-X. This parameter can be specified multiple times to pass\n" +" through multiple PCI devices to UML.\n\n" +); + +static int uml_vfio_mc_config(char *str, char **error_out) +{ + struct uml_vfio_device *dev; + + if (*str != '=') { + *error_out = "Invalid config"; + return -EINVAL; + } + str += 1; + + dev = uml_vfio_add_device(str); + if (IS_ERR(dev)) + return PTR_ERR(dev); + uml_vfio_open_device(dev); + return 0; +} + +static int uml_vfio_mc_id(char **str, int *start_out, int *end_out) +{ + return -EOPNOTSUPP; +} + +static int uml_vfio_mc_remove(int n, char **error_out) +{ + return -EOPNOTSUPP; +} + +static struct mc_device uml_vfio_mc = { + .list = LIST_HEAD_INIT(uml_vfio_mc.list), + .name = "vfio_uml.device", + .config = uml_vfio_mc_config, + .get_config = NULL, + .id = uml_vfio_mc_id, + .remove = uml_vfio_mc_remove, +}; + +static int __init uml_vfio_init(void) +{ + struct uml_vfio_device *dev, *n; + + sigio_broken(); + + /* If the opening fails, the device will be released. */ + list_for_each_entry_safe(dev, n, ¨_vfio_devices, list) + uml_vfio_open_device(dev); + + mconsole_register_dev(¨_vfio_mc); + + return 0; +} +late_initcall(uml_vfio_init); + +static void __exit uml_vfio_exit(void) +{ + struct uml_vfio_device *dev, *n; + + list_for_each_entry_safe(dev, n, ¨_vfio_devices, list) + uml_vfio_release_device(dev); + + if (uml_vfio_container.fd >= 0) + os_close_file(uml_vfio_container.fd); +} +module_exit(uml_vfio_exit); diff --git a/arch/um/drivers/vfio_user.c b/arch/um/drivers/vfio_user.c new file mode 100644 index 000000000000..6a45d8e14582 --- /dev/null +++ b/arch/um/drivers/vfio_user.c @@ -0,0 +1,327 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025 Ant Group + * Author: Tiwei Bie <tiwei.btw@antgroup.com> + */ +#include <errno.h> +#include <fcntl.h> +#include <unistd.h> +#include <stdio.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <sys/eventfd.h> +#include <linux/limits.h> +#include <linux/vfio.h> +#include <linux/pci_regs.h> +#include <as-layout.h> +#include <um_malloc.h> + +#include "vfio_user.h" + +int uml_vfio_user_open_container(void) +{ + int r, fd; + + fd = open("/dev/vfio/vfio", O_RDWR); + if (fd < 0) + return -errno; + + r = ioctl(fd, VFIO_GET_API_VERSION); + if (r != VFIO_API_VERSION) { + r = r < 0 ? -errno : -EINVAL; + goto error; + } + + r = ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU); + if (r <= 0) { + r = r < 0 ? -errno : -EINVAL; + goto error; + } + + return fd; + +error: + close(fd); + return r; +} + +int uml_vfio_user_setup_iommu(int container) +{ + /* + * This is a bit tricky. See the big comment in + * vhost_user_set_mem_table() in virtio_uml.c. + */ + unsigned long reserved = uml_reserved - uml_physmem; + struct vfio_iommu_type1_dma_map dma_map = { + .argsz = sizeof(dma_map), + .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE, + .vaddr = uml_reserved, + .iova = reserved, + .size = physmem_size - reserved, + }; + + if (ioctl(container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU) < 0) + return -errno; + + if (ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map) < 0) + return -errno; + + return 0; +} + +int uml_vfio_user_get_group_id(const char *device) +{ + char *path, *buf, *end; + const char *name; + int r; + + path = uml_kmalloc(PATH_MAX, UM_GFP_KERNEL); + if (!path) + return -ENOMEM; + + sprintf(path, "/sys/bus/pci/devices/%s/iommu_group", device); + + buf = uml_kmalloc(PATH_MAX + 1, UM_GFP_KERNEL); + if (!buf) { + r = -ENOMEM; + goto free_path; + } + + r = readlink(path, buf, PATH_MAX); + if (r < 0) { + r = -errno; + goto free_buf; + } + buf[r] = '\0'; + + name = basename(buf); + + r = strtoul(name, &end, 10); + if (*end != '\0' || end == name) { + r = -EINVAL; + goto free_buf; + } + +free_buf: + kfree(buf); +free_path: + kfree(path); + return r; +} + +int uml_vfio_user_open_group(int group_id) +{ + char *path; + int fd; + + path = uml_kmalloc(PATH_MAX, UM_GFP_KERNEL); + if (!path) + return -ENOMEM; + + sprintf(path, "/dev/vfio/%d", group_id); + + fd = open(path, O_RDWR); + if (fd < 0) { + fd = -errno; + goto out; + } + +out: + kfree(path); + return fd; +} + +int uml_vfio_user_set_container(int container, int group) +{ + if (ioctl(group, VFIO_GROUP_SET_CONTAINER, &container) < 0) + return -errno; + return 0; +} + +int uml_vfio_user_unset_container(int container, int group) +{ + if (ioctl(group, VFIO_GROUP_UNSET_CONTAINER, &container) < 0) + return -errno; + return 0; +} + +static int vfio_set_irqs(int device, int start, int count, int *irqfd) +{ + struct vfio_irq_set *irq_set; + int argsz = sizeof(*irq_set) + sizeof(*irqfd) * count; + int err = 0; + + irq_set = uml_kmalloc(argsz, UM_GFP_KERNEL); + if (!irq_set) + return -ENOMEM; + + irq_set->argsz = argsz; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; + irq_set->start = start; + irq_set->count = count; + memcpy(irq_set->data, irqfd, sizeof(*irqfd) * count); + + if (ioctl(device, VFIO_DEVICE_SET_IRQS, irq_set) < 0) { + err = -errno; + goto out; + } + +out: + kfree(irq_set); + return err; +} + +int uml_vfio_user_setup_device(struct uml_vfio_user_device *dev, + int group, const char *device) +{ + struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; + struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; + int err, i; + + dev->device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, device); + if (dev->device < 0) + return -errno; + + if (ioctl(dev->device, VFIO_DEVICE_GET_INFO, &device_info) < 0) { + err = -errno; + goto close_device; + } + + dev->num_regions = device_info.num_regions; + if (dev->num_regions > VFIO_PCI_CONFIG_REGION_INDEX + 1) + dev->num_regions = VFIO_PCI_CONFIG_REGION_INDEX + 1; + + dev->region = uml_kmalloc(sizeof(*dev->region) * dev->num_regions, + UM_GFP_KERNEL); + if (!dev->region) { + err = -ENOMEM; + goto close_device; + } + + for (i = 0; i < dev->num_regions; i++) { + struct vfio_region_info region = { + .argsz = sizeof(region), + .index = i, + }; + if (ioctl(dev->device, VFIO_DEVICE_GET_REGION_INFO, ®ion) < 0) { + err = -errno; + goto free_region; + } + dev->region[i].size = region.size; + dev->region[i].offset = region.offset; + } + + /* Only MSI-X is supported currently. */ + irq_info.index = VFIO_PCI_MSIX_IRQ_INDEX; + if (ioctl(dev->device, VFIO_DEVICE_GET_IRQ_INFO, &irq_info) < 0) { + err = -errno; + goto free_region; + } + + dev->irq_count = irq_info.count; + + dev->irqfd = uml_kmalloc(sizeof(int) * dev->irq_count, UM_GFP_KERNEL); + if (!dev->irqfd) { + err = -ENOMEM; + goto free_region; + } + + memset(dev->irqfd, -1, sizeof(int) * dev->irq_count); + + err = vfio_set_irqs(dev->device, 0, dev->irq_count, dev->irqfd); + if (err) + goto free_irqfd; + + return 0; + +free_irqfd: + kfree(dev->irqfd); +free_region: + kfree(dev->region); +close_device: + close(dev->device); + return err; +} + +void uml_vfio_user_teardown_device(struct uml_vfio_user_device *dev) +{ + kfree(dev->irqfd); + kfree(dev->region); + close(dev->device); +} + +int uml_vfio_user_activate_irq(struct uml_vfio_user_device *dev, int index) +{ + int irqfd; + + irqfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); + if (irqfd < 0) + return -errno; + + dev->irqfd[index] = irqfd; + return irqfd; +} + +void uml_vfio_user_deactivate_irq(struct uml_vfio_user_device *dev, int index) +{ + close(dev->irqfd[index]); + dev->irqfd[index] = -1; +} + +int uml_vfio_user_update_irqs(struct uml_vfio_user_device *dev) +{ + return vfio_set_irqs(dev->device, 0, dev->irq_count, dev->irqfd); +} + +static int vfio_region_read(struct uml_vfio_user_device *dev, unsigned int index, + uint64_t offset, void *buf, uint64_t size) +{ + if (index >= dev->num_regions || offset + size > dev->region[index].size) + return -EINVAL; + + if (pread(dev->device, buf, size, dev->region[index].offset + offset) < 0) + return -errno; + + return 0; +} + +static int vfio_region_write(struct uml_vfio_user_device *dev, unsigned int index, + uint64_t offset, const void *buf, uint64_t size) +{ + if (index >= dev->num_regions || offset + size > dev->region[index].size) + return -EINVAL; + + if (pwrite(dev->device, buf, size, dev->region[index].offset + offset) < 0) + return -errno; + + return 0; +} + +int uml_vfio_user_cfgspace_read(struct uml_vfio_user_device *dev, + unsigned int offset, void *buf, int size) +{ + return vfio_region_read(dev, VFIO_PCI_CONFIG_REGION_INDEX, + offset, buf, size); +} + +int uml_vfio_user_cfgspace_write(struct uml_vfio_user_device *dev, + unsigned int offset, const void *buf, int size) +{ + return vfio_region_write(dev, VFIO_PCI_CONFIG_REGION_INDEX, + offset, buf, size); +} + +int uml_vfio_user_bar_read(struct uml_vfio_user_device *dev, int bar, + unsigned int offset, void *buf, int size) +{ + return vfio_region_read(dev, bar, offset, buf, size); +} + +int uml_vfio_user_bar_write(struct uml_vfio_user_device *dev, int bar, + unsigned int offset, const void *buf, int size) +{ + return vfio_region_write(dev, bar, offset, buf, size); +} diff --git a/arch/um/drivers/vfio_user.h b/arch/um/drivers/vfio_user.h new file mode 100644 index 000000000000..75535e05059b --- /dev/null +++ b/arch/um/drivers/vfio_user.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __UM_VFIO_USER_H +#define __UM_VFIO_USER_H + +struct uml_vfio_user_device { + int device; + + struct { + uint64_t size; + uint64_t offset; + } *region; + int num_regions; + + int32_t *irqfd; + int irq_count; +}; + +int uml_vfio_user_open_container(void); +int uml_vfio_user_setup_iommu(int container); + +int uml_vfio_user_get_group_id(const char *device); +int uml_vfio_user_open_group(int group_id); +int uml_vfio_user_set_container(int container, int group); +int uml_vfio_user_unset_container(int container, int group); + +int uml_vfio_user_setup_device(struct uml_vfio_user_device *dev, + int group, const char *device); +void uml_vfio_user_teardown_device(struct uml_vfio_user_device *dev); + +int uml_vfio_user_activate_irq(struct uml_vfio_user_device *dev, int index); +void uml_vfio_user_deactivate_irq(struct uml_vfio_user_device *dev, int index); +int uml_vfio_user_update_irqs(struct uml_vfio_user_device *dev); + +int uml_vfio_user_cfgspace_read(struct uml_vfio_user_device *dev, + unsigned int offset, void *buf, int size); +int uml_vfio_user_cfgspace_write(struct uml_vfio_user_device *dev, + unsigned int offset, const void *buf, int size); + +int uml_vfio_user_bar_read(struct uml_vfio_user_device *dev, int bar, + unsigned int offset, void *buf, int size); +int uml_vfio_user_bar_write(struct uml_vfio_user_device *dev, int bar, + unsigned int offset, const void *buf, int size); + +#endif /* __UM_VFIO_USER_H */ diff --git a/arch/um/drivers/vhost_user.h b/arch/um/drivers/vhost_user.h new file mode 100644 index 000000000000..fcfa3b7e021b --- /dev/null +++ b/arch/um/drivers/vhost_user.h @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Vhost-user protocol */ + +#ifndef __VHOST_USER_H__ +#define __VHOST_USER_H__ + +/* Message flags */ +#define VHOST_USER_FLAG_REPLY BIT(2) +#define VHOST_USER_FLAG_NEED_REPLY BIT(3) +/* Feature bits */ +#define VHOST_USER_F_PROTOCOL_FEATURES 30 +/* Protocol feature bits */ +#define VHOST_USER_PROTOCOL_F_MQ 0 +#define VHOST_USER_PROTOCOL_F_REPLY_ACK 3 +#define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5 +#define VHOST_USER_PROTOCOL_F_CONFIG 9 +#define VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS 14 +/* Vring state index masks */ +#define VHOST_USER_VRING_INDEX_MASK 0xff +#define VHOST_USER_VRING_POLL_MASK BIT(8) + +/* Supported version */ +#define VHOST_USER_VERSION 1 +/* Supported transport features */ +#define VHOST_USER_SUPPORTED_F BIT_ULL(VHOST_USER_F_PROTOCOL_FEATURES) +/* Supported protocol features */ +#define VHOST_USER_SUPPORTED_PROTOCOL_F (BIT_ULL(VHOST_USER_PROTOCOL_F_MQ) | \ + BIT_ULL(VHOST_USER_PROTOCOL_F_REPLY_ACK) | \ + BIT_ULL(VHOST_USER_PROTOCOL_F_SLAVE_REQ) | \ + BIT_ULL(VHOST_USER_PROTOCOL_F_CONFIG) | \ + BIT_ULL(VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS)) + +enum vhost_user_request { + VHOST_USER_GET_FEATURES = 1, + VHOST_USER_SET_FEATURES = 2, + VHOST_USER_SET_OWNER = 3, + VHOST_USER_RESET_OWNER = 4, + VHOST_USER_SET_MEM_TABLE = 5, + VHOST_USER_SET_LOG_BASE = 6, + VHOST_USER_SET_LOG_FD = 7, + VHOST_USER_SET_VRING_NUM = 8, + VHOST_USER_SET_VRING_ADDR = 9, + VHOST_USER_SET_VRING_BASE = 10, + VHOST_USER_GET_VRING_BASE = 11, + VHOST_USER_SET_VRING_KICK = 12, + VHOST_USER_SET_VRING_CALL = 13, + VHOST_USER_SET_VRING_ERR = 14, + VHOST_USER_GET_PROTOCOL_FEATURES = 15, + VHOST_USER_SET_PROTOCOL_FEATURES = 16, + VHOST_USER_GET_QUEUE_NUM = 17, + VHOST_USER_SET_VRING_ENABLE = 18, + VHOST_USER_SEND_RARP = 19, + VHOST_USER_NET_SEND_MTU = 20, + VHOST_USER_SET_SLAVE_REQ_FD = 21, + VHOST_USER_IOTLB_MSG = 22, + VHOST_USER_SET_VRING_ENDIAN = 23, + VHOST_USER_GET_CONFIG = 24, + VHOST_USER_SET_CONFIG = 25, + VHOST_USER_VRING_KICK = 35, +}; + +enum vhost_user_slave_request { + VHOST_USER_SLAVE_IOTLB_MSG = 1, + VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2, + VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG = 3, + VHOST_USER_SLAVE_VRING_CALL = 4, +}; + +struct vhost_user_header { + /* + * Use enum vhost_user_request for outgoing messages, + * uses enum vhost_user_slave_request for incoming ones. + */ + u32 request; + u32 flags; + u32 size; +} __packed; + +struct vhost_user_config { + u32 offset; + u32 size; + u32 flags; + u8 payload[]; /* Variable length */ +} __packed; + +struct vhost_user_vring_state { + u32 index; + u32 num; +} __packed; + +struct vhost_user_vring_addr { + u32 index; + u32 flags; + u64 desc, used, avail, log; +} __packed; + +struct vhost_user_mem_region { + u64 guest_addr; + u64 size; + u64 user_addr; + u64 mmap_offset; +} __packed; + +struct vhost_user_mem_regions { + u32 num; + u32 padding; + struct vhost_user_mem_region regions[2]; /* Currently supporting 2 */ +} __packed; + +union vhost_user_payload { + u64 integer; + struct vhost_user_config config; + struct vhost_user_vring_state vring_state; + struct vhost_user_vring_addr vring_addr; + struct vhost_user_mem_regions mem_regions; +}; + +struct vhost_user_msg { + struct vhost_user_header header; + union vhost_user_payload payload; +} __packed; + +#endif diff --git a/arch/um/drivers/virt-pci.c b/arch/um/drivers/virt-pci.c new file mode 100644 index 000000000000..557d93aea00a --- /dev/null +++ b/arch/um/drivers/virt-pci.c @@ -0,0 +1,618 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020 Intel Corporation + * Author: Johannes Berg <johannes@sipsolutions.net> + */ +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/logic_iomem.h> +#include <linux/of_platform.h> +#include <linux/irqchip/irq-msi-lib.h> +#include <linux/irqdomain.h> +#include <linux/msi.h> +#include <linux/unaligned.h> +#include <irq_kern.h> + +#include "virt-pci.h" + +#define MAX_DEVICES 8 +#define MAX_MSI_VECTORS 32 +#define CFG_SPACE_SIZE 4096 + +struct um_pci_device_reg { + struct um_pci_device *dev; + void __iomem *iomem; +}; + +static struct pci_host_bridge *bridge; +static DEFINE_MUTEX(um_pci_mtx); +static struct um_pci_device *um_pci_platform_device; +static struct um_pci_device_reg um_pci_devices[MAX_DEVICES]; +static struct fwnode_handle *um_pci_fwnode; +static struct irq_domain *um_pci_inner_domain; +static unsigned long um_pci_msi_used[BITS_TO_LONGS(MAX_MSI_VECTORS)]; + +static unsigned long um_pci_cfgspace_read(void *priv, unsigned int offset, + int size) +{ + struct um_pci_device_reg *reg = priv; + struct um_pci_device *dev = reg->dev; + + if (!dev) + return ULONG_MAX; + + switch (size) { + case 1: + case 2: + case 4: +#ifdef CONFIG_64BIT + case 8: +#endif + break; + default: + WARN(1, "invalid config space read size %d\n", size); + return ULONG_MAX; + } + + return dev->ops->cfgspace_read(dev, offset, size); +} + +static void um_pci_cfgspace_write(void *priv, unsigned int offset, int size, + unsigned long val) +{ + struct um_pci_device_reg *reg = priv; + struct um_pci_device *dev = reg->dev; + + if (!dev) + return; + + switch (size) { + case 1: + case 2: + case 4: +#ifdef CONFIG_64BIT + case 8: +#endif + break; + default: + WARN(1, "invalid config space write size %d\n", size); + return; + } + + dev->ops->cfgspace_write(dev, offset, size, val); +} + +static const struct logic_iomem_ops um_pci_device_cfgspace_ops = { + .read = um_pci_cfgspace_read, + .write = um_pci_cfgspace_write, +}; + +static unsigned long um_pci_bar_read(void *priv, unsigned int offset, + int size) +{ + u8 *resptr = priv; + struct um_pci_device *dev = container_of(resptr - *resptr, + struct um_pci_device, + resptr[0]); + u8 bar = *resptr; + + switch (size) { + case 1: + case 2: + case 4: +#ifdef CONFIG_64BIT + case 8: +#endif + break; + default: + WARN(1, "invalid bar read size %d\n", size); + return ULONG_MAX; + } + + return dev->ops->bar_read(dev, bar, offset, size); +} + +static void um_pci_bar_write(void *priv, unsigned int offset, int size, + unsigned long val) +{ + u8 *resptr = priv; + struct um_pci_device *dev = container_of(resptr - *resptr, + struct um_pci_device, + resptr[0]); + u8 bar = *resptr; + + switch (size) { + case 1: + case 2: + case 4: +#ifdef CONFIG_64BIT + case 8: +#endif + break; + default: + WARN(1, "invalid bar write size %d\n", size); + return; + } + + dev->ops->bar_write(dev, bar, offset, size, val); +} + +static void um_pci_bar_copy_from(void *priv, void *buffer, + unsigned int offset, int size) +{ + u8 *resptr = priv; + struct um_pci_device *dev = container_of(resptr - *resptr, + struct um_pci_device, + resptr[0]); + u8 bar = *resptr; + + dev->ops->bar_copy_from(dev, bar, buffer, offset, size); +} + +static void um_pci_bar_copy_to(void *priv, unsigned int offset, + const void *buffer, int size) +{ + u8 *resptr = priv; + struct um_pci_device *dev = container_of(resptr - *resptr, + struct um_pci_device, + resptr[0]); + u8 bar = *resptr; + + dev->ops->bar_copy_to(dev, bar, offset, buffer, size); +} + +static void um_pci_bar_set(void *priv, unsigned int offset, u8 value, int size) +{ + u8 *resptr = priv; + struct um_pci_device *dev = container_of(resptr - *resptr, + struct um_pci_device, + resptr[0]); + u8 bar = *resptr; + + dev->ops->bar_set(dev, bar, offset, value, size); +} + +static const struct logic_iomem_ops um_pci_device_bar_ops = { + .read = um_pci_bar_read, + .write = um_pci_bar_write, + .set = um_pci_bar_set, + .copy_from = um_pci_bar_copy_from, + .copy_to = um_pci_bar_copy_to, +}; + +static void __iomem *um_pci_map_bus(struct pci_bus *bus, unsigned int devfn, + int where) +{ + struct um_pci_device_reg *dev; + unsigned int busn = bus->number; + + if (busn > 0) + return NULL; + + /* not allowing functions for now ... */ + if (devfn % 8) + return NULL; + + if (devfn / 8 >= ARRAY_SIZE(um_pci_devices)) + return NULL; + + dev = &um_pci_devices[devfn / 8]; + if (!dev) + return NULL; + + return (void __iomem *)((unsigned long)dev->iomem + where); +} + +static struct pci_ops um_pci_ops = { + .map_bus = um_pci_map_bus, + .read = pci_generic_config_read, + .write = pci_generic_config_write, +}; + +static void um_pci_rescan(void) +{ + pci_lock_rescan_remove(); + pci_rescan_bus(bridge->bus); + pci_unlock_rescan_remove(); +} + +#ifdef CONFIG_OF +/* Copied from arch/x86/kernel/devicetree.c */ +struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus) +{ + struct device_node *np; + + for_each_node_by_type(np, "pci") { + const void *prop; + unsigned int bus_min; + + prop = of_get_property(np, "bus-range", NULL); + if (!prop) + continue; + bus_min = be32_to_cpup(prop); + if (bus->number == bus_min) + return np; + } + return NULL; +} +#endif + +static struct resource virt_cfgspace_resource = { + .name = "PCI config space", + .start = 0xf0000000 - MAX_DEVICES * CFG_SPACE_SIZE, + .end = 0xf0000000 - 1, + .flags = IORESOURCE_MEM, +}; + +static long um_pci_map_cfgspace(unsigned long offset, size_t size, + const struct logic_iomem_ops **ops, + void **priv) +{ + if (WARN_ON(size > CFG_SPACE_SIZE || offset % CFG_SPACE_SIZE)) + return -EINVAL; + + if (offset / CFG_SPACE_SIZE < MAX_DEVICES) { + *ops = &um_pci_device_cfgspace_ops; + *priv = &um_pci_devices[offset / CFG_SPACE_SIZE]; + return 0; + } + + WARN(1, "cannot map offset 0x%lx/0x%zx\n", offset, size); + return -ENOENT; +} + +static const struct logic_iomem_region_ops um_pci_cfgspace_ops = { + .map = um_pci_map_cfgspace, +}; + +static struct resource virt_iomem_resource = { + .name = "PCI iomem", + .start = 0xf0000000, + .end = 0xffffffff, + .flags = IORESOURCE_MEM, +}; + +struct um_pci_map_iomem_data { + unsigned long offset; + size_t size; + const struct logic_iomem_ops **ops; + void **priv; + long ret; +}; + +static int um_pci_map_iomem_walk(struct pci_dev *pdev, void *_data) +{ + struct um_pci_map_iomem_data *data = _data; + struct um_pci_device_reg *reg = &um_pci_devices[pdev->devfn / 8]; + struct um_pci_device *dev; + int i; + + if (!reg->dev) + return 0; + + for (i = 0; i < ARRAY_SIZE(dev->resptr); i++) { + struct resource *r = &pdev->resource[i]; + + if ((r->flags & IORESOURCE_TYPE_BITS) != IORESOURCE_MEM) + continue; + + /* + * must be the whole or part of the resource, + * not allowed to only overlap + */ + if (data->offset < r->start || data->offset > r->end) + continue; + if (data->offset + data->size - 1 > r->end) + continue; + + dev = reg->dev; + *data->ops = &um_pci_device_bar_ops; + dev->resptr[i] = i; + *data->priv = &dev->resptr[i]; + data->ret = data->offset - r->start; + + /* no need to continue */ + return 1; + } + + return 0; +} + +static long um_pci_map_iomem(unsigned long offset, size_t size, + const struct logic_iomem_ops **ops, + void **priv) +{ + struct um_pci_map_iomem_data data = { + /* we want the full address here */ + .offset = offset + virt_iomem_resource.start, + .size = size, + .ops = ops, + .priv = priv, + .ret = -ENOENT, + }; + + pci_walk_bus(bridge->bus, um_pci_map_iomem_walk, &data); + return data.ret; +} + +static const struct logic_iomem_region_ops um_pci_iomem_ops = { + .map = um_pci_map_iomem, +}; + +static void um_pci_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) +{ + /* + * This is a very low address and not actually valid 'physical' memory + * in UML, so we can simply map MSI(-X) vectors to there, it cannot be + * legitimately written to by the device in any other way. + * We use the (virtual) IRQ number here as the message to simplify the + * code that receives the message, where for now we simply trust the + * device to send the correct message. + */ + msg->address_hi = 0; + msg->address_lo = 0xa0000; + msg->data = data->irq; +} + +static struct irq_chip um_pci_msi_bottom_irq_chip = { + .name = "UM virtual MSI", + .irq_compose_msi_msg = um_pci_compose_msi_msg, +}; + +static int um_pci_inner_domain_alloc(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs, + void *args) +{ + unsigned long bit; + + WARN_ON(nr_irqs != 1); + + mutex_lock(&um_pci_mtx); + bit = find_first_zero_bit(um_pci_msi_used, MAX_MSI_VECTORS); + if (bit >= MAX_MSI_VECTORS) { + mutex_unlock(&um_pci_mtx); + return -ENOSPC; + } + + set_bit(bit, um_pci_msi_used); + mutex_unlock(&um_pci_mtx); + + irq_domain_set_info(domain, virq, bit, &um_pci_msi_bottom_irq_chip, + domain->host_data, handle_simple_irq, + NULL, NULL); + + return 0; +} + +static void um_pci_inner_domain_free(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs) +{ + struct irq_data *d = irq_domain_get_irq_data(domain, virq); + + mutex_lock(&um_pci_mtx); + + if (!test_bit(d->hwirq, um_pci_msi_used)) + pr_err("trying to free unused MSI#%lu\n", d->hwirq); + else + __clear_bit(d->hwirq, um_pci_msi_used); + + mutex_unlock(&um_pci_mtx); +} + +static const struct irq_domain_ops um_pci_inner_domain_ops = { + .select = msi_lib_irq_domain_select, + .alloc = um_pci_inner_domain_alloc, + .free = um_pci_inner_domain_free, +}; + +#define UM_PCI_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS | \ + MSI_FLAG_USE_DEF_CHIP_OPS | \ + MSI_FLAG_NO_AFFINITY) +#define UM_PCI_MSI_FLAGS_SUPPORTED (MSI_GENERIC_FLAGS_MASK | \ + MSI_FLAG_PCI_MSIX) + +static const struct msi_parent_ops um_pci_msi_parent_ops = { + .required_flags = UM_PCI_MSI_FLAGS_REQUIRED, + .supported_flags = UM_PCI_MSI_FLAGS_SUPPORTED, + .bus_select_token = DOMAIN_BUS_NEXUS, + .bus_select_mask = MATCH_PCI_MSI, + .prefix = "UM-virtual-", + .init_dev_msi_info = msi_lib_init_dev_msi_info, +}; + +static struct resource busn_resource = { + .name = "PCI busn", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUS, +}; + +static int um_pci_map_irq(const struct pci_dev *pdev, u8 slot, u8 pin) +{ + struct um_pci_device_reg *reg = &um_pci_devices[pdev->devfn / 8]; + + if (WARN_ON(!reg->dev)) + return -EINVAL; + + /* Yes, we map all pins to the same IRQ ... doesn't matter for now. */ + return reg->dev->irq; +} + +void *pci_root_bus_fwnode(struct pci_bus *bus) +{ + return um_pci_fwnode; +} + +static long um_pci_map_platform(unsigned long offset, size_t size, + const struct logic_iomem_ops **ops, + void **priv) +{ + if (!um_pci_platform_device) + return -ENOENT; + + *ops = &um_pci_device_bar_ops; + *priv = &um_pci_platform_device->resptr[0]; + + return offset; +} + +static const struct logic_iomem_region_ops um_pci_platform_ops = { + .map = um_pci_map_platform, +}; + +static struct resource virt_platform_resource = { + .name = "platform", + .start = 0x10000000, + .end = 0x1fffffff, + .flags = IORESOURCE_MEM, +}; + +int um_pci_device_register(struct um_pci_device *dev) +{ + int i, free = -1; + int err = 0; + + mutex_lock(&um_pci_mtx); + for (i = 0; i < MAX_DEVICES; i++) { + if (um_pci_devices[i].dev) + continue; + free = i; + break; + } + + if (free < 0) { + err = -ENOSPC; + goto out; + } + + dev->irq = irq_alloc_desc(numa_node_id()); + if (dev->irq < 0) { + err = dev->irq; + goto out; + } + + um_pci_devices[free].dev = dev; + +out: + mutex_unlock(&um_pci_mtx); + if (!err) + um_pci_rescan(); + return err; +} + +void um_pci_device_unregister(struct um_pci_device *dev) +{ + int i; + + mutex_lock(&um_pci_mtx); + for (i = 0; i < MAX_DEVICES; i++) { + if (um_pci_devices[i].dev != dev) + continue; + um_pci_devices[i].dev = NULL; + irq_free_desc(dev->irq); + break; + } + mutex_unlock(&um_pci_mtx); + + if (i < MAX_DEVICES) { + struct pci_dev *pci_dev; + + pci_dev = pci_get_slot(bridge->bus, i); + if (pci_dev) + pci_stop_and_remove_bus_device_locked(pci_dev); + } +} + +int um_pci_platform_device_register(struct um_pci_device *dev) +{ + guard(mutex)(&um_pci_mtx); + if (um_pci_platform_device) + return -EBUSY; + um_pci_platform_device = dev; + return 0; +} + +void um_pci_platform_device_unregister(struct um_pci_device *dev) +{ + guard(mutex)(&um_pci_mtx); + if (um_pci_platform_device == dev) + um_pci_platform_device = NULL; +} + +static int __init um_pci_init(void) +{ + int err, i; + + WARN_ON(logic_iomem_add_region(&virt_cfgspace_resource, + &um_pci_cfgspace_ops)); + WARN_ON(logic_iomem_add_region(&virt_iomem_resource, + &um_pci_iomem_ops)); + WARN_ON(logic_iomem_add_region(&virt_platform_resource, + &um_pci_platform_ops)); + + bridge = pci_alloc_host_bridge(0); + if (!bridge) { + err = -ENOMEM; + goto free; + } + + um_pci_fwnode = irq_domain_alloc_named_fwnode("um-pci"); + if (!um_pci_fwnode) { + err = -ENOMEM; + goto free; + } + + struct irq_domain_info info = { + .fwnode = um_pci_fwnode, + .ops = &um_pci_inner_domain_ops, + .size = MAX_MSI_VECTORS, + }; + + um_pci_inner_domain = msi_create_parent_irq_domain(&info, &um_pci_msi_parent_ops); + if (!um_pci_inner_domain) { + err = -ENOMEM; + goto free; + } + + pci_add_resource(&bridge->windows, &virt_iomem_resource); + pci_add_resource(&bridge->windows, &busn_resource); + bridge->ops = &um_pci_ops; + bridge->map_irq = um_pci_map_irq; + + for (i = 0; i < MAX_DEVICES; i++) { + resource_size_t start; + + start = virt_cfgspace_resource.start + i * CFG_SPACE_SIZE; + um_pci_devices[i].iomem = ioremap(start, CFG_SPACE_SIZE); + if (WARN(!um_pci_devices[i].iomem, "failed to map %d\n", i)) { + err = -ENOMEM; + goto free; + } + } + + err = pci_host_probe(bridge); + if (err) + goto free; + + return 0; + +free: + if (um_pci_inner_domain) + irq_domain_remove(um_pci_inner_domain); + if (um_pci_fwnode) + irq_domain_free_fwnode(um_pci_fwnode); + if (bridge) { + pci_free_resource_list(&bridge->windows); + pci_free_host_bridge(bridge); + } + return err; +} +device_initcall(um_pci_init); + +static void __exit um_pci_exit(void) +{ + irq_domain_remove(um_pci_inner_domain); + pci_free_resource_list(&bridge->windows); + pci_free_host_bridge(bridge); +} +module_exit(um_pci_exit); diff --git a/arch/um/drivers/virt-pci.h b/arch/um/drivers/virt-pci.h new file mode 100644 index 000000000000..b20d1475d1eb --- /dev/null +++ b/arch/um/drivers/virt-pci.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __UM_VIRT_PCI_H +#define __UM_VIRT_PCI_H + +#include <linux/pci.h> + +struct um_pci_device { + const struct um_pci_ops *ops; + + /* for now just standard BARs */ + u8 resptr[PCI_STD_NUM_BARS]; + + int irq; +}; + +struct um_pci_ops { + unsigned long (*cfgspace_read)(struct um_pci_device *dev, + unsigned int offset, int size); + void (*cfgspace_write)(struct um_pci_device *dev, unsigned int offset, + int size, unsigned long val); + + unsigned long (*bar_read)(struct um_pci_device *dev, int bar, + unsigned int offset, int size); + void (*bar_write)(struct um_pci_device *dev, int bar, + unsigned int offset, int size, unsigned long val); + + void (*bar_copy_from)(struct um_pci_device *dev, int bar, void *buffer, + unsigned int offset, int size); + void (*bar_copy_to)(struct um_pci_device *dev, int bar, + unsigned int offset, const void *buffer, int size); + void (*bar_set)(struct um_pci_device *dev, int bar, + unsigned int offset, u8 value, int size); +}; + +int um_pci_device_register(struct um_pci_device *dev); +void um_pci_device_unregister(struct um_pci_device *dev); + +int um_pci_platform_device_register(struct um_pci_device *dev); +void um_pci_platform_device_unregister(struct um_pci_device *dev); + +#endif /* __UM_VIRT_PCI_H */ diff --git a/arch/um/drivers/virtio_pcidev.c b/arch/um/drivers/virtio_pcidev.c new file mode 100644 index 000000000000..f9b4b6f7582c --- /dev/null +++ b/arch/um/drivers/virtio_pcidev.c @@ -0,0 +1,634 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020 Intel Corporation + * Author: Johannes Berg <johannes@sipsolutions.net> + */ +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/virtio.h> +#include <linux/virtio_config.h> +#include <linux/logic_iomem.h> +#include <linux/of_platform.h> +#include <linux/irqdomain.h> +#include <linux/virtio_pcidev.h> +#include <linux/virtio-uml.h> +#include <linux/delay.h> +#include <linux/msi.h> +#include <linux/unaligned.h> +#include <irq_kern.h> + +#include "virt-pci.h" + +#define to_virtio_pcidev(_pdev) \ + container_of(_pdev, struct virtio_pcidev_device, pdev) + +/* for MSI-X we have a 32-bit payload */ +#define MAX_IRQ_MSG_SIZE (sizeof(struct virtio_pcidev_msg) + sizeof(u32)) +#define NUM_IRQ_MSGS 10 + +struct virtio_pcidev_message_buffer { + struct virtio_pcidev_msg hdr; + u8 data[8]; +}; + +struct virtio_pcidev_device { + struct um_pci_device pdev; + struct virtio_device *vdev; + + struct virtqueue *cmd_vq, *irq_vq; + +#define VIRTIO_PCIDEV_WRITE_BUFS 20 + struct virtio_pcidev_message_buffer bufs[VIRTIO_PCIDEV_WRITE_BUFS + 1]; + void *extra_ptrs[VIRTIO_PCIDEV_WRITE_BUFS + 1]; + DECLARE_BITMAP(used_bufs, VIRTIO_PCIDEV_WRITE_BUFS); + +#define VIRTIO_PCIDEV_STAT_WAITING 0 + unsigned long status; + + bool platform; +}; + +static unsigned int virtio_pcidev_max_delay_us = 40000; +module_param_named(max_delay_us, virtio_pcidev_max_delay_us, uint, 0644); + +static int virtio_pcidev_get_buf(struct virtio_pcidev_device *dev, bool *posted) +{ + int i; + + for (i = 0; i < VIRTIO_PCIDEV_WRITE_BUFS; i++) { + if (!test_and_set_bit(i, dev->used_bufs)) + return i; + } + + *posted = false; + return VIRTIO_PCIDEV_WRITE_BUFS; +} + +static void virtio_pcidev_free_buf(struct virtio_pcidev_device *dev, void *buf) +{ + int i; + + if (buf == &dev->bufs[VIRTIO_PCIDEV_WRITE_BUFS]) { + kfree(dev->extra_ptrs[VIRTIO_PCIDEV_WRITE_BUFS]); + dev->extra_ptrs[VIRTIO_PCIDEV_WRITE_BUFS] = NULL; + return; + } + + for (i = 0; i < VIRTIO_PCIDEV_WRITE_BUFS; i++) { + if (buf == &dev->bufs[i]) { + kfree(dev->extra_ptrs[i]); + dev->extra_ptrs[i] = NULL; + WARN_ON(!test_and_clear_bit(i, dev->used_bufs)); + return; + } + } + + WARN_ON(1); +} + +static int virtio_pcidev_send_cmd(struct virtio_pcidev_device *dev, + struct virtio_pcidev_msg *cmd, + unsigned int cmd_size, + const void *extra, unsigned int extra_size, + void *out, unsigned int out_size) +{ + struct scatterlist out_sg, extra_sg, in_sg; + struct scatterlist *sgs_list[] = { + [0] = &out_sg, + [1] = extra ? &extra_sg : &in_sg, + [2] = extra ? &in_sg : NULL, + }; + struct virtio_pcidev_message_buffer *buf; + int delay_count = 0; + bool bounce_out; + int ret, len; + int buf_idx; + bool posted; + + if (WARN_ON(cmd_size < sizeof(*cmd) || cmd_size > sizeof(*buf))) + return -EINVAL; + + switch (cmd->op) { + case VIRTIO_PCIDEV_OP_CFG_WRITE: + case VIRTIO_PCIDEV_OP_MMIO_WRITE: + case VIRTIO_PCIDEV_OP_MMIO_MEMSET: + /* in PCI, writes are posted, so don't wait */ + posted = !out; + WARN_ON(!posted); + break; + default: + posted = false; + break; + } + + bounce_out = !posted && cmd_size <= sizeof(*cmd) && + out && out_size <= sizeof(buf->data); + + buf_idx = virtio_pcidev_get_buf(dev, &posted); + buf = &dev->bufs[buf_idx]; + memcpy(buf, cmd, cmd_size); + + if (posted && extra && extra_size > sizeof(buf) - cmd_size) { + dev->extra_ptrs[buf_idx] = kmemdup(extra, extra_size, + GFP_ATOMIC); + + if (!dev->extra_ptrs[buf_idx]) { + virtio_pcidev_free_buf(dev, buf); + return -ENOMEM; + } + extra = dev->extra_ptrs[buf_idx]; + } else if (extra && extra_size <= sizeof(buf) - cmd_size) { + memcpy((u8 *)buf + cmd_size, extra, extra_size); + cmd_size += extra_size; + extra_size = 0; + extra = NULL; + cmd = (void *)buf; + } else { + cmd = (void *)buf; + } + + sg_init_one(&out_sg, cmd, cmd_size); + if (extra) + sg_init_one(&extra_sg, extra, extra_size); + /* allow stack for small buffers */ + if (bounce_out) + sg_init_one(&in_sg, buf->data, out_size); + else if (out) + sg_init_one(&in_sg, out, out_size); + + /* add to internal virtio queue */ + ret = virtqueue_add_sgs(dev->cmd_vq, sgs_list, + extra ? 2 : 1, + out ? 1 : 0, + cmd, GFP_ATOMIC); + if (ret) { + virtio_pcidev_free_buf(dev, buf); + return ret; + } + + if (posted) { + virtqueue_kick(dev->cmd_vq); + return 0; + } + + /* kick and poll for getting a response on the queue */ + set_bit(VIRTIO_PCIDEV_STAT_WAITING, &dev->status); + virtqueue_kick(dev->cmd_vq); + ret = 0; + + while (1) { + void *completed = virtqueue_get_buf(dev->cmd_vq, &len); + + if (completed == buf) + break; + + if (completed) + virtio_pcidev_free_buf(dev, completed); + + if (WARN_ONCE(virtqueue_is_broken(dev->cmd_vq) || + ++delay_count > virtio_pcidev_max_delay_us, + "um virt-pci delay: %d", delay_count)) { + ret = -EIO; + break; + } + udelay(1); + } + clear_bit(VIRTIO_PCIDEV_STAT_WAITING, &dev->status); + + if (bounce_out) + memcpy(out, buf->data, out_size); + + virtio_pcidev_free_buf(dev, buf); + + return ret; +} + +static unsigned long virtio_pcidev_cfgspace_read(struct um_pci_device *pdev, + unsigned int offset, int size) +{ + struct virtio_pcidev_device *dev = to_virtio_pcidev(pdev); + struct virtio_pcidev_msg hdr = { + .op = VIRTIO_PCIDEV_OP_CFG_READ, + .size = size, + .addr = offset, + }; + /* max 8, we might not use it all */ + u8 data[8]; + + memset(data, 0xff, sizeof(data)); + + /* size has been checked in um_pci_cfgspace_read() */ + if (virtio_pcidev_send_cmd(dev, &hdr, sizeof(hdr), NULL, 0, data, size)) + return ULONG_MAX; + + switch (size) { + case 1: + return data[0]; + case 2: + return le16_to_cpup((void *)data); + case 4: + return le32_to_cpup((void *)data); +#ifdef CONFIG_64BIT + case 8: + return le64_to_cpup((void *)data); +#endif + default: + return ULONG_MAX; + } +} + +static void virtio_pcidev_cfgspace_write(struct um_pci_device *pdev, + unsigned int offset, int size, + unsigned long val) +{ + struct virtio_pcidev_device *dev = to_virtio_pcidev(pdev); + struct { + struct virtio_pcidev_msg hdr; + /* maximum size - we may only use parts of it */ + u8 data[8]; + } msg = { + .hdr = { + .op = VIRTIO_PCIDEV_OP_CFG_WRITE, + .size = size, + .addr = offset, + }, + }; + + /* size has been checked in um_pci_cfgspace_write() */ + switch (size) { + case 1: + msg.data[0] = (u8)val; + break; + case 2: + put_unaligned_le16(val, (void *)msg.data); + break; + case 4: + put_unaligned_le32(val, (void *)msg.data); + break; +#ifdef CONFIG_64BIT + case 8: + put_unaligned_le64(val, (void *)msg.data); + break; +#endif + } + + WARN_ON(virtio_pcidev_send_cmd(dev, &msg.hdr, sizeof(msg), NULL, 0, NULL, 0)); +} + +static void virtio_pcidev_bar_copy_from(struct um_pci_device *pdev, + int bar, void *buffer, + unsigned int offset, int size) +{ + struct virtio_pcidev_device *dev = to_virtio_pcidev(pdev); + struct virtio_pcidev_msg hdr = { + .op = VIRTIO_PCIDEV_OP_MMIO_READ, + .bar = bar, + .size = size, + .addr = offset, + }; + + memset(buffer, 0xff, size); + + virtio_pcidev_send_cmd(dev, &hdr, sizeof(hdr), NULL, 0, buffer, size); +} + +static unsigned long virtio_pcidev_bar_read(struct um_pci_device *pdev, int bar, + unsigned int offset, int size) +{ + /* 8 is maximum size - we may only use parts of it */ + u8 data[8]; + + /* size has been checked in um_pci_bar_read() */ + virtio_pcidev_bar_copy_from(pdev, bar, data, offset, size); + + switch (size) { + case 1: + return data[0]; + case 2: + return le16_to_cpup((void *)data); + case 4: + return le32_to_cpup((void *)data); +#ifdef CONFIG_64BIT + case 8: + return le64_to_cpup((void *)data); +#endif + default: + return ULONG_MAX; + } +} + +static void virtio_pcidev_bar_copy_to(struct um_pci_device *pdev, + int bar, unsigned int offset, + const void *buffer, int size) +{ + struct virtio_pcidev_device *dev = to_virtio_pcidev(pdev); + struct virtio_pcidev_msg hdr = { + .op = VIRTIO_PCIDEV_OP_MMIO_WRITE, + .bar = bar, + .size = size, + .addr = offset, + }; + + virtio_pcidev_send_cmd(dev, &hdr, sizeof(hdr), buffer, size, NULL, 0); +} + +static void virtio_pcidev_bar_write(struct um_pci_device *pdev, int bar, + unsigned int offset, int size, + unsigned long val) +{ + /* maximum size - we may only use parts of it */ + u8 data[8]; + + /* size has been checked in um_pci_bar_write() */ + switch (size) { + case 1: + data[0] = (u8)val; + break; + case 2: + put_unaligned_le16(val, (void *)data); + break; + case 4: + put_unaligned_le32(val, (void *)data); + break; +#ifdef CONFIG_64BIT + case 8: + put_unaligned_le64(val, (void *)data); + break; +#endif + } + + virtio_pcidev_bar_copy_to(pdev, bar, offset, data, size); +} + +static void virtio_pcidev_bar_set(struct um_pci_device *pdev, int bar, + unsigned int offset, u8 value, int size) +{ + struct virtio_pcidev_device *dev = to_virtio_pcidev(pdev); + struct { + struct virtio_pcidev_msg hdr; + u8 data; + } msg = { + .hdr = { + .op = VIRTIO_PCIDEV_OP_CFG_WRITE, + .bar = bar, + .size = size, + .addr = offset, + }, + .data = value, + }; + + virtio_pcidev_send_cmd(dev, &msg.hdr, sizeof(msg), NULL, 0, NULL, 0); +} + +static const struct um_pci_ops virtio_pcidev_um_pci_ops = { + .cfgspace_read = virtio_pcidev_cfgspace_read, + .cfgspace_write = virtio_pcidev_cfgspace_write, + .bar_read = virtio_pcidev_bar_read, + .bar_write = virtio_pcidev_bar_write, + .bar_copy_from = virtio_pcidev_bar_copy_from, + .bar_copy_to = virtio_pcidev_bar_copy_to, + .bar_set = virtio_pcidev_bar_set, +}; + +static void virtio_pcidev_irq_vq_addbuf(struct virtqueue *vq, void *buf, bool kick) +{ + struct scatterlist sg[1]; + + sg_init_one(sg, buf, MAX_IRQ_MSG_SIZE); + if (virtqueue_add_inbuf(vq, sg, 1, buf, GFP_ATOMIC)) + kfree(buf); + else if (kick) + virtqueue_kick(vq); +} + +static void virtio_pcidev_handle_irq_message(struct virtqueue *vq, + struct virtio_pcidev_msg *msg) +{ + struct virtio_device *vdev = vq->vdev; + struct virtio_pcidev_device *dev = vdev->priv; + + if (!dev->pdev.irq) + return; + + /* we should properly chain interrupts, but on ARCH=um we don't care */ + + switch (msg->op) { + case VIRTIO_PCIDEV_OP_INT: + generic_handle_irq(dev->pdev.irq); + break; + case VIRTIO_PCIDEV_OP_MSI: + /* our MSI message is just the interrupt number */ + if (msg->size == sizeof(u32)) + generic_handle_irq(le32_to_cpup((void *)msg->data)); + else + generic_handle_irq(le16_to_cpup((void *)msg->data)); + break; + case VIRTIO_PCIDEV_OP_PME: + /* nothing to do - we already woke up due to the message */ + break; + default: + dev_err(&vdev->dev, "unexpected virt-pci message %d\n", msg->op); + break; + } +} + +static void virtio_pcidev_cmd_vq_cb(struct virtqueue *vq) +{ + struct virtio_device *vdev = vq->vdev; + struct virtio_pcidev_device *dev = vdev->priv; + void *cmd; + int len; + + if (test_bit(VIRTIO_PCIDEV_STAT_WAITING, &dev->status)) + return; + + while ((cmd = virtqueue_get_buf(vq, &len))) + virtio_pcidev_free_buf(dev, cmd); +} + +static void virtio_pcidev_irq_vq_cb(struct virtqueue *vq) +{ + struct virtio_pcidev_msg *msg; + int len; + + while ((msg = virtqueue_get_buf(vq, &len))) { + if (len >= sizeof(*msg)) + virtio_pcidev_handle_irq_message(vq, msg); + + /* recycle the message buffer */ + virtio_pcidev_irq_vq_addbuf(vq, msg, true); + } +} + +static int virtio_pcidev_init_vqs(struct virtio_pcidev_device *dev) +{ + struct virtqueue_info vqs_info[] = { + { "cmd", virtio_pcidev_cmd_vq_cb }, + { "irq", virtio_pcidev_irq_vq_cb }, + }; + struct virtqueue *vqs[2]; + int err, i; + + err = virtio_find_vqs(dev->vdev, 2, vqs, vqs_info, NULL); + if (err) + return err; + + dev->cmd_vq = vqs[0]; + dev->irq_vq = vqs[1]; + + virtio_device_ready(dev->vdev); + + for (i = 0; i < NUM_IRQ_MSGS; i++) { + void *msg = kzalloc(MAX_IRQ_MSG_SIZE, GFP_KERNEL); + + if (msg) + virtio_pcidev_irq_vq_addbuf(dev->irq_vq, msg, false); + } + + virtqueue_kick(dev->irq_vq); + + return 0; +} + +static void __virtio_pcidev_virtio_platform_remove(struct virtio_device *vdev, + struct virtio_pcidev_device *dev) +{ + um_pci_platform_device_unregister(&dev->pdev); + + virtio_reset_device(vdev); + vdev->config->del_vqs(vdev); + + kfree(dev); +} + +static int virtio_pcidev_virtio_platform_probe(struct virtio_device *vdev, + struct virtio_pcidev_device *dev) +{ + int err; + + dev->platform = true; + + err = virtio_pcidev_init_vqs(dev); + if (err) + goto err_free; + + err = um_pci_platform_device_register(&dev->pdev); + if (err) + goto err_reset; + + err = of_platform_default_populate(vdev->dev.of_node, NULL, &vdev->dev); + if (err) + goto err_unregister; + + return 0; + +err_unregister: + um_pci_platform_device_unregister(&dev->pdev); +err_reset: + virtio_reset_device(vdev); + vdev->config->del_vqs(vdev); +err_free: + kfree(dev); + return err; +} + +static int virtio_pcidev_virtio_probe(struct virtio_device *vdev) +{ + struct virtio_pcidev_device *dev; + int err; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return -ENOMEM; + + dev->vdev = vdev; + vdev->priv = dev; + + dev->pdev.ops = &virtio_pcidev_um_pci_ops; + + if (of_device_is_compatible(vdev->dev.of_node, "simple-bus")) + return virtio_pcidev_virtio_platform_probe(vdev, dev); + + err = virtio_pcidev_init_vqs(dev); + if (err) + goto err_free; + + err = um_pci_device_register(&dev->pdev); + if (err) + goto err_reset; + + device_set_wakeup_enable(&vdev->dev, true); + + /* + * In order to do suspend-resume properly, don't allow VQs + * to be suspended. + */ + virtio_uml_set_no_vq_suspend(vdev, true); + + return 0; + +err_reset: + virtio_reset_device(vdev); + vdev->config->del_vqs(vdev); +err_free: + kfree(dev); + return err; +} + +static void virtio_pcidev_virtio_remove(struct virtio_device *vdev) +{ + struct virtio_pcidev_device *dev = vdev->priv; + + if (dev->platform) { + of_platform_depopulate(&vdev->dev); + __virtio_pcidev_virtio_platform_remove(vdev, dev); + return; + } + + device_set_wakeup_enable(&vdev->dev, false); + + um_pci_device_unregister(&dev->pdev); + + /* Stop all virtqueues */ + virtio_reset_device(vdev); + dev->cmd_vq = NULL; + dev->irq_vq = NULL; + vdev->config->del_vqs(vdev); + + kfree(dev); +} + +static void virtio_pcidev_virtio_shutdown(struct virtio_device *vdev) +{ + /* nothing to do, we just don't want queue shutdown */ +} + +static struct virtio_device_id id_table[] = { + { CONFIG_UML_PCI_OVER_VIRTIO_DEVICE_ID, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; +MODULE_DEVICE_TABLE(virtio, id_table); + +static struct virtio_driver virtio_pcidev_virtio_driver = { + .driver.name = "virtio-pci", + .id_table = id_table, + .probe = virtio_pcidev_virtio_probe, + .remove = virtio_pcidev_virtio_remove, + .shutdown = virtio_pcidev_virtio_shutdown, +}; + +static int __init virtio_pcidev_init(void) +{ + if (WARN(CONFIG_UML_PCI_OVER_VIRTIO_DEVICE_ID < 0, + "No virtio device ID configured for PCI - no PCI support\n")) + return 0; + + return register_virtio_driver(&virtio_pcidev_virtio_driver); +} +late_initcall(virtio_pcidev_init); + +static void __exit virtio_pcidev_exit(void) +{ + unregister_virtio_driver(&virtio_pcidev_virtio_driver); +} +module_exit(virtio_pcidev_exit); diff --git a/arch/um/drivers/virtio_uml.c b/arch/um/drivers/virtio_uml.c new file mode 100644 index 000000000000..6cf1152a1a4e --- /dev/null +++ b/arch/um/drivers/virtio_uml.c @@ -0,0 +1,1495 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Virtio vhost-user driver + * + * Copyright(c) 2019 Intel Corporation + * + * This driver allows virtio devices to be used over a vhost-user socket. + * + * Guest devices can be instantiated by kernel module or command line + * parameters. One device will be created for each parameter. Syntax: + * + * virtio_uml.device=<socket>:<virtio_id>[:<platform_id>] + * where: + * <socket> := vhost-user socket path to connect + * <virtio_id> := virtio device id (as in virtio_ids.h) + * <platform_id> := (optional) platform device id + * + * example: + * virtio_uml.device=/var/uml.socket:1 + * + * Based on Virtio MMIO driver by Pawel Moll, copyright 2011-2014, ARM Ltd. + */ +#include <linux/module.h> +#include <linux/of.h> +#include <linux/platform_device.h> +#include <linux/slab.h> +#include <linux/string_choices.h> +#include <linux/virtio.h> +#include <linux/virtio_config.h> +#include <linux/virtio_ring.h> +#include <linux/time-internal.h> +#include <linux/virtio-uml.h> +#include <shared/as-layout.h> +#include <irq_kern.h> +#include <init.h> +#include <os.h> +#include "vhost_user.h" + +#define MAX_SUPPORTED_QUEUE_SIZE 256 + +#define to_virtio_uml_device(_vdev) \ + container_of(_vdev, struct virtio_uml_device, vdev) + +struct virtio_uml_platform_data { + u32 virtio_device_id; + const char *socket_path; + struct work_struct conn_broken_wk; + struct platform_device *pdev; +}; + +struct virtio_uml_device { + struct virtio_device vdev; + struct platform_device *pdev; + struct virtio_uml_platform_data *pdata; + + raw_spinlock_t sock_lock; + int sock, req_fd, irq; + u64 features; + u64 protocol_features; + u64 max_vqs; + u8 status; + u8 registered:1; + u8 suspended:1; + u8 no_vq_suspend:1; + + u8 config_changed_irq:1; + uint64_t vq_irq_vq_map; + int recv_rc; +}; + +struct virtio_uml_vq_info { + int kick_fd, call_fd; + char name[32]; + bool suspended; +}; + +#define vu_err(vu_dev, ...) dev_err(&(vu_dev)->pdev->dev, ##__VA_ARGS__) + +/* Vhost-user protocol */ + +static int full_sendmsg_fds(int fd, const void *buf, unsigned int len, + const int *fds, unsigned int fds_num) +{ + int rc; + + do { + rc = os_sendmsg_fds(fd, buf, len, fds, fds_num); + if (rc > 0) { + buf += rc; + len -= rc; + fds = NULL; + fds_num = 0; + } + } while (len && (rc >= 0 || rc == -EINTR)); + + if (rc < 0) + return rc; + return 0; +} + +static int full_read(int fd, void *buf, int len, bool abortable) +{ + int rc; + + if (!len) + return 0; + + do { + rc = os_read_file(fd, buf, len); + if (rc > 0) { + buf += rc; + len -= rc; + } + } while (len && (rc > 0 || rc == -EINTR || (!abortable && rc == -EAGAIN))); + + if (rc < 0) + return rc; + if (rc == 0) + return -ECONNRESET; + return 0; +} + +static int vhost_user_recv_header(int fd, struct vhost_user_msg *msg) +{ + return full_read(fd, msg, sizeof(msg->header), true); +} + +static int vhost_user_recv(struct virtio_uml_device *vu_dev, + int fd, struct vhost_user_msg *msg, + size_t max_payload_size, bool wait) +{ + size_t size; + int rc; + + /* + * In virtio time-travel mode, we're handling all the vhost-user + * FDs by polling them whenever appropriate. However, we may get + * into a situation where we're sending out an interrupt message + * to a device (e.g. a net device) and need to handle a simulation + * time message while doing so, e.g. one that tells us to update + * our idea of how long we can run without scheduling. + * + * Thus, we need to not just read() from the given fd, but need + * to also handle messages for the simulation time - this function + * does that for us while waiting for the given fd to be readable. + */ + if (wait) + time_travel_wait_readable(fd); + + rc = vhost_user_recv_header(fd, msg); + + if (rc) + return rc; + size = msg->header.size; + if (size > max_payload_size) + return -EPROTO; + return full_read(fd, &msg->payload, size, false); +} + +static void vhost_user_check_reset(struct virtio_uml_device *vu_dev, + int rc) +{ + struct virtio_uml_platform_data *pdata = vu_dev->pdata; + + if (rc != -ECONNRESET) + return; + + if (!vu_dev->registered) + return; + + vu_dev->registered = 0; + + schedule_work(&pdata->conn_broken_wk); +} + +static int vhost_user_recv_resp(struct virtio_uml_device *vu_dev, + struct vhost_user_msg *msg, + size_t max_payload_size) +{ + int rc = vhost_user_recv(vu_dev, vu_dev->sock, msg, + max_payload_size, true); + + if (rc) { + vhost_user_check_reset(vu_dev, rc); + return rc; + } + + if (msg->header.flags != (VHOST_USER_FLAG_REPLY | VHOST_USER_VERSION)) + return -EPROTO; + + return 0; +} + +static int vhost_user_recv_u64(struct virtio_uml_device *vu_dev, + u64 *value) +{ + struct vhost_user_msg msg; + int rc = vhost_user_recv_resp(vu_dev, &msg, + sizeof(msg.payload.integer)); + + if (rc) + return rc; + if (msg.header.size != sizeof(msg.payload.integer)) + return -EPROTO; + *value = msg.payload.integer; + return 0; +} + +static int vhost_user_recv_req(struct virtio_uml_device *vu_dev, + struct vhost_user_msg *msg, + size_t max_payload_size) +{ + int rc = vhost_user_recv(vu_dev, vu_dev->req_fd, msg, + max_payload_size, false); + + if (rc) + return rc; + + if ((msg->header.flags & ~VHOST_USER_FLAG_NEED_REPLY) != + VHOST_USER_VERSION) + return -EPROTO; + + return 0; +} + +static int vhost_user_send(struct virtio_uml_device *vu_dev, + bool need_response, struct vhost_user_msg *msg, + int *fds, size_t num_fds) +{ + size_t size = sizeof(msg->header) + msg->header.size; + unsigned long flags; + bool request_ack; + int rc; + + msg->header.flags |= VHOST_USER_VERSION; + + /* + * The need_response flag indicates that we already need a response, + * e.g. to read the features. In these cases, don't request an ACK as + * it is meaningless. Also request an ACK only if supported. + */ + request_ack = !need_response; + if (!(vu_dev->protocol_features & + BIT_ULL(VHOST_USER_PROTOCOL_F_REPLY_ACK))) + request_ack = false; + + if (request_ack) + msg->header.flags |= VHOST_USER_FLAG_NEED_REPLY; + + raw_spin_lock_irqsave(&vu_dev->sock_lock, flags); + rc = full_sendmsg_fds(vu_dev->sock, msg, size, fds, num_fds); + if (rc < 0) + goto out; + + if (request_ack) { + uint64_t status; + + rc = vhost_user_recv_u64(vu_dev, &status); + if (rc) + goto out; + + if (status) { + vu_err(vu_dev, "slave reports error: %llu\n", status); + rc = -EIO; + goto out; + } + } + +out: + raw_spin_unlock_irqrestore(&vu_dev->sock_lock, flags); + return rc; +} + +static int vhost_user_send_no_payload(struct virtio_uml_device *vu_dev, + bool need_response, u32 request) +{ + struct vhost_user_msg msg = { + .header.request = request, + }; + + return vhost_user_send(vu_dev, need_response, &msg, NULL, 0); +} + +static int vhost_user_send_no_payload_fd(struct virtio_uml_device *vu_dev, + u32 request, int fd) +{ + struct vhost_user_msg msg = { + .header.request = request, + }; + + return vhost_user_send(vu_dev, false, &msg, &fd, 1); +} + +static int vhost_user_send_u64(struct virtio_uml_device *vu_dev, + u32 request, u64 value) +{ + struct vhost_user_msg msg = { + .header.request = request, + .header.size = sizeof(msg.payload.integer), + .payload.integer = value, + }; + + return vhost_user_send(vu_dev, false, &msg, NULL, 0); +} + +static int vhost_user_set_owner(struct virtio_uml_device *vu_dev) +{ + return vhost_user_send_no_payload(vu_dev, false, VHOST_USER_SET_OWNER); +} + +static int vhost_user_get_features(struct virtio_uml_device *vu_dev, + u64 *features) +{ + int rc = vhost_user_send_no_payload(vu_dev, true, + VHOST_USER_GET_FEATURES); + + if (rc) + return rc; + return vhost_user_recv_u64(vu_dev, features); +} + +static int vhost_user_set_features(struct virtio_uml_device *vu_dev, + u64 features) +{ + return vhost_user_send_u64(vu_dev, VHOST_USER_SET_FEATURES, features); +} + +static int vhost_user_get_protocol_features(struct virtio_uml_device *vu_dev, + u64 *protocol_features) +{ + int rc = vhost_user_send_no_payload(vu_dev, true, + VHOST_USER_GET_PROTOCOL_FEATURES); + + if (rc) + return rc; + return vhost_user_recv_u64(vu_dev, protocol_features); +} + +static int vhost_user_set_protocol_features(struct virtio_uml_device *vu_dev, + u64 protocol_features) +{ + return vhost_user_send_u64(vu_dev, VHOST_USER_SET_PROTOCOL_FEATURES, + protocol_features); +} + +static int vhost_user_get_queue_num(struct virtio_uml_device *vu_dev, + u64 *queue_num) +{ + int rc = vhost_user_send_no_payload(vu_dev, true, + VHOST_USER_GET_QUEUE_NUM); + + if (rc) + return rc; + return vhost_user_recv_u64(vu_dev, queue_num); +} + +static void vhost_user_reply(struct virtio_uml_device *vu_dev, + struct vhost_user_msg *msg, int response) +{ + struct vhost_user_msg reply = { + .payload.integer = response, + }; + size_t size = sizeof(reply.header) + sizeof(reply.payload.integer); + int rc; + + reply.header = msg->header; + reply.header.flags &= ~VHOST_USER_FLAG_NEED_REPLY; + reply.header.flags |= VHOST_USER_FLAG_REPLY; + reply.header.size = sizeof(reply.payload.integer); + + rc = full_sendmsg_fds(vu_dev->req_fd, &reply, size, NULL, 0); + + if (rc) + vu_err(vu_dev, + "sending reply to slave request failed: %d (size %zu)\n", + rc, size); +} + +static irqreturn_t vu_req_read_message(struct virtio_uml_device *vu_dev, + struct time_travel_event *ev) +{ + struct virtqueue *vq; + int response = 1; + struct { + struct vhost_user_msg msg; + u8 extra_payload[512]; + } msg; + int rc; + irqreturn_t irq_rc = IRQ_NONE; + + while (1) { + rc = vhost_user_recv_req(vu_dev, &msg.msg, + sizeof(msg.msg.payload) + + sizeof(msg.extra_payload)); + if (rc) + break; + + switch (msg.msg.header.request) { + case VHOST_USER_SLAVE_CONFIG_CHANGE_MSG: + vu_dev->config_changed_irq = true; + response = 0; + break; + case VHOST_USER_SLAVE_VRING_CALL: + virtio_device_for_each_vq((&vu_dev->vdev), vq) { + if (vq->index == msg.msg.payload.vring_state.index) { + response = 0; + vu_dev->vq_irq_vq_map |= BIT_ULL(vq->index); + break; + } + } + break; + case VHOST_USER_SLAVE_IOTLB_MSG: + /* not supported - VIRTIO_F_ACCESS_PLATFORM */ + case VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG: + /* not supported - VHOST_USER_PROTOCOL_F_HOST_NOTIFIER */ + default: + vu_err(vu_dev, "unexpected slave request %d\n", + msg.msg.header.request); + } + + if (ev && !vu_dev->suspended) + time_travel_add_irq_event(ev); + + if (msg.msg.header.flags & VHOST_USER_FLAG_NEED_REPLY) + vhost_user_reply(vu_dev, &msg.msg, response); + irq_rc = IRQ_HANDLED; + } + /* mask EAGAIN as we try non-blocking read until socket is empty */ + vu_dev->recv_rc = (rc == -EAGAIN) ? 0 : rc; + return irq_rc; +} + +static irqreturn_t vu_req_interrupt(int irq, void *data) +{ + struct virtio_uml_device *vu_dev = data; + irqreturn_t ret = IRQ_HANDLED; + + if (!um_irq_timetravel_handler_used()) + ret = vu_req_read_message(vu_dev, NULL); + + if (vu_dev->recv_rc) { + vhost_user_check_reset(vu_dev, vu_dev->recv_rc); + } else if (vu_dev->vq_irq_vq_map) { + struct virtqueue *vq; + + virtio_device_for_each_vq((&vu_dev->vdev), vq) { + if (vu_dev->vq_irq_vq_map & BIT_ULL(vq->index)) + vring_interrupt(0 /* ignored */, vq); + } + vu_dev->vq_irq_vq_map = 0; + } else if (vu_dev->config_changed_irq) { + virtio_config_changed(&vu_dev->vdev); + vu_dev->config_changed_irq = false; + } + + return ret; +} + +static void vu_req_interrupt_comm_handler(int irq, int fd, void *data, + struct time_travel_event *ev) +{ + vu_req_read_message(data, ev); +} + +static int vhost_user_init_slave_req(struct virtio_uml_device *vu_dev) +{ + int rc, req_fds[2]; + + /* Use a pipe for slave req fd, SIGIO is not supported for eventfd */ + rc = os_pipe(req_fds, true, true); + if (rc < 0) + return rc; + vu_dev->req_fd = req_fds[0]; + + rc = um_request_irq_tt(UM_IRQ_ALLOC, vu_dev->req_fd, IRQ_READ, + vu_req_interrupt, IRQF_SHARED, + vu_dev->pdev->name, vu_dev, + vu_req_interrupt_comm_handler); + if (rc < 0) + goto err_close; + + vu_dev->irq = rc; + + rc = vhost_user_send_no_payload_fd(vu_dev, VHOST_USER_SET_SLAVE_REQ_FD, + req_fds[1]); + if (rc) + goto err_free_irq; + + goto out; + +err_free_irq: + um_free_irq(vu_dev->irq, vu_dev); +err_close: + os_close_file(req_fds[0]); +out: + /* Close unused write end of request fds */ + os_close_file(req_fds[1]); + return rc; +} + +static int vhost_user_init(struct virtio_uml_device *vu_dev) +{ + int rc = vhost_user_set_owner(vu_dev); + + if (rc) + return rc; + rc = vhost_user_get_features(vu_dev, &vu_dev->features); + if (rc) + return rc; + + if (vu_dev->features & BIT_ULL(VHOST_USER_F_PROTOCOL_FEATURES)) { + rc = vhost_user_get_protocol_features(vu_dev, + &vu_dev->protocol_features); + if (rc) + return rc; + vu_dev->protocol_features &= VHOST_USER_SUPPORTED_PROTOCOL_F; + rc = vhost_user_set_protocol_features(vu_dev, + vu_dev->protocol_features); + if (rc) + return rc; + } + + if (vu_dev->protocol_features & + BIT_ULL(VHOST_USER_PROTOCOL_F_SLAVE_REQ)) { + rc = vhost_user_init_slave_req(vu_dev); + if (rc) + return rc; + } + + if (vu_dev->protocol_features & + BIT_ULL(VHOST_USER_PROTOCOL_F_MQ)) { + rc = vhost_user_get_queue_num(vu_dev, &vu_dev->max_vqs); + if (rc) + return rc; + } else { + vu_dev->max_vqs = U64_MAX; + } + + return 0; +} + +static void vhost_user_get_config(struct virtio_uml_device *vu_dev, + u32 offset, void *buf, u32 len) +{ + u32 cfg_size = offset + len; + struct vhost_user_msg *msg; + size_t payload_size = sizeof(msg->payload.config) + cfg_size; + size_t msg_size = sizeof(msg->header) + payload_size; + int rc; + + if (!(vu_dev->protocol_features & + BIT_ULL(VHOST_USER_PROTOCOL_F_CONFIG))) + return; + + msg = kzalloc(msg_size, GFP_KERNEL); + if (!msg) + return; + msg->header.request = VHOST_USER_GET_CONFIG; + msg->header.size = payload_size; + msg->payload.config.offset = 0; + msg->payload.config.size = cfg_size; + + rc = vhost_user_send(vu_dev, true, msg, NULL, 0); + if (rc) { + vu_err(vu_dev, "sending VHOST_USER_GET_CONFIG failed: %d\n", + rc); + goto free; + } + + rc = vhost_user_recv_resp(vu_dev, msg, msg_size); + if (rc) { + vu_err(vu_dev, + "receiving VHOST_USER_GET_CONFIG response failed: %d\n", + rc); + goto free; + } + + if (msg->header.size != payload_size || + msg->payload.config.size != cfg_size) { + rc = -EPROTO; + vu_err(vu_dev, + "Invalid VHOST_USER_GET_CONFIG sizes (payload %d expected %zu, config %u expected %u)\n", + msg->header.size, payload_size, + msg->payload.config.size, cfg_size); + goto free; + } + memcpy(buf, msg->payload.config.payload + offset, len); + +free: + kfree(msg); +} + +static void vhost_user_set_config(struct virtio_uml_device *vu_dev, + u32 offset, const void *buf, u32 len) +{ + struct vhost_user_msg *msg; + size_t payload_size = sizeof(msg->payload.config) + len; + size_t msg_size = sizeof(msg->header) + payload_size; + int rc; + + if (!(vu_dev->protocol_features & + BIT_ULL(VHOST_USER_PROTOCOL_F_CONFIG))) + return; + + msg = kzalloc(msg_size, GFP_KERNEL); + if (!msg) + return; + msg->header.request = VHOST_USER_SET_CONFIG; + msg->header.size = payload_size; + msg->payload.config.offset = offset; + msg->payload.config.size = len; + memcpy(msg->payload.config.payload, buf, len); + + rc = vhost_user_send(vu_dev, false, msg, NULL, 0); + if (rc) + vu_err(vu_dev, "sending VHOST_USER_SET_CONFIG failed: %d\n", + rc); + + kfree(msg); +} + +static int vhost_user_init_mem_region(u64 addr, u64 size, int *fd_out, + struct vhost_user_mem_region *region_out) +{ + unsigned long long mem_offset; + int rc = phys_mapping(addr, &mem_offset); + + if (WARN(rc < 0, "phys_mapping of 0x%llx returned %d\n", addr, rc)) + return -EFAULT; + *fd_out = rc; + region_out->guest_addr = addr; + region_out->user_addr = addr; + region_out->size = size; + region_out->mmap_offset = mem_offset; + + /* Ensure mapping is valid for the entire region */ + rc = phys_mapping(addr + size - 1, &mem_offset); + if (WARN(rc != *fd_out, "phys_mapping of 0x%llx failed: %d != %d\n", + addr + size - 1, rc, *fd_out)) + return -EFAULT; + return 0; +} + +static int vhost_user_set_mem_table(struct virtio_uml_device *vu_dev) +{ + struct vhost_user_msg msg = { + .header.request = VHOST_USER_SET_MEM_TABLE, + .header.size = offsetof(typeof(msg.payload.mem_regions), regions[1]), + .payload.mem_regions.num = 1, + }; + unsigned long reserved = uml_reserved - uml_physmem; + int fds[2]; + int rc; + + /* + * This is a bit tricky, see also the comment with setup_physmem(). + * + * Essentially, setup_physmem() uses a file to mmap() our physmem, + * but the code and data we *already* have is omitted. To us, this + * is no difference, since they both become part of our address + * space and memory consumption. To somebody looking in from the + * outside, however, it is different because the part of our memory + * consumption that's already part of the binary (code/data) is not + * mapped from the file, so it's not visible to another mmap from + * the file descriptor. + * + * Thus, don't advertise this space to the vhost-user slave. This + * means that the slave will likely abort or similar when we give + * it an address from the hidden range, since it's not marked as + * a valid address, but at least that way we detect the issue and + * don't just have the slave read an all-zeroes buffer from the + * shared memory file, or write something there that we can never + * see (depending on the direction of the virtqueue traffic.) + * + * Since we usually don't want to use .text for virtio buffers, + * this effectively means that you cannot use + * 1) global variables, which are in the .bss and not in the shm + * file-backed memory + * 2) the stack in some processes, depending on where they have + * their stack (or maybe only no interrupt stack?) + * + * The stack is already not typically valid for DMA, so this isn't + * much of a restriction, but global variables might be encountered. + * + * It might be possible to fix it by copying around the data that's + * between bss_start and where we map the file now, but it's not + * something that you typically encounter with virtio drivers, so + * it didn't seem worthwhile. + */ + rc = vhost_user_init_mem_region(reserved, physmem_size - reserved, + &fds[0], + &msg.payload.mem_regions.regions[0]); + + if (rc < 0) + return rc; + + return vhost_user_send(vu_dev, false, &msg, fds, + msg.payload.mem_regions.num); +} + +static int vhost_user_set_vring_state(struct virtio_uml_device *vu_dev, + u32 request, u32 index, u32 num) +{ + struct vhost_user_msg msg = { + .header.request = request, + .header.size = sizeof(msg.payload.vring_state), + .payload.vring_state.index = index, + .payload.vring_state.num = num, + }; + + return vhost_user_send(vu_dev, false, &msg, NULL, 0); +} + +static int vhost_user_set_vring_num(struct virtio_uml_device *vu_dev, + u32 index, u32 num) +{ + return vhost_user_set_vring_state(vu_dev, VHOST_USER_SET_VRING_NUM, + index, num); +} + +static int vhost_user_set_vring_base(struct virtio_uml_device *vu_dev, + u32 index, u32 offset) +{ + return vhost_user_set_vring_state(vu_dev, VHOST_USER_SET_VRING_BASE, + index, offset); +} + +static int vhost_user_set_vring_addr(struct virtio_uml_device *vu_dev, + u32 index, u64 desc, u64 used, u64 avail, + u64 log) +{ + struct vhost_user_msg msg = { + .header.request = VHOST_USER_SET_VRING_ADDR, + .header.size = sizeof(msg.payload.vring_addr), + .payload.vring_addr.index = index, + .payload.vring_addr.desc = desc, + .payload.vring_addr.used = used, + .payload.vring_addr.avail = avail, + .payload.vring_addr.log = log, + }; + + return vhost_user_send(vu_dev, false, &msg, NULL, 0); +} + +static int vhost_user_set_vring_fd(struct virtio_uml_device *vu_dev, + u32 request, int index, int fd) +{ + struct vhost_user_msg msg = { + .header.request = request, + .header.size = sizeof(msg.payload.integer), + .payload.integer = index, + }; + + if (index & ~VHOST_USER_VRING_INDEX_MASK) + return -EINVAL; + if (fd < 0) { + msg.payload.integer |= VHOST_USER_VRING_POLL_MASK; + return vhost_user_send(vu_dev, false, &msg, NULL, 0); + } + return vhost_user_send(vu_dev, false, &msg, &fd, 1); +} + +static int vhost_user_set_vring_call(struct virtio_uml_device *vu_dev, + int index, int fd) +{ + return vhost_user_set_vring_fd(vu_dev, VHOST_USER_SET_VRING_CALL, + index, fd); +} + +static int vhost_user_set_vring_kick(struct virtio_uml_device *vu_dev, + int index, int fd) +{ + return vhost_user_set_vring_fd(vu_dev, VHOST_USER_SET_VRING_KICK, + index, fd); +} + +static int vhost_user_set_vring_enable(struct virtio_uml_device *vu_dev, + u32 index, bool enable) +{ + if (!(vu_dev->features & BIT_ULL(VHOST_USER_F_PROTOCOL_FEATURES))) + return 0; + + return vhost_user_set_vring_state(vu_dev, VHOST_USER_SET_VRING_ENABLE, + index, enable); +} + + +/* Virtio interface */ + +static bool vu_notify(struct virtqueue *vq) +{ + struct virtio_uml_vq_info *info = vq->priv; + const uint64_t n = 1; + int rc; + + if (info->suspended) + return true; + + time_travel_propagate_time(); + + if (info->kick_fd < 0) { + struct virtio_uml_device *vu_dev; + + vu_dev = to_virtio_uml_device(vq->vdev); + + return vhost_user_set_vring_state(vu_dev, VHOST_USER_VRING_KICK, + vq->index, 0) == 0; + } + + do { + rc = os_write_file(info->kick_fd, &n, sizeof(n)); + } while (rc == -EINTR); + return !WARN(rc != sizeof(n), "write returned %d\n", rc); +} + +static irqreturn_t vu_interrupt(int irq, void *opaque) +{ + struct virtqueue *vq = opaque; + struct virtio_uml_vq_info *info = vq->priv; + uint64_t n; + int rc; + irqreturn_t ret = IRQ_NONE; + + do { + rc = os_read_file(info->call_fd, &n, sizeof(n)); + if (rc == sizeof(n)) + ret |= vring_interrupt(irq, vq); + } while (rc == sizeof(n) || rc == -EINTR); + WARN(rc != -EAGAIN, "read returned %d\n", rc); + return ret; +} + + +static void vu_get(struct virtio_device *vdev, unsigned offset, + void *buf, unsigned len) +{ + struct virtio_uml_device *vu_dev = to_virtio_uml_device(vdev); + + vhost_user_get_config(vu_dev, offset, buf, len); +} + +static void vu_set(struct virtio_device *vdev, unsigned offset, + const void *buf, unsigned len) +{ + struct virtio_uml_device *vu_dev = to_virtio_uml_device(vdev); + + vhost_user_set_config(vu_dev, offset, buf, len); +} + +static u8 vu_get_status(struct virtio_device *vdev) +{ + struct virtio_uml_device *vu_dev = to_virtio_uml_device(vdev); + + return vu_dev->status; +} + +static void vu_set_status(struct virtio_device *vdev, u8 status) +{ + struct virtio_uml_device *vu_dev = to_virtio_uml_device(vdev); + + vu_dev->status = status; +} + +static void vu_reset(struct virtio_device *vdev) +{ + struct virtio_uml_device *vu_dev = to_virtio_uml_device(vdev); + + vu_dev->status = 0; +} + +static void vu_del_vq(struct virtqueue *vq) +{ + struct virtio_uml_vq_info *info = vq->priv; + + if (info->call_fd >= 0) { + struct virtio_uml_device *vu_dev; + + vu_dev = to_virtio_uml_device(vq->vdev); + + um_free_irq(vu_dev->irq, vq); + os_close_file(info->call_fd); + } + + if (info->kick_fd >= 0) + os_close_file(info->kick_fd); + + vring_del_virtqueue(vq); + kfree(info); +} + +static void vu_del_vqs(struct virtio_device *vdev) +{ + struct virtio_uml_device *vu_dev = to_virtio_uml_device(vdev); + struct virtqueue *vq, *n; + u64 features; + + /* Note: reverse order as a workaround to a decoding bug in snabb */ + list_for_each_entry_reverse(vq, &vdev->vqs, list) + WARN_ON(vhost_user_set_vring_enable(vu_dev, vq->index, false)); + + /* Ensure previous messages have been processed */ + WARN_ON(vhost_user_get_features(vu_dev, &features)); + + list_for_each_entry_safe(vq, n, &vdev->vqs, list) + vu_del_vq(vq); +} + +static int vu_setup_vq_call_fd(struct virtio_uml_device *vu_dev, + struct virtqueue *vq) +{ + struct virtio_uml_vq_info *info = vq->priv; + int call_fds[2]; + int rc, irq; + + /* no call FD needed/desired in this case */ + if (vu_dev->protocol_features & + BIT_ULL(VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS) && + vu_dev->protocol_features & + BIT_ULL(VHOST_USER_PROTOCOL_F_SLAVE_REQ)) { + info->call_fd = -1; + return 0; + } + + /* Use a pipe for call fd, since SIGIO is not supported for eventfd */ + rc = os_pipe(call_fds, true, true); + if (rc < 0) + return rc; + + info->call_fd = call_fds[0]; + irq = um_request_irq(vu_dev->irq, info->call_fd, IRQ_READ, + vu_interrupt, IRQF_SHARED, info->name, vq); + if (irq < 0) { + rc = irq; + goto close_both; + } + + rc = vhost_user_set_vring_call(vu_dev, vq->index, call_fds[1]); + if (rc) + goto release_irq; + + vu_dev->irq = irq; + + goto out; + +release_irq: + um_free_irq(irq, vq); +close_both: + os_close_file(call_fds[0]); +out: + /* Close (unused) write end of call fds */ + os_close_file(call_fds[1]); + + return rc; +} + +static struct virtqueue *vu_setup_vq(struct virtio_device *vdev, + unsigned index, vq_callback_t *callback, + const char *name, bool ctx) +{ + struct virtio_uml_device *vu_dev = to_virtio_uml_device(vdev); + struct platform_device *pdev = vu_dev->pdev; + struct virtio_uml_vq_info *info; + struct virtqueue *vq; + int num = MAX_SUPPORTED_QUEUE_SIZE; + int rc; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) { + rc = -ENOMEM; + goto error_kzalloc; + } + snprintf(info->name, sizeof(info->name), "%s.%d-%s", pdev->name, + pdev->id, name); + + vq = vring_create_virtqueue(index, num, PAGE_SIZE, vdev, true, true, + ctx, vu_notify, callback, info->name); + if (!vq) { + rc = -ENOMEM; + goto error_create; + } + vq->priv = info; + vq->num_max = num; + num = virtqueue_get_vring_size(vq); + + if (vu_dev->protocol_features & + BIT_ULL(VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS)) { + info->kick_fd = -1; + } else { + rc = os_eventfd(0, 0); + if (rc < 0) + goto error_kick; + info->kick_fd = rc; + } + + rc = vu_setup_vq_call_fd(vu_dev, vq); + if (rc) + goto error_call; + + rc = vhost_user_set_vring_num(vu_dev, index, num); + if (rc) + goto error_setup; + + rc = vhost_user_set_vring_base(vu_dev, index, 0); + if (rc) + goto error_setup; + + rc = vhost_user_set_vring_addr(vu_dev, index, + virtqueue_get_desc_addr(vq), + virtqueue_get_used_addr(vq), + virtqueue_get_avail_addr(vq), + (u64) -1); + if (rc) + goto error_setup; + + return vq; + +error_setup: + if (info->call_fd >= 0) { + um_free_irq(vu_dev->irq, vq); + os_close_file(info->call_fd); + } +error_call: + if (info->kick_fd >= 0) + os_close_file(info->kick_fd); +error_kick: + vring_del_virtqueue(vq); +error_create: + kfree(info); +error_kzalloc: + return ERR_PTR(rc); +} + +static int vu_find_vqs(struct virtio_device *vdev, unsigned nvqs, + struct virtqueue *vqs[], + struct virtqueue_info vqs_info[], + struct irq_affinity *desc) +{ + struct virtio_uml_device *vu_dev = to_virtio_uml_device(vdev); + int i, queue_idx = 0, rc; + struct virtqueue *vq; + + /* not supported for now */ + if (WARN(nvqs > 64 || nvqs > vu_dev->max_vqs, + "%d VQs requested, only up to 64 or %lld supported\n", + nvqs, vu_dev->max_vqs)) + return -EINVAL; + + rc = vhost_user_set_mem_table(vu_dev); + if (rc) + return rc; + + for (i = 0; i < nvqs; ++i) { + struct virtqueue_info *vqi = &vqs_info[i]; + + if (!vqi->name) { + vqs[i] = NULL; + continue; + } + + vqs[i] = vu_setup_vq(vdev, queue_idx++, vqi->callback, + vqi->name, vqi->ctx); + if (IS_ERR(vqs[i])) { + rc = PTR_ERR(vqs[i]); + goto error_setup; + } + } + + list_for_each_entry(vq, &vdev->vqs, list) { + struct virtio_uml_vq_info *info = vq->priv; + + if (info->kick_fd >= 0) { + rc = vhost_user_set_vring_kick(vu_dev, vq->index, + info->kick_fd); + if (rc) + goto error_setup; + } + + rc = vhost_user_set_vring_enable(vu_dev, vq->index, true); + if (rc) + goto error_setup; + } + + return 0; + +error_setup: + vu_del_vqs(vdev); + return rc; +} + +static u64 vu_get_features(struct virtio_device *vdev) +{ + struct virtio_uml_device *vu_dev = to_virtio_uml_device(vdev); + + return vu_dev->features; +} + +static int vu_finalize_features(struct virtio_device *vdev) +{ + struct virtio_uml_device *vu_dev = to_virtio_uml_device(vdev); + u64 supported = vdev->features & VHOST_USER_SUPPORTED_F; + + vring_transport_features(vdev); + vu_dev->features = vdev->features | supported; + + return vhost_user_set_features(vu_dev, vu_dev->features); +} + +static const char *vu_bus_name(struct virtio_device *vdev) +{ + struct virtio_uml_device *vu_dev = to_virtio_uml_device(vdev); + + return vu_dev->pdev->name; +} + +static const struct virtio_config_ops virtio_uml_config_ops = { + .get = vu_get, + .set = vu_set, + .get_status = vu_get_status, + .set_status = vu_set_status, + .reset = vu_reset, + .find_vqs = vu_find_vqs, + .del_vqs = vu_del_vqs, + .get_features = vu_get_features, + .finalize_features = vu_finalize_features, + .bus_name = vu_bus_name, +}; + +static void virtio_uml_release_dev(struct device *d) +{ + struct virtio_device *vdev = + container_of(d, struct virtio_device, dev); + struct virtio_uml_device *vu_dev = to_virtio_uml_device(vdev); + + time_travel_propagate_time(); + + /* might not have been opened due to not negotiating the feature */ + if (vu_dev->req_fd >= 0) { + um_free_irq(vu_dev->irq, vu_dev); + os_close_file(vu_dev->req_fd); + } + + os_close_file(vu_dev->sock); + kfree(vu_dev); +} + +void virtio_uml_set_no_vq_suspend(struct virtio_device *vdev, + bool no_vq_suspend) +{ + struct virtio_uml_device *vu_dev = to_virtio_uml_device(vdev); + + if (WARN_ON(vdev->config != &virtio_uml_config_ops)) + return; + + vu_dev->no_vq_suspend = no_vq_suspend; + dev_info(&vdev->dev, "%s VQ suspend\n", str_disabled_enabled(no_vq_suspend)); +} + +static void vu_of_conn_broken(struct work_struct *wk) +{ + struct virtio_uml_platform_data *pdata; + struct virtio_uml_device *vu_dev; + + pdata = container_of(wk, struct virtio_uml_platform_data, conn_broken_wk); + + vu_dev = platform_get_drvdata(pdata->pdev); + + virtio_break_device(&vu_dev->vdev); + + /* + * We can't remove the device from the devicetree so the only thing we + * can do is warn. + */ + WARN_ON(1); +} + +/* Platform device */ + +static struct virtio_uml_platform_data * +virtio_uml_create_pdata(struct platform_device *pdev) +{ + struct device_node *np = pdev->dev.of_node; + struct virtio_uml_platform_data *pdata; + int ret; + + if (!np) + return ERR_PTR(-EINVAL); + + pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); + if (!pdata) + return ERR_PTR(-ENOMEM); + + INIT_WORK(&pdata->conn_broken_wk, vu_of_conn_broken); + pdata->pdev = pdev; + + ret = of_property_read_string(np, "socket-path", &pdata->socket_path); + if (ret) + return ERR_PTR(ret); + + ret = of_property_read_u32(np, "virtio-device-id", + &pdata->virtio_device_id); + if (ret) + return ERR_PTR(ret); + + return pdata; +} + +static int virtio_uml_probe(struct platform_device *pdev) +{ + struct virtio_uml_platform_data *pdata = pdev->dev.platform_data; + struct virtio_uml_device *vu_dev; + int rc; + + if (!pdata) { + pdata = virtio_uml_create_pdata(pdev); + if (IS_ERR(pdata)) + return PTR_ERR(pdata); + } + + vu_dev = kzalloc(sizeof(*vu_dev), GFP_KERNEL); + if (!vu_dev) + return -ENOMEM; + + vu_dev->pdata = pdata; + vu_dev->vdev.dev.parent = &pdev->dev; + vu_dev->vdev.dev.release = virtio_uml_release_dev; + vu_dev->vdev.config = &virtio_uml_config_ops; + vu_dev->vdev.id.device = pdata->virtio_device_id; + vu_dev->vdev.id.vendor = VIRTIO_DEV_ANY_ID; + vu_dev->pdev = pdev; + vu_dev->req_fd = -1; + vu_dev->irq = UM_IRQ_ALLOC; + + time_travel_propagate_time(); + + do { + rc = os_connect_socket(pdata->socket_path); + } while (rc == -EINTR); + if (rc < 0) + goto error_free; + vu_dev->sock = rc; + + raw_spin_lock_init(&vu_dev->sock_lock); + + rc = vhost_user_init(vu_dev); + if (rc) + goto error_init; + + platform_set_drvdata(pdev, vu_dev); + + device_set_wakeup_capable(&vu_dev->vdev.dev, true); + + rc = register_virtio_device(&vu_dev->vdev); + if (rc) { + put_device(&vu_dev->vdev.dev); + return rc; + } + vu_dev->registered = 1; + return 0; + +error_init: + os_close_file(vu_dev->sock); +error_free: + kfree(vu_dev); + return rc; +} + +static void virtio_uml_remove(struct platform_device *pdev) +{ + struct virtio_uml_device *vu_dev = platform_get_drvdata(pdev); + + unregister_virtio_device(&vu_dev->vdev); +} + +/* Command line device list */ + +static void vu_cmdline_release_dev(struct device *d) +{ +} + +static struct device vu_cmdline_parent = { + .init_name = "virtio-uml-cmdline", + .release = vu_cmdline_release_dev, +}; + +static bool vu_cmdline_parent_registered; +static int vu_cmdline_id; + +static int vu_unregister_cmdline_device(struct device *dev, void *data) +{ + struct platform_device *pdev = to_platform_device(dev); + struct virtio_uml_platform_data *pdata = pdev->dev.platform_data; + + kfree(pdata->socket_path); + platform_device_unregister(pdev); + return 0; +} + +static void vu_conn_broken(struct work_struct *wk) +{ + struct virtio_uml_platform_data *pdata; + struct virtio_uml_device *vu_dev; + + pdata = container_of(wk, struct virtio_uml_platform_data, conn_broken_wk); + + vu_dev = platform_get_drvdata(pdata->pdev); + + virtio_break_device(&vu_dev->vdev); + + vu_unregister_cmdline_device(&pdata->pdev->dev, NULL); +} + +static int vu_cmdline_set(const char *device, const struct kernel_param *kp) +{ + const char *ids = strchr(device, ':'); + unsigned int virtio_device_id; + int processed, consumed, err; + char *socket_path; + struct virtio_uml_platform_data pdata, *ppdata; + struct platform_device *pdev; + + if (!ids || ids == device) + return -EINVAL; + + processed = sscanf(ids, ":%u%n:%d%n", + &virtio_device_id, &consumed, + &vu_cmdline_id, &consumed); + + if (processed < 1 || ids[consumed]) + return -EINVAL; + + if (!vu_cmdline_parent_registered) { + err = device_register(&vu_cmdline_parent); + if (err) { + pr_err("Failed to register parent device!\n"); + put_device(&vu_cmdline_parent); + return err; + } + vu_cmdline_parent_registered = true; + } + + socket_path = kmemdup_nul(device, ids - device, GFP_KERNEL); + if (!socket_path) + return -ENOMEM; + + pdata.virtio_device_id = (u32) virtio_device_id; + pdata.socket_path = socket_path; + + pr_info("Registering device virtio-uml.%d id=%d at %s\n", + vu_cmdline_id, virtio_device_id, socket_path); + + pdev = platform_device_register_data(&vu_cmdline_parent, "virtio-uml", + vu_cmdline_id++, &pdata, + sizeof(pdata)); + err = PTR_ERR_OR_ZERO(pdev); + if (err) + goto free; + + ppdata = pdev->dev.platform_data; + ppdata->pdev = pdev; + INIT_WORK(&ppdata->conn_broken_wk, vu_conn_broken); + + return 0; + +free: + kfree(socket_path); + return err; +} + +static int vu_cmdline_get_device(struct device *dev, void *data) +{ + struct platform_device *pdev = to_platform_device(dev); + struct virtio_uml_platform_data *pdata = pdev->dev.platform_data; + char *buffer = data; + unsigned int len = strlen(buffer); + + snprintf(buffer + len, PAGE_SIZE - len, "%s:%d:%d\n", + pdata->socket_path, pdata->virtio_device_id, pdev->id); + return 0; +} + +static int vu_cmdline_get(char *buffer, const struct kernel_param *kp) +{ + buffer[0] = '\0'; + if (vu_cmdline_parent_registered) + device_for_each_child(&vu_cmdline_parent, buffer, + vu_cmdline_get_device); + return strlen(buffer) + 1; +} + +static const struct kernel_param_ops vu_cmdline_param_ops = { + .set = vu_cmdline_set, + .get = vu_cmdline_get, +}; + +device_param_cb(device, &vu_cmdline_param_ops, NULL, S_IRUSR); +__uml_help(vu_cmdline_param_ops, +"virtio_uml.device=<socket>:<virtio_id>[:<platform_id>]\n" +" Configure a virtio device over a vhost-user socket.\n" +" See virtio_ids.h for a list of possible virtio device id values.\n" +" Optionally use a specific platform_device id.\n\n" +); + + +static void vu_unregister_cmdline_devices(void) +{ + if (vu_cmdline_parent_registered) { + device_for_each_child(&vu_cmdline_parent, NULL, + vu_unregister_cmdline_device); + device_unregister(&vu_cmdline_parent); + vu_cmdline_parent_registered = false; + } +} + +/* Platform driver */ + +static const struct of_device_id virtio_uml_match[] = { + { .compatible = "virtio,uml", }, + { } +}; +MODULE_DEVICE_TABLE(of, virtio_uml_match); + +static int virtio_uml_suspend(struct platform_device *pdev, pm_message_t state) +{ + struct virtio_uml_device *vu_dev = platform_get_drvdata(pdev); + + if (!vu_dev->no_vq_suspend) { + struct virtqueue *vq; + + virtio_device_for_each_vq((&vu_dev->vdev), vq) { + struct virtio_uml_vq_info *info = vq->priv; + + info->suspended = true; + vhost_user_set_vring_enable(vu_dev, vq->index, false); + } + } + + if (!device_may_wakeup(&vu_dev->vdev.dev)) { + vu_dev->suspended = true; + return 0; + } + + return irq_set_irq_wake(vu_dev->irq, 1); +} + +static int virtio_uml_resume(struct platform_device *pdev) +{ + struct virtio_uml_device *vu_dev = platform_get_drvdata(pdev); + + if (!vu_dev->no_vq_suspend) { + struct virtqueue *vq; + + virtio_device_for_each_vq((&vu_dev->vdev), vq) { + struct virtio_uml_vq_info *info = vq->priv; + + info->suspended = false; + vhost_user_set_vring_enable(vu_dev, vq->index, true); + } + } + + vu_dev->suspended = false; + + if (!device_may_wakeup(&vu_dev->vdev.dev)) + return 0; + + return irq_set_irq_wake(vu_dev->irq, 0); +} + +static struct platform_driver virtio_uml_driver = { + .probe = virtio_uml_probe, + .remove = virtio_uml_remove, + .driver = { + .name = "virtio-uml", + .of_match_table = virtio_uml_match, + }, + .suspend = virtio_uml_suspend, + .resume = virtio_uml_resume, +}; + +static int __init virtio_uml_init(void) +{ + return platform_driver_register(&virtio_uml_driver); +} + +static void __exit virtio_uml_exit(void) +{ + platform_driver_unregister(&virtio_uml_driver); + vu_unregister_cmdline_devices(); +} + +module_init(virtio_uml_init); +module_exit(virtio_uml_exit); +__uml_exitcall(virtio_uml_exit); + +MODULE_DESCRIPTION("UML driver for vhost-user virtio devices"); +MODULE_LICENSE("GPL"); diff --git a/arch/um/drivers/xterm.c b/arch/um/drivers/xterm.c index 20e30be44795..d05918e422f9 100644 --- a/arch/um/drivers/xterm.c +++ b/arch/um/drivers/xterm.c @@ -1,6 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <stddef.h> @@ -18,6 +18,7 @@ struct xterm_chan { int pid; int helper_pid; + int chan_fd; char *title; int device; int raw; @@ -33,6 +34,7 @@ static void *xterm_init(char *str, int device, const struct chan_opts *opts) return NULL; *data = ((struct xterm_chan) { .pid = -1, .helper_pid = -1, + .chan_fd = -1, .device = device, .title = opts->xterm_title, .raw = opts->raw } ); @@ -40,7 +42,7 @@ static void *xterm_init(char *str, int device, const struct chan_opts *opts) } /* Only changed by xterm_setup, which is a setup */ -static char *terminal_emulator = "xterm"; +static char *terminal_emulator = CONFIG_XTERM_CHAN_DEFAULT_EMULATOR; static char *title_switch = "-T"; static char *exec_switch = "-e"; @@ -77,8 +79,9 @@ __uml_setup("xterm=", xterm_setup, " respectively. The title switch must have the form '<switch> title',\n" " not '<switch>=title'. Similarly, the exec switch must have the form\n" " '<switch> command arg1 arg2 ...'.\n" -" The default values are 'xterm=xterm,-T,-e'. Values for gnome-terminal\n" -" are 'xterm=gnome-terminal,-t,-x'.\n\n" +" The default values are 'xterm=" CONFIG_XTERM_CHAN_DEFAULT_EMULATOR + ",-T,-e'.\n" +" Values for gnome-terminal are 'xterm=gnome-terminal,-t,--'.\n\n" ); static int xterm_open(int input, int output, int primary, void *d, @@ -94,12 +97,9 @@ static int xterm_open(int input, int output, int primary, void *d, if (access(argv[4], X_OK) < 0) argv[4] = "port-helper"; - /* - * Check that DISPLAY is set, this doesn't guarantee the xterm - * will work but w/o it we can be pretty sure it won't. - */ - if (getenv("DISPLAY") == NULL) { - printk(UM_KERN_ERR "xterm_open: $DISPLAY not set.\n"); + /* Ensure we are running on Xorg or Wayland. */ + if (!getenv("DISPLAY") && !getenv("WAYLAND_DISPLAY")) { + printk(UM_KERN_ERR "xterm_open : neither $DISPLAY nor $WAYLAND_DISPLAY is set.\n"); return -ENODEV; } @@ -149,10 +149,11 @@ static int xterm_open(int input, int output, int primary, void *d, goto out_kill; } + data->chan_fd = fd; new = xterm_fd(fd, &data->helper_pid); if (new < 0) { err = new; - printk(UM_KERN_ERR "xterm_open : os_rcv_fd failed, err = %d\n", + printk(UM_KERN_ERR "xterm_open : xterm_fd failed, err = %d\n", -err); goto out_kill; } @@ -206,6 +207,8 @@ static void xterm_close(int fd, void *d) os_kill_process(data->helper_pid, 0); data->helper_pid = -1; + if (data->chan_fd != -1) + os_close_file(data->chan_fd); os_close_file(fd); } diff --git a/arch/um/drivers/xterm.h b/arch/um/drivers/xterm.h index 56b9c4aba423..5968da3a6aba 100644 --- a/arch/um/drivers/xterm.h +++ b/arch/um/drivers/xterm.h @@ -1,6 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL */ #ifndef __XTERM_H__ diff --git a/arch/um/drivers/xterm_kern.c b/arch/um/drivers/xterm_kern.c index e8f9957bfbf6..3971252cb1a6 100644 --- a/arch/um/drivers/xterm_kern.c +++ b/arch/um/drivers/xterm_kern.c @@ -1,6 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <linux/slab.h> @@ -9,6 +9,7 @@ #include <asm/irq.h> #include <irq_kern.h> #include <os.h> +#include "xterm.h" struct xterm_wait { struct completion ready; @@ -20,12 +21,19 @@ struct xterm_wait { static irqreturn_t xterm_interrupt(int irq, void *data) { struct xterm_wait *xterm = data; - int fd; + int fd = -1, n_fds = 1; + ssize_t ret; - fd = os_rcv_fd(xterm->fd, &xterm->pid); - if (fd == -EAGAIN) + ret = os_rcv_fd_msg(xterm->fd, &fd, n_fds, + &xterm->pid, sizeof(xterm->pid)); + if (ret == -EAGAIN) return IRQ_NONE; + if (ret < 0) + fd = ret; + else if (ret != sizeof(xterm->pid)) + fd = -EMSGSIZE; + xterm->new_fd = fd; complete(&xterm->ready); @@ -51,7 +59,7 @@ int xterm_fd(int socket, int *pid_out) err = um_request_irq(XTERM_IRQ, socket, IRQ_READ, xterm_interrupt, IRQF_SHARED, "xterm", data); - if (err) { + if (err < 0) { printk(KERN_ERR "xterm_fd : failed to get IRQ for xterm, " "err = %d\n", err); ret = err; diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild index b30f34a79882..1b9b82bbe322 100644 --- a/arch/um/include/asm/Kbuild +++ b/arch/um/include/asm/Kbuild @@ -1,5 +1,28 @@ -generic-y += bug.h cputime.h device.h emergency-restart.h futex.h hardirq.h -generic-y += hw_irq.h irq_regs.h kdebug.h percpu.h sections.h topology.h xor.h -generic-y += ftrace.h pci.h io.h param.h delay.h mutex.h current.h exec.h -generic-y += switch_to.h clkdev.h +# SPDX-License-Identifier: GPL-2.0 +generic-y += bug.h +generic-y += compat.h +generic-y += device.h +generic-y += dma-mapping.h +generic-y += emergency-restart.h +generic-y += exec.h +generic-y += ftrace.h +generic-y += hw_irq.h +generic-y += irq_regs.h +generic-y += irq_work.h +generic-y += kdebug.h +generic-y += mcs_spinlock.h +generic-y += mmiowb.h +generic-y += module.h +generic-y += module.lds.h +generic-y += parport.h +generic-y += percpu.h +generic-y += preempt.h +generic-y += runtime-const.h +generic-y += softirq_stack.h +generic-y += switch_to.h +generic-y += topology.h generic-y += trace_clock.h +generic-y += kprobes.h +generic-y += mm_hooks.h +generic-y += vga.h +generic-y += video.h diff --git a/arch/um/include/asm/a.out-core.h b/arch/um/include/asm/a.out-core.h deleted file mode 100644 index 995643b18309..000000000000 --- a/arch/um/include/asm/a.out-core.h +++ /dev/null @@ -1,27 +0,0 @@ -/* a.out coredump register dumper - * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ - -#ifndef __UM_A_OUT_CORE_H -#define __UM_A_OUT_CORE_H - -#ifdef __KERNEL__ - -#include <linux/user.h> - -/* - * fill in the user structure for an a.out core dump - */ -static inline void aout_dump_thread(struct pt_regs *regs, struct user *u) -{ -} - -#endif /* __KERNEL__ */ -#endif /* __UM_A_OUT_CORE_H */ diff --git a/arch/um/include/asm/archrandom.h b/arch/um/include/asm/archrandom.h new file mode 100644 index 000000000000..24e16c979c51 --- /dev/null +++ b/arch/um/include/asm/archrandom.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_UM_ARCHRANDOM_H__ +#define __ASM_UM_ARCHRANDOM_H__ + +#include <linux/types.h> + +/* This is from <os.h>, but better not to #include that in a global header here. */ +ssize_t os_getrandom(void *buf, size_t len, unsigned int flags); + +static inline size_t __must_check arch_get_random_longs(unsigned long *v, size_t max_longs) +{ + ssize_t ret; + + ret = os_getrandom(v, max_longs * sizeof(*v), 0); + if (ret < 0) + return 0; + return ret / sizeof(*v); +} + +static inline size_t __must_check arch_get_random_seed_longs(unsigned long *v, size_t max_longs) +{ + return 0; +} + +#endif diff --git a/arch/um/include/asm/asm-prototypes.h b/arch/um/include/asm/asm-prototypes.h new file mode 100644 index 000000000000..408b31d59127 --- /dev/null +++ b/arch/um/include/asm/asm-prototypes.h @@ -0,0 +1,6 @@ +#include <asm-generic/asm-prototypes.h> +#include <asm/checksum.h> + +#ifdef CONFIG_UML_X86 +extern void cmpxchg8b_emu(void); +#endif diff --git a/arch/um/include/asm/bpf_perf_event.h b/arch/um/include/asm/bpf_perf_event.h new file mode 100644 index 000000000000..287221342d2c --- /dev/null +++ b/arch/um/include/asm/bpf_perf_event.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * asm-generic/bpf_perf_event.h is part of the uapi headers, but since + * arch/um has no uapi of its on, we can't use the "generic-y" + * Kbuild rule to generate the wrapper + */ + +#include <asm-generic/bpf_perf_event.h> diff --git a/arch/um/include/asm/bugs.h b/arch/um/include/asm/bugs.h deleted file mode 100644 index 6a72e240d5fc..000000000000 --- a/arch/um/include/asm/bugs.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __UM_BUGS_H -#define __UM_BUGS_H - -void check_bugs(void); - -#endif diff --git a/arch/um/include/asm/cache.h b/arch/um/include/asm/cache.h index 19e1bdd67416..5c156273484b 100644 --- a/arch/um/include/asm/cache.h +++ b/arch/um/include/asm/cache.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __UM_CACHE_H #define __UM_CACHE_H diff --git a/arch/um/include/asm/cacheflush.h b/arch/um/include/asm/cacheflush.h new file mode 100644 index 000000000000..4c9858cd36ec --- /dev/null +++ b/arch/um/include/asm/cacheflush.h @@ -0,0 +1,9 @@ +#ifndef __UM_ASM_CACHEFLUSH_H +#define __UM_ASM_CACHEFLUSH_H + +#include <asm/tlbflush.h> +#define flush_cache_vmap flush_tlb_kernel_range +#define flush_cache_vunmap flush_tlb_kernel_range + +#include <asm-generic/cacheflush.h> +#endif /* __UM_ASM_CACHEFLUSH_H */ diff --git a/arch/um/include/asm/common.lds.S b/arch/um/include/asm/common.lds.S index 1dd5bd8a8c59..fd481ac371de 100644 --- a/arch/um/include/asm/common.lds.S +++ b/arch/um/include/asm/common.lds.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include <asm-generic/vmlinux.lds.h> .fini : { *(.fini) } =0x9090 @@ -8,14 +9,13 @@ _sdata = .; PROVIDE (sdata = .); - RODATA + RO_DATA(4096) .unprotected : { *(.unprotected) } . = ALIGN(4096); PROVIDE (_unprotected_end = .); . = ALIGN(4096); - .note : { *(.note.*) } EXCEPTION_TABLE(0) BUG_TABLE @@ -52,14 +52,6 @@ CON_INITCALL } - .uml.initcall.init : { - __uml_initcall_start = .; - *(.uml.initcall.init) - __uml_initcall_end = .; - } - - SECURITY_INIT - .exitcall : { __exitcall_begin = .; *(.exitcall.exit) @@ -81,7 +73,7 @@ .altinstr_replacement : { *(.altinstr_replacement) } /* .exit.text is discard at runtime, not link time, to deal with references from .altinstructions and .eh_frame */ - .exit.text : { *(.exit.text) } + .exit.text : { EXIT_TEXT } .exit.data : { *(.exit.data) } .preinit_array : { @@ -91,6 +83,8 @@ } .init_array : { __init_array_start = .; + *(.kasan_init) + *(.init_array.*) *(.init_array) __init_array_end = .; } diff --git a/arch/um/include/asm/cpufeature.h b/arch/um/include/asm/cpufeature.h new file mode 100644 index 000000000000..4354f6984271 --- /dev/null +++ b/arch/um/include/asm/cpufeature.h @@ -0,0 +1,141 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_UM_CPUFEATURE_H +#define _ASM_UM_CPUFEATURE_H + +#include <asm/processor.h> + +#if defined(__KERNEL__) && !defined(__ASSEMBLER__) + +#include <asm/asm.h> +#include <linux/bitops.h> + +extern const char * const x86_cap_flags[NCAPINTS*32]; +extern const char * const x86_power_flags[32]; +#define X86_CAP_FMT "%s" +#define x86_cap_flag(flag) x86_cap_flags[flag] + +/* + * In order to save room, we index into this array by doing + * X86_BUG_<name> - NCAPINTS*32. + */ +extern const char * const x86_bug_flags[NBUGINTS*32]; + +#define test_cpu_cap(c, bit) \ + test_bit(bit, (unsigned long *)((c)->x86_capability)) + +/* + * There are 32 bits/features in each mask word. The high bits + * (selected with (bit>>5) give us the word number and the low 5 + * bits give us the bit/feature number inside the word. + * (1UL<<((bit)&31) gives us a mask for the feature_bit so we can + * see if it is set in the mask word. + */ +#define CHECK_BIT_IN_MASK_WORD(maskname, word, bit) \ + (((bit)>>5)==(word) && (1UL<<((bit)&31) & maskname##word )) + +#define cpu_has(c, bit) \ + test_cpu_cap(c, bit) + +#define this_cpu_has(bit) \ + (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ + x86_this_cpu_test_bit(bit, cpu_info.x86_capability)) + +/* + * This macro is for detection of features which need kernel + * infrastructure to be used. It may *not* directly test the CPU + * itself. Use the cpu_has() family if you want true runtime + * testing of CPU features, like in hypervisor code where you are + * supporting a possible guest feature where host support for it + * is not relevant. + */ +#define cpu_feature_enabled(bit) \ + (__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 : static_cpu_has(bit)) + +#define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit) + +#define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability)) + +extern void setup_clear_cpu_cap(unsigned int bit); + +#define setup_force_cpu_cap(bit) do { \ + set_cpu_cap(&boot_cpu_data, bit); \ + set_bit(bit, (unsigned long *)cpu_caps_set); \ +} while (0) + +#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit) + +/* + * Static testing of CPU features. Used the same as boot_cpu_has(). It + * statically patches the target code for additional performance. Use + * static_cpu_has() only in fast paths, where every cycle counts. Which + * means that the boot_cpu_has() variant is already fast enough for the + * majority of cases and you should stick to using it as it is generally + * only two instructions: a RIP-relative MOV and a TEST. + */ +static __always_inline bool _static_cpu_has(u16 bit) +{ + asm goto("1: jmp 6f\n" + "2:\n" + ".skip -(((5f-4f) - (2b-1b)) > 0) * " + "((5f-4f) - (2b-1b)),0x90\n" + "3:\n" + ".section .altinstructions,\"a\"\n" + " .long 1b - .\n" /* src offset */ + " .long 4f - .\n" /* repl offset */ + " .word %P[always]\n" /* always replace */ + " .byte 3b - 1b\n" /* src len */ + " .byte 5f - 4f\n" /* repl len */ + " .byte 3b - 2b\n" /* pad len */ + ".previous\n" + ".section .altinstr_replacement,\"ax\"\n" + "4: jmp %l[t_no]\n" + "5:\n" + ".previous\n" + ".section .altinstructions,\"a\"\n" + " .long 1b - .\n" /* src offset */ + " .long 0\n" /* no replacement */ + " .word %P[feature]\n" /* feature bit */ + " .byte 3b - 1b\n" /* src len */ + " .byte 0\n" /* repl len */ + " .byte 0\n" /* pad len */ + ".previous\n" + ".section .altinstr_aux,\"ax\"\n" + "6:\n" + " testb %[bitnum],%[cap_byte]\n" + " jnz %l[t_yes]\n" + " jmp %l[t_no]\n" + ".previous\n" + : : [feature] "i" (bit), + [always] "i" (X86_FEATURE_ALWAYS), + [bitnum] "i" (1 << (bit & 7)), + [cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3]) + : : t_yes, t_no); +t_yes: + return true; +t_no: + return false; +} + +#define static_cpu_has(bit) \ +( \ + __builtin_constant_p(boot_cpu_has(bit)) ? \ + boot_cpu_has(bit) : \ + _static_cpu_has(bit) \ +) + +#define cpu_has_bug(c, bit) cpu_has(c, (bit)) +#define set_cpu_bug(c, bit) set_cpu_cap(c, (bit)) + +#define static_cpu_has_bug(bit) static_cpu_has((bit)) +#define boot_cpu_has_bug(bit) cpu_has_bug(&boot_cpu_data, (bit)) +#define boot_cpu_set_bug(bit) set_cpu_cap(&boot_cpu_data, (bit)) + +#define MAX_CPU_FEATURES (NCAPINTS * 32) +#define cpu_have_feature boot_cpu_has + +#define CPU_FEATURE_TYPEFMT "x86,ven%04Xfam%04Xmod%04X" +#define CPU_FEATURE_TYPEVAL boot_cpu_data.x86_vendor, boot_cpu_data.x86, \ + boot_cpu_data.x86_model + +#endif /* defined(__KERNEL__) && !defined(__ASSEMBLER__) */ +#endif /* _ASM_UM_CPUFEATURE_H */ diff --git a/arch/um/include/asm/current.h b/arch/um/include/asm/current.h new file mode 100644 index 000000000000..159a29b3d4cc --- /dev/null +++ b/arch/um/include/asm/current.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_CURRENT_H +#define __ASM_CURRENT_H + +#include <linux/compiler.h> +#include <linux/threads.h> + +#ifndef __ASSEMBLER__ + +#include <shared/smp.h> + +struct task_struct; +extern struct task_struct *cpu_tasks[NR_CPUS]; + +static __always_inline struct task_struct *get_current(void) +{ + return cpu_tasks[uml_curr_cpu()]; +} + +#define current get_current() + +#endif /* __ASSEMBLER__ */ + +#endif /* __ASM_CURRENT_H */ diff --git a/arch/um/include/asm/delay.h b/arch/um/include/asm/delay.h new file mode 100644 index 000000000000..e79b2ab6f40c --- /dev/null +++ b/arch/um/include/asm/delay.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __UM_DELAY_H +#define __UM_DELAY_H +#include <asm-generic/delay.h> +#include <linux/time-internal.h> + +static inline void um_ndelay(unsigned long nsecs) +{ + if (time_travel_mode == TT_MODE_INFCPU || + time_travel_mode == TT_MODE_EXTERNAL) { + time_travel_ndelay(nsecs); + return; + } + ndelay(nsecs); +} +#undef ndelay +#define ndelay(n) um_ndelay(n) + +static inline void um_udelay(unsigned long usecs) +{ + if (time_travel_mode == TT_MODE_INFCPU || + time_travel_mode == TT_MODE_EXTERNAL) { + time_travel_ndelay(1000 * usecs); + return; + } + udelay(usecs); +} +#undef udelay +#define udelay(n) um_udelay(n) +#endif /* __UM_DELAY_H */ diff --git a/arch/um/include/asm/dma.h b/arch/um/include/asm/dma.h index f88c5860520b..fdc53642c718 100644 --- a/arch/um/include/asm/dma.h +++ b/arch/um/include/asm/dma.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __UM_DMA_H #define __UM_DMA_H diff --git a/arch/um/include/asm/fixmap.h b/arch/um/include/asm/fixmap.h deleted file mode 100644 index 21a423bae5e8..000000000000 --- a/arch/um/include/asm/fixmap.h +++ /dev/null @@ -1,98 +0,0 @@ -#ifndef __UM_FIXMAP_H -#define __UM_FIXMAP_H - -#include <asm/processor.h> -#include <asm/kmap_types.h> -#include <asm/archparam.h> -#include <asm/page.h> -#include <linux/threads.h> - -/* - * Here we define all the compile-time 'special' virtual - * addresses. The point is to have a constant address at - * compile time, but to set the physical address only - * in the boot process. We allocate these special addresses - * from the end of virtual memory (0xfffff000) backwards. - * Also this lets us do fail-safe vmalloc(), we - * can guarantee that these special addresses and - * vmalloc()-ed addresses never overlap. - * - * these 'compile-time allocated' memory buffers are - * fixed-size 4k pages. (or larger if used with an increment - * highger than 1) use fixmap_set(idx,phys) to associate - * physical memory with fixmap indices. - * - * TLB entries of such buffers will not be flushed across - * task switches. - */ - -/* - * on UP currently we will have no trace of the fixmap mechanizm, - * no page table allocations, etc. This might change in the - * future, say framebuffers for the console driver(s) could be - * fix-mapped? - */ -enum fixed_addresses { -#ifdef CONFIG_HIGHMEM - FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ - FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, -#endif - __end_of_fixed_addresses -}; - -extern void __set_fixmap (enum fixed_addresses idx, - unsigned long phys, pgprot_t flags); - -#define set_fixmap(idx, phys) \ - __set_fixmap(idx, phys, PAGE_KERNEL) -/* - * Some hardware wants to get fixmapped without caching. - */ -#define set_fixmap_nocache(idx, phys) \ - __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE) -/* - * used by vmalloc.c. - * - * Leave one empty page between vmalloc'ed areas and - * the start of the fixmap, and leave one page empty - * at the top of mem.. - */ - -#define FIXADDR_TOP (TASK_SIZE - 2 * PAGE_SIZE) -#define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT) -#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) - -#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT)) -#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT) - -extern void __this_fixmap_does_not_exist(void); - -/* - * 'index to address' translation. If anyone tries to use the idx - * directly without tranlation, we catch the bug with a NULL-deference - * kernel oops. Illegal ranges of incoming indices are caught too. - */ -static inline unsigned long fix_to_virt(const unsigned int idx) -{ - /* - * this branch gets completely eliminated after inlining, - * except when someone tries to use fixaddr indices in an - * illegal way. (such as mixing up address types or using - * out-of-range indices). - * - * If it doesn't get removed, the linker will complain - * loudly with a reasonably clear error message.. - */ - if (idx >= __end_of_fixed_addresses) - __this_fixmap_does_not_exist(); - - return __fix_to_virt(idx); -} - -static inline unsigned long virt_to_fix(const unsigned long vaddr) -{ - BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START); - return __virt_to_fix(vaddr); -} - -#endif diff --git a/arch/um/include/asm/fpu/api.h b/arch/um/include/asm/fpu/api.h new file mode 100644 index 000000000000..3abf67c83c40 --- /dev/null +++ b/arch/um/include/asm/fpu/api.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _ASM_UM_FPU_API_H +#define _ASM_UM_FPU_API_H + +#include <linux/types.h> + +/* Copyright (c) 2020 Cambridge Greys Ltd + * Copyright (c) 2020 Red Hat Inc. + * A set of "dummy" defines to allow the direct inclusion + * of x86 optimized copy, xor, etc routines into the + * UML code tree. */ + +#define kernel_fpu_begin() (void)0 +#define kernel_fpu_end() (void)0 + +static inline bool irq_fpu_usable(void) +{ + return true; +} + + +#endif diff --git a/arch/um/include/asm/futex.h b/arch/um/include/asm/futex.h new file mode 100644 index 000000000000..780aa6bfc050 --- /dev/null +++ b/arch/um/include/asm/futex.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_UM_FUTEX_H +#define _ASM_UM_FUTEX_H + +#include <linux/futex.h> +#include <linux/uaccess.h> +#include <asm/errno.h> + + +int arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval, u32 __user *uaddr); +int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval); + +#endif diff --git a/arch/um/include/asm/hardirq.h b/arch/um/include/asm/hardirq.h new file mode 100644 index 000000000000..8de71752a9b8 --- /dev/null +++ b/arch/um/include/asm/hardirq.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_UM_HARDIRQ_H +#define __ASM_UM_HARDIRQ_H + +#include <linux/cache.h> +#include <linux/threads.h> + +#define __ARCH_IRQ_EXIT_IRQS_DISABLED 1 + +typedef struct { + unsigned int __softirq_pending; +#if IS_ENABLED(CONFIG_SMP) + unsigned int irq_resched_count; + unsigned int irq_call_count; +#endif +} ____cacheline_aligned irq_cpustat_t; + +DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); + +#define __ARCH_IRQ_STAT + +#define inc_irq_stat(member) this_cpu_inc(irq_stat.member) + +#include <linux/irq.h> + +static inline void ack_bad_irq(unsigned int irq) +{ + pr_crit("unexpected IRQ trap at vector %02x\n", irq); +} + +#endif /* __ASM_UM_HARDIRQ_H */ diff --git a/arch/um/include/asm/io.h b/arch/um/include/asm/io.h new file mode 100644 index 000000000000..9ea42cc746d9 --- /dev/null +++ b/arch/um/include/asm/io.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_UM_IO_H +#define _ASM_UM_IO_H +#include <linux/types.h> + +/* get emulated iomem (if desired) */ +#include <asm-generic/logic_io.h> + +#ifndef ioremap +#define ioremap ioremap +static inline void __iomem *ioremap(phys_addr_t offset, size_t size) +{ + return NULL; +} +#endif /* ioremap */ + +#ifndef iounmap +#define iounmap iounmap +static inline void iounmap(void __iomem *addr) +{ +} +#endif /* iounmap */ + +#include <asm-generic/io.h> + +#endif diff --git a/arch/um/include/asm/irq.h b/arch/um/include/asm/irq.h index 4a2037f8204b..36dbedd1af48 100644 --- a/arch/um/include/asm/irq.h +++ b/arch/um/include/asm/irq.h @@ -1,23 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __UM_IRQ_H #define __UM_IRQ_H #define TIMER_IRQ 0 #define UMN_IRQ 1 -#define CONSOLE_IRQ 2 -#define CONSOLE_WRITE_IRQ 3 -#define UBD_IRQ 4 -#define UM_ETH_IRQ 5 -#define SSL_IRQ 6 -#define SSL_WRITE_IRQ 7 -#define ACCEPT_IRQ 8 -#define MCONSOLE_IRQ 9 -#define WINCH_IRQ 10 -#define SIGIO_WRITE_IRQ 11 -#define TELNETD_IRQ 12 -#define XTERM_IRQ 13 -#define RANDOM_IRQ 14 - -#define LAST_IRQ RANDOM_IRQ -#define NR_IRQS (LAST_IRQ + 1) +#define UBD_IRQ 2 +#define UM_ETH_IRQ 3 +#define ACCEPT_IRQ 4 +#define MCONSOLE_IRQ 5 +#define WINCH_IRQ 6 +#define SIGIO_WRITE_IRQ 7 +#define TELNETD_IRQ 8 +#define XTERM_IRQ 9 +#define RANDOM_IRQ 10 +#define SIGCHLD_IRQ 11 +#ifdef CONFIG_UML_NET_VECTOR + +#define VECTOR_BASE_IRQ (SIGCHLD_IRQ + 1) +#define VECTOR_IRQ_SPACE 8 + +#define UM_FIRST_DYN_IRQ (VECTOR_IRQ_SPACE + VECTOR_BASE_IRQ) + +#else + +#define UM_FIRST_DYN_IRQ (SIGCHLD_IRQ + 1) + +#endif + +#define UM_LAST_SIGNAL_IRQ 64 +/* If we have (simulated) PCI MSI, allow 64 more interrupt numbers for it */ +#ifdef CONFIG_PCI_MSI +#define NR_IRQS (UM_LAST_SIGNAL_IRQ + 64) +#else +#define NR_IRQS UM_LAST_SIGNAL_IRQ +#endif /* CONFIG_PCI_MSI */ + +#include <asm-generic/irq.h> #endif diff --git a/arch/um/include/asm/irqflags.h b/arch/um/include/asm/irqflags.h index c780d8a16773..31e49e0894c5 100644 --- a/arch/um/include/asm/irqflags.h +++ b/arch/um/include/asm/irqflags.h @@ -1,42 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __UM_IRQFLAGS_H #define __UM_IRQFLAGS_H -extern int get_signals(void); -extern int set_signals(int enable); -extern void block_signals(void); -extern void unblock_signals(void); +int um_get_signals(void); +int um_set_signals(int enable); +void block_signals(void); +void unblock_signals(void); +#define arch_local_save_flags arch_local_save_flags static inline unsigned long arch_local_save_flags(void) { - return get_signals(); + return um_get_signals(); } +#define arch_local_irq_restore arch_local_irq_restore static inline void arch_local_irq_restore(unsigned long flags) { - set_signals(flags); + um_set_signals(flags); } +#define arch_local_irq_enable arch_local_irq_enable static inline void arch_local_irq_enable(void) { unblock_signals(); } +#define arch_local_irq_disable arch_local_irq_disable static inline void arch_local_irq_disable(void) { block_signals(); } -static inline unsigned long arch_local_irq_save(void) -{ - unsigned long flags; - flags = arch_local_save_flags(); - arch_local_irq_disable(); - return flags; -} +#define ARCH_IRQ_DISABLED 0 -static inline bool arch_irqs_disabled(void) -{ - return arch_local_save_flags() == 0; -} +#include <asm-generic/irqflags.h> #endif diff --git a/arch/um/include/asm/kasan.h b/arch/um/include/asm/kasan.h new file mode 100644 index 000000000000..81bcdc0f962e --- /dev/null +++ b/arch/um/include/asm/kasan.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_UM_KASAN_H +#define __ASM_UM_KASAN_H + +#include <linux/init.h> +#include <linux/const.h> + +#define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL) + +/* used in kasan_mem_to_shadow to divide by 8 */ +#define KASAN_SHADOW_SCALE_SHIFT 3 + +#ifdef CONFIG_X86_64 +#define KASAN_HOST_USER_SPACE_END_ADDR 0x00007fffffffffffUL +/* KASAN_SHADOW_SIZE is the size of total address space divided by 8 */ +#define KASAN_SHADOW_SIZE ((KASAN_HOST_USER_SPACE_END_ADDR + 1) >> \ + KASAN_SHADOW_SCALE_SHIFT) +#else +#error "KASAN_SHADOW_SIZE is not defined for this sub-architecture" +#endif /* CONFIG_X86_64 */ + +#define KASAN_SHADOW_START (KASAN_SHADOW_OFFSET) +#define KASAN_SHADOW_END (KASAN_SHADOW_START + KASAN_SHADOW_SIZE) + +#ifdef CONFIG_KASAN +void kasan_init(void); +#else +static inline void kasan_init(void) { } +#endif /* CONFIG_KASAN */ + +#endif /* __ASM_UM_KASAN_H */ diff --git a/arch/um/include/asm/kmap_types.h b/arch/um/include/asm/kmap_types.h deleted file mode 100644 index 2e0a6b1d8300..000000000000 --- a/arch/um/include/asm/kmap_types.h +++ /dev/null @@ -1,13 +0,0 @@ -/* - * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL - */ - -#ifndef __UM_KMAP_TYPES_H -#define __UM_KMAP_TYPES_H - -/* No more #include "asm/arch/kmap_types.h" ! */ - -#define KM_TYPE_NR 14 - -#endif diff --git a/arch/um/include/asm/mmu.h b/arch/um/include/asm/mmu.h index da705448590f..07d48738b402 100644 --- a/arch/um/include/asm/mmu.h +++ b/arch/um/include/asm/mmu.h @@ -1,24 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #ifndef __ARCH_UM_MMU_H #define __ARCH_UM_MMU_H +#include "linux/types.h" +#include <linux/mutex.h> +#include <linux/spinlock.h> #include <mm_id.h> -#include <asm/mm_context.h> typedef struct mm_context { struct mm_id id; - struct uml_arch_mm_context arch; - struct page *stub_pages[2]; -} mm_context_t; + struct mutex turnstile; + + struct list_head list; -extern void __switch_mm(struct mm_id * mm_idp); + /* Address range in need of a TLB sync */ + spinlock_t sync_tlb_lock; + unsigned long sync_tlb_range_from; + unsigned long sync_tlb_range_to; +} mm_context_t; -/* Avoid tangled inclusion with asm/ldt.h */ -extern long init_new_ldt(struct mm_context *to_mm, struct mm_context *from_mm); -extern void free_ldt(struct mm_context *mm); +#define INIT_MM_CONTEXT(mm) \ + .context = { \ + .turnstile = __MUTEX_INITIALIZER(mm.context.turnstile), \ + .sync_tlb_lock = __SPIN_LOCK_INITIALIZER(mm.context.sync_tlb_lock), \ + } #endif diff --git a/arch/um/include/asm/mmu_context.h b/arch/um/include/asm/mmu_context.h index aa4a743dc4ab..c727e56ba116 100644 --- a/arch/um/include/asm/mmu_context.h +++ b/arch/um/include/asm/mmu_context.h @@ -1,58 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #ifndef __UM_MMU_CONTEXT_H #define __UM_MMU_CONTEXT_H #include <linux/sched.h> -#include <asm/mmu.h> - -extern void uml_setup_stubs(struct mm_struct *mm); -extern void arch_exit_mmap(struct mm_struct *mm); - -#define deactivate_mm(tsk,mm) do { } while (0) - -extern void force_flush_all(void); +#include <linux/mm_types.h> +#include <linux/mmap_lock.h> -static inline void activate_mm(struct mm_struct *old, struct mm_struct *new) -{ - /* - * This is called by fs/exec.c and sys_unshare() - * when the new ->mm is used for the first time. - */ - __switch_mm(&new->context.id); - down_write(&new->mmap_sem); - uml_setup_stubs(new); - up_write(&new->mmap_sem); -} +#include <asm/mm_hooks.h> +#include <asm/mmu.h> static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { - unsigned cpu = smp_processor_id(); - - if(prev != next){ - cpumask_clear_cpu(cpu, mm_cpumask(prev)); - cpumask_set_cpu(cpu, mm_cpumask(next)); - if(next != &init_mm) - __switch_mm(&next->context.id); - } -} - -static inline void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) -{ - uml_setup_stubs(mm); -} - -static inline void enter_lazy_tlb(struct mm_struct *mm, - struct task_struct *tsk) -{ } +#define init_new_context init_new_context extern int init_new_context(struct task_struct *task, struct mm_struct *mm); +#define destroy_context destroy_context extern void destroy_context(struct mm_struct *mm); +#include <asm-generic/mmu_context.h> + #endif diff --git a/arch/um/include/asm/msi.h b/arch/um/include/asm/msi.h new file mode 100644 index 000000000000..c8c6c381f394 --- /dev/null +++ b/arch/um/include/asm/msi.h @@ -0,0 +1 @@ +#include <asm-generic/msi.h> diff --git a/arch/um/include/asm/page.h b/arch/um/include/asm/page.h index 5ff53d9185f7..2d363460d896 100644 --- a/arch/um/include/asm/page.h +++ b/arch/um/include/asm/page.h @@ -1,7 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2000 - 2003 Jeff Dike (jdike@addtoit.com) * Copyright 2003 PathScale, Inc. - * Licensed under the GPL */ #ifndef __UM_PAGE_H @@ -9,15 +9,13 @@ #include <linux/const.h> -/* PAGE_SHIFT determines the page size */ -#define PAGE_SHIFT 12 -#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) -#define PAGE_MASK (~(PAGE_SIZE-1)) +#include <vdso/page.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct page; +#include <linux/pfn.h> #include <linux/types.h> #include <asm/vm-flags.h> @@ -31,56 +29,35 @@ struct page; #define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) -#if defined(CONFIG_3_LEVEL_PGTABLES) && !defined(CONFIG_64BIT) - -typedef struct { unsigned long pte_low, pte_high; } pte_t; -typedef struct { unsigned long pmd; } pmd_t; +typedef struct { unsigned long pte; } pte_t; typedef struct { unsigned long pgd; } pgd_t; -#define pte_val(x) ((x).pte_low | ((unsigned long long) (x).pte_high << 32)) - -#define pte_get_bits(pte, bits) ((pte).pte_low & (bits)) -#define pte_set_bits(pte, bits) ((pte).pte_low |= (bits)) -#define pte_clear_bits(pte, bits) ((pte).pte_low &= ~(bits)) -#define pte_copy(to, from) ({ (to).pte_high = (from).pte_high; \ - smp_wmb(); \ - (to).pte_low = (from).pte_low; }) -#define pte_is_zero(pte) (!((pte).pte_low & ~_PAGE_NEWPAGE) && !(pte).pte_high) -#define pte_set_val(pte, phys, prot) \ - ({ (pte).pte_high = (phys) >> 32; \ - (pte).pte_low = (phys) | pgprot_val(prot); }) +#if CONFIG_PGTABLE_LEVELS > 2 + +typedef struct { unsigned long pmd; } pmd_t; #define pmd_val(x) ((x).pmd) #define __pmd(x) ((pmd_t) { (x) } ) -typedef unsigned long long pfn_t; -typedef unsigned long long phys_t; - -#else +#if CONFIG_PGTABLE_LEVELS > 3 -typedef struct { unsigned long pte; } pte_t; -typedef struct { unsigned long pgd; } pgd_t; +typedef struct { unsigned long pud; } pud_t; +#define pud_val(x) ((x).pud) +#define __pud(x) ((pud_t) { (x) } ) -#ifdef CONFIG_3_LEVEL_PGTABLES -typedef struct { unsigned long pmd; } pmd_t; -#define pmd_val(x) ((x).pmd) -#define __pmd(x) ((pmd_t) { (x) } ) -#endif +#endif /* CONFIG_PGTABLE_LEVELS > 3 */ +#endif /* CONFIG_PGTABLE_LEVELS > 2 */ #define pte_val(x) ((x).pte) - #define pte_get_bits(p, bits) ((p).pte & (bits)) #define pte_set_bits(p, bits) ((p).pte |= (bits)) #define pte_clear_bits(p, bits) ((p).pte &= ~(bits)) #define pte_copy(to, from) ((to).pte = (from).pte) -#define pte_is_zero(p) (!((p).pte & ~_PAGE_NEWPAGE)) +#define pte_is_zero(p) (!((p).pte & ~_PAGE_NEEDSYNC)) #define pte_set_val(p, phys, prot) (p).pte = (phys | pgprot_val(prot)) -typedef unsigned long pfn_t; typedef unsigned long phys_t; -#endif - typedef struct { unsigned long pgprot; } pgprot_t; typedef struct page *pgtable_t; @@ -106,17 +83,17 @@ extern unsigned long uml_physmem; * casting is the right thing, but 32-bit UML can't have 64-bit virtual * addresses */ -#define __pa(virt) to_phys((void *) (unsigned long) (virt)) -#define __va(phys) to_virt((unsigned long) (phys)) +#define __pa(virt) uml_to_phys((void *) (unsigned long) (virt)) +#define __va(phys) uml_to_virt((unsigned long) (phys)) -#define phys_to_pfn(p) ((pfn_t) ((p) >> PAGE_SHIFT)) -#define pfn_to_phys(pfn) ((phys_t) ((pfn) << PAGE_SHIFT)) +#define phys_to_pfn(p) ((p) >> PAGE_SHIFT) +#define pfn_to_phys(pfn) PFN_PHYS(pfn) -#define pfn_valid(pfn) ((pfn) < max_mapnr) #define virt_addr_valid(v) pfn_valid(phys_to_pfn(__pa(v))) #include <asm-generic/memory_model.h> #include <asm-generic/getorder.h> -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ + #endif /* __UM_PAGE_H */ diff --git a/arch/um/include/asm/pci.h b/arch/um/include/asm/pci.h new file mode 100644 index 000000000000..238d2e7faff8 --- /dev/null +++ b/arch/um/include/asm/pci.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __ASM_UM_PCI_H +#define __ASM_UM_PCI_H +#include <linux/types.h> +#include <asm/io.h> + +/* Generic PCI */ +#include <asm-generic/pci.h> + +#ifdef CONFIG_PCI_MSI +/* + * This is a bit of an annoying hack, and it assumes we only have + * the virt-pci (if anything). Which is true, but still. + */ +void *pci_root_bus_fwnode(struct pci_bus *bus); +#define pci_root_bus_fwnode pci_root_bus_fwnode +#endif + +#endif /* __ASM_UM_PCI_H */ diff --git a/arch/um/include/asm/pgalloc.h b/arch/um/include/asm/pgalloc.h index bf90b2aa2002..826ec44b58cd 100644 --- a/arch/um/include/asm/pgalloc.h +++ b/arch/um/include/asm/pgalloc.h @@ -1,8 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) * Copyright 2003 PathScale, Inc. * Derived from include/asm-i386/pgalloc.h and include/asm-i386/pgtable.h - * Licensed under the GPL */ #ifndef __UM_PGALLOC_H @@ -10,6 +10,8 @@ #include <linux/mm.h> +#include <asm-generic/pgalloc.h> + #define pmd_populate_kernel(mm, pmd, pte) \ set_pmd(pmd, __pmd(_PAGE_TABLE + (unsigned long) __pa(pte))) @@ -17,45 +19,27 @@ set_pmd(pmd, __pmd(_PAGE_TABLE + \ ((unsigned long long)page_to_pfn(pte) << \ (unsigned long long) PAGE_SHIFT))) -#define pmd_pgtable(pmd) pmd_page(pmd) /* * Allocate and free page tables. */ extern pgd_t *pgd_alloc(struct mm_struct *); -extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); - -extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long); -extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long); -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - free_page((unsigned long) pte); -} +#define __pte_free_tlb(tlb, pte, address) \ + tlb_remove_ptdesc((tlb), page_ptdesc(pte)) -static inline void pte_free(struct mm_struct *mm, pgtable_t pte) -{ - pgtable_page_dtor(pte); - __free_page(pte); -} +#if CONFIG_PGTABLE_LEVELS > 2 -#define __pte_free_tlb(tlb,pte, address) \ -do { \ - pgtable_page_dtor(pte); \ - tlb_remove_page((tlb),(pte)); \ -} while (0) +#define __pmd_free_tlb(tlb, pmd, address) \ + tlb_remove_ptdesc((tlb), virt_to_ptdesc(pmd)) -#ifdef CONFIG_3_LEVEL_PGTABLES +#if CONFIG_PGTABLE_LEVELS > 3 -static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) -{ - free_page((unsigned long)pmd); -} +#define __pud_free_tlb(tlb, pud, address) \ + tlb_remove_ptdesc((tlb), virt_to_ptdesc(pud)) -#define __pmd_free_tlb(tlb,x, address) tlb_remove_page((tlb),virt_to_page(x)) #endif - -#define check_pgt_cache() do { } while (0) +#endif #endif diff --git a/arch/um/include/asm/pgtable-2level.h b/arch/um/include/asm/pgtable-2level.h index f534b73e753e..14ec16f92ce4 100644 --- a/arch/um/include/asm/pgtable-2level.h +++ b/arch/um/include/asm/pgtable-2level.h @@ -1,8 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) * Copyright 2003 PathScale, Inc. * Derived from include/asm-i386/pgtable.h - * Licensed under the GPL */ #ifndef __UM_PGTABLE_2LEVEL_H @@ -23,7 +23,6 @@ #define PTRS_PER_PTE 1024 #define USER_PTRS_PER_PGD ((TASK_SIZE + (PGDIR_SIZE - 1)) / PGDIR_SIZE) #define PTRS_PER_PGD 1024 -#define FIRST_USER_ADDRESS 0 #define pte_ERROR(e) \ printk("%s:%d: bad pte %p(%08lx).\n", __FILE__, __LINE__, &(e), \ @@ -32,22 +31,12 @@ printk("%s:%d: bad pgd %p(%08lx).\n", __FILE__, __LINE__, &(e), \ pgd_val(e)) -static inline int pgd_newpage(pgd_t pgd) { return 0; } +static inline int pgd_needsync(pgd_t pgd) { return 0; } static inline void pgd_mkuptodate(pgd_t pgd) { } #define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval)) #define pte_pfn(x) phys_to_pfn(pte_val(x)) -#define pfn_pte(pfn, prot) __pte(pfn_to_phys(pfn) | pgprot_val(prot)) #define pfn_pmd(pfn, prot) __pmd(pfn_to_phys(pfn) | pgprot_val(prot)) -/* - * Bits 0 through 4 are taken - */ -#define PTE_FILE_MAX_BITS 27 - -#define pte_to_pgoff(pte) (pte_val(pte) >> 5) - -#define pgoff_to_pte(off) ((pte_t) { ((off) << 5) + _PAGE_FILE }) - #endif diff --git a/arch/um/include/asm/pgtable-3level.h b/arch/um/include/asm/pgtable-3level.h deleted file mode 100644 index 0032f9212e74..000000000000 --- a/arch/um/include/asm/pgtable-3level.h +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright 2003 PathScale Inc - * Derived from include/asm-i386/pgtable.h - * Licensed under the GPL - */ - -#ifndef __UM_PGTABLE_3LEVEL_H -#define __UM_PGTABLE_3LEVEL_H - -#include <asm-generic/pgtable-nopud.h> - -/* PGDIR_SHIFT determines what a third-level page table entry can map */ - -#ifdef CONFIG_64BIT -#define PGDIR_SHIFT 30 -#else -#define PGDIR_SHIFT 31 -#endif -#define PGDIR_SIZE (1UL << PGDIR_SHIFT) -#define PGDIR_MASK (~(PGDIR_SIZE-1)) - -/* PMD_SHIFT determines the size of the area a second-level page table can - * map - */ - -#define PMD_SHIFT 21 -#define PMD_SIZE (1UL << PMD_SHIFT) -#define PMD_MASK (~(PMD_SIZE-1)) - -/* - * entries per page directory level - */ - -#define PTRS_PER_PTE 512 -#ifdef CONFIG_64BIT -#define PTRS_PER_PMD 512 -#define PTRS_PER_PGD 512 -#else -#define PTRS_PER_PMD 1024 -#define PTRS_PER_PGD 1024 -#endif - -#define USER_PTRS_PER_PGD ((TASK_SIZE + (PGDIR_SIZE - 1)) / PGDIR_SIZE) -#define FIRST_USER_ADDRESS 0 - -#define pte_ERROR(e) \ - printk("%s:%d: bad pte %p(%016lx).\n", __FILE__, __LINE__, &(e), \ - pte_val(e)) -#define pmd_ERROR(e) \ - printk("%s:%d: bad pmd %p(%016lx).\n", __FILE__, __LINE__, &(e), \ - pmd_val(e)) -#define pgd_ERROR(e) \ - printk("%s:%d: bad pgd %p(%016lx).\n", __FILE__, __LINE__, &(e), \ - pgd_val(e)) - -#define pud_none(x) (!(pud_val(x) & ~_PAGE_NEWPAGE)) -#define pud_bad(x) ((pud_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) -#define pud_present(x) (pud_val(x) & _PAGE_PRESENT) -#define pud_populate(mm, pud, pmd) \ - set_pud(pud, __pud(_PAGE_TABLE + __pa(pmd))) - -#ifdef CONFIG_64BIT -#define set_pud(pudptr, pudval) set_64bit((u64 *) (pudptr), pud_val(pudval)) -#else -#define set_pud(pudptr, pudval) (*(pudptr) = (pudval)) -#endif - -static inline int pgd_newpage(pgd_t pgd) -{ - return(pgd_val(pgd) & _PAGE_NEWPAGE); -} - -static inline void pgd_mkuptodate(pgd_t pgd) { pgd_val(pgd) &= ~_PAGE_NEWPAGE; } - -#ifdef CONFIG_64BIT -#define set_pmd(pmdptr, pmdval) set_64bit((u64 *) (pmdptr), pmd_val(pmdval)) -#else -#define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval)) -#endif - -struct mm_struct; -extern pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address); - -static inline void pud_clear (pud_t *pud) -{ - set_pud(pud, __pud(_PAGE_NEWPAGE)); -} - -#define pud_page(pud) phys_to_page(pud_val(pud) & PAGE_MASK) -#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PAGE_MASK)) - -/* Find an entry in the second-level page table.. */ -#define pmd_offset(pud, address) ((pmd_t *) pud_page_vaddr(*(pud)) + \ - pmd_index(address)) - -static inline unsigned long pte_pfn(pte_t pte) -{ - return phys_to_pfn(pte_val(pte)); -} - -static inline pte_t pfn_pte(pfn_t page_nr, pgprot_t pgprot) -{ - pte_t pte; - phys_t phys = pfn_to_phys(page_nr); - - pte_set_val(pte, phys, pgprot); - return pte; -} - -static inline pmd_t pfn_pmd(pfn_t page_nr, pgprot_t pgprot) -{ - return __pmd((page_nr << PAGE_SHIFT) | pgprot_val(pgprot)); -} - -/* - * Bits 0 through 3 are taken in the low part of the pte, - * put the 32 bits of offset into the high part. - */ -#define PTE_FILE_MAX_BITS 32 - -#ifdef CONFIG_64BIT - -#define pte_to_pgoff(p) ((p).pte >> 32) - -#define pgoff_to_pte(off) ((pte_t) { ((off) << 32) | _PAGE_FILE }) - -#else - -#define pte_to_pgoff(pte) ((pte).pte_high) - -#define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) }) - -#endif - -#endif - diff --git a/arch/um/include/asm/pgtable-4level.h b/arch/um/include/asm/pgtable-4level.h new file mode 100644 index 000000000000..7a271b7b83d2 --- /dev/null +++ b/arch/um/include/asm/pgtable-4level.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2003 PathScale Inc + * Derived from include/asm-i386/pgtable.h + */ + +#ifndef __UM_PGTABLE_4LEVEL_H +#define __UM_PGTABLE_4LEVEL_H + +#include <asm-generic/pgtable-nop4d.h> + +/* PGDIR_SHIFT determines what a fourth-level page table entry can map */ + +#define PGDIR_SHIFT 39 +#define PGDIR_SIZE (1UL << PGDIR_SHIFT) +#define PGDIR_MASK (~(PGDIR_SIZE-1)) + +/* PUD_SHIFT determines the size of the area a third-level page table can + * map + */ + +#define PUD_SHIFT 30 +#define PUD_SIZE (1UL << PUD_SHIFT) +#define PUD_MASK (~(PUD_SIZE-1)) + +/* PMD_SHIFT determines the size of the area a second-level page table can + * map + */ + +#define PMD_SHIFT 21 +#define PMD_SIZE (1UL << PMD_SHIFT) +#define PMD_MASK (~(PMD_SIZE-1)) + +/* + * entries per page directory level + */ + +#define PTRS_PER_PTE 512 +#define PTRS_PER_PMD 512 +#define PTRS_PER_PUD 512 +#define PTRS_PER_PGD 512 + +#define USER_PTRS_PER_PGD ((TASK_SIZE + (PGDIR_SIZE - 1)) / PGDIR_SIZE) + +#define pte_ERROR(e) \ + printk("%s:%d: bad pte %p(%016lx).\n", __FILE__, __LINE__, &(e), \ + pte_val(e)) +#define pmd_ERROR(e) \ + printk("%s:%d: bad pmd %p(%016lx).\n", __FILE__, __LINE__, &(e), \ + pmd_val(e)) +#define pud_ERROR(e) \ + printk("%s:%d: bad pud %p(%016lx).\n", __FILE__, __LINE__, &(e), \ + pud_val(e)) +#define pgd_ERROR(e) \ + printk("%s:%d: bad pgd %p(%016lx).\n", __FILE__, __LINE__, &(e), \ + pgd_val(e)) + +#define pud_none(x) (!(pud_val(x) & ~_PAGE_NEEDSYNC)) +#define pud_bad(x) ((pud_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) +#define pud_present(x) (pud_val(x) & _PAGE_PRESENT) +#define pud_populate(mm, pud, pmd) \ + set_pud(pud, __pud(_PAGE_TABLE + __pa(pmd))) + +#define set_pud(pudptr, pudval) (*(pudptr) = (pudval)) + +#define p4d_none(x) (!(p4d_val(x) & ~_PAGE_NEEDSYNC)) +#define p4d_bad(x) ((p4d_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) +#define p4d_present(x) (p4d_val(x) & _PAGE_PRESENT) +#define p4d_populate(mm, p4d, pud) \ + set_p4d(p4d, __p4d(_PAGE_TABLE + __pa(pud))) + +#define set_p4d(p4dptr, p4dval) (*(p4dptr) = (p4dval)) + + +static inline int pgd_needsync(pgd_t pgd) +{ + return pgd_val(pgd) & _PAGE_NEEDSYNC; +} + +static inline void pgd_mkuptodate(pgd_t pgd) { pgd_val(pgd) &= ~_PAGE_NEEDSYNC; } + +#define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval)) + +static inline void pud_clear (pud_t *pud) +{ + set_pud(pud, __pud(_PAGE_NEEDSYNC)); +} + +static inline void p4d_clear (p4d_t *p4d) +{ + set_p4d(p4d, __p4d(_PAGE_NEEDSYNC)); +} + +#define pud_page(pud) phys_to_page(pud_val(pud) & PAGE_MASK) +#define pud_pgtable(pud) ((pmd_t *) __va(pud_val(pud) & PAGE_MASK)) + +#define p4d_page(p4d) phys_to_page(p4d_val(p4d) & PAGE_MASK) +#define p4d_pgtable(p4d) ((pud_t *) __va(p4d_val(p4d) & PAGE_MASK)) + +static inline unsigned long pte_pfn(pte_t pte) +{ + return phys_to_pfn(pte_val(pte)); +} + +static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) +{ + return __pmd((page_nr << PAGE_SHIFT) | pgprot_val(pgprot)); +} + +#endif diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h index bf974f712af7..3b42b0f45bf6 100644 --- a/arch/um/include/asm/pgtable.h +++ b/arch/um/include/asm/pgtable.h @@ -1,31 +1,35 @@ -/* +/* SPDX-License-Identifier: GPL-2.0 */ +/* * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) * Copyright 2003 PathScale, Inc. * Derived from include/asm-i386/pgtable.h - * Licensed under the GPL */ #ifndef __UM_PGTABLE_H #define __UM_PGTABLE_H -#include <asm/fixmap.h> +#include <asm/page.h> +#include <linux/mm_types.h> #define _PAGE_PRESENT 0x001 -#define _PAGE_NEWPAGE 0x002 -#define _PAGE_NEWPROT 0x004 +#define _PAGE_NEEDSYNC 0x002 #define _PAGE_RW 0x020 #define _PAGE_USER 0x040 #define _PAGE_ACCESSED 0x080 #define _PAGE_DIRTY 0x100 /* If _PAGE_PRESENT is clear, we use these: */ -#define _PAGE_FILE 0x008 /* nonlinear file mapping, saved PTE; unset:swap */ #define _PAGE_PROTNONE 0x010 /* if the user mapped it with PROT_NONE; pte_present gives true */ -#ifdef CONFIG_3_LEVEL_PGTABLES -#include <asm/pgtable-3level.h> -#else +/* We borrow bit 10 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE 0x400 + +#if CONFIG_PGTABLE_LEVELS == 4 +#include <asm/pgtable-4level.h> +#elif CONFIG_PGTABLE_LEVELS == 2 #include <asm/pgtable-2level.h> +#else +#error "Unsupported number of page table levels" #endif extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; @@ -33,8 +37,6 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; /* zero page used for uninitialized stuff */ extern unsigned long *empty_zero_page; -#define pgtable_cache_init() do ; while (0) - /* Just any arbitrary offset to the start of the vmalloc VM area: the * current 8MB value just means that there will be a 8MB "hole" after the * physical memory until the kernel virtual memory starts. That means that @@ -43,19 +45,15 @@ extern unsigned long *empty_zero_page; * area for the same reason. ;) */ -extern unsigned long end_iomem; +#ifndef COMPILE_OFFSETS +#include <as-layout.h> /* for high_physmem */ +#endif #define VMALLOC_OFFSET (__va_space) -#define VMALLOC_START ((end_iomem + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1)) -#define PKMAP_BASE ((FIXADDR_START - LAST_PKMAP * PAGE_SIZE) & PMD_MASK) -#ifdef CONFIG_HIGHMEM -# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE) -#else -# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE) -#endif +#define VMALLOC_START ((high_physmem + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1)) +#define VMALLOC_END (TASK_SIZE-2*PAGE_SIZE) #define MODULES_VADDR VMALLOC_START #define MODULES_END VMALLOC_END -#define MODULES_LEN (MODULES_VADDR - MODULES_END) #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY) #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) @@ -75,23 +73,6 @@ extern unsigned long end_iomem; * Also, write permissions imply read permissions. This is the closest we can * get.. */ -#define __P000 PAGE_NONE -#define __P001 PAGE_READONLY -#define __P010 PAGE_COPY -#define __P011 PAGE_COPY -#define __P100 PAGE_READONLY -#define __P101 PAGE_READONLY -#define __P110 PAGE_COPY -#define __P111 PAGE_COPY - -#define __S000 PAGE_NONE -#define __S001 PAGE_READONLY -#define __S010 PAGE_SHARED -#define __S011 PAGE_SHARED -#define __S100 PAGE_READONLY -#define __S101 PAGE_READONLY -#define __S110 PAGE_SHARED -#define __S111 PAGE_SHARED /* * ZERO_PAGE is a global shared page that is always zero: used @@ -99,20 +80,24 @@ extern unsigned long end_iomem; */ #define ZERO_PAGE(vaddr) virt_to_page(empty_zero_page) -#define pte_clear(mm,addr,xp) pte_set_val(*(xp), (phys_t) 0, __pgprot(_PAGE_NEWPAGE)) +#define pte_clear(mm, addr, xp) pte_set_val(*(xp), (phys_t) 0, __pgprot(_PAGE_NEEDSYNC)) -#define pmd_none(x) (!((unsigned long)pmd_val(x) & ~_PAGE_NEWPAGE)) +#define pmd_none(x) (!((unsigned long)pmd_val(x) & ~_PAGE_NEEDSYNC)) #define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) -#define pmd_clear(xp) do { pmd_val(*(xp)) = _PAGE_NEWPAGE; } while (0) +#define pmd_clear(xp) do { pmd_val(*(xp)) = _PAGE_NEEDSYNC; } while (0) -#define pmd_newpage(x) (pmd_val(x) & _PAGE_NEWPAGE) -#define pmd_mkuptodate(x) (pmd_val(x) &= ~_PAGE_NEWPAGE) +#define pmd_needsync(x) (pmd_val(x) & _PAGE_NEEDSYNC) +#define pmd_mkuptodate(x) (pmd_val(x) &= ~_PAGE_NEEDSYNC) -#define pud_newpage(x) (pud_val(x) & _PAGE_NEWPAGE) -#define pud_mkuptodate(x) (pud_val(x) &= ~_PAGE_NEWPAGE) +#define pud_needsync(x) (pud_val(x) & _PAGE_NEEDSYNC) +#define pud_mkuptodate(x) (pud_val(x) &= ~_PAGE_NEEDSYNC) +#define p4d_needsync(x) (p4d_val(x) & _PAGE_NEEDSYNC) +#define p4d_mkuptodate(x) (p4d_val(x) &= ~_PAGE_NEEDSYNC) + +#define pmd_pfn(pmd) (pmd_val(pmd) >> PAGE_SHIFT) #define pmd_page(pmd) phys_to_page(pmd_val(pmd) & PAGE_MASK) #define pte_page(x) pfn_to_page(pte_pfn(x)) @@ -135,7 +120,7 @@ static inline int pte_none(pte_t pte) * Undefined behaviour if not.. */ static inline int pte_read(pte_t pte) -{ +{ return((pte_get_bits(pte, _PAGE_USER)) && !(pte_get_bits(pte, _PAGE_PROTNONE))); } @@ -151,14 +136,6 @@ static inline int pte_write(pte_t pte) !(pte_get_bits(pte, _PAGE_PROTNONE))); } -/* - * The following only works if pte_present() is not true. - */ -static inline int pte_file(pte_t pte) -{ - return pte_get_bits(pte, _PAGE_FILE); -} - static inline int pte_dirty(pte_t pte) { return pte_get_bits(pte, _PAGE_DIRTY); @@ -169,19 +146,9 @@ static inline int pte_young(pte_t pte) return pte_get_bits(pte, _PAGE_ACCESSED); } -static inline int pte_newpage(pte_t pte) +static inline int pte_needsync(pte_t pte) { - return pte_get_bits(pte, _PAGE_NEWPAGE); -} - -static inline int pte_newprot(pte_t pte) -{ - return(pte_present(pte) && (pte_get_bits(pte, _PAGE_NEWPROT))); -} - -static inline int pte_special(pte_t pte) -{ - return 0; + return pte_get_bits(pte, _PAGE_NEEDSYNC); } /* @@ -190,38 +157,32 @@ static inline int pte_special(pte_t pte) * ================================= */ -static inline pte_t pte_mknewprot(pte_t pte) -{ - pte_set_bits(pte, _PAGE_NEWPROT); - return(pte); -} - static inline pte_t pte_mkclean(pte_t pte) { pte_clear_bits(pte, _PAGE_DIRTY); return(pte); } -static inline pte_t pte_mkold(pte_t pte) -{ +static inline pte_t pte_mkold(pte_t pte) +{ pte_clear_bits(pte, _PAGE_ACCESSED); return(pte); } static inline pte_t pte_wrprotect(pte_t pte) -{ +{ pte_clear_bits(pte, _PAGE_RW); - return(pte_mknewprot(pte)); + return pte; } static inline pte_t pte_mkread(pte_t pte) -{ +{ pte_set_bits(pte, _PAGE_USER); - return(pte_mknewprot(pte)); + return pte; } static inline pte_t pte_mkdirty(pte_t pte) -{ +{ pte_set_bits(pte, _PAGE_DIRTY); return(pte); } @@ -232,28 +193,21 @@ static inline pte_t pte_mkyoung(pte_t pte) return(pte); } -static inline pte_t pte_mkwrite(pte_t pte) +static inline pte_t pte_mkwrite_novma(pte_t pte) { pte_set_bits(pte, _PAGE_RW); - return(pte_mknewprot(pte)); + return pte; } -static inline pte_t pte_mkuptodate(pte_t pte) +static inline pte_t pte_mkuptodate(pte_t pte) { - pte_clear_bits(pte, _PAGE_NEWPAGE); - if(pte_present(pte)) - pte_clear_bits(pte, _PAGE_NEWPROT); - return(pte); + pte_clear_bits(pte, _PAGE_NEEDSYNC); + return pte; } -static inline pte_t pte_mknewpage(pte_t pte) -{ - pte_set_bits(pte, _PAGE_NEWPAGE); - return(pte); -} - -static inline pte_t pte_mkspecial(pte_t pte) +static inline pte_t pte_mkneedsync(pte_t pte) { + pte_set_bits(pte, _PAGE_NEEDSYNC); return(pte); } @@ -261,115 +215,124 @@ static inline void set_pte(pte_t *pteptr, pte_t pteval) { pte_copy(*pteptr, pteval); - /* If it's a swap entry, it needs to be marked _PAGE_NEWPAGE so - * fix_range knows to unmap it. _PAGE_NEWPROT is specific to - * mapped pages. + /* If it's a swap entry, it needs to be marked _PAGE_NEEDSYNC so + * update_pte_range knows to unmap it. */ - *pteptr = pte_mknewpage(*pteptr); - if(pte_present(*pteptr)) *pteptr = pte_mknewprot(*pteptr); + *pteptr = pte_mkneedsync(*pteptr); +} + +#define PFN_PTE_SHIFT PAGE_SHIFT + +static inline void um_tlb_mark_sync(struct mm_struct *mm, unsigned long start, + unsigned long end) +{ + guard(spinlock_irqsave)(&mm->context.sync_tlb_lock); + + if (!mm->context.sync_tlb_range_to) { + mm->context.sync_tlb_range_from = start; + mm->context.sync_tlb_range_to = end; + } else { + if (start < mm->context.sync_tlb_range_from) + mm->context.sync_tlb_range_from = start; + if (end > mm->context.sync_tlb_range_to) + mm->context.sync_tlb_range_to = end; + } +} + +#define set_ptes set_ptes +static inline void set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, int nr) +{ + /* Basically the default implementation */ + size_t length = nr * PAGE_SIZE; + + for (;;) { + set_pte(ptep, pte); + if (--nr == 0) + break; + ptep++; + pte = __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT)); + } + + um_tlb_mark_sync(mm, addr, addr + length); } -#define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval) #define __HAVE_ARCH_PTE_SAME static inline int pte_same(pte_t pte_a, pte_t pte_b) { - return !((pte_val(pte_a) ^ pte_val(pte_b)) & ~_PAGE_NEWPAGE); + return !((pte_val(pte_a) ^ pte_val(pte_b)) & ~_PAGE_NEEDSYNC); } -/* - * Conversion functions: convert a page and protection to a page entry, - * and a page entry and page directory to the page they refer to. - */ - -#define phys_to_page(phys) pfn_to_page(phys_to_pfn(phys)) #define __virt_to_page(virt) phys_to_page(__pa(virt)) -#define page_to_phys(page) pfn_to_phys((pfn_t) page_to_pfn(page)) #define virt_to_page(addr) __virt_to_page((const unsigned long) addr) -#define mk_pte(page, pgprot) \ - ({ pte_t pte; \ - \ - pte_set_val(pte, page_to_phys(page), (pgprot)); \ - if (pte_present(pte)) \ - pte_mknewprot(pte_mknewpage(pte)); \ - pte;}) +static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot) +{ + pte_t pte; + + pte_set_val(pte, pfn_to_phys(pfn), pgprot); + + return pte; +} static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { pte_set_val(pte, (pte_val(pte) & _PAGE_CHG_MASK), newprot); - return pte; + return pte; } /* - * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD] - * - * this macro returns the index of the entry in the pgd page which would - * control the given virtual address - */ -#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) - -/* - * pgd_offset() returns a (pgd_t *) - * pgd_index() is used get the offset into the pgd page's array of pgd_t's; - */ -#define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address)) - -/* - * a shortcut which implies the use of the kernel's pgd, instead - * of a process's - */ -#define pgd_offset_k(address) pgd_offset(&init_mm, address) - -/* * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] * * this macro returns the index of the entry in the pmd page which would * control the given virtual address */ #define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK)) -#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) - -#define pmd_page_vaddr(pmd) \ - ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK)) - -/* - * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE] - * - * this macro returns the index of the entry in the pte page which would - * control the given virtual address - */ -#define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) -#define pte_offset_kernel(dir, address) \ - ((pte_t *) pmd_page_vaddr(*(dir)) + pte_index(address)) -#define pte_offset_map(dir, address) \ - ((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address)) -#define pte_unmap(pte) do { } while (0) struct mm_struct; extern pte_t *virt_to_pte(struct mm_struct *mm, unsigned long addr); -#define update_mmu_cache(vma,address,ptep) do ; while (0) +#define update_mmu_cache(vma,address,ptep) do {} while (0) +#define update_mmu_cache_range(vmf, vma, address, ptep, nr) do {} while (0) -/* Encode and de-code a swap entry */ +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <--------------- offset ----------------> E < type -> 0 0 0 1 0 + * + * E is the exclusive marker that is not stored in swap entries. + * _PAGE_NEEDSYNC (bit 1) is always set to 1 in set_pte(). + */ #define __swp_type(x) (((x).val >> 5) & 0x1f) #define __swp_offset(x) ((x).val >> 11) #define __swp_entry(type, offset) \ - ((swp_entry_t) { ((type) << 5) | ((offset) << 11) }) + ((swp_entry_t) { (((type) & 0x1f) << 5) | ((offset) << 11) }) #define __pte_to_swp_entry(pte) \ ((swp_entry_t) { pte_val(pte_mkuptodate(pte)) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define kern_addr_valid(addr) (1) +static inline bool pte_swp_exclusive(pte_t pte) +{ + return pte_get_bits(pte, _PAGE_SWP_EXCLUSIVE); +} -#include <asm-generic/pgtable.h> +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_set_bits(pte, _PAGE_SWP_EXCLUSIVE); + return pte; +} -/* Clear a kernel PTE and flush it from the TLB */ -#define kpte_clear_flush(ptep, vaddr) \ -do { \ - pte_clear(&init_mm, (vaddr), (ptep)); \ - __flush_tlb_one((vaddr)); \ -} while (0) +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_clear_bits(pte, _PAGE_SWP_EXCLUSIVE); + return pte; +} #endif diff --git a/arch/um/include/asm/processor-generic.h b/arch/um/include/asm/processor-generic.h index c03cd5a02364..7854d51b6639 100644 --- a/arch/um/include/asm/processor-generic.h +++ b/arch/um/include/asm/processor-generic.h @@ -1,6 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #ifndef __UM_PROCESSOR_GENERIC_H @@ -11,65 +11,40 @@ struct pt_regs; struct task_struct; #include <asm/ptrace.h> -#include <registers.h> #include <sysdep/archsetjmp.h> #include <linux/prefetch.h> +#include <asm/cpufeatures.h> + struct mm_struct; struct thread_struct { - struct task_struct *saved_task; - struct pt_regs regs; - int singlestep_syscall; - void *fault_addr; - jmp_buf *fault_catcher; + struct pt_regs *segv_regs; struct task_struct *prev_sched; - unsigned long temp_stack; struct arch_thread arch; jmp_buf switch_buf; - int mm_count; struct { - int op; - union { - struct { - int pid; - } fork, exec; - struct { - int (*proc)(void *); - void *arg; - } thread; - struct { - void (*proc)(void *); - void *arg; - } cb; - } u; + struct { + int (*proc)(void *); + void *arg; + } thread; } request; + + void *segv_continue; + + /* Contains variable sized FP registers */ + struct pt_regs regs; }; #define INIT_THREAD \ { \ .regs = EMPTY_REGS, \ - .fault_addr = NULL, \ .prev_sched = NULL, \ - .temp_stack = 0, \ .arch = INIT_ARCH_THREAD, \ - .request = { 0 } \ -} - -static inline void release_thread(struct task_struct *task) -{ + .request = { } \ } -extern unsigned long thread_saved_pc(struct task_struct *t); - -static inline void mm_copy_segments(struct mm_struct *from_mm, - struct mm_struct *new_mm) -{ -} - -#define init_stack (init_thread_union.stack) - /* * User space process size: 3GB (default). */ @@ -96,23 +71,18 @@ extern void start_thread(struct pt_regs *regs, unsigned long entry, struct cpuinfo_um { unsigned long loops_per_jiffy; - int ipi_pipe[2]; + int cache_alignment; + union { + __u32 x86_capability[NCAPINTS + NBUGINTS]; + unsigned long x86_capability_alignment; + }; }; extern struct cpuinfo_um boot_cpu_data; -#define my_cpu_data cpu_data[smp_processor_id()] - -#ifdef CONFIG_SMP -extern struct cpuinfo_um cpu_data[]; -#define current_cpu_data cpu_data[smp_processor_id()] -#else -#define cpu_data (&boot_cpu_data) -#define current_cpu_data boot_cpu_data -#endif - +#define cache_line_size() (boot_cpu_data.cache_alignment) #define KSTK_REG(tsk, reg) get_thread_reg(reg, &tsk->thread.switch_buf) -extern unsigned long get_wchan(struct task_struct *p); +extern unsigned long __get_wchan(struct task_struct *p); #endif diff --git a/arch/um/include/asm/ptrace-generic.h b/arch/um/include/asm/ptrace-generic.h index cb9b3c47ca8e..86d74f9d33cf 100644 --- a/arch/um/include/asm/ptrace-generic.h +++ b/arch/um/include/asm/ptrace-generic.h @@ -1,14 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #ifndef __UM_PTRACE_GENERIC_H #define __UM_PTRACE_GENERIC_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ -#include <asm/ptrace-abi.h> #include <sysdep/ptrace.h> struct pt_regs { @@ -28,6 +27,8 @@ struct pt_regs { #define instruction_pointer(regs) PT_REGS_IP(regs) +#define PTRACE_OLDSETOPTIONS 21 + struct task_struct; extern long subarch_ptrace(struct task_struct *child, long request, @@ -35,9 +36,12 @@ extern long subarch_ptrace(struct task_struct *child, long request, extern unsigned long getreg(struct task_struct *child, int regno); extern int putreg(struct task_struct *child, int regno, unsigned long value); -extern int arch_copy_tls(struct task_struct *new); +extern int poke_user(struct task_struct *child, long addr, long data); +extern int peek_user(struct task_struct *child, long addr, long data); + +extern int arch_set_tls(struct task_struct *new, unsigned long tls); extern void clear_flushed_tls(struct task_struct *task); -extern void syscall_trace_enter(struct pt_regs *regs); +extern int syscall_trace_enter(struct pt_regs *regs); extern void syscall_trace_leave(struct pt_regs *regs); #endif diff --git a/arch/um/include/asm/sections.h b/arch/um/include/asm/sections.h new file mode 100644 index 000000000000..a3c1fb6ed6ad --- /dev/null +++ b/arch/um/include/asm/sections.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __UM_SECTIONS_H +#define __UM_SECTIONS_H + +#include <asm-generic/sections.h> + +extern char __binary_start[]; +extern char __syscall_stub_start[], __syscall_stub_end[]; + +#endif diff --git a/arch/um/include/asm/setup.h b/arch/um/include/asm/setup.h index 99f086301f4c..80ada899f254 100644 --- a/arch/um/include/asm/setup.h +++ b/arch/um/include/asm/setup.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef SETUP_H_INCLUDED #define SETUP_H_INCLUDED diff --git a/arch/um/include/asm/smp.h b/arch/um/include/asm/smp.h index e4507938d8cf..be1743a6ff3c 100644 --- a/arch/um/include/asm/smp.h +++ b/arch/um/include/asm/smp.h @@ -1,32 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __UM_SMP_H #define __UM_SMP_H -#ifdef CONFIG_SMP +#if IS_ENABLED(CONFIG_SMP) -#include <linux/bitops.h> -#include <asm/current.h> #include <linux/cpumask.h> +#include <shared/smp.h> -#define raw_smp_processor_id() (current_thread->cpu) +#define raw_smp_processor_id() uml_curr_cpu() -#define cpu_logical_map(n) (n) -#define cpu_number_map(n) (n) -extern int hard_smp_processor_id(void); -#define NO_PROC_ID -1 +void arch_smp_send_reschedule(int cpu); -extern int ncpus; +void arch_send_call_function_single_ipi(int cpu); +void arch_send_call_function_ipi_mask(const struct cpumask *mask); -static inline void smp_cpus_done(unsigned int maxcpus) -{ -} - -extern struct task_struct *idle_threads[NR_CPUS]; - -#else - -#define hard_smp_processor_id() 0 - -#endif +#endif /* CONFIG_SMP */ #endif diff --git a/arch/um/include/asm/stacktrace.h b/arch/um/include/asm/stacktrace.h new file mode 100644 index 000000000000..436b55952c3a --- /dev/null +++ b/arch/um/include/asm/stacktrace.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_UML_STACKTRACE_H +#define _ASM_UML_STACKTRACE_H + +#include <linux/uaccess.h> +#include <linux/ptrace.h> + +struct stack_frame { + struct stack_frame *next_frame; + unsigned long return_address; +}; + +struct stacktrace_ops { + void (*address)(void *data, unsigned long address, int reliable); +}; + +#ifdef CONFIG_FRAME_POINTER +static inline unsigned long +get_frame_pointer(struct task_struct *task, struct pt_regs *segv_regs) +{ + if (!task || task == current) + return segv_regs ? PT_REGS_BP(segv_regs) : current_bp(); + return KSTK_EBP(task); +} +#else +static inline unsigned long +get_frame_pointer(struct task_struct *task, struct pt_regs *segv_regs) +{ + return 0; +} +#endif + +static inline unsigned long +*get_stack_pointer(struct task_struct *task, struct pt_regs *segv_regs) +{ + if (!task || task == current) + return segv_regs ? (unsigned long *)PT_REGS_SP(segv_regs) : current_sp(); + return (unsigned long *)KSTK_ESP(task); +} + +void dump_trace(struct task_struct *tsk, const struct stacktrace_ops *ops, void *data); + +#endif /* _ASM_UML_STACKTRACE_H */ diff --git a/arch/um/include/asm/syscall-generic.h b/arch/um/include/asm/syscall-generic.h new file mode 100644 index 000000000000..bcd73bcfe577 --- /dev/null +++ b/arch/um/include/asm/syscall-generic.h @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Access to user system call parameters and results + * + * See asm-generic/syscall.h for function descriptions. + * + * Copyright (C) 2015 Mickaël Salaün <mic@digikod.net> + */ + +#ifndef __UM_SYSCALL_GENERIC_H +#define __UM_SYSCALL_GENERIC_H + +#include <asm/ptrace.h> +#include <linux/err.h> +#include <linux/sched.h> +#include <sysdep/ptrace.h> + +static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs) +{ + + return PT_REGS_SYSCALL_NR(regs); +} + +static inline void syscall_set_nr(struct task_struct *task, struct pt_regs *regs, int nr) +{ + PT_REGS_SYSCALL_NR(regs) = nr; +} + +static inline void syscall_rollback(struct task_struct *task, + struct pt_regs *regs) +{ + /* do nothing */ +} + +static inline long syscall_get_error(struct task_struct *task, + struct pt_regs *regs) +{ + const long error = regs_return_value(regs); + + return IS_ERR_VALUE(error) ? error : 0; +} + +static inline long syscall_get_return_value(struct task_struct *task, + struct pt_regs *regs) +{ + return regs_return_value(regs); +} + +static inline void syscall_set_return_value(struct task_struct *task, + struct pt_regs *regs, + int error, long val) +{ + PT_REGS_SET_SYSCALL_RETURN(regs, (long) error ?: val); +} + +static inline void syscall_get_arguments(struct task_struct *task, + struct pt_regs *regs, + unsigned long *args) +{ + const struct uml_pt_regs *r = ®s->regs; + + *args++ = UPT_SYSCALL_ARG1(r); + *args++ = UPT_SYSCALL_ARG2(r); + *args++ = UPT_SYSCALL_ARG3(r); + *args++ = UPT_SYSCALL_ARG4(r); + *args++ = UPT_SYSCALL_ARG5(r); + *args = UPT_SYSCALL_ARG6(r); +} + +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + const unsigned long *args) +{ + struct uml_pt_regs *r = ®s->regs; + + UPT_SYSCALL_ARG1(r) = *args++; + UPT_SYSCALL_ARG2(r) = *args++; + UPT_SYSCALL_ARG3(r) = *args++; + UPT_SYSCALL_ARG4(r) = *args++; + UPT_SYSCALL_ARG5(r) = *args++; + UPT_SYSCALL_ARG6(r) = *args; +} + +/* See arch/x86/um/asm/syscall.h for syscall_get_arch() definition. */ + +#endif /* __UM_SYSCALL_GENERIC_H */ diff --git a/arch/um/include/asm/sysrq.h b/arch/um/include/asm/sysrq.h deleted file mode 100644 index c8d332b56b98..000000000000 --- a/arch/um/include/asm/sysrq.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef __UM_SYSRQ_H -#define __UM_SYSRQ_H - -struct task_struct; -extern void show_trace(struct task_struct* task, unsigned long *stack); - -#endif diff --git a/arch/um/include/asm/thread_info.h b/arch/um/include/asm/thread_info.h index 2c8eeb2df8b4..7a6f4dc99fa1 100644 --- a/arch/um/include/asm/thread_info.h +++ b/arch/um/include/asm/thread_info.h @@ -1,80 +1,62 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #ifndef __UM_THREAD_INFO_H #define __UM_THREAD_INFO_H -#ifndef __ASSEMBLY__ +#define THREAD_SIZE_ORDER CONFIG_KERNEL_STACK_ORDER +#define THREAD_SIZE ((1 << CONFIG_KERNEL_STACK_ORDER) * PAGE_SIZE) + +#ifndef __ASSEMBLER__ #include <asm/types.h> #include <asm/page.h> -#include <asm/uaccess.h> +#include <asm/segment.h> +#include <sysdep/ptrace_user.h> struct thread_info { - struct task_struct *task; /* main task structure */ - struct exec_domain *exec_domain; /* execution domain */ unsigned long flags; /* low level flags */ __u32 cpu; /* current CPU */ int preempt_count; /* 0 => preemptable, <0 => BUG */ - mm_segment_t addr_limit; /* thread address space: - 0-0xBFFFFFFF for user - 0-0xFFFFFFFF for kernel */ - struct restart_block restart_block; - struct thread_info *real_thread; /* Points to non-IRQ stack */ }; #define INIT_THREAD_INFO(tsk) \ { \ - .task = &tsk, \ - .exec_domain = &default_exec_domain, \ .flags = 0, \ .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ - .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ - .real_thread = NULL, \ } -#define init_thread_info (init_thread_union.thread_info) -#define init_stack (init_thread_union.stack) - -#define THREAD_SIZE ((1 << CONFIG_KERNEL_STACK_ORDER) * PAGE_SIZE) -/* how to get the thread information struct from C */ -static inline struct thread_info *current_thread_info(void) -{ - struct thread_info *ti; - unsigned long mask = THREAD_SIZE - 1; - void *p; - - asm volatile ("" : "=r" (p) : "0" (&ti)); - ti = (struct thread_info *) (((unsigned long)p) & ~mask); - return ti; -} - -#define THREAD_SIZE_ORDER CONFIG_KERNEL_STACK_ORDER - #endif -#define PREEMPT_ACTIVE 0x10000000 - #define TIF_SYSCALL_TRACE 0 /* syscall trace active */ #define TIF_SIGPENDING 1 /* signal pending */ #define TIF_NEED_RESCHED 2 /* rescheduling necessary */ +#define TIF_NOTIFY_SIGNAL 3 /* signal notifications exist */ #define TIF_RESTART_BLOCK 4 #define TIF_MEMDIE 5 /* is terminating due to OOM killer */ #define TIF_SYSCALL_AUDIT 6 #define TIF_RESTORE_SIGMASK 7 #define TIF_NOTIFY_RESUME 8 +#define TIF_SECCOMP 9 /* secure computing */ +#define TIF_SINGLESTEP 10 /* single stepping userspace */ +#define TIF_SYSCALL_TRACEPOINT 11 /* syscall tracepoint instrumentation */ + #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) +#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) #define _TIF_MEMDIE (1 << TIF_MEMDIE) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) +#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) +#define _TIF_SECCOMP (1 << TIF_SECCOMP) +#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) + +#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL | \ + _TIF_NOTIFY_RESUME) #endif diff --git a/arch/um/include/asm/timex.h b/arch/um/include/asm/timex.h index 0f4ada08f748..9f27176adb26 100644 --- a/arch/um/include/asm/timex.h +++ b/arch/um/include/asm/timex.h @@ -1,13 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __UM_TIMEX_H #define __UM_TIMEX_H -typedef unsigned long cycles_t; - -static inline cycles_t get_cycles (void) -{ - return 0; -} - #define CLOCK_TICK_RATE (HZ) +#include <asm-generic/timex.h> + #endif diff --git a/arch/um/include/asm/tlb.h b/arch/um/include/asm/tlb.h index 4febacd1a8a1..0422467bda5b 100644 --- a/arch/um/include/asm/tlb.h +++ b/arch/um/include/asm/tlb.h @@ -1,120 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __UM_TLB_H #define __UM_TLB_H -#include <linux/pagemap.h> -#include <linux/swap.h> -#include <asm/percpu.h> -#include <asm/pgalloc.h> -#include <asm/tlbflush.h> - -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) - -/* struct mmu_gather is an opaque type used by the mm code for passing around - * any data needed by arch specific code for tlb_remove_page. - */ -struct mmu_gather { - struct mm_struct *mm; - unsigned int need_flush; /* Really unmapped some ptes? */ - unsigned long start; - unsigned long end; - unsigned int fullmm; /* non-zero means full mm flush */ -}; - -static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, - unsigned long address) -{ - if (tlb->start > address) - tlb->start = address; - if (tlb->end < address + PAGE_SIZE) - tlb->end = address + PAGE_SIZE; -} - -static inline void init_tlb_gather(struct mmu_gather *tlb) -{ - tlb->need_flush = 0; - - tlb->start = TASK_SIZE; - tlb->end = 0; - - if (tlb->fullmm) { - tlb->start = 0; - tlb->end = TASK_SIZE; - } -} - -static inline void -tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned int full_mm_flush) -{ - tlb->mm = mm; - tlb->fullmm = full_mm_flush; - - init_tlb_gather(tlb); -} - -extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, - unsigned long end); - -static inline void -tlb_flush_mmu(struct mmu_gather *tlb) -{ - if (!tlb->need_flush) - return; +#include <linux/mm.h> - flush_tlb_mm_range(tlb->mm, tlb->start, tlb->end); - init_tlb_gather(tlb); -} - -/* tlb_finish_mmu - * Called at the end of the shootdown operation to free up any resources - * that were required. - */ -static inline void -tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) -{ - tlb_flush_mmu(tlb); - - /* keep the page table cache within bounds */ - check_pgt_cache(); -} - -/* tlb_remove_page - * Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), - * while handling the additional races in SMP caused by other CPUs - * caching valid mappings in their TLBs. - */ -static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) -{ - tlb->need_flush = 1; - free_page_and_swap_cache(page); - return 1; /* avoid calling tlb_flush_mmu */ -} - -static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) -{ - __tlb_remove_page(tlb, page); -} - -/** - * tlb_remove_tlb_entry - remember a pte unmapping for later tlb invalidation. - * - * Record the fact that pte's were really umapped in ->need_flush, so we can - * later optimise away the tlb invalidate. This helps when userspace is - * unmapping already-unmapped pages, which happens quite a lot. - */ -#define tlb_remove_tlb_entry(tlb, ptep, address) \ - do { \ - tlb->need_flush = 1; \ - __tlb_remove_tlb_entry(tlb, ptep, address); \ - } while (0) - -#define pte_free_tlb(tlb, ptep, addr) __pte_free_tlb(tlb, ptep, addr) - -#define pud_free_tlb(tlb, pudp, addr) __pud_free_tlb(tlb, pudp, addr) - -#define pmd_free_tlb(tlb, pmdp, addr) __pmd_free_tlb(tlb, pmdp, addr) - -#define tlb_migrate_finish(mm) do {} while (0) +#include <asm/tlbflush.h> +#include <asm/cacheflush.h> +#include <asm-generic/tlb.h> #endif diff --git a/arch/um/include/asm/tlbflush.h b/arch/um/include/asm/tlbflush.h index 614f2c091178..13a3009942be 100644 --- a/arch/um/include/asm/tlbflush.h +++ b/arch/um/include/asm/tlbflush.h @@ -1,6 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #ifndef __UM_TLBFLUSH_H @@ -9,23 +9,51 @@ #include <linux/mm.h> /* - * TLB flushing: + * In UML, we need to sync the TLB over by using mmap/munmap syscalls from + * the process handling the MM (which can be the kernel itself). + * + * To track updates, we can hook into set_ptes and flush_tlb_*. With set_ptes + * we catch all PTE transitions where memory that was unusable becomes usable. + * While with flush_tlb_* we can track any memory that becomes unusable and + * even if a higher layer of the page table was modified. + * + * So, we simply track updates using both methods and mark the memory area to + * be synced later on. The only special case is that flush_tlb_kern_* needs to + * be executed immediately as there is no good synchronization point in that + * case. In contrast, in the set_ptes case we can wait for the next kernel + * segfault before we do the synchornization. * - * - flush_tlb() flushes the current mm struct TLBs * - flush_tlb_all() flushes all processes TLBs * - flush_tlb_mm(mm) flushes the specified mm context TLB's * - flush_tlb_page(vma, vmaddr) flushes one page - * - flush_tlb_kernel_vm() flushes the kernel vm area * - flush_tlb_range(vma, start, end) flushes a range of pages + * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages */ +extern int um_tlb_sync(struct mm_struct *mm); + extern void flush_tlb_all(void); extern void flush_tlb_mm(struct mm_struct *mm); -extern void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end); -extern void flush_tlb_page(struct vm_area_struct *vma, unsigned long address); -extern void flush_tlb_kernel_vm(void); -extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); -extern void __flush_tlb_one(unsigned long addr); + +static inline void flush_tlb_page(struct vm_area_struct *vma, + unsigned long address) +{ + um_tlb_mark_sync(vma->vm_mm, address, address + PAGE_SIZE); +} + +static inline void flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + um_tlb_mark_sync(vma->vm_mm, start, end); +} + +static inline void flush_tlb_kernel_range(unsigned long start, + unsigned long end) +{ + um_tlb_mark_sync(&init_mm, start, end); + + /* Kernel needs to be synced immediately */ + um_tlb_sync(&init_mm); +} #endif diff --git a/arch/um/include/asm/uaccess.h b/arch/um/include/asm/uaccess.h index 3f22fbf7ca1d..0df9ea4abda8 100644 --- a/arch/um/include/asm/uaccess.h +++ b/arch/um/include/asm/uaccess.h @@ -1,178 +1,67 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL + * Copyright (C) 2015 Richard Weinberger (richard@nod.at) */ #ifndef __UM_UACCESS_H #define __UM_UACCESS_H -/* thread_info has a mm_segment_t in it, so put the definition up here */ -typedef struct { - unsigned long seg; -} mm_segment_t; - -#include <linux/thread_info.h> -#include <linux/errno.h> -#include <asm/processor.h> #include <asm/elf.h> - -#define VERIFY_READ 0 -#define VERIFY_WRITE 1 - -/* - * The fs value determines whether argument validity checking should be - * performed or not. If get_fs() == USER_DS, checking is performed, with - * get_fs() == KERNEL_DS, checking is bypassed. - * - * For historical reasons, these macros are grossly misnamed. - */ - -#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) }) - -#define KERNEL_DS MAKE_MM_SEG(0xFFFFFFFF) -#define USER_DS MAKE_MM_SEG(TASK_SIZE) - -#define get_ds() (KERNEL_DS) -#define get_fs() (current_thread_info()->addr_limit) -#define set_fs(x) (current_thread_info()->addr_limit = (x)) - -#define segment_eq(a, b) ((a).seg == (b).seg) +#include <linux/unaligned.h> +#include <sysdep/faultinfo.h> #define __under_task_size(addr, size) \ (((unsigned long) (addr) < TASK_SIZE) && \ (((unsigned long) (addr) + (size)) < TASK_SIZE)) -#define __access_ok_vsyscall(type, addr, size) \ - ((type == VERIFY_READ) && \ - ((unsigned long) (addr) >= FIXADDR_USER_START) && \ - ((unsigned long) (addr) + (size) <= FIXADDR_USER_END) && \ - ((unsigned long) (addr) + (size) >= (unsigned long)(addr))) - #define __addr_range_nowrap(addr, size) \ ((unsigned long) (addr) <= ((unsigned long) (addr) + (size))) -#define access_ok(type, addr, size) \ - (__addr_range_nowrap(addr, size) && \ - (__under_task_size(addr, size) || \ - __access_ok_vsyscall(type, addr, size) || \ - segment_eq(get_fs(), KERNEL_DS))) - -extern int copy_from_user(void *to, const void __user *from, int n); -extern int copy_to_user(void __user *to, const void *from, int n); - -/* - * strncpy_from_user: - Copy a NUL terminated string from userspace. - * @dst: Destination address, in kernel space. This buffer must be at - * least @count bytes long. - * @src: Source address, in user space. - * @count: Maximum number of bytes to copy, including the trailing NUL. - * - * Copies a NUL-terminated string from userspace to kernel space. - * - * On success, returns the length of the string (not including the trailing - * NUL). - * - * If access to userspace fails, returns -EFAULT (some data may have been - * copied). - * - * If @count is smaller than the length of the string, copies @count bytes - * and returns @count. - */ - -extern int strncpy_from_user(char *dst, const char __user *src, int count); +extern unsigned long raw_copy_from_user(void *to, const void __user *from, unsigned long n); +extern unsigned long raw_copy_to_user(void __user *to, const void *from, unsigned long n); +extern unsigned long __clear_user(void __user *mem, unsigned long len); +static inline int __access_ok(const void __user *ptr, unsigned long size); -/* - * __clear_user: - Zero a block of memory in user space, with less checking. - * @to: Destination address, in user space. - * @n: Number of bytes to zero. - * - * Zero a block of memory in user space. Caller must check - * the specified block with access_ok() before calling this function. - * - * Returns number of bytes that could not be cleared. - * On success, this will be zero. - */ -extern int __clear_user(void __user *mem, int len); - -/* - * clear_user: - Zero a block of memory in user space. - * @to: Destination address, in user space. - * @n: Number of bytes to zero. - * - * Zero a block of memory in user space. - * - * Returns number of bytes that could not be cleared. - * On success, this will be zero. - */ -extern int clear_user(void __user *mem, int len); +/* Teach asm-generic/uaccess.h that we have C functions for these. */ +#define __access_ok __access_ok +#define __clear_user __clear_user -/* - * strlen_user: - Get the size of a string in user space. - * @str: The string to measure. - * @n: The maximum valid length - * - * Get the size of a NUL-terminated string in user space. - * - * Returns the size of the string INCLUDING the terminating NUL. - * On exception, returns 0. - * If the string is too long, returns a value greater than @n. - */ -extern int strnlen_user(const void __user *str, int len); +#define INLINE_COPY_FROM_USER +#define INLINE_COPY_TO_USER -#define __copy_from_user(to, from, n) copy_from_user(to, from, n) +#include <asm-generic/uaccess.h> -#define __copy_to_user(to, from, n) copy_to_user(to, from, n) - -#define __copy_to_user_inatomic __copy_to_user -#define __copy_from_user_inatomic __copy_from_user - -#define __get_user(x, ptr) \ -({ \ - const __typeof__(*(ptr)) __user *__private_ptr = (ptr); \ - __typeof__(x) __private_val; \ - int __private_ret = -EFAULT; \ - (x) = (__typeof__(*(__private_ptr)))0; \ - if (__copy_from_user((__force void *)&__private_val, (__private_ptr),\ - sizeof(*(__private_ptr))) == 0) { \ - (x) = (__typeof__(*(__private_ptr))) __private_val; \ - __private_ret = 0; \ - } \ - __private_ret; \ -}) - -#define get_user(x, ptr) \ -({ \ - const __typeof__((*(ptr))) __user *private_ptr = (ptr); \ - (access_ok(VERIFY_READ, private_ptr, sizeof(*private_ptr)) ? \ - __get_user(x, private_ptr) : ((x) = (__typeof__(*ptr))0, -EFAULT)); \ -}) - -#define __put_user(x, ptr) \ -({ \ - __typeof__(*(ptr)) __user *__private_ptr = ptr; \ - __typeof__(*(__private_ptr)) __private_val; \ - int __private_ret = -EFAULT; \ - __private_val = (__typeof__(*(__private_ptr))) (x); \ - if (__copy_to_user((__private_ptr), &__private_val, \ - sizeof(*(__private_ptr))) == 0) { \ - __private_ret = 0; \ - } \ - __private_ret; \ -}) - -#define put_user(x, ptr) \ -({ \ - __typeof__(*(ptr)) __user *private_ptr = (ptr); \ - (access_ok(VERIFY_WRITE, private_ptr, sizeof(*private_ptr)) ? \ - __put_user(x, private_ptr) : -EFAULT); \ -}) - -#define strlen_user(str) strnlen_user(str, ~0U >> 1) - -struct exception_table_entry +static inline int __access_ok(const void __user *ptr, unsigned long size) { - unsigned long insn; - unsigned long fixup; -}; + unsigned long addr = (unsigned long)ptr; + return __addr_range_nowrap(addr, size) && __under_task_size(addr, size); +} + +#define __get_kernel_nofault(dst, src, type, err_label) \ +do { \ + int __faulted; \ + \ + ___backtrack_faulted(__faulted); \ + if (__faulted) { \ + *((type *)dst) = (type) 0; \ + goto err_label; \ + } \ + *((type *)dst) = get_unaligned((type *)(src)); \ + barrier(); \ + current->thread.segv_continue = NULL; \ +} while (0) + +#define __put_kernel_nofault(dst, src, type, err_label) \ +do { \ + int __faulted; \ + \ + ___backtrack_faulted(__faulted); \ + if (__faulted) \ + goto err_label; \ + put_unaligned(*((type *)src), (type *)(dst)); \ + barrier(); \ + current->thread.segv_continue = NULL; \ +} while (0) #endif diff --git a/arch/um/include/asm/unwind.h b/arch/um/include/asm/unwind.h new file mode 100644 index 000000000000..7ffa5437b761 --- /dev/null +++ b/arch/um/include/asm/unwind.h @@ -0,0 +1,8 @@ +#ifndef _ASM_UML_UNWIND_H +#define _ASM_UML_UNWIND_H + +static inline void +unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, + void *orc, size_t orc_size) {} + +#endif /* _ASM_UML_UNWIND_H */ diff --git a/arch/um/include/asm/vmalloc.h b/arch/um/include/asm/vmalloc.h new file mode 100644 index 000000000000..9a7b9ed93733 --- /dev/null +++ b/arch/um/include/asm/vmalloc.h @@ -0,0 +1,4 @@ +#ifndef _ASM_UM_VMALLOC_H +#define _ASM_UM_VMALLOC_H + +#endif /* _ASM_UM_VMALLOC_H */ diff --git a/arch/um/include/asm/vmlinux.lds.h b/arch/um/include/asm/vmlinux.lds.h new file mode 100644 index 000000000000..149494ae78ea --- /dev/null +++ b/arch/um/include/asm/vmlinux.lds.h @@ -0,0 +1,2 @@ +#include <asm/thread_info.h> +#include <asm-generic/vmlinux.lds.h> diff --git a/arch/um/include/asm/xor.h b/arch/um/include/asm/xor.h new file mode 100644 index 000000000000..647fae200c5d --- /dev/null +++ b/arch/um/include/asm/xor.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_UM_XOR_H +#define _ASM_UM_XOR_H + +#ifdef CONFIG_64BIT +#undef CONFIG_X86_32 +#define TT_CPU_INF_XOR_DEFAULT (AVX_SELECT(&xor_block_sse_pf64)) +#else +#define CONFIG_X86_32 1 +#define TT_CPU_INF_XOR_DEFAULT (AVX_SELECT(&xor_block_8regs)) +#endif + +#include <asm/cpufeature.h> +#include <../../x86/include/asm/xor.h> +#include <linux/time-internal.h> + +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT +#undef XOR_SELECT_TEMPLATE +/* pick an arbitrary one - measuring isn't possible with inf-cpu */ +#define XOR_SELECT_TEMPLATE(x) \ + (time_travel_mode == TT_MODE_INFCPU ? TT_CPU_INF_XOR_DEFAULT : x) +#endif + +#endif diff --git a/arch/um/include/linux/smp-internal.h b/arch/um/include/linux/smp-internal.h new file mode 100644 index 000000000000..1dbcbc23f9c9 --- /dev/null +++ b/arch/um/include/linux/smp-internal.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __UM_SMP_INTERNAL_H +#define __UM_SMP_INTERNAL_H + +#if IS_ENABLED(CONFIG_SMP) + +void prefill_possible_map(void); + +#else /* !CONFIG_SMP */ + +static inline void prefill_possible_map(void) { } + +#endif /* CONFIG_SMP */ + +extern char cpu_irqstacks[NR_CPUS][THREAD_SIZE] __aligned(THREAD_SIZE); + +#endif /* __UM_SMP_INTERNAL_H */ diff --git a/arch/um/include/linux/time-internal.h b/arch/um/include/linux/time-internal.h new file mode 100644 index 000000000000..c274eb5ad55e --- /dev/null +++ b/arch/um/include/linux/time-internal.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2012 - 2014 Cisco Systems + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + */ + +#ifndef __TIMER_INTERNAL_H__ +#define __TIMER_INTERNAL_H__ +#include <linux/list.h> +#include <asm/bug.h> +#include <shared/timetravel.h> + +#define TIMER_MULTIPLIER 256 +#define TIMER_MIN_DELTA 500 + +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT +struct time_travel_event { + unsigned long long time; + void (*fn)(struct time_travel_event *d); + struct list_head list; + bool pending, onstack; +}; + +void time_travel_sleep(void); + +static inline void +time_travel_set_event_fn(struct time_travel_event *e, + void (*fn)(struct time_travel_event *d)) +{ + e->fn = fn; +} + +void __time_travel_propagate_time(void); + +static inline void time_travel_propagate_time(void) +{ + if (time_travel_mode == TT_MODE_EXTERNAL) + __time_travel_propagate_time(); +} + +void __time_travel_wait_readable(int fd); + +static inline void time_travel_wait_readable(int fd) +{ + if (time_travel_mode == TT_MODE_EXTERNAL) + __time_travel_wait_readable(fd); +} + +void time_travel_add_irq_event(struct time_travel_event *e); +void time_travel_add_event_rel(struct time_travel_event *e, + unsigned long long delay_ns); +bool time_travel_del_event(struct time_travel_event *e); +#else +struct time_travel_event { +}; + +static inline void time_travel_sleep(void) +{ +} + +/* this is a macro so the event/function need not exist */ +#define time_travel_set_event_fn(e, fn) do {} while (0) + +static inline void time_travel_propagate_time(void) +{ +} + +static inline void time_travel_wait_readable(int fd) +{ +} + +static inline void time_travel_add_irq_event(struct time_travel_event *e) +{ + WARN_ON(1); +} + +/* + * not inlines so the data structure need not exist, + * cause linker failures + */ +extern void time_travel_not_configured(void); +#define time_travel_add_event_rel(...) time_travel_not_configured() +#define time_travel_del_event(...) time_travel_not_configured() +#endif /* CONFIG_UML_TIME_TRAVEL_SUPPORT */ + +extern unsigned long tt_extra_sched_jiffies; + +/* + * Without CONFIG_UML_TIME_TRAVEL_SUPPORT this is a linker error if used, + * which is intentional since we really shouldn't link it in that case. + */ +void time_travel_ndelay(unsigned long nsec); + +int um_setup_timer(void); + +#endif /* __TIMER_INTERNAL_H__ */ diff --git a/arch/um/include/linux/virtio-uml.h b/arch/um/include/linux/virtio-uml.h new file mode 100644 index 000000000000..2f652fa90f04 --- /dev/null +++ b/arch/um/include/linux/virtio-uml.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021 Intel Corporation + * Author: Johannes Berg <johannes@sipsolutions.net> + */ + +#ifndef __VIRTIO_UML_H__ +#define __VIRTIO_UML_H__ + +void virtio_uml_set_no_vq_suspend(struct virtio_device *vdev, + bool no_vq_suspend); + +#endif /* __VIRTIO_UML_H__ */ diff --git a/arch/um/include/shared/aio.h b/arch/um/include/shared/aio.h deleted file mode 100644 index 423bae9153f8..000000000000 --- a/arch/um/include/shared/aio.h +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (C) 2004 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL - */ - -#ifndef AIO_H__ -#define AIO_H__ - -enum aio_type { AIO_READ, AIO_WRITE, AIO_MMAP }; - -struct aio_thread_reply { - void *data; - int err; -}; - -struct aio_context { - int reply_fd; - struct aio_context *next; -}; - -#define INIT_AIO_CONTEXT { .reply_fd = -1, \ - .next = NULL } - -extern int submit_aio(enum aio_type type, int fd, char *buf, int len, - unsigned long long offset, int reply_fd, - struct aio_context *aio); - -#endif diff --git a/arch/um/include/shared/arch.h b/arch/um/include/shared/arch.h index 4f46abda060d..cc398a21ad96 100644 --- a/arch/um/include/shared/arch.h +++ b/arch/um/include/shared/arch.h @@ -1,6 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #ifndef __ARCH_H__ @@ -12,4 +12,6 @@ extern void arch_check_bugs(void); extern int arch_fixup(unsigned long address, struct uml_pt_regs *regs); extern void arch_examine_signal(int sig, struct uml_pt_regs *regs); +void mc_set_rip(void *_mc, void *target); + #endif diff --git a/arch/um/include/shared/as-layout.h b/arch/um/include/shared/as-layout.h index 694c792bab4e..02ef258e3395 100644 --- a/arch/um/include/shared/as-layout.h +++ b/arch/um/include/shared/as-layout.h @@ -1,6 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #ifndef __START_H__ @@ -20,48 +20,37 @@ * 'UL' and other type specifiers unilaterally. We * use the following macros to deal with this. */ +#define STUB_START stub_start +#define STUB_CODE STUB_START +#define STUB_DATA (STUB_CODE + UM_KERN_PAGE_SIZE) +#define STUB_DATA_PAGES 2 +#define STUB_SIZE ((1 + STUB_DATA_PAGES) * UM_KERN_PAGE_SIZE) +#define STUB_END (STUB_START + STUB_SIZE) -#ifdef __ASSEMBLY__ -#define _UML_AC(X, Y) (Y) -#else -#define __UML_AC(X, Y) (X(Y)) -#define _UML_AC(X, Y) __UML_AC(X, Y) -#endif - -#define STUB_START _UML_AC(, 0x100000) -#define STUB_CODE _UML_AC((unsigned long), STUB_START) -#define STUB_DATA _UML_AC((unsigned long), STUB_CODE + UM_KERN_PAGE_SIZE) -#define STUB_END _UML_AC((unsigned long), STUB_DATA + UM_KERN_PAGE_SIZE) - -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <sysdep/ptrace.h> -struct cpu_task { - int pid; - void *task; -}; +struct task_struct; +extern struct task_struct *cpu_tasks[]; -extern struct cpu_task cpu_tasks[]; +extern unsigned long long physmem_size; -extern unsigned long low_physmem; extern unsigned long high_physmem; extern unsigned long uml_physmem; extern unsigned long uml_reserved; extern unsigned long end_vm; extern unsigned long start_vm; -extern unsigned long long highmem; -extern unsigned long _stext, _etext, _sdata, _edata, __bss_start, _end; -extern unsigned long _unprotected_end; extern unsigned long brk_start; -extern unsigned long host_task_size; +extern unsigned long stub_start; -extern int linux_main(int argc, char **argv); +extern int linux_main(int argc, char **argv, char **envp); +extern void uml_finishsetup(void); struct siginfo; -extern void (*sig_info[])(int, struct siginfo *si, struct uml_pt_regs *); +extern void (*sig_info[])(int, struct siginfo *si, struct uml_pt_regs *, void *); #endif diff --git a/arch/um/include/shared/common-offsets.h b/arch/um/include/shared/common-offsets.h deleted file mode 100644 index c92306809029..000000000000 --- a/arch/um/include/shared/common-offsets.h +++ /dev/null @@ -1,41 +0,0 @@ -/* for use by sys-$SUBARCH/kernel-offsets.c */ - -DEFINE(KERNEL_MADV_REMOVE, MADV_REMOVE); - -DEFINE(UM_KERN_PAGE_SIZE, PAGE_SIZE); -DEFINE(UM_KERN_PAGE_MASK, PAGE_MASK); -DEFINE(UM_KERN_PAGE_SHIFT, PAGE_SHIFT); -DEFINE(UM_NSEC_PER_SEC, NSEC_PER_SEC); - -DEFINE(UM_ELF_CLASS, ELF_CLASS); -DEFINE(UM_ELFCLASS32, ELFCLASS32); -DEFINE(UM_ELFCLASS64, ELFCLASS64); - -DEFINE(UM_NR_CPUS, NR_CPUS); - -DEFINE(UM_GFP_KERNEL, GFP_KERNEL); -DEFINE(UM_GFP_ATOMIC, GFP_ATOMIC); - -/* For crypto assembler code. */ -DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx)); - -DEFINE(UM_THREAD_SIZE, THREAD_SIZE); - -DEFINE(UM_HZ, HZ); - -DEFINE(UM_USEC_PER_SEC, USEC_PER_SEC); -DEFINE(UM_NSEC_PER_SEC, NSEC_PER_SEC); -DEFINE(UM_NSEC_PER_USEC, NSEC_PER_USEC); - -#ifdef CONFIG_PRINTK -DEFINE(UML_CONFIG_PRINTK, CONFIG_PRINTK); -#endif -#ifdef CONFIG_NO_HZ_COMMON -DEFINE(UML_CONFIG_NO_HZ_COMMON, CONFIG_NO_HZ_COMMON); -#endif -#ifdef CONFIG_UML_X86 -DEFINE(UML_CONFIG_UML_X86, CONFIG_UML_X86); -#endif -#ifdef CONFIG_64BIT -DEFINE(UML_CONFIG_64BIT, CONFIG_64BIT); -#endif diff --git a/arch/um/include/shared/elf_user.h b/arch/um/include/shared/elf_user.h index 53516b637272..fd461ee40c05 100644 --- a/arch/um/include/shared/elf_user.h +++ b/arch/um/include/shared/elf_user.h @@ -1,7 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2004 Fujitsu Siemens Computers GmbH * Author: Bodo Stroesser <bstroesser@fujitsu-siemens.com> - * Licensed under the GPL */ #ifndef __ELF_USER_H__ diff --git a/arch/um/include/shared/frame_kern.h b/arch/um/include/shared/frame_kern.h index e584e40ee832..ed952ac661ca 100644 --- a/arch/um/include/shared/frame_kern.h +++ b/arch/um/include/shared/frame_kern.h @@ -1,19 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL */ #ifndef __FRAME_KERN_H_ #define __FRAME_KERN_H_ -extern int setup_signal_stack_sc(unsigned long stack_top, int sig, - struct k_sigaction *ka, - struct pt_regs *regs, - sigset_t *mask); -extern int setup_signal_stack_si(unsigned long stack_top, int sig, - struct k_sigaction *ka, - struct pt_regs *regs, siginfo_t *info, - sigset_t *mask); +extern int setup_signal_stack_sc(unsigned long stack_top, struct ksignal *ksig, + struct pt_regs *regs, sigset_t *mask); +extern int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig, + struct pt_regs *regs, sigset_t *mask); #endif diff --git a/arch/um/include/shared/init.h b/arch/um/include/shared/init.h index b3906f860a87..1a659e2e8cc3 100644 --- a/arch/um/include/shared/init.h +++ b/arch/um/include/shared/init.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_UML_INIT_H #define _LINUX_UML_INIT_H @@ -40,39 +41,19 @@ typedef int (*initcall_t)(void); typedef void (*exitcall_t)(void); -#ifndef __KERNEL__ -#ifndef __section -# define __section(S) __attribute__ ((__section__(#S))) -#endif - -#if __GNUC__ == 3 - -#if __GNUC_MINOR__ >= 3 -# define __used __attribute__((__used__)) -#else -# define __used __attribute__((__unused__)) -#endif - -#else -#if __GNUC__ == 4 -# define __used __attribute__((__used__)) -#endif -#endif +#include <linux/compiler_types.h> -#else -#include <linux/compiler.h> -#endif /* These are for everybody (although not all archs will actually discard it in modules) */ -#define __init __section(.init.text) -#define __initdata __section(.init.data) -#define __exitdata __section(.exit.data) -#define __exit_call __used __section(.exitcall.exit) +#define __init __section(".init.text") +#define __initdata __section(".init.data") +#define __exitdata __section(".exit.data") +#define __exit_call __used __section(".exitcall.exit") #ifdef MODULE -#define __exit __section(.exit.text) +#define __exit __section(".exit.text") #else -#define __exit __used __section(.exit.text) +#define __exit __used __section(".exit.text") #endif #endif @@ -83,14 +64,10 @@ struct uml_param { int (*setup_func)(char *, int *); }; -extern initcall_t __uml_initcall_start, __uml_initcall_end; extern initcall_t __uml_postsetup_start, __uml_postsetup_end; extern const char *__uml_help_start, *__uml_help_end; #endif -#define __uml_initcall(fn) \ - static initcall_t __uml_initcall_##fn __uml_init_call = fn - #define __uml_exitcall(fn) \ static exitcall_t __uml_exitcall_##fn __uml_exit_call = fn @@ -125,13 +102,12 @@ extern struct uml_param __uml_setup_start, __uml_setup_end; * Mark functions and data as being only used at initialization * or exit time. */ -#define __uml_init_setup __used __section(.uml.setup.init) -#define __uml_setup_help __used __section(.uml.help.init) -#define __uml_init_call __used __section(.uml.initcall.init) -#define __uml_postsetup_call __used __section(.uml.postsetup.init) -#define __uml_exit_call __used __section(.uml.exitcall.exit) +#define __uml_init_setup __used __section(".uml.setup.init") +#define __uml_setup_help __used __section(".uml.help.init") +#define __uml_postsetup_call __used __section(".uml.postsetup.init") +#define __uml_exit_call __used __section(".uml.exitcall.exit") -#ifndef __KERNEL__ +#ifdef __UM_HOST__ #define __define_initcall(level,fn) \ static initcall_t __initcall_##fn __used \ @@ -144,7 +120,7 @@ extern struct uml_param __uml_setup_start, __uml_setup_end; #define __exitcall(fn) static exitcall_t __exitcall_##fn __exit_call = fn -#define __init_call __used __section(.initcall.init) +#define __init_call __used __section(".initcall.init") #endif diff --git a/arch/um/include/shared/irq_kern.h b/arch/um/include/shared/irq_kern.h index e05bd667de15..44357fa6ee29 100644 --- a/arch/um/include/shared/irq_kern.h +++ b/arch/um/include/shared/irq_kern.h @@ -1,18 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL */ #ifndef __IRQ_KERN_H__ #define __IRQ_KERN_H__ #include <linux/interrupt.h> +#include <linux/time-internal.h> #include <asm/ptrace.h> +#include "irq_user.h" -extern int um_request_irq(unsigned int irq, int fd, int type, - irq_handler_t handler, - unsigned long irqflags, const char * devname, - void *dev_id); -void um_free_irq(unsigned int irq, void *dev); +#define UM_IRQ_ALLOC -1 + +int um_request_irq(int irq, int fd, enum um_irq_type type, + irq_handler_t handler, unsigned long irqflags, + const char *devname, void *dev_id); + +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT +/** + * um_request_irq_tt - request an IRQ with timetravel handler + * + * @irq: the IRQ number, or %UM_IRQ_ALLOC + * @fd: The file descriptor to request an IRQ for + * @type: read or write + * @handler: the (generic style) IRQ handler + * @irqflags: Linux IRQ flags + * @devname: name for this to show + * @dev_id: data pointer to pass to the IRQ handler + * @timetravel_handler: the timetravel interrupt handler, invoked with the IRQ + * number, fd, dev_id and time-travel event pointer. + * + * Returns: The interrupt number assigned or a negative error. + * + * Note that the timetravel handler is invoked only if the time_travel_mode is + * %TT_MODE_EXTERNAL, and then it is invoked even while the system is suspended! + * This function must call time_travel_add_irq_event() for the event passed with + * an appropriate delay, before sending an ACK on the socket it was invoked for. + * + * If this was called while the system is suspended, then adding the event will + * cause the system to resume. + * + * Since this function will almost certainly have to handle the FD's condition, + * a read will consume the message, and after that it is up to the code using + * it to pass such a message to the @handler in whichever way it can. + * + * If time_travel_mode is not %TT_MODE_EXTERNAL the @timetravel_handler will + * not be invoked at all and the @handler must handle the FD becoming + * readable (or writable) instead. Use um_irq_timetravel_handler_used() to + * distinguish these cases. + * + * See virtio_uml.c for an example. + */ +int um_request_irq_tt(int irq, int fd, enum um_irq_type type, + irq_handler_t handler, unsigned long irqflags, + const char *devname, void *dev_id, + void (*timetravel_handler)(int, int, void *, + struct time_travel_event *)); +#else +static inline +int um_request_irq_tt(int irq, int fd, enum um_irq_type type, + irq_handler_t handler, unsigned long irqflags, + const char *devname, void *dev_id, + void (*timetravel_handler)(int, int, void *, + struct time_travel_event *)) +{ + return um_request_irq(irq, fd, type, handler, irqflags, + devname, dev_id); +} #endif +static inline bool um_irq_timetravel_handler_used(void) +{ + return time_travel_mode == TT_MODE_EXTERNAL; +} + +void um_free_irq(int irq, void *dev_id); +void free_irqs(void); +#endif diff --git a/arch/um/include/shared/irq_user.h b/arch/um/include/shared/irq_user.h index df5633053957..746abc24a5d5 100644 --- a/arch/um/include/shared/irq_user.h +++ b/arch/um/include/shared/irq_user.h @@ -1,6 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #ifndef __IRQ_USER_H__ @@ -8,24 +8,20 @@ #include <sysdep/ptrace.h> -struct irq_fd { - struct irq_fd *next; - void *id; - int fd; - int type; - int irq; - int events; - int current_events; +enum um_irq_type { + IRQ_READ, + IRQ_WRITE, + NUM_IRQ_TYPES, }; -enum { IRQ_READ, IRQ_WRITE }; - struct siginfo; -extern void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs); +extern void sigio_handler(int sig, struct siginfo *unused_si, + struct uml_pt_regs *regs, void *mc); +extern void sigchld_handler(int sig, struct siginfo *unused_si, + struct uml_pt_regs *regs, void *mc); +void sigio_run_timetravel_handlers(void); extern void free_irq_by_fd(int fd); -extern void reactivate_fd(int fd, int irqnum); extern void deactivate_fd(int fd, int irqnum); extern int deactivate_all_fds(void); -extern int activate_ipi(int fd, int pid); #endif diff --git a/arch/um/include/shared/kern.h b/arch/um/include/shared/kern.h index 6cd01240bbf0..3a9c75a8413c 100644 --- a/arch/um/include/shared/kern.h +++ b/arch/um/include/shared/kern.h @@ -1,6 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL */ #ifndef __KERN_H__ diff --git a/arch/um/include/shared/kern_util.h b/arch/um/include/shared/kern_util.h index 83a91f976330..38321188c04c 100644 --- a/arch/um/include/shared/kern_util.h +++ b/arch/um/include/shared/kern_util.h @@ -1,6 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #ifndef __KERN_UTIL_H__ @@ -13,31 +13,30 @@ struct siginfo; extern int uml_exitcode; -extern int ncpus; extern int kmalloc_ok; -#define UML_ROUND_UP(addr) \ - ((((unsigned long) addr) + PAGE_SIZE - 1) & PAGE_MASK) - extern unsigned long alloc_stack(int order, int atomic); extern void free_stack(unsigned long stack, int order); -extern int do_signal(void); +struct pt_regs; +extern void do_signal(struct pt_regs *regs); extern void interrupt_end(void); -extern void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs); +extern void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs, + void *mc); extern unsigned long segv(struct faultinfo fi, unsigned long ip, - int is_user, struct uml_pt_regs *regs); + int is_user, struct uml_pt_regs *regs, + void *mc); extern int handle_page_fault(unsigned long address, unsigned long ip, int is_write, int is_user, int *code_out); extern unsigned int do_IRQ(int irq, struct uml_pt_regs *regs); -extern int smp_sigio_handler(void); extern void initial_thread_cb(void (*proc)(void *), void *arg); -extern int is_syscall(unsigned long addr); extern void timer_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs); +extern void uml_pm_wake(void); + extern int start_uml(void); extern void paging_init(void); @@ -48,22 +47,25 @@ extern void do_uml_exitcalls(void); * Are we disallowed to sleep? Used to choose between GFP_KERNEL and * GFP_ATOMIC. */ -extern int __cant_sleep(void); +extern int __uml_cant_sleep(void); extern int get_current_pid(void); extern int copy_from_user_proc(void *to, void *from, int size); -extern int cpu(void); extern char *uml_strdup(const char *string); +int uml_need_resched(void); extern unsigned long to_irq_stack(unsigned long *mask_out); extern unsigned long from_irq_stack(int nested); -extern void syscall_trace(struct uml_pt_regs *regs, int entryexit); -extern int singlestepping(void *t); +extern int singlestepping(void); -extern void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs); -extern void bus_handler(int sig, struct siginfo *si, struct uml_pt_regs *regs); -extern void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs); +extern void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs, + void *mc); +extern void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs, + void *mc); extern void fatal_sigsegv(void) __attribute__ ((noreturn)); +void um_idle_sleep(void); + +void kasan_map_memory(void *start, size_t len); #endif diff --git a/arch/um/include/shared/longjmp.h b/arch/um/include/shared/longjmp.h index 9bdddf4c405b..c53e43d980c8 100644 --- a/arch/um/include/shared/longjmp.h +++ b/arch/um/include/shared/longjmp.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __UML_LONGJMP_H #define __UML_LONGJMP_H @@ -11,13 +12,12 @@ extern void longjmp(jmp_buf, int); longjmp(*buf, val); \ } while(0) -#define UML_SETJMP(buf) ({ \ - int n; \ - volatile int enable; \ - enable = get_signals(); \ - n = setjmp(*buf); \ - if(n != 0) \ - set_signals(enable); \ +#define UML_SETJMP(buf) ({ \ + int n, enable; \ + enable = um_get_signals(); \ + n = setjmp(*buf); \ + if(n != 0) \ + um_set_signals_trace(enable); \ n; }) #endif diff --git a/arch/um/include/shared/mem.h b/arch/um/include/shared/mem.h index 5cd40e99e8d5..98aacd544108 100644 --- a/arch/um/include/shared/mem.h +++ b/arch/um/include/shared/mem.h @@ -1,6 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #ifndef __MEM_H__ @@ -9,12 +9,12 @@ extern int phys_mapping(unsigned long phys, unsigned long long *offset_out); extern unsigned long uml_physmem; -static inline unsigned long to_phys(void *virt) +static inline unsigned long uml_to_phys(void *virt) { return(((unsigned long) virt) - uml_physmem); } -static inline void *to_virt(unsigned long phys) +static inline void *uml_to_virt(unsigned long phys) { return((void *) uml_physmem + phys); } diff --git a/arch/um/include/shared/mem_user.h b/arch/um/include/shared/mem_user.h index 46384acd547b..8a5b72872ff8 100644 --- a/arch/um/include/shared/mem_user.h +++ b/arch/um/include/shared/mem_user.h @@ -32,30 +32,10 @@ #ifndef _MEM_USER_H #define _MEM_USER_H -struct iomem_region { - struct iomem_region *next; - char *driver; - int fd; - int size; - unsigned long phys; - unsigned long virt; -}; - -extern struct iomem_region *iomem_regions; -extern int iomem_size; - #define ROUND_4M(n) ((((unsigned long) (n)) + (1 << 22)) & ~((1 << 22) - 1)) -extern int init_mem_user(void); -extern void setup_memory(void *entry); -extern unsigned long find_iomem(char *driver, unsigned long *len_out); -extern int init_maps(unsigned long physmem, unsigned long iomem, - unsigned long highmem); -extern unsigned long get_vm(unsigned long len); extern void setup_physmem(unsigned long start, unsigned long usable, - unsigned long len, unsigned long long highmem); -extern void add_iomem(char *name, int fd, unsigned long size); -extern unsigned long phys_offset(unsigned long phys); + unsigned long len); extern void map_memory(unsigned long virt, unsigned long phys, unsigned long len, int r, int w, int x); diff --git a/arch/um/include/shared/net_kern.h b/arch/um/include/shared/net_kern.h deleted file mode 100644 index 012ac87d4900..000000000000 --- a/arch/um/include/shared/net_kern.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (C) 2002 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL - */ - -#ifndef __UM_NET_KERN_H -#define __UM_NET_KERN_H - -#include <linux/netdevice.h> -#include <linux/platform_device.h> -#include <linux/skbuff.h> -#include <linux/socket.h> -#include <linux/list.h> -#include <linux/workqueue.h> - -struct uml_net { - struct list_head list; - struct net_device *dev; - struct platform_device pdev; - int index; -}; - -struct uml_net_private { - struct list_head list; - spinlock_t lock; - struct net_device *dev; - struct timer_list tl; - - struct work_struct work; - int fd; - unsigned char mac[ETH_ALEN]; - int max_packet; - unsigned short (*protocol)(struct sk_buff *); - int (*open)(void *); - void (*close)(int, void *); - void (*remove)(void *); - int (*read)(int, struct sk_buff *skb, struct uml_net_private *); - int (*write)(int, struct sk_buff *skb, struct uml_net_private *); - - void (*add_address)(unsigned char *, unsigned char *, void *); - void (*delete_address)(unsigned char *, unsigned char *, void *); - char user[0]; -}; - -struct net_kern_info { - void (*init)(struct net_device *, void *); - unsigned short (*protocol)(struct sk_buff *); - int (*read)(int, struct sk_buff *skb, struct uml_net_private *); - int (*write)(int, struct sk_buff *skb, struct uml_net_private *); -}; - -struct transport { - struct list_head list; - const char *name; - int (* const setup)(char *, char **, void *); - const struct net_user_info *user; - const struct net_kern_info *kern; - const int private_size; - const int setup_size; -}; - -extern struct net_device *ether_init(int); -extern unsigned short ether_protocol(struct sk_buff *); -extern int tap_setup_common(char *str, char *type, char **dev_name, - char **mac_out, char **gate_addr); -extern void register_transport(struct transport *new); -extern unsigned short eth_protocol(struct sk_buff *skb); - -#endif diff --git a/arch/um/include/shared/net_user.h b/arch/um/include/shared/net_user.h deleted file mode 100644 index 3dabbe128e40..000000000000 --- a/arch/um/include/shared/net_user.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL - */ - -#ifndef __UM_NET_USER_H__ -#define __UM_NET_USER_H__ - -#define ETH_ADDR_LEN (6) -#define ETH_HEADER_ETHERTAP (16) -#define ETH_HEADER_OTHER (26) /* 14 for ethernet + VLAN + MPLS for crazy people */ -#define ETH_MAX_PACKET (1500) - -#define UML_NET_VERSION (4) - -struct net_user_info { - int (*init)(void *, void *); - int (*open)(void *); - void (*close)(int, void *); - void (*remove)(void *); - void (*add_address)(unsigned char *, unsigned char *, void *); - void (*delete_address)(unsigned char *, unsigned char *, void *); - int max_packet; - int mtu; -}; - -extern void ether_user_init(void *data, void *dev); -extern void iter_addresses(void *d, void (*cb)(unsigned char *, - unsigned char *, void *), - void *arg); - -extern void *get_output_buffer(int *len_out); -extern void free_output_buffer(void *buffer); - -extern int tap_open_common(void *dev, char *gate_addr); -extern void tap_check_ips(char *gate_addr, unsigned char *eth_addr); - -extern void read_output(int fd, char *output_out, int len); - -extern int net_read(int fd, void *buf, int len); -extern int net_recvfrom(int fd, void *buf, int len); -extern int net_write(int fd, void *buf, int len); -extern int net_send(int fd, void *buf, int len); -extern int net_sendto(int fd, void *buf, int len, void *to, int sock_len); - -extern void open_addr(unsigned char *addr, unsigned char *netmask, void *arg); -extern void close_addr(unsigned char *addr, unsigned char *netmask, void *arg); - -extern char *split_if_spec(char *str, ...); - -extern int dev_netmask(void *d, void *m); - -#endif diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index 95feaa47a2fb..b26e94292fc1 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -1,15 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* + * Copyright (C) 2015 Anton Ivanov (aivanov@{brocade.com,kot-begemot.co.uk}) + * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de) * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #ifndef __OS_H__ #define __OS_H__ -#include <stdarg.h> #include <irq_user.h> #include <longjmp.h> #include <mm_id.h> +/* This is to get size_t */ +#ifndef __UM_HOST__ +#include <linux/types.h> +#else +#include <sys/types.h> +#endif #define CATCH_EINTR(expr) while ((errno = 0, ((expr) < 0)) && (errno == EINTR)) @@ -34,6 +41,8 @@ #define OS_LIB_PATH "/usr/lib/" #endif +#define OS_SENDMSG_MAX_FDS 8 + /* * types taken from stat_file() in hostfs_user.c * (if they are wrong here, they are wrong there...). @@ -134,15 +143,17 @@ extern int os_access(const char *file, int mode); extern int os_set_exec_close(int fd); extern int os_ioctl_generic(int fd, unsigned int cmd, unsigned long arg); extern int os_get_ifname(int fd, char *namebuf); -extern int os_set_slip(int fd); extern int os_mode_fd(int fd, int mode); extern int os_seek_file(int fd, unsigned long long offset); extern int os_open_file(const char *file, struct openflags flags, int mode); extern int os_read_file(int fd, void *buf, int len); extern int os_write_file(int fd, const void *buf, int count); +extern int os_sync_file(int fd); extern int os_file_size(const char *file, unsigned long long *size_out); -extern int os_file_modtime(const char *file, unsigned long *modtime); +extern int os_pread_file(int fd, void *buf, int len, unsigned long long offset); +extern int os_pwrite_file(int fd, const void *buf, int count, unsigned long long offset); +extern int os_file_modtime(const char *file, long long *modtime); extern int os_pipe(int *fd, int stream, int close_on_exec); extern int os_set_fd_async(int fd); extern int os_clear_fd_async(int fd); @@ -150,45 +161,48 @@ extern int os_set_fd_block(int fd, int blocking); extern int os_accept_connection(int fd); extern int os_create_unix_socket(const char *file, int len, int close_on_exec); extern int os_shutdown_socket(int fd, int r, int w); +extern int os_dup_file(int fd); extern void os_close_file(int fd); -extern int os_rcv_fd(int fd, int *helper_pid_out); -extern int create_unix_socket(char *file, int len, int close_on_exec); +ssize_t os_rcv_fd_msg(int fd, int *fds, unsigned int n_fds, + void *data, size_t data_len); extern int os_connect_socket(const char *name); extern int os_file_type(char *file); extern int os_file_mode(const char *file, struct openflags *mode_out); extern int os_lock_file(int fd, int excl); extern void os_flush_stdout(void); -extern int os_stat_filesystem(char *path, long *bsize_out, - long long *blocks_out, long long *bfree_out, - long long *bavail_out, long long *files_out, - long long *ffree_out, void *fsid_out, - int fsid_size, long *namelen_out, - long *spare_out); -extern int os_change_dir(char *dir); -extern int os_fchange_dir(int fd); extern unsigned os_major(unsigned long long dev); extern unsigned os_minor(unsigned long long dev); extern unsigned long long os_makedev(unsigned major, unsigned minor); +extern int os_falloc_punch(int fd, unsigned long long offset, int count); +extern int os_falloc_zeroes(int fd, unsigned long long offset, int count); +extern int os_eventfd(unsigned int initval, int flags); +extern int os_sendmsg_fds(int fd, const void *buf, unsigned int len, + const int *fds, unsigned int fds_num); +int os_poll(unsigned int n, const int *fds); +void *os_mmap_rw_shared(int fd, size_t size); +void *os_mremap_rw_shared(void *old_addr, size_t old_size, size_t new_size); /* start_up.c */ extern void os_early_checks(void); -extern void can_do_skas(void); extern void os_check_bugs(void); extern void check_host_supports_tls(int *supports_tls, int *tls_min); +extern void get_host_cpu_features( + void (*flags_helper_func)(char *line), + void (*cache_helper_func)(char *line)); /* mem.c */ extern int create_mem_file(unsigned long long len); +/* tlb.c */ +extern void report_enomem(void); + /* process.c */ -extern unsigned long os_process_pc(int pid); -extern int os_process_parent(int pid); -extern void os_stop_process(int pid); +pid_t os_reap_child(void); +extern void os_alarm_process(int pid); extern void os_kill_process(int pid, int reap_child); extern void os_kill_ptraced_process(int pid, int reap_child); -extern long os_ptrace_ldt(long pid, long addr, long data); extern int os_getpid(void); -extern int os_getpgrp(void); extern void init_new_thread_signals(void); @@ -199,7 +213,11 @@ extern int os_protect_memory(void *addr, unsigned long len, extern int os_unmap_memory(void *addr, int len); extern int os_drop_memory(void *addr, int length); extern int can_drop_memory(void); -extern void os_flush_stdout(void); + +void os_set_pdeathsig(void); + +int os_futex_wait(void *uaddr, unsigned int val); +int os_futex_wake(void *uaddr); /* execvp.c */ extern int execvp_noalloc(char *buf, const char *file, char *const argv[]); @@ -209,6 +227,11 @@ extern int run_helper_thread(int (*proc)(void *), void *arg, unsigned int flags, unsigned long *stack_out); extern int helper_wait(int pid); +struct os_helper_thread; +int os_run_helper_thread(struct os_helper_thread **td_out, + void *(*routine)(void *), void *arg); +void os_kill_helper_thread(struct os_helper_thread *td); +void os_fix_helper_thread_signals(void); /* umid.c */ extern int umid_file_name(char *name, char *buf, int len); @@ -216,54 +239,60 @@ extern int set_umid(char *name); extern char *get_umid(void); /* signal.c */ -extern void timer_init(void); +extern void timer_set_signal_handler(void); extern void set_sigstack(void *sig_stack, int size); -extern void remove_sigstack(void); extern void set_handler(int sig); +extern void send_sigio_to_self(void); extern int change_sig(int signal, int on); extern void block_signals(void); extern void unblock_signals(void); -extern int get_signals(void); -extern int set_signals(int enable); +extern int um_get_signals(void); +extern int um_set_signals(int enable); +extern int um_set_signals_trace(int enable); +extern void deliver_alarm(void); +extern void register_pm_wake_signal(void); +extern void block_signals_hard(void); +extern void unblock_signals_hard(void); +extern void mark_sigio_pending(void); /* util.c */ extern void stack_protections(unsigned long address); extern int raw(int fd); extern void setup_machinename(char *machine_out); extern void setup_hostinfo(char *buf, int len); +extern ssize_t os_getrandom(void *buf, size_t len, unsigned int flags); extern void os_dump_core(void) __attribute__ ((noreturn)); extern void um_early_printk(const char *s, unsigned int n); +extern void os_fix_helper_signals(void); +extern void os_info(const char *fmt, ...) + __attribute__ ((format (printf, 1, 2))); +extern void os_warn(const char *fmt, ...) + __attribute__ ((format (printf, 1, 2))); /* time.c */ -extern void idle_sleep(unsigned long long nsecs); -extern int set_interval(void); -extern int timer_one_shot(int ticks); -extern long long disable_timer(void); -extern void uml_idle_timer(void); +void os_idle_prepare(void); +extern void os_idle_sleep(void); +extern int os_timer_create(void); +extern int os_timer_set_interval(int cpu, unsigned long long nsecs); +extern int os_timer_one_shot(int cpu, unsigned long long nsecs); +extern void os_timer_disable(int cpu); +extern long long os_persistent_clock_emulation(void); extern long long os_nsecs(void); /* skas/mem.c */ -extern long run_syscall_stub(struct mm_id * mm_idp, - int syscall, unsigned long *args, long expected, - void **addr, int done); -extern long syscall_stub_data(struct mm_id * mm_idp, - unsigned long *data, int data_count, - void **addr, void **stub_addr); -extern int map(struct mm_id * mm_idp, unsigned long virt, - unsigned long len, int prot, int phys_fd, - unsigned long long offset, int done, void **data); -extern int unmap(struct mm_id * mm_idp, unsigned long addr, unsigned long len, - int done, void **data); -extern int protect(struct mm_id * mm_idp, unsigned long addr, - unsigned long len, unsigned int prot, int done, void **data); +int syscall_stub_flush(struct mm_id *mm_idp); +struct stub_syscall *syscall_stub_alloc(struct mm_id *mm_idp); +void syscall_stub_dump_error(struct mm_id *mm_idp); + +int map(struct mm_id *mm_idp, unsigned long virt, + unsigned long len, int prot, int phys_fd, + unsigned long long offset); +int unmap(struct mm_id *mm_idp, unsigned long addr, unsigned long len); /* skas/process.c */ extern int is_skas_winch(int pid, int fd, void *data); -extern int start_userspace(unsigned long stub_stack); -extern int copy_context_skas0(unsigned long stack, int pid); +extern int start_userspace(struct mm_id *mm_id); extern void userspace(struct uml_pt_regs *regs); -extern int map_stub_pages(int fd, unsigned long code, unsigned long data, - unsigned long stack); extern void new_thread(void *stack, jmp_buf *buf, void (*handler)(void)); extern void switch_threads(jmp_buf *me, jmp_buf *you); extern int start_idle_thread(void *stack, jmp_buf *switch_buf); @@ -273,29 +302,58 @@ extern void halt_skas(void); extern void reboot_skas(void); /* irq.c */ -extern int os_waiting_for_events(struct irq_fd *active_fds); -extern int os_create_pollfd(int fd, int events, void *tmp_pfd, int size_tmpfds); -extern void os_free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg, - struct irq_fd *active_fds, struct irq_fd ***last_irq_ptr2); -extern void os_free_irq_later(struct irq_fd *active_fds, - int irq, void *dev_id); -extern int os_get_pollfd(int i); -extern void os_set_pollfd(int i, int fd); +extern int os_waiting_for_events_epoll(void); +extern void *os_epoll_get_data_pointer(int index); +extern int os_epoll_triggered(int index, int events); +extern int os_event_mask(enum um_irq_type irq_type); +extern int os_setup_epoll(void); +extern int os_add_epoll_fd(int events, int fd, void *data); +extern int os_mod_epoll_fd(int events, int fd, void *data); +extern int os_del_epoll_fd(int fd); extern void os_set_ioignore(void); +extern void os_close_epoll_fd(void); +extern void um_irqs_suspend(void); +extern void um_irqs_resume(void); /* sigio.c */ extern int add_sigio_fd(int fd); extern int ignore_sigio_fd(int fd); -extern void maybe_sigio_broken(int fd, int read); -extern void sigio_broken(int fd, int read); - -/* sys-x86_64/prctl.c */ -extern int os_arch_prctl(int pid, int code, unsigned long *addr); +extern void maybe_sigio_broken(int fd); +extern void sigio_broken(void); +/* + * unlocked versions for IRQ controller code. + * + * This is safe because it's used at suspend/resume and nothing + * else is running. + */ +extern int __add_sigio_fd(int fd); +extern int __ignore_sigio_fd(int fd); /* tty.c */ extern int get_pty(void); -/* sys-$ARCH/task_size.c */ -extern unsigned long os_get_top_address(void); +long syscall(long number, ...); + +/* irqflags tracing */ +extern void block_signals_trace(void); +extern void unblock_signals_trace(void); +extern void um_trace_signals_on(void); +extern void um_trace_signals_off(void); + +/* time-travel */ +extern void deliver_time_travel_irqs(void); + +/* smp.c */ +#if IS_ENABLED(CONFIG_SMP) +void os_init_smp(void); +int os_start_cpu_thread(int cpu); +void os_start_secondary(void *arg, jmp_buf *switch_buf); +int os_send_ipi(int cpu, int vector); +void os_local_ipi_enable(void); +void os_local_ipi_disable(void); +#else /* !CONFIG_SMP */ +static inline void os_local_ipi_enable(void) { } +static inline void os_local_ipi_disable(void) { } +#endif /* CONFIG_SMP */ #endif diff --git a/arch/um/include/shared/ptrace_user.h b/arch/um/include/shared/ptrace_user.h index 56b2f284b108..8a705d8f96ce 100644 --- a/arch/um/include/shared/ptrace_user.h +++ b/arch/um/include/shared/ptrace_user.h @@ -1,6 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #ifndef __PTRACE_USER_H__ @@ -12,45 +12,4 @@ extern int ptrace_getregs(long pid, unsigned long *regs_out); extern int ptrace_setregs(long pid, unsigned long *regs_in); -/* syscall emulation path in ptrace */ - -#ifndef PTRACE_SYSEMU -#define PTRACE_SYSEMU 31 -#endif -#ifndef PTRACE_SYSEMU_SINGLESTEP -#define PTRACE_SYSEMU_SINGLESTEP 32 -#endif - -/* On architectures, that started to support PTRACE_O_TRACESYSGOOD - * in linux 2.4, there are two different definitions of - * PTRACE_SETOPTIONS: linux 2.4 uses 21 while linux 2.6 uses 0x4200. - * For binary compatibility, 2.6 also supports the old "21", named - * PTRACE_OLDSETOPTION. On these architectures, UML always must use - * "21", to ensure the kernel runs on 2.4 and 2.6 host without - * recompilation. So, we use PTRACE_OLDSETOPTIONS in UML. - * We also want to be able to build the kernel on 2.4, which doesn't - * have PTRACE_OLDSETOPTIONS. So, if it is missing, we declare - * PTRACE_OLDSETOPTIONS to be the same as PTRACE_SETOPTIONS. - * - * On architectures, that start to support PTRACE_O_TRACESYSGOOD on - * linux 2.6, PTRACE_OLDSETOPTIONS never is defined, and also isn't - * supported by the host kernel. In that case, our trick lets us use - * the new 0x4200 with the name PTRACE_OLDSETOPTIONS. - */ -#ifndef PTRACE_OLDSETOPTIONS -#define PTRACE_OLDSETOPTIONS PTRACE_SETOPTIONS -#endif - -void set_using_sysemu(int value); -int get_using_sysemu(void); -extern int sysemu_supported; - -#define SELECT_PTRACE_OPERATION(sysemu_mode, singlestep_mode) \ - (((int[3][3] ) { \ - { PTRACE_SYSCALL, PTRACE_SYSCALL, PTRACE_SINGLESTEP }, \ - { PTRACE_SYSEMU, PTRACE_SYSEMU, PTRACE_SINGLESTEP }, \ - { PTRACE_SYSEMU, PTRACE_SYSEMU_SINGLESTEP, \ - PTRACE_SYSEMU_SINGLESTEP } }) \ - [sysemu_mode][singlestep_mode]) - #endif diff --git a/arch/um/include/shared/registers.h b/arch/um/include/shared/registers.h index f5b76355ad71..7d81b2339a48 100644 --- a/arch/um/include/shared/registers.h +++ b/arch/um/include/shared/registers.h @@ -1,23 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2004 PathScale, Inc - * Licensed under the GPL */ #ifndef __REGISTERS_H #define __REGISTERS_H #include <sysdep/ptrace.h> -#include <sysdep/archsetjmp.h> -extern int save_fp_registers(int pid, unsigned long *fp_regs); -extern int restore_fp_registers(int pid, unsigned long *fp_regs); -extern int save_fpx_registers(int pid, unsigned long *fp_regs); -extern int restore_fpx_registers(int pid, unsigned long *fp_regs); -extern int save_registers(int pid, struct uml_pt_regs *regs); -extern int restore_registers(int pid, struct uml_pt_regs *regs); -extern int init_registers(int pid); +extern int init_pid_registers(int pid); extern void get_safe_registers(unsigned long *regs, unsigned long *fp_regs); -extern unsigned long get_thread_reg(int reg, jmp_buf *buf); extern int get_fp_registers(int pid, unsigned long *regs); extern int put_fp_registers(int pid, unsigned long *regs); diff --git a/arch/um/include/shared/sigio.h b/arch/um/include/shared/sigio.h index 434f1a9ae4b3..c6c2edce1f6d 100644 --- a/arch/um/include/shared/sigio.h +++ b/arch/um/include/shared/sigio.h @@ -1,13 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL */ #ifndef __SIGIO_H__ #define __SIGIO_H__ -extern int write_sigio_irq(int fd); -extern int register_sigio_fd(int fd); extern void sigio_lock(void); extern void sigio_unlock(void); diff --git a/arch/um/include/shared/skas/mm_id.h b/arch/um/include/shared/skas/mm_id.h index 48dd0989ddaa..fb96c0bd8222 100644 --- a/arch/um/include/shared/skas/mm_id.h +++ b/arch/um/include/shared/skas/mm_id.h @@ -1,17 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2005 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL */ #ifndef __MM_ID_H #define __MM_ID_H +#include <linux/compiler_types.h> + +#define STUB_MAX_FDS 4 + struct mm_id { - union { - int mm_fd; - int pid; - } u; + int pid; unsigned long stack; + int syscall_data_len; + + /* Only used with SECCOMP mode */ + int sock; + int syscall_fd_num; + int syscall_fd_map[STUB_MAX_FDS]; }; +void enter_turnstile(struct mm_id *mm_id) __acquires(turnstile); +void exit_turnstile(struct mm_id *mm_id) __releases(turnstile); + +void notify_mm_kill(int pid); + #endif diff --git a/arch/um/include/shared/skas/proc_mm.h b/arch/um/include/shared/skas/proc_mm.h deleted file mode 100644 index 902809209603..000000000000 --- a/arch/um/include/shared/skas/proc_mm.h +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL - */ - -#ifndef __SKAS_PROC_MM_H -#define __SKAS_PROC_MM_H - -#define MM_MMAP 54 -#define MM_MUNMAP 55 -#define MM_MPROTECT 56 -#define MM_COPY_SEGMENTS 57 - -struct mm_mmap { - unsigned long addr; - unsigned long len; - unsigned long prot; - unsigned long flags; - unsigned long fd; - unsigned long offset; -}; - -struct mm_munmap { - unsigned long addr; - unsigned long len; -}; - -struct mm_mprotect { - unsigned long addr; - unsigned long len; - unsigned int prot; -}; - -struct proc_mm_op { - int op; - union { - struct mm_mmap mmap; - struct mm_munmap munmap; - struct mm_mprotect mprotect; - int copy_segments; - } u; -}; - -#endif diff --git a/arch/um/include/shared/skas/skas.h b/arch/um/include/shared/skas/skas.h index c45df961c874..2237ffedec75 100644 --- a/arch/um/include/shared/skas/skas.h +++ b/arch/um/include/shared/skas/skas.h @@ -1,6 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #ifndef __SKAS_H @@ -8,15 +8,14 @@ #include <sysdep/ptrace.h> -extern int userspace_pid[]; -extern int proc_mm, ptrace_faultinfo, ptrace_ldt; -extern int skas_needs_stub; +extern int using_seccomp; -extern int user_thread(unsigned long stack, int flags); extern void new_thread_handler(void); extern void handle_syscall(struct uml_pt_regs *regs); -extern int new_mm(unsigned long stack); -extern long execute_syscall_skas(void *r); extern unsigned long current_stub_stack(void); +extern struct mm_id *current_mm_id(void); +extern void current_mm_sync(void); +void initial_jmpbuf_lock(void); +void initial_jmpbuf_unlock(void); #endif diff --git a/arch/um/include/shared/skas/stub-data.h b/arch/um/include/shared/skas/stub-data.h index f6ed92c3727d..27db38e95df9 100644 --- a/arch/um/include/shared/skas/stub-data.h +++ b/arch/um/include/shared/skas/stub-data.h @@ -1,18 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* + + * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de) * Copyright (C) 2005 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL */ #ifndef __STUB_DATA_H #define __STUB_DATA_H -#include <sys/time.h> +#include <linux/compiler_types.h> +#include <as-layout.h> +#include <sysdep/tls.h> +#include <sysdep/stub-data.h> +#include <mm_id.h> + +#define FUTEX_IN_CHILD 0 +#define FUTEX_IN_KERN 1 + +struct stub_init_data { + int seccomp; + + unsigned long stub_start; + + int stub_code_fd; + unsigned long stub_code_offset; + int stub_data_fd; + unsigned long stub_data_offset; + + unsigned long signal_handler; + unsigned long signal_restorer; +}; + +#define STUB_NEXT_SYSCALL(s) \ + ((struct stub_syscall *) (((unsigned long) s) + (s)->cmd_len)) + +enum stub_syscall_type { + STUB_SYSCALL_UNSET = 0, + STUB_SYSCALL_MMAP, + STUB_SYSCALL_MUNMAP, +}; + +struct stub_syscall { + struct { + unsigned long addr; + unsigned long length; + unsigned long offset; + int fd; + int prot; + } mem; + + enum stub_syscall_type syscall; +}; struct stub_data { - long offset; - int fd; - struct itimerval timer; long err; + + int syscall_data_len; + /* 128 leaves enough room for additional fields in the struct */ + struct stub_syscall syscall_data[(UM_KERN_PAGE_SIZE - 128) / sizeof(struct stub_syscall)] __aligned(16); + + /* data shared with signal handler (only used in seccomp mode) */ + short restart_wait; + unsigned int futex; + int signal; + unsigned short si_offset; + unsigned short mctx_offset; + + /* seccomp architecture specific state restore */ + struct stub_data_arch arch_data; + + /* Stack for our signal handlers and for calling into . */ + unsigned char sigstack[UM_KERN_PAGE_SIZE] __aligned(UM_KERN_PAGE_SIZE); }; #endif diff --git a/arch/um/include/shared/skas_ptrace.h b/arch/um/include/shared/skas_ptrace.h deleted file mode 100644 index 630a9c92b93c..000000000000 --- a/arch/um/include/shared/skas_ptrace.h +++ /dev/null @@ -1,14 +0,0 @@ -/* - * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL - */ - -#ifndef __SKAS_PTRACE_H -#define __SKAS_PTRACE_H - -#define PTRACE_FAULTINFO 52 -#define PTRACE_SWITCH_MM 55 - -#include <sysdep/skas_ptrace.h> - -#endif diff --git a/arch/um/include/shared/smp.h b/arch/um/include/shared/smp.h new file mode 100644 index 000000000000..06e3faa95091 --- /dev/null +++ b/arch/um/include/shared/smp.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __UM_SHARED_SMP_H +#define __UM_SHARED_SMP_H + +#if IS_ENABLED(CONFIG_SMP) + +extern int uml_ncpus; + +int uml_curr_cpu(void); +void uml_start_secondary(void *opaque); +void uml_ipi_handler(int vector); + +#else /* !CONFIG_SMP */ + +#define uml_ncpus 1 +#define uml_curr_cpu() 0 + +#endif /* CONFIG_SMP */ + +#endif /* __UM_SHARED_SMP_H */ diff --git a/arch/um/include/shared/timetravel.h b/arch/um/include/shared/timetravel.h new file mode 100644 index 000000000000..7c2b277b7eb0 --- /dev/null +++ b/arch/um/include/shared/timetravel.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2019-2021 Intel Corporation + */ +#ifndef _UM_TIME_TRAVEL_H_ +#define _UM_TIME_TRAVEL_H_ + +enum time_travel_mode { + TT_MODE_OFF, + TT_MODE_BASIC, + TT_MODE_INFCPU, + TT_MODE_EXTERNAL, +}; + +#if IS_ENABLED(CONFIG_UML_TIME_TRAVEL_SUPPORT) +extern enum time_travel_mode time_travel_mode; +extern int time_travel_should_print_bc_msg; +#else +#define time_travel_mode TT_MODE_OFF +#define time_travel_should_print_bc_msg 0 +#endif /* CONFIG_UML_TIME_TRAVEL_SUPPORT */ + +void _time_travel_print_bc_msg(void); +static inline void time_travel_print_bc_msg(void) +{ + if (time_travel_should_print_bc_msg) + _time_travel_print_bc_msg(); +} + +#endif /* _UM_TIME_TRAVEL_H_ */ diff --git a/arch/um/include/shared/um_malloc.h b/arch/um/include/shared/um_malloc.h index 6395fef6b69b..815dd03e8707 100644 --- a/arch/um/include/shared/um_malloc.h +++ b/arch/um/include/shared/um_malloc.h @@ -1,6 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2005 Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it> - * Licensed under the GPL */ #ifndef __UM_MALLOC_H__ @@ -11,8 +11,9 @@ extern void *uml_kmalloc(int size, int flags); extern void kfree(const void *ptr); -extern void *vmalloc(unsigned long size); -extern void vfree(void *ptr); +extern void *vmalloc_noprof(unsigned long size); +#define vmalloc(...) vmalloc_noprof(__VA_ARGS__) +extern void vfree(const void *ptr); #endif /* __UM_MALLOC_H__ */ diff --git a/arch/um/include/shared/user.h b/arch/um/include/shared/user.h index cef068563336..139eb78a4767 100644 --- a/arch/um/include/shared/user.h +++ b/arch/um/include/shared/user.h @@ -1,6 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #ifndef __USER_H__ @@ -16,11 +16,12 @@ */ #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) -/* This is to get size_t */ -#ifdef __KERNEL__ +/* This is to get size_t and NULL */ +#ifndef __UM_HOST__ #include <linux/types.h> #else #include <stddef.h> +#include <sys/types.h> #endif extern void panic(const char *fmt, ...) @@ -37,19 +38,29 @@ extern void panic(const char *fmt, ...) #define UM_KERN_DEBUG KERN_DEBUG #define UM_KERN_CONT KERN_CONT -#ifdef UML_CONFIG_PRINTK -extern int printk(const char *fmt, ...) +#if IS_ENABLED(CONFIG_PRINTK) +#define printk(...) _printk(__VA_ARGS__) +extern int _printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2))); +extern void print_hex_dump(const char *level, const char *prefix_str, + int prefix_type, int rowsize, int groupsize, + const void *buf, size_t len, _Bool ascii); #else static inline int printk(const char *fmt, ...) { return 0; } +static inline void print_hex_dump(const char *level, const char *prefix_str, + int prefix_type, int rowsize, int groupsize, + const void *buf, size_t len, _Bool ascii) +{ +} #endif extern int in_aton(char *str); -extern size_t strlcpy(char *, const char *, size_t); extern size_t strlcat(char *, const char *, size_t); +extern size_t sized_strscpy(char *, const char *, size_t); +#define strscpy(dst, src) sized_strscpy(dst, src, sizeof(dst)) /* Copied from linux/compiler-gcc.h since we can't include it directly */ #define barrier() __asm__ __volatile__("": : :"memory") diff --git a/arch/um/include/uapi/asm/Kbuild b/arch/um/include/uapi/asm/Kbuild new file mode 100644 index 000000000000..f66554cd5c45 --- /dev/null +++ b/arch/um/include/uapi/asm/Kbuild @@ -0,0 +1 @@ +# SPDX-License-Identifier: GPL-2.0 diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile index babe21826e3e..be60bc451b3f 100644 --- a/arch/um/kernel/Makefile +++ b/arch/um/kernel/Makefile @@ -1,30 +1,37 @@ +# SPDX-License-Identifier: GPL-2.0 # # Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux,intel}.com) -# Licensed under the GPL # +# Don't instrument UML-specific code; without this, we may crash when +# accessing the instrumentation buffer for the first time from the +# kernel. +KCOV_INSTRUMENT := n + CPPFLAGS_vmlinux.lds := -DSTART=$(LDS_START) \ -DELF_ARCH=$(LDS_ELF_ARCH) \ -DELF_FORMAT=$(LDS_ELF_FORMAT) \ $(LDS_EXTRA) -extra-y := vmlinux.lds -clean-files := +always-$(KBUILD_BUILTIN) := vmlinux.lds obj-y = config.o exec.o exitcode.o irq.o ksyms.o mem.o \ physmem.o process.o ptrace.o reboot.o sigio.o \ - signal.o smp.o syscall.o sysrq.o time.o tlb.o trap.o \ - um_arch.o umid.o skas/ + signal.o sysrq.o time.o tlb.o trap.o \ + um_arch.o umid.o kmsg_dump.o capflags.o skas/ +obj-y += load_file.o obj-$(CONFIG_BLK_DEV_INITRD) += initrd.o obj-$(CONFIG_GPROF) += gprof_syms.o -obj-$(CONFIG_GCOV) += gmon_syms.o +obj-$(CONFIG_OF) += dtb.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o +obj-$(CONFIG_STACKTRACE) += stacktrace.o +obj-$(CONFIG_SMP) += smp.o USER_OBJS := config.o -include arch/um/scripts/Makefile.rules +include $(srctree)/arch/um/scripts/Makefile.rules -targets := config.c config.tmp +targets := config.c config.tmp capflags.c # Be careful with the below Sed code - sed is pitfall-rich! # We use sed to lower build requirements, for "embedded" builders for instance. @@ -39,6 +46,15 @@ quiet_cmd_quote1 = QUOTE $@ $(obj)/config.c: $(src)/config.c.in $(obj)/config.tmp FORCE $(call if_changed,quote2) +quiet_cmd_mkcapflags = MKCAP $@ + cmd_mkcapflags = $(CONFIG_SHELL) $(src)/../../x86/kernel/cpu/mkcapflags.sh $@ $^ + +cpufeature = $(src)/../../x86/include/asm/cpufeatures.h +vmxfeature = $(src)/../../x86/include/asm/vmxfeatures.h + +$(obj)/capflags.c: $(cpufeature) $(vmxfeature) $(src)/../../x86/kernel/cpu/mkcapflags.sh FORCE + $(call if_changed,mkcapflags) + quiet_cmd_quote2 = QUOTE $@ cmd_quote2 = sed -e '/CONFIG/{' \ -e 's/"CONFIG"//' \ diff --git a/arch/um/kernel/asm-offsets.c b/arch/um/kernel/asm-offsets.c index 1fb12235ab9c..d620b6f6de9b 100644 --- a/arch/um/kernel/asm-offsets.c +++ b/arch/um/kernel/asm-offsets.c @@ -1 +1,49 @@ -#include <sysdep/kernel-offsets.h> +/* SPDX-License-Identifier: GPL-2.0 */ +#define COMPILE_OFFSETS +#include <linux/stddef.h> +#include <linux/sched.h> +#include <linux/elf.h> +#include <linux/crypto.h> +#include <linux/kbuild.h> +#include <linux/audit.h> +#include <linux/fs.h> +#include <asm/mman.h> +#include <asm/seccomp.h> +#include <asm/extable.h> + +/* workaround for a warning with -Wmissing-prototypes */ +void foo(void); + +void foo(void) +{ + DEFINE(KERNEL_MADV_REMOVE, MADV_REMOVE); + + DEFINE(UM_KERN_PAGE_SIZE, PAGE_SIZE); + DEFINE(UM_KERN_PAGE_MASK, PAGE_MASK); + DEFINE(UM_KERN_PAGE_SHIFT, PAGE_SHIFT); + + DEFINE(UM_GFP_KERNEL, GFP_KERNEL); + DEFINE(UM_GFP_ATOMIC, GFP_ATOMIC); + + DEFINE(UM_THREAD_SIZE, THREAD_SIZE); + + DEFINE(UM_NSEC_PER_SEC, NSEC_PER_SEC); + DEFINE(UM_NSEC_PER_USEC, NSEC_PER_USEC); + + DEFINE(UM_KERN_GDT_ENTRY_TLS_ENTRIES, GDT_ENTRY_TLS_ENTRIES); + + DEFINE(UM_SECCOMP_ARCH_NATIVE, SECCOMP_ARCH_NATIVE); + + DEFINE(HOSTFS_ATTR_MODE, ATTR_MODE); + DEFINE(HOSTFS_ATTR_UID, ATTR_UID); + DEFINE(HOSTFS_ATTR_GID, ATTR_GID); + DEFINE(HOSTFS_ATTR_SIZE, ATTR_SIZE); + DEFINE(HOSTFS_ATTR_ATIME, ATTR_ATIME); + DEFINE(HOSTFS_ATTR_MTIME, ATTR_MTIME); + DEFINE(HOSTFS_ATTR_CTIME, ATTR_CTIME); + DEFINE(HOSTFS_ATTR_ATIME_SET, ATTR_ATIME_SET); + DEFINE(HOSTFS_ATTR_MTIME_SET, ATTR_MTIME_SET); + + DEFINE(ALT_INSTR_SIZE, sizeof(struct alt_instr)); + DEFINE(EXTABLE_SIZE, sizeof(struct exception_table_entry)); +} diff --git a/arch/um/kernel/config.c.in b/arch/um/kernel/config.c.in index 972bf1659564..3ece3c3b31cc 100644 --- a/arch/um/kernel/config.c.in +++ b/arch/um/kernel/config.c.in @@ -1,6 +1,6 @@ -/* +// SPDX-License-Identifier: GPL-2.0 +/* * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL */ #include <stdio.h> diff --git a/arch/um/kernel/dtb.c b/arch/um/kernel/dtb.c new file mode 100644 index 000000000000..47cd3d869fb2 --- /dev/null +++ b/arch/um/kernel/dtb.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/init.h> +#include <linux/of_fdt.h> +#include <linux/printk.h> +#include <linux/memblock.h> +#include <init.h> + +#include "um_arch.h" + +static char *dtb __initdata; + +void uml_dtb_init(void) +{ + long long size; + void *area; + + area = uml_load_file(dtb, &size); + if (area) { + if (!early_init_dt_scan(area, __pa(area))) { + pr_err("invalid DTB %s\n", dtb); + memblock_free(area, size); + return; + } + + early_init_fdt_scan_reserved_mem(); + } + + unflatten_device_tree(); +} + +static int __init uml_dtb_setup(char *line, int *add) +{ + *add = 0; + dtb = line; + return 0; +} + +__uml_setup("dtb=", uml_dtb_setup, +"dtb=<file>\n" +" Boot the kernel with the devicetree blob from the specified file.\n\n" +); diff --git a/arch/um/kernel/dyn.lds.S b/arch/um/kernel/dyn.lds.S index adde088aeeff..a36b7918a011 100644 --- a/arch/um/kernel/dyn.lds.S +++ b/arch/um/kernel/dyn.lds.S @@ -1,4 +1,4 @@ -#include <asm-generic/vmlinux.lds.h> +#include <asm/vmlinux.lds.h> #include <asm/page.h> OUTPUT_FORMAT(ELF_FORMAT) @@ -6,6 +6,12 @@ OUTPUT_ARCH(ELF_ARCH) ENTRY(_start) jiffies = jiffies_64; +VERSION { + { + local: *; + }; +} + SECTIONS { PROVIDE (__executable_start = START); @@ -69,6 +75,8 @@ SECTIONS TEXT_TEXT SCHED_TEXT LOCK_TEXT + IRQENTRY_TEXT + SOFTIRQENTRY_TEXT *(.fixup) *(.stub .text.* .gnu.linkonce.t.*) /* .gnu.warning sections are handled specially by elf32.em. */ @@ -100,12 +108,14 @@ SECTIONS be empty, which isn't pretty. */ . = ALIGN(32 / 8); .preinit_array : { *(.preinit_array) } - .init_array : { *(.init_array) } + .init_array : { + *(.kasan_init) + *(.init_array.*) + *(.init_array) + } .fini_array : { *(.fini_array) } .data : { INIT_TASK_DATA(KERNEL_STACK_SIZE) - . = ALIGN(KERNEL_STACK_SIZE); - *(.data..init_irqstack) DATA_DATA *(.data.* .gnu.linkonce.d.*) SORT(CONSTRUCTORS) @@ -161,8 +171,11 @@ SECTIONS PROVIDE (end = .); STABS_DEBUG - DWARF_DEBUG + ELF_DETAILS DISCARDS } + +ASSERT(__syscall_stub_end - __syscall_stub_start <= PAGE_SIZE, + "STUB code must not be larger than one page"); diff --git a/arch/um/kernel/early_printk.c b/arch/um/kernel/early_printk.c index 4a0800bc37b2..c350c2331bbe 100644 --- a/arch/um/kernel/early_printk.c +++ b/arch/um/kernel/early_printk.c @@ -1,9 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2011 Richard Weinberger <richrd@nod.at> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/kernel.h> diff --git a/arch/um/kernel/exec.c b/arch/um/kernel/exec.c index 0d7103c9eff3..13812fa97eee 100644 --- a/arch/um/kernel/exec.c +++ b/arch/um/kernel/exec.c @@ -1,50 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <linux/stddef.h> #include <linux/module.h> #include <linux/fs.h> #include <linux/ptrace.h> -#include <linux/sched.h> +#include <linux/sched/mm.h> +#include <linux/sched/task.h> +#include <linux/sched/task_stack.h> #include <linux/slab.h> #include <asm/current.h> #include <asm/processor.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <as-layout.h> #include <mem_user.h> +#include <registers.h> #include <skas.h> #include <os.h> void flush_thread(void) { - void *data = NULL; - int ret; - arch_flush_thread(¤t->thread.arch); - ret = unmap(¤t->mm->context.id, 0, STUB_START, 0, &data); - ret = ret || unmap(¤t->mm->context.id, STUB_END, - host_task_size - STUB_END, 1, &data); - if (ret) { - printk(KERN_ERR "flush_thread - clearing address space failed, " - "err = %d\n", ret); - force_sig(SIGKILL, current); - } get_safe_registers(current_pt_regs()->regs.gp, current_pt_regs()->regs.fp); - - __switch_mm(¤t->mm->context.id); } void start_thread(struct pt_regs *regs, unsigned long eip, unsigned long esp) { PT_REGS_IP(regs) = eip; PT_REGS_SP(regs) = esp; - current->ptrace &= ~PT_DTRACE; -#ifdef SUBARCH_EXECVE1 - SUBARCH_EXECVE1(regs->regs); -#endif + clear_thread_flag(TIF_SINGLESTEP); } EXPORT_SYMBOL(start_thread); diff --git a/arch/um/kernel/exitcode.c b/arch/um/kernel/exitcode.c index 829df49dee99..43edc2aa57e4 100644 --- a/arch/um/kernel/exitcode.c +++ b/arch/um/kernel/exitcode.c @@ -1,6 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <linux/ctype.h> @@ -10,7 +10,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/types.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> /* * If read and write race, the read will still atomically read a valid @@ -40,9 +40,11 @@ static ssize_t exitcode_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *pos) { char *end, buf[sizeof("nnnnn\0")]; + size_t size; int tmp; - if (copy_from_user(buf, buffer, count)) + size = min(count, sizeof(buf)); + if (copy_from_user(buf, buffer, size)) return -EFAULT; tmp = simple_strtol(buf, &end, 0); @@ -53,20 +55,19 @@ static ssize_t exitcode_proc_write(struct file *file, return count; } -static const struct file_operations exitcode_proc_fops = { - .owner = THIS_MODULE, - .open = exitcode_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = exitcode_proc_write, +static const struct proc_ops exitcode_proc_ops = { + .proc_open = exitcode_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = exitcode_proc_write, }; static int make_proc_exitcode(void) { struct proc_dir_entry *ent; - ent = proc_create("exitcode", 0600, NULL, &exitcode_proc_fops); + ent = proc_create("exitcode", 0600, NULL, &exitcode_proc_ops); if (ent == NULL) { printk(KERN_WARNING "make_proc_exitcode : Failed to register " "/proc/exitcode\n"); diff --git a/arch/um/kernel/gmon_syms.c b/arch/um/kernel/gmon_syms.c deleted file mode 100644 index 1bf61266da8e..000000000000 --- a/arch/um/kernel/gmon_syms.c +++ /dev/null @@ -1,9 +0,0 @@ -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL - */ - -#include <linux/module.h> - -extern void __bb_init_func(void *) __attribute__((weak)); -EXPORT_SYMBOL(__bb_init_func); diff --git a/arch/um/kernel/gprof_syms.c b/arch/um/kernel/gprof_syms.c index 74ddb44288a3..84d536908775 100644 --- a/arch/um/kernel/gprof_syms.c +++ b/arch/um/kernel/gprof_syms.c @@ -1,6 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <linux/module.h> diff --git a/arch/um/kernel/initrd.c b/arch/um/kernel/initrd.c index 55cead809b18..99dba827461c 100644 --- a/arch/um/kernel/initrd.c +++ b/arch/um/kernel/initrd.c @@ -1,46 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <linux/init.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/initrd.h> #include <asm/types.h> #include <init.h> #include <os.h> +#include "um_arch.h" + /* Changed by uml_initrd_setup, which is a setup */ static char *initrd __initdata = NULL; -static int load_initrd(char *filename, void *buf, int size); -static int __init read_initrd(void) +int __init read_initrd(void) { + unsigned long long size; void *area; - long long size; - int err; - - if (initrd == NULL) - return 0; - - err = os_file_size(initrd, &size); - if (err) - return 0; - - /* - * This is necessary because alloc_bootmem craps out if you - * ask for no memory. - */ - if (size == 0) { - printk(KERN_ERR "\"%s\" is a zero-size initrd\n", initrd); - return 0; - } - area = alloc_bootmem(size); - if (area == NULL) + if (!initrd) return 0; - if (load_initrd(initrd, area, size) == -1) + area = uml_load_file(initrd, &size); + if (!area) return 0; initrd_start = (unsigned long) area; @@ -48,10 +32,9 @@ static int __init read_initrd(void) return 0; } -__uml_postsetup(read_initrd); - static int __init uml_initrd_setup(char *line, int *add) { + *add = 0; initrd = line; return 0; } @@ -61,25 +44,3 @@ __uml_setup("initrd=", uml_initrd_setup, " This is used to boot UML from an initrd image. The argument is the\n" " name of the file containing the image.\n\n" ); - -static int load_initrd(char *filename, void *buf, int size) -{ - int fd, n; - - fd = os_open_file(filename, of_read(OPENFLAGS()), 0); - if (fd < 0) { - printk(KERN_ERR "Opening '%s' failed - err = %d\n", filename, - -fd); - return -1; - } - n = os_read_file(fd, buf, size); - if (n != size) { - printk(KERN_ERR "Read of %d bytes from '%s' failed, " - "err = %d\n", size, - filename, -n); - return -1; - } - - os_close_file(fd); - return 0; -} diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index 36e12f0cefd5..f4b13f15a9c1 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -1,6 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* + * Copyright (C) 2017 - Cambridge Greys Ltd + * Copyright (C) 2011 - 2014 Cisco Systems Inc * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL * Derived (i.e. mostly copied) from arch/i386/kernel/irq.c: * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar */ @@ -16,245 +18,424 @@ #include <as-layout.h> #include <kern_util.h> #include <os.h> +#include <irq_user.h> +#include <irq_kern.h> +#include <linux/time-internal.h> +DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); + +#define irq_stats(x) (&per_cpu(irq_stat, x)) + +/* When epoll triggers we do not know why it did so + * we can also have different IRQs for read and write. + * This is why we keep a small irq_reg array for each fd - + * one entry per IRQ type + */ +struct irq_reg { + void *id; + int irq; + /* it's cheaper to store this than to query it */ + int events; + bool active; + bool pending; + bool wakeup; +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT + bool pending_event; + void (*timetravel_handler)(int, int, void *, + struct time_travel_event *); + struct time_travel_event event; +#endif +}; + +struct irq_entry { + struct list_head list; + int fd; + struct irq_reg reg[NUM_IRQ_TYPES]; + bool suspended; + bool sigio_workaround; +}; + +static DEFINE_RAW_SPINLOCK(irq_lock); +static LIST_HEAD(active_fds); +static DECLARE_BITMAP(irqs_allocated, UM_LAST_SIGNAL_IRQ); +static bool irqs_suspended; +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT +static bool irqs_pending; +#endif + +static void irq_io_loop(struct irq_reg *irq, struct uml_pt_regs *regs) +{ /* - * This list is accessed under irq_lock, except in sigio_handler, - * where it is safe from being modified. IRQ handlers won't change it - - * if an IRQ source has vanished, it will be freed by free_irqs just - * before returning from sigio_handler. That will process a separate - * list of irqs to free, with its own locking, coming back here to - * remove list elements, taking the irq_lock to do so. + * irq->active guards against reentry + * irq->pending accumulates pending requests + * if pending is raised the irq_handler is re-run + * until pending is cleared */ -static struct irq_fd *active_fds = NULL; -static struct irq_fd **last_irq_ptr = &active_fds; + if (irq->active) { + irq->active = false; -extern void free_irqs(void); + do { + irq->pending = false; + do_IRQ(irq->irq, regs); + } while (irq->pending); + + irq->active = true; + } else { + irq->pending = true; + } +} -void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT +static void irq_event_handler(struct time_travel_event *ev) { - struct irq_fd *irq_fd; - int n; + struct irq_reg *reg = container_of(ev, struct irq_reg, event); - if (smp_sigio_handler()) + /* do nothing if suspended; just cause a wakeup and mark as pending */ + if (irqs_suspended) { + irqs_pending = true; + reg->pending_event = true; return; + } - while (1) { - n = os_waiting_for_events(active_fds); - if (n <= 0) { - if (n == -EINTR) - continue; - else break; - } + generic_handle_irq(reg->irq); +} + +static bool irq_do_timetravel_handler(struct irq_entry *entry, + enum um_irq_type t) +{ + struct irq_reg *reg = &entry->reg[t]; + + if (!reg->timetravel_handler) + return false; + + /* + * Handle all messages - we might get multiple even while + * interrupts are already suspended, due to suspend order + * etc. Note that time_travel_add_irq_event() will not add + * an event twice, if it's pending already "first wins". + */ + reg->timetravel_handler(reg->irq, entry->fd, reg->id, ®->event); - for (irq_fd = active_fds; irq_fd != NULL; - irq_fd = irq_fd->next) { - if (irq_fd->current_events != 0) { - irq_fd->current_events = 0; - do_IRQ(irq_fd->irq, regs); + if (!reg->event.pending) + return false; + + return true; +} + +static void irq_do_pending_events(bool timetravel_handlers_only) +{ + struct irq_entry *entry; + + if (!irqs_pending || timetravel_handlers_only) + return; + + irqs_pending = false; + + list_for_each_entry(entry, &active_fds, list) { + enum um_irq_type t; + + for (t = 0; t < NUM_IRQ_TYPES; t++) { + struct irq_reg *reg = &entry->reg[t]; + + /* + * Any timetravel_handler was invoked already, just + * directly run the IRQ. + */ + if (reg->pending_event) { + irq_enter(); + generic_handle_irq(reg->irq); + irq_exit(); + reg->pending_event = false; } } } - - free_irqs(); +} +#else +static bool irq_do_timetravel_handler(struct irq_entry *entry, + enum um_irq_type t) +{ + return false; } -static DEFINE_SPINLOCK(irq_lock); +static void irq_do_pending_events(bool timetravel_handlers_only) +{ +} +#endif -static int activate_fd(int irq, int fd, int type, void *dev_id) +static void sigio_reg_handler(int idx, struct irq_entry *entry, enum um_irq_type t, + struct uml_pt_regs *regs, + bool timetravel_handlers_only) { - struct pollfd *tmp_pfd; - struct irq_fd *new_fd, *irq_fd; - unsigned long flags; - int events, err, n; + struct irq_reg *reg = &entry->reg[t]; - err = os_set_fd_async(fd); - if (err < 0) - goto out; + if (!reg->events) + return; - err = -ENOMEM; - new_fd = kmalloc(sizeof(struct irq_fd), GFP_KERNEL); - if (new_fd == NULL) - goto out; + if (os_epoll_triggered(idx, reg->events) <= 0) + return; - if (type == IRQ_READ) - events = UM_POLLIN | UM_POLLPRI; - else events = UM_POLLOUT; - *new_fd = ((struct irq_fd) { .next = NULL, - .id = dev_id, - .fd = fd, - .type = type, - .irq = irq, - .events = events, - .current_events = 0 } ); - - err = -EBUSY; - spin_lock_irqsave(&irq_lock, flags); - for (irq_fd = active_fds; irq_fd != NULL; irq_fd = irq_fd->next) { - if ((irq_fd->fd == fd) && (irq_fd->type == type)) { - printk(KERN_ERR "Registering fd %d twice\n", fd); - printk(KERN_ERR "Irqs : %d, %d\n", irq_fd->irq, irq); - printk(KERN_ERR "Ids : 0x%p, 0x%p\n", irq_fd->id, - dev_id); - goto out_unlock; - } + if (irq_do_timetravel_handler(entry, t)) + return; + + /* + * If we're called to only run time-travel handlers then don't + * actually proceed but mark sigio as pending (if applicable). + * For suspend/resume, timetravel_handlers_only may be true + * despite time-travel not being configured and used. + */ + if (timetravel_handlers_only) { +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT + reg->pending_event = true; + irqs_pending = true; + mark_sigio_pending(); +#endif + return; } - if (type == IRQ_WRITE) - fd = -1; + irq_io_loop(reg, regs); +} - tmp_pfd = NULL; - n = 0; +static void _sigio_handler(struct uml_pt_regs *regs, + bool timetravel_handlers_only) +{ + struct irq_entry *irq_entry; + int n, i; - while (1) { - n = os_create_pollfd(fd, events, tmp_pfd, n); - if (n == 0) - break; - - /* - * n > 0 - * It means we couldn't put new pollfd to current pollfds - * and tmp_fds is NULL or too small for new pollfds array. - * Needed size is equal to n as minimum. - * - * Here we have to drop the lock in order to call - * kmalloc, which might sleep. - * If something else came in and changed the pollfds array - * so we will not be able to put new pollfd struct to pollfds - * then we free the buffer tmp_fds and try again. - */ - spin_unlock_irqrestore(&irq_lock, flags); - kfree(tmp_pfd); + if (timetravel_handlers_only && !um_irq_timetravel_handler_used()) + return; - tmp_pfd = kmalloc(n, GFP_KERNEL); - if (tmp_pfd == NULL) - goto out_kfree; + /* Flush out pending events that were ignored due to time-travel. */ + if (!irqs_suspended) + irq_do_pending_events(timetravel_handlers_only); - spin_lock_irqsave(&irq_lock, flags); - } + while (1) { + /* This is now lockless - epoll keeps back-referencesto the irqs + * which have trigger it so there is no need to walk the irq + * list and lock it every time. We avoid locking by turning off + * IO for a specific fd by executing os_del_epoll_fd(fd) before + * we do any changes to the actual data structures + */ + n = os_waiting_for_events_epoll(); - *last_irq_ptr = new_fd; - last_irq_ptr = &new_fd->next; + if (n <= 0) { + if (n == -EINTR) + continue; + else + break; + } - spin_unlock_irqrestore(&irq_lock, flags); + for (i = 0; i < n ; i++) { + enum um_irq_type t; - /* - * This calls activate_fd, so it has to be outside the critical - * section. - */ - maybe_sigio_broken(fd, (type == IRQ_READ)); + irq_entry = os_epoll_get_data_pointer(i); - return 0; + for (t = 0; t < NUM_IRQ_TYPES; t++) + sigio_reg_handler(i, irq_entry, t, regs, + timetravel_handlers_only); + } + } - out_unlock: - spin_unlock_irqrestore(&irq_lock, flags); - out_kfree: - kfree(new_fd); - out: - return err; + if (!timetravel_handlers_only) + free_irqs(); } -static void free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg) +void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs, + void *mc) { - unsigned long flags; - - spin_lock_irqsave(&irq_lock, flags); - os_free_irq_by_cb(test, arg, active_fds, &last_irq_ptr); - spin_unlock_irqrestore(&irq_lock, flags); + preempt_disable(); + _sigio_handler(regs, irqs_suspended); + preempt_enable(); } -struct irq_and_dev { - int irq; - void *dev; -}; - -static int same_irq_and_dev(struct irq_fd *irq, void *d) +static struct irq_entry *get_irq_entry_by_fd(int fd) { - struct irq_and_dev *data = d; + struct irq_entry *walk; - return ((irq->irq == data->irq) && (irq->id == data->dev)); + lockdep_assert_held(&irq_lock); + + list_for_each_entry(walk, &active_fds, list) { + if (walk->fd == fd) + return walk; + } + + return NULL; } -static void free_irq_by_irq_and_dev(unsigned int irq, void *dev) +static void remove_irq_entry(struct irq_entry *to_free, bool remove) { - struct irq_and_dev data = ((struct irq_and_dev) { .irq = irq, - .dev = dev }); + if (!to_free) + return; - free_irq_by_cb(same_irq_and_dev, &data); + if (remove) + os_del_epoll_fd(to_free->fd); + list_del(&to_free->list); } -static int same_fd(struct irq_fd *irq, void *fd) +static bool update_irq_entry(struct irq_entry *entry) { - return (irq->fd == *((int *)fd)); + enum um_irq_type i; + int events = 0; + + for (i = 0; i < NUM_IRQ_TYPES; i++) + events |= entry->reg[i].events; + + if (events) { + /* will modify (instead of add) if needed */ + os_add_epoll_fd(events, entry->fd, entry); + return true; + } + + os_del_epoll_fd(entry->fd); + return false; } -void free_irq_by_fd(int fd) +static struct irq_entry *update_or_remove_irq_entry(struct irq_entry *entry) { - free_irq_by_cb(same_fd, &fd); + if (update_irq_entry(entry)) + return NULL; + remove_irq_entry(entry, false); + return entry; } -/* Must be called with irq_lock held */ -static struct irq_fd *find_irq_by_fd(int fd, int irqnum, int *index_out) +static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id, + void (*timetravel_handler)(int, int, void *, + struct time_travel_event *)) { - struct irq_fd *irq; - int i = 0; - int fdi; + struct irq_entry *irq_entry, *to_free = NULL; + int err, events = os_event_mask(type); + unsigned long flags; - for (irq = active_fds; irq != NULL; irq = irq->next) { - if ((irq->fd == fd) && (irq->irq == irqnum)) - break; - i++; - } - if (irq == NULL) { - printk(KERN_ERR "find_irq_by_fd doesn't have descriptor %d\n", - fd); + err = os_set_fd_async(fd); + if (err < 0) goto out; + + raw_spin_lock_irqsave(&irq_lock, flags); + irq_entry = get_irq_entry_by_fd(fd); + if (irq_entry) { +already: + /* cannot register the same FD twice with the same type */ + if (WARN_ON(irq_entry->reg[type].events)) { + err = -EALREADY; + goto out_unlock; + } + + /* temporarily disable to avoid IRQ-side locking */ + os_del_epoll_fd(fd); + } else { + struct irq_entry *new; + + /* don't restore interrupts */ + raw_spin_unlock(&irq_lock); + new = kzalloc(sizeof(*irq_entry), GFP_ATOMIC); + if (!new) { + local_irq_restore(flags); + return -ENOMEM; + } + raw_spin_lock(&irq_lock); + irq_entry = get_irq_entry_by_fd(fd); + if (irq_entry) { + to_free = new; + goto already; + } + irq_entry = new; + irq_entry->fd = fd; + list_add_tail(&irq_entry->list, &active_fds); + maybe_sigio_broken(fd); } - fdi = os_get_pollfd(i); - if ((fdi != -1) && (fdi != fd)) { - printk(KERN_ERR "find_irq_by_fd - mismatch between active_fds " - "and pollfds, fd %d vs %d, need %d\n", irq->fd, - fdi, fd); - irq = NULL; - goto out; + + irq_entry->reg[type].id = dev_id; + irq_entry->reg[type].irq = irq; + irq_entry->reg[type].active = true; + irq_entry->reg[type].events = events; + +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT + if (um_irq_timetravel_handler_used()) { + irq_entry->reg[type].timetravel_handler = timetravel_handler; + irq_entry->reg[type].event.fn = irq_event_handler; } - *index_out = i; - out: - return irq; +#endif + + WARN_ON(!update_irq_entry(irq_entry)); + err = 0; +out_unlock: + raw_spin_unlock_irqrestore(&irq_lock, flags); +out: + kfree(to_free); + return err; +} + +/* + * Remove the entry or entries for a specific FD, if you + * don't want to remove all the possible entries then use + * um_free_irq() or deactivate_fd() instead. + */ +void free_irq_by_fd(int fd) +{ + struct irq_entry *to_free; + unsigned long flags; + + raw_spin_lock_irqsave(&irq_lock, flags); + to_free = get_irq_entry_by_fd(fd); + remove_irq_entry(to_free, true); + raw_spin_unlock_irqrestore(&irq_lock, flags); + kfree(to_free); } +EXPORT_SYMBOL(free_irq_by_fd); -void reactivate_fd(int fd, int irqnum) +static void free_irq_by_irq_and_dev(unsigned int irq, void *dev) { - struct irq_fd *irq; + struct irq_entry *entry, *to_free = NULL; unsigned long flags; - int i; - spin_lock_irqsave(&irq_lock, flags); - irq = find_irq_by_fd(fd, irqnum, &i); - if (irq == NULL) { - spin_unlock_irqrestore(&irq_lock, flags); - return; - } - os_set_pollfd(i, irq->fd); - spin_unlock_irqrestore(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); + list_for_each_entry(entry, &active_fds, list) { + enum um_irq_type i; - add_sigio_fd(fd); + for (i = 0; i < NUM_IRQ_TYPES; i++) { + struct irq_reg *reg = &entry->reg[i]; + + if (!reg->events) + continue; + if (reg->irq != irq) + continue; + if (reg->id != dev) + continue; + + os_del_epoll_fd(entry->fd); + reg->events = 0; + to_free = update_or_remove_irq_entry(entry); + goto out; + } + } +out: + raw_spin_unlock_irqrestore(&irq_lock, flags); + kfree(to_free); } void deactivate_fd(int fd, int irqnum) { - struct irq_fd *irq; + struct irq_entry *entry; unsigned long flags; - int i; + enum um_irq_type i; - spin_lock_irqsave(&irq_lock, flags); - irq = find_irq_by_fd(fd, irqnum, &i); - if (irq == NULL) { - spin_unlock_irqrestore(&irq_lock, flags); - return; + os_del_epoll_fd(fd); + + raw_spin_lock_irqsave(&irq_lock, flags); + entry = get_irq_entry_by_fd(fd); + if (!entry) + goto out; + + for (i = 0; i < NUM_IRQ_TYPES; i++) { + if (!entry->reg[i].events) + continue; + if (entry->reg[i].irq == irqnum) + entry->reg[i].events = 0; } - os_set_pollfd(i, -1); - spin_unlock_irqrestore(&irq_lock, flags); + entry = update_or_remove_irq_entry(entry); +out: + raw_spin_unlock_irqrestore(&irq_lock, flags); + kfree(entry); ignore_sigio_fd(fd); } @@ -268,17 +449,18 @@ EXPORT_SYMBOL(deactivate_fd); */ int deactivate_all_fds(void) { - struct irq_fd *irq; - int err; + struct irq_entry *entry; - for (irq = active_fds; irq != NULL; irq = irq->next) { - err = os_clear_fd_async(irq->fd); - if (err) - return err; - } - /* If there is a signal already queued, after unblocking ignore it */ + /* Stop IO. The IRQ loop has no lock so this is our + * only way of making sure we are safe to dispose + * of all IRQ handlers + */ os_set_ioignore(); + /* we can no longer call kfree() here so just deactivate */ + list_for_each_entry(entry, &active_fds, list) + os_del_epoll_fd(entry->fd); + os_close_epoll_fd(); return 0; } @@ -297,31 +479,180 @@ unsigned int do_IRQ(int irq, struct uml_pt_regs *regs) return 1; } -void um_free_irq(unsigned int irq, void *dev) +void um_free_irq(int irq, void *dev) { + if (WARN(irq < 0 || irq > UM_LAST_SIGNAL_IRQ, + "freeing invalid irq %d", irq)) + return; + free_irq_by_irq_and_dev(irq, dev); free_irq(irq, dev); + clear_bit(irq, irqs_allocated); } EXPORT_SYMBOL(um_free_irq); -int um_request_irq(unsigned int irq, int fd, int type, - irq_handler_t handler, - unsigned long irqflags, const char * devname, - void *dev_id) +static int +_um_request_irq(int irq, int fd, enum um_irq_type type, + irq_handler_t handler, unsigned long irqflags, + const char *devname, void *dev_id, + void (*timetravel_handler)(int, int, void *, + struct time_travel_event *)) { int err; + if (irq == UM_IRQ_ALLOC) { + int i; + + for (i = UM_FIRST_DYN_IRQ; i < NR_IRQS; i++) { + if (!test_and_set_bit(i, irqs_allocated)) { + irq = i; + break; + } + } + } + + if (irq < 0) + return -ENOSPC; + if (fd != -1) { - err = activate_fd(irq, fd, type, dev_id); + err = activate_fd(irq, fd, type, dev_id, timetravel_handler); if (err) - return err; + goto error; } - return request_irq(irq, handler, irqflags, devname, dev_id); + err = request_irq(irq, handler, irqflags, devname, dev_id); + if (err < 0) + goto error; + + return irq; +error: + clear_bit(irq, irqs_allocated); + return err; } +int um_request_irq(int irq, int fd, enum um_irq_type type, + irq_handler_t handler, unsigned long irqflags, + const char *devname, void *dev_id) +{ + return _um_request_irq(irq, fd, type, handler, irqflags, + devname, dev_id, NULL); +} EXPORT_SYMBOL(um_request_irq); -EXPORT_SYMBOL(reactivate_fd); + +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT +int um_request_irq_tt(int irq, int fd, enum um_irq_type type, + irq_handler_t handler, unsigned long irqflags, + const char *devname, void *dev_id, + void (*timetravel_handler)(int, int, void *, + struct time_travel_event *)) +{ + return _um_request_irq(irq, fd, type, handler, irqflags, + devname, dev_id, timetravel_handler); +} +EXPORT_SYMBOL(um_request_irq_tt); + +void sigio_run_timetravel_handlers(void) +{ + _sigio_handler(NULL, true); +} +#endif + +#ifdef CONFIG_PM_SLEEP +void um_irqs_suspend(void) +{ + struct irq_entry *entry; + unsigned long flags; + + irqs_suspended = true; + + raw_spin_lock_irqsave(&irq_lock, flags); + list_for_each_entry(entry, &active_fds, list) { + enum um_irq_type t; + bool clear = true; + + for (t = 0; t < NUM_IRQ_TYPES; t++) { + if (!entry->reg[t].events) + continue; + + /* + * For the SIGIO_WRITE_IRQ, which is used to handle the + * SIGIO workaround thread, we need special handling: + * enable wake for it itself, but below we tell it about + * any FDs that should be suspended. + */ + if (entry->reg[t].wakeup || + entry->reg[t].irq == SIGIO_WRITE_IRQ +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT + || entry->reg[t].timetravel_handler +#endif + ) { + clear = false; + break; + } + } + + if (clear) { + entry->suspended = true; + os_clear_fd_async(entry->fd); + entry->sigio_workaround = + !__ignore_sigio_fd(entry->fd); + } + } + raw_spin_unlock_irqrestore(&irq_lock, flags); +} + +void um_irqs_resume(void) +{ + struct irq_entry *entry; + unsigned long flags; + + + raw_spin_lock_irqsave(&irq_lock, flags); + list_for_each_entry(entry, &active_fds, list) { + if (entry->suspended) { + int err = os_set_fd_async(entry->fd); + + WARN(err < 0, "os_set_fd_async returned %d\n", err); + entry->suspended = false; + + if (entry->sigio_workaround) { + err = __add_sigio_fd(entry->fd); + WARN(err < 0, "add_sigio_returned %d\n", err); + } + } + } + raw_spin_unlock_irqrestore(&irq_lock, flags); + + irqs_suspended = false; + send_sigio_to_self(); +} + +static int normal_irq_set_wake(struct irq_data *d, unsigned int on) +{ + struct irq_entry *entry; + unsigned long flags; + + raw_spin_lock_irqsave(&irq_lock, flags); + list_for_each_entry(entry, &active_fds, list) { + enum um_irq_type t; + + for (t = 0; t < NUM_IRQ_TYPES; t++) { + if (!entry->reg[t].events) + continue; + + if (entry->reg[t].irq != d->irq) + continue; + entry->reg[t].wakeup = on; + goto unlock; + } + } +unlock: + raw_spin_unlock_irqrestore(&irq_lock, flags); + return 0; +} +#else +#define normal_irq_set_wake NULL +#endif /* * irq_chip must define at least enable/disable and ack when @@ -331,139 +662,67 @@ static void dummy(struct irq_data *d) { } -/* This is used for everything else than the timer. */ +/* This is used for everything other than the timer. */ static struct irq_chip normal_irq_type = { .name = "SIGIO", .irq_disable = dummy, .irq_enable = dummy, .irq_ack = dummy, + .irq_mask = dummy, + .irq_unmask = dummy, + .irq_set_wake = normal_irq_set_wake, }; -static struct irq_chip SIGVTALRM_irq_type = { - .name = "SIGVTALRM", +static struct irq_chip alarm_irq_type = { + .name = "SIGALRM", .irq_disable = dummy, .irq_enable = dummy, .irq_ack = dummy, + .irq_mask = dummy, + .irq_unmask = dummy, }; void __init init_IRQ(void) { int i; - irq_set_chip_and_handler(TIMER_IRQ, &SIGVTALRM_irq_type, handle_edge_irq); + irq_set_chip_and_handler(TIMER_IRQ, &alarm_irq_type, handle_percpu_irq); - for (i = 1; i < NR_IRQS; i++) + for (i = 1; i < UM_LAST_SIGNAL_IRQ; i++) irq_set_chip_and_handler(i, &normal_irq_type, handle_edge_irq); + /* Initialize EPOLL Loop */ + os_setup_epoll(); } -/* - * IRQ stack entry and exit: - * - * Unlike i386, UML doesn't receive IRQs on the normal kernel stack - * and switch over to the IRQ stack after some preparation. We use - * sigaltstack to receive signals on a separate stack from the start. - * These two functions make sure the rest of the kernel won't be too - * upset by being on a different stack. The IRQ stack has a - * thread_info structure at the bottom so that current et al continue - * to work. - * - * to_irq_stack copies the current task's thread_info to the IRQ stack - * thread_info and sets the tasks's stack to point to the IRQ stack. - * - * from_irq_stack copies the thread_info struct back (flags may have - * been modified) and resets the task's stack pointer. - * - * Tricky bits - - * - * What happens when two signals race each other? UML doesn't block - * signals with sigprocmask, SA_DEFER, or sa_mask, so a second signal - * could arrive while a previous one is still setting up the - * thread_info. - * - * There are three cases - - * The first interrupt on the stack - sets up the thread_info and - * handles the interrupt - * A nested interrupt interrupting the copying of the thread_info - - * can't handle the interrupt, as the stack is in an unknown state - * A nested interrupt not interrupting the copying of the - * thread_info - doesn't do any setup, just handles the interrupt - * - * The first job is to figure out whether we interrupted stack setup. - * This is done by xchging the signal mask with thread_info->pending. - * If the value that comes back is zero, then there is no setup in - * progress, and the interrupt can be handled. If the value is - * non-zero, then there is stack setup in progress. In order to have - * the interrupt handled, we leave our signal in the mask, and it will - * be handled by the upper handler after it has set up the stack. - * - * Next is to figure out whether we are the outer handler or a nested - * one. As part of setting up the stack, thread_info->real_thread is - * set to non-NULL (and is reset to NULL on exit). This is the - * nesting indicator. If it is non-NULL, then the stack is already - * set up and the handler can run. - */ - -static unsigned long pending_mask; - -unsigned long to_irq_stack(unsigned long *mask_out) +int __init arch_probe_nr_irqs(void) { - struct thread_info *ti; - unsigned long mask, old; - int nested; - - mask = xchg(&pending_mask, *mask_out); - if (mask != 0) { - /* - * If any interrupts come in at this point, we want to - * make sure that their bits aren't lost by our - * putting our bit in. So, this loop accumulates bits - * until xchg returns the same value that we put in. - * When that happens, there were no new interrupts, - * and pending_mask contains a bit for each interrupt - * that came in. - */ - old = *mask_out; - do { - old |= mask; - mask = xchg(&pending_mask, old); - } while (mask != old); - return 1; - } - - ti = current_thread_info(); - nested = (ti->real_thread != NULL); - if (!nested) { - struct task_struct *task; - struct thread_info *tti; - - task = cpu_tasks[ti->cpu].task; - tti = task_thread_info(task); - - *ti = *tti; - ti->real_thread = tti; - task->stack = ti; - } - - mask = xchg(&pending_mask, 0); - *mask_out |= mask | nested; - return 0; + return NR_IRQS; } -unsigned long from_irq_stack(int nested) +void sigchld_handler(int sig, struct siginfo *unused_si, + struct uml_pt_regs *regs, void *mc) { - struct thread_info *ti, *to; - unsigned long mask; + do_IRQ(SIGCHLD_IRQ, regs); +} - ti = current_thread_info(); +/* + * /proc/interrupts printing for arch specific interrupts + */ +int arch_show_interrupts(struct seq_file *p, int prec) +{ +#if IS_ENABLED(CONFIG_SMP) + int cpu; - pending_mask = 1; + seq_printf(p, "%*s: ", prec, "RES"); + for_each_online_cpu(cpu) + seq_printf(p, "%10u ", irq_stats(cpu)->irq_resched_count); + seq_puts(p, " Rescheduling interrupts\n"); - to = ti->real_thread; - current->stack = to; - ti->real_thread = NULL; - *to = *ti; + seq_printf(p, "%*s: ", prec, "CAL"); + for_each_online_cpu(cpu) + seq_printf(p, "%10u ", irq_stats(cpu)->irq_call_count); + seq_puts(p, " Function call interrupts\n"); +#endif - mask = xchg(&pending_mask, 0); - return mask & ~1; + return 0; } - diff --git a/arch/um/kernel/kmsg_dump.c b/arch/um/kernel/kmsg_dump.c new file mode 100644 index 000000000000..fc0f543d1d8e --- /dev/null +++ b/arch/um/kernel/kmsg_dump.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kmsg_dump.h> +#include <linux/spinlock.h> +#include <linux/console.h> +#include <linux/string.h> +#include <shared/init.h> +#include <shared/kern.h> +#include <os.h> + +static void kmsg_dumper_stdout(struct kmsg_dumper *dumper, + struct kmsg_dump_detail *detail) +{ + static struct kmsg_dump_iter iter; + static DEFINE_SPINLOCK(lock); + static char line[1024]; + struct console *con; + unsigned long flags; + size_t len = 0; + int cookie; + + /* + * If no consoles are available to output crash information, dump + * the kmsg buffer to stdout. + */ + + cookie = console_srcu_read_lock(); + for_each_console_srcu(con) { + /* + * The ttynull console and disabled consoles are ignored + * since they cannot output. All other consoles are + * expected to output the crash information. + */ + if (strcmp(con->name, "ttynull") != 0 && + console_is_usable(con, console_srcu_read_flags(con), true)) { + break; + } + } + console_srcu_read_unlock(cookie); + if (con) + return; + + if (!spin_trylock_irqsave(&lock, flags)) + return; + + kmsg_dump_rewind(&iter); + + printf("kmsg_dump:\n"); + while (kmsg_dump_get_line(&iter, true, line, sizeof(line), &len)) { + line[len] = '\0'; + printf("%s", line); + } + + spin_unlock_irqrestore(&lock, flags); +} + +static struct kmsg_dumper kmsg_dumper = { + .dump = kmsg_dumper_stdout +}; + +static int __init kmsg_dumper_stdout_init(void) +{ + return kmsg_dump_register(&kmsg_dumper); +} + +__uml_postsetup(kmsg_dumper_stdout_init); diff --git a/arch/um/kernel/ksyms.c b/arch/um/kernel/ksyms.c index 543c04756939..96314c31e61c 100644 --- a/arch/um/kernel/ksyms.c +++ b/arch/um/kernel/ksyms.c @@ -1,13 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <linux/module.h> #include <os.h> -EXPORT_SYMBOL(set_signals); -EXPORT_SYMBOL(get_signals); +EXPORT_SYMBOL(um_get_signals); +EXPORT_SYMBOL(um_set_signals); EXPORT_SYMBOL(os_stat_fd); EXPORT_SYMBOL(os_stat_file); @@ -33,12 +33,16 @@ EXPORT_SYMBOL(os_shutdown_socket); EXPORT_SYMBOL(os_create_unix_socket); EXPORT_SYMBOL(os_connect_socket); EXPORT_SYMBOL(os_accept_connection); -EXPORT_SYMBOL(os_rcv_fd); +EXPORT_SYMBOL(os_rcv_fd_msg); EXPORT_SYMBOL(run_helper); EXPORT_SYMBOL(os_major); EXPORT_SYMBOL(os_minor); EXPORT_SYMBOL(os_makedev); +EXPORT_SYMBOL(os_eventfd); +EXPORT_SYMBOL(os_sendmsg_fds); EXPORT_SYMBOL(add_sigio_fd); EXPORT_SYMBOL(ignore_sigio_fd); EXPORT_SYMBOL(sigio_broken); + +EXPORT_SYMBOL(syscall); diff --git a/arch/um/kernel/load_file.c b/arch/um/kernel/load_file.c new file mode 100644 index 000000000000..cb9d178ab7d8 --- /dev/null +++ b/arch/um/kernel/load_file.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + */ +#include <linux/memblock.h> +#include <os.h> + +#include "um_arch.h" + +static int __init __uml_load_file(const char *filename, void *buf, int size) +{ + int fd, n; + + fd = os_open_file(filename, of_read(OPENFLAGS()), 0); + if (fd < 0) { + printk(KERN_ERR "Opening '%s' failed - err = %d\n", filename, + -fd); + return -1; + } + n = os_read_file(fd, buf, size); + if (n != size) { + printk(KERN_ERR "Read of %d bytes from '%s' failed, " + "err = %d\n", size, + filename, -n); + return -1; + } + + os_close_file(fd); + return 0; +} + +void *uml_load_file(const char *filename, unsigned long long *size) +{ + void *area; + int err; + + *size = 0; + + if (!filename) + return NULL; + + err = os_file_size(filename, size); + if (err) + return NULL; + + if (*size == 0) { + printk(KERN_ERR "\"%s\" is empty\n", filename); + return NULL; + } + + area = memblock_alloc_or_panic(*size, SMP_CACHE_BYTES); + + if (__uml_load_file(filename, area, *size)) { + memblock_free(area, *size); + return NULL; + } + + return area; +} diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 7ddb64baf327..39c4a7e21c6f 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -1,29 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <linux/stddef.h> #include <linux/module.h> -#include <linux/bootmem.h> -#include <linux/highmem.h> +#include <linux/memblock.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/slab.h> -#include <asm/fixmap.h> +#include <linux/init.h> +#include <asm/sections.h> #include <asm/page.h> +#include <asm/pgalloc.h> #include <as-layout.h> #include <init.h> #include <kern.h> #include <kern_util.h> #include <mem_user.h> #include <os.h> +#include <um_malloc.h> +#include <linux/sched/task.h> +#include <linux/kasan.h> + +#ifdef CONFIG_KASAN +void __init kasan_init(void) +{ + /* + * kasan_map_memory will map all of the required address space and + * the host machine will allocate physical memory as necessary. + */ + kasan_map_memory((void *)KASAN_SHADOW_START, KASAN_SHADOW_SIZE); + init_task.kasan_depth = 0; + /* + * Since kasan_init() is called before main(), + * KASAN is initialized but the enablement is deferred after + * jump_label_init(). See arch_mm_preinit(). + */ +} + +static void (*kasan_init_ptr)(void) +__section(".kasan_init") __used += kasan_init; +#endif /* allocated in paging_init, zeroed in mem_init, and unchanged thereafter */ unsigned long *empty_zero_page = NULL; EXPORT_SYMBOL(empty_zero_page); -/* allocated in paging_init and unchanged thereafter */ -static unsigned long *empty_bad_page = NULL; /* * Initialized during boot, and readonly for initializing page tables @@ -32,202 +55,47 @@ static unsigned long *empty_bad_page = NULL; pgd_t swapper_pg_dir[PTRS_PER_PGD]; /* Initialized at boot time, and readonly after that */ -unsigned long long highmem; int kmalloc_ok = 0; /* Used during early boot */ static unsigned long brk_end; -#ifdef CONFIG_HIGHMEM -static void setup_highmem(unsigned long highmem_start, - unsigned long highmem_len) +void __init arch_mm_preinit(void) { - unsigned long highmem_pfn; - int i; + /* Safe to call after jump_label_init(). Enables KASAN. */ + kasan_init_generic(); - highmem_pfn = __pa(highmem_start) >> PAGE_SHIFT; - for (i = 0; i < highmem_len >> PAGE_SHIFT; i++) - free_highmem_page(&mem_map[highmem_pfn + i]); -} -#endif - -void __init mem_init(void) -{ /* clear the zero-page */ memset(empty_zero_page, 0, PAGE_SIZE); /* Map in the area just after the brk now that kmalloc is about * to be turned on. */ - brk_end = (unsigned long) UML_ROUND_UP(sbrk(0)); + brk_end = PAGE_ALIGN((unsigned long) sbrk(0)); map_memory(brk_end, __pa(brk_end), uml_reserved - brk_end, 1, 1, 0); - free_bootmem(__pa(brk_end), uml_reserved - brk_end); + memblock_free((void *)brk_end, uml_reserved - brk_end); uml_reserved = brk_end; - - /* this will put all low memory onto the freelists */ - free_all_bootmem(); - max_low_pfn = totalram_pages; -#ifdef CONFIG_HIGHMEM - setup_highmem(end_iomem, highmem); -#endif - max_pfn = totalram_pages; - mem_init_print_info(NULL); - kmalloc_ok = 1; -} - -/* - * Create a page table and place a pointer to it in a middle page - * directory entry. - */ -static void __init one_page_table_init(pmd_t *pmd) -{ - if (pmd_none(*pmd)) { - pte_t *pte = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); - set_pmd(pmd, __pmd(_KERNPG_TABLE + - (unsigned long) __pa(pte))); - if (pte != pte_offset_kernel(pmd, 0)) - BUG(); - } -} - -static void __init one_md_table_init(pud_t *pud) -{ -#ifdef CONFIG_3_LEVEL_PGTABLES - pmd_t *pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); - set_pud(pud, __pud(_KERNPG_TABLE + (unsigned long) __pa(pmd_table))); - if (pmd_table != pmd_offset(pud, 0)) - BUG(); -#endif -} - -static void __init fixrange_init(unsigned long start, unsigned long end, - pgd_t *pgd_base) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - int i, j; - unsigned long vaddr; - - vaddr = start; - i = pgd_index(vaddr); - j = pmd_index(vaddr); - pgd = pgd_base + i; - - for ( ; (i < PTRS_PER_PGD) && (vaddr < end); pgd++, i++) { - pud = pud_offset(pgd, vaddr); - if (pud_none(*pud)) - one_md_table_init(pud); - pmd = pmd_offset(pud, vaddr); - for (; (j < PTRS_PER_PMD) && (vaddr < end); pmd++, j++) { - one_page_table_init(pmd); - vaddr += PMD_SIZE; - } - j = 0; - } -} - -#ifdef CONFIG_HIGHMEM -pte_t *kmap_pte; -pgprot_t kmap_prot; - -#define kmap_get_fixmap_pte(vaddr) \ - pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), (vaddr)),\ - (vaddr)), (vaddr)) - -static void __init kmap_init(void) -{ - unsigned long kmap_vstart; - - /* cache the first kmap pte */ - kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); - kmap_pte = kmap_get_fixmap_pte(kmap_vstart); - - kmap_prot = PAGE_KERNEL; -} - -static void __init init_highmem(void) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - unsigned long vaddr; - - /* - * Permanent kmaps: - */ - vaddr = PKMAP_BASE; - fixrange_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, swapper_pg_dir); - - pgd = swapper_pg_dir + pgd_index(vaddr); - pud = pud_offset(pgd, vaddr); - pmd = pmd_offset(pud, vaddr); - pte = pte_offset_kernel(pmd, vaddr); - pkmap_page_table = pte; - - kmap_init(); + min_low_pfn = PFN_UP(__pa(uml_reserved)); + max_pfn = max_low_pfn; } -#endif /* CONFIG_HIGHMEM */ -static void __init fixaddr_user_init( void) +void __init mem_init(void) { -#ifdef CONFIG_ARCH_REUSE_HOST_VSYSCALL_AREA - long size = FIXADDR_USER_END - FIXADDR_USER_START; - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - phys_t p; - unsigned long v, vaddr = FIXADDR_USER_START; - - if (!size) - return; - - fixrange_init( FIXADDR_USER_START, FIXADDR_USER_END, swapper_pg_dir); - v = (unsigned long) alloc_bootmem_low_pages(size); - memcpy((void *) v , (void *) FIXADDR_USER_START, size); - p = __pa(v); - for ( ; size > 0; size -= PAGE_SIZE, vaddr += PAGE_SIZE, - p += PAGE_SIZE) { - pgd = swapper_pg_dir + pgd_index(vaddr); - pud = pud_offset(pgd, vaddr); - pmd = pmd_offset(pud, vaddr); - pte = pte_offset_kernel(pmd, vaddr); - pte_set_val(*pte, p, PAGE_READONLY); - } -#endif + kmalloc_ok = 1; } void __init paging_init(void) { - unsigned long zones_size[MAX_NR_ZONES], vaddr; - int i; + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; - empty_zero_page = (unsigned long *) alloc_bootmem_low_pages(PAGE_SIZE); - empty_bad_page = (unsigned long *) alloc_bootmem_low_pages(PAGE_SIZE); - for (i = 0; i < ARRAY_SIZE(zones_size); i++) - zones_size[i] = 0; + empty_zero_page = (unsigned long *) memblock_alloc_low(PAGE_SIZE, + PAGE_SIZE); + if (!empty_zero_page) + panic("%s: Failed to allocate %lu bytes align=%lx\n", + __func__, PAGE_SIZE, PAGE_SIZE); - zones_size[ZONE_NORMAL] = (end_iomem >> PAGE_SHIFT) - - (uml_physmem >> PAGE_SHIFT); -#ifdef CONFIG_HIGHMEM - zones_size[ZONE_HIGHMEM] = highmem >> PAGE_SHIFT; -#endif - free_area_init(zones_size); - - /* - * Fixed mappings, only the page table structure has to be - * created - mappings will be set by set_fixmap(): - */ - vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; - fixrange_init(vaddr, FIXADDR_TOP, swapper_pg_dir); - - fixaddr_user_init(); - -#ifdef CONFIG_HIGHMEM - init_highmem(); -#endif + max_zone_pfn[ZONE_NORMAL] = high_physmem >> PAGE_SHIFT; + free_area_init(max_zone_pfn); } /* @@ -239,64 +107,49 @@ void free_initmem(void) { } -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif - /* Allocate and free page tables. */ pgd_t *pgd_alloc(struct mm_struct *mm) { - pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL); + pgd_t *pgd = __pgd_alloc(mm, 0); - if (pgd) { - memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); + if (pgd) memcpy(pgd + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); - } - return pgd; -} - -void pgd_free(struct mm_struct *mm, pgd_t *pgd) -{ - free_page((unsigned long) pgd); -} - -pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) -{ - pte_t *pte; - - pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); - return pte; -} - -pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) -{ - struct page *pte; - - pte = alloc_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); - if (pte) - pgtable_page_ctor(pte); - return pte; -} - -#ifdef CONFIG_3_LEVEL_PGTABLES -pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) -{ - pmd_t *pmd = (pmd_t *) __get_free_page(GFP_KERNEL); - if (pmd) - memset(pmd, 0, PAGE_SIZE); - - return pmd; + return pgd; } -#endif void *uml_kmalloc(int size, int flags) { return kmalloc(size, flags); } + +static const pgprot_t protection_map[16] = { + [VM_NONE] = PAGE_NONE, + [VM_READ] = PAGE_READONLY, + [VM_WRITE] = PAGE_COPY, + [VM_WRITE | VM_READ] = PAGE_COPY, + [VM_EXEC] = PAGE_READONLY, + [VM_EXEC | VM_READ] = PAGE_READONLY, + [VM_EXEC | VM_WRITE] = PAGE_COPY, + [VM_EXEC | VM_WRITE | VM_READ] = PAGE_COPY, + [VM_SHARED] = PAGE_NONE, + [VM_SHARED | VM_READ] = PAGE_READONLY, + [VM_SHARED | VM_WRITE] = PAGE_SHARED, + [VM_SHARED | VM_WRITE | VM_READ] = PAGE_SHARED, + [VM_SHARED | VM_EXEC] = PAGE_READONLY, + [VM_SHARED | VM_EXEC | VM_READ] = PAGE_READONLY, + [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_SHARED, + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED +}; +DECLARE_VM_GET_PAGE_PROT + +void mark_rodata_ro(void) +{ + unsigned long rodata_start = PFN_ALIGN(__start_rodata); + unsigned long rodata_end = PFN_ALIGN(__end_rodata); + + os_protect_memory((void *)rodata_start, rodata_end - rodata_start, 1, 0, 0); +} diff --git a/arch/um/kernel/physmem.c b/arch/um/kernel/physmem.c index f116db15d402..ae6ca373c261 100644 --- a/arch/um/kernel/physmem.c +++ b/arch/um/kernel/physmem.c @@ -1,16 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <linux/module.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/mm.h> #include <linux/pfn.h> #include <asm/page.h> +#include <asm/sections.h> #include <as-layout.h> #include <init.h> #include <kern.h> +#include <kern_util.h> #include <mem_user.h> #include <os.h> @@ -20,43 +22,6 @@ static int physmem_fd = -1; unsigned long high_physmem; EXPORT_SYMBOL(high_physmem); -extern unsigned long long physmem_size; - -int __init init_maps(unsigned long physmem, unsigned long iomem, - unsigned long highmem) -{ - struct page *p, *map; - unsigned long phys_len, phys_pages, highmem_len, highmem_pages; - unsigned long iomem_len, iomem_pages, total_len, total_pages; - int i; - - phys_pages = physmem >> PAGE_SHIFT; - phys_len = phys_pages * sizeof(struct page); - - iomem_pages = iomem >> PAGE_SHIFT; - iomem_len = iomem_pages * sizeof(struct page); - - highmem_pages = highmem >> PAGE_SHIFT; - highmem_len = highmem_pages * sizeof(struct page); - - total_pages = phys_pages + iomem_pages + highmem_pages; - total_len = phys_len + iomem_len + highmem_len; - - map = alloc_bootmem_low_pages(total_len); - if (map == NULL) - return -ENOMEM; - - for (i = 0; i < total_pages; i++) { - p = &map[i]; - memset(p, 0, sizeof(struct page)); - SetPageReserved(p); - INIT_LIST_HEAD(&p->lru); - } - - max_mapnr = total_pages; - return 0; -} - void map_memory(unsigned long virt, unsigned long phys, unsigned long len, int r, int w, int x) { @@ -75,25 +40,46 @@ void map_memory(unsigned long virt, unsigned long phys, unsigned long len, } } -extern int __syscall_stub_start; - +/** + * setup_physmem() - Setup physical memory for UML + * @start: Start address of the physical kernel memory, + * i.e start address of the executable image. + * @reserve_end: end address of the physical kernel memory. + * @len: Length of total physical memory that should be mapped/made + * available, in bytes. + * + * Creates an unlinked temporary file of size (len) and memory maps + * it on the last executable image address (uml_reserved). + * + * The offset is needed as the length of the total physical memory + * (len) includes the size of the memory used be the executable image, + * but the mapped-to address is the last address of the executable image + * (uml_reserved == end address of executable image). + * + * The memory mapped memory of the temporary file is used as backing memory + * of all user space processes/kernel tasks. + */ void __init setup_physmem(unsigned long start, unsigned long reserve_end, - unsigned long len, unsigned long long highmem) + unsigned long len) { unsigned long reserve = reserve_end - start; - int pfn = PFN_UP(__pa(reserve_end)); - int delta = (len - reserve) >> PAGE_SHIFT; - int err, offset, bootmap_size; + unsigned long map_size = len - reserve; + int err; + + if (len <= reserve) { + os_warn("Too few physical memory! Needed=%lu, given=%lu\n", + reserve, len); + exit(1); + } - physmem_fd = create_mem_file(len + highmem); + physmem_fd = create_mem_file(len); - offset = uml_reserved - uml_physmem; - err = os_map_memory((void *) uml_reserved, physmem_fd, offset, - len - offset, 1, 1, 1); + err = os_map_memory((void *) reserve_end, physmem_fd, reserve, + map_size, 1, 1, 1); if (err < 0) { - printf("setup_physmem - mapping %ld bytes of memory at 0x%p " - "failed - errno = %d\n", len - offset, - (void *) uml_reserved, err); + os_warn("setup_physmem - mapping %lu bytes of memory at 0x%p " + "failed - errno = %d\n", map_size, + (void *) reserve_end, err); exit(1); } @@ -101,12 +87,14 @@ void __init setup_physmem(unsigned long start, unsigned long reserve_end, * Special kludge - This page will be mapped in to userspace processes * from physmem_fd, so it needs to be written out there. */ - os_seek_file(physmem_fd, __pa(&__syscall_stub_start)); - os_write_file(physmem_fd, &__syscall_stub_start, PAGE_SIZE); + os_seek_file(physmem_fd, __pa(__syscall_stub_start)); + os_write_file(physmem_fd, __syscall_stub_start, PAGE_SIZE); + + memblock_add(__pa(start), len); + memblock_reserve(__pa(start), reserve); - bootmap_size = init_bootmem(pfn, pfn + delta); - free_bootmem(__pa(reserve_end) + bootmap_size, - len - bootmap_size - reserve); + min_low_pfn = PFN_UP(__pa(reserve_end)); + max_low_pfn = min_low_pfn + (map_size >> PAGE_SHIFT); } int phys_mapping(unsigned long phys, unsigned long long *offset_out) @@ -117,30 +105,16 @@ int phys_mapping(unsigned long phys, unsigned long long *offset_out) fd = physmem_fd; *offset_out = phys; } - else if (phys < __pa(end_iomem)) { - struct iomem_region *region = iomem_regions; - - while (region != NULL) { - if ((phys >= region->phys) && - (phys < region->phys + region->size)) { - fd = region->fd; - *offset_out = phys - region->phys; - break; - } - region = region->next; - } - } - else if (phys < __pa(end_iomem) + highmem) { - fd = physmem_fd; - *offset_out = phys - iomem_size; - } return fd; } +EXPORT_SYMBOL(phys_mapping); static int __init uml_mem_setup(char *line, int *add) { char *retptr; + + *add = 0; physmem_size = memparse(line,&retptr); return 0; } @@ -153,63 +127,3 @@ __uml_setup("mem=", uml_mem_setup, " be more, and the excess, if it's ever used, will just be swapped out.\n" " Example: mem=64M\n\n" ); - -extern int __init parse_iomem(char *str, int *add); - -__uml_setup("iomem=", parse_iomem, -"iomem=<name>,<file>\n" -" Configure <file> as an IO memory region named <name>.\n\n" -); - -/* - * This list is constructed in parse_iomem and addresses filled in in - * setup_iomem, both of which run during early boot. Afterwards, it's - * unchanged. - */ -struct iomem_region *iomem_regions; - -/* Initialized in parse_iomem and unchanged thereafter */ -int iomem_size; - -unsigned long find_iomem(char *driver, unsigned long *len_out) -{ - struct iomem_region *region = iomem_regions; - - while (region != NULL) { - if (!strcmp(region->driver, driver)) { - *len_out = region->size; - return region->virt; - } - - region = region->next; - } - - return 0; -} -EXPORT_SYMBOL(find_iomem); - -static int setup_iomem(void) -{ - struct iomem_region *region = iomem_regions; - unsigned long iomem_start = high_physmem + PAGE_SIZE; - int err; - - while (region != NULL) { - err = os_map_memory((void *) iomem_start, region->fd, 0, - region->size, 1, 1, 0); - if (err) - printk(KERN_ERR "Mapping iomem region for driver '%s' " - "failed, errno = %d\n", region->driver, -err); - else { - region->virt = iomem_start; - region->phys = __pa(region->virt); - } - - iomem_start += region->size + PAGE_SIZE; - region = region->next; - } - - return 0; -} - -__initcall(setup_iomem); diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index bbcef522bcb1..63b38a3f73f7 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -1,7 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* + * Copyright (C) 2015 Anton Ivanov (aivanov@{brocade.com,kot-begemot.co.uk}) + * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de) * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) * Copyright 2003 PathScale, Inc. - * Licensed under the GPL */ #include <linux/stddef.h> @@ -13,44 +15,38 @@ #include <linux/proc_fs.h> #include <linux/ptrace.h> #include <linux/random.h> +#include <linux/cpu.h> #include <linux/slab.h> #include <linux/sched.h> +#include <linux/sched/debug.h> +#include <linux/sched/task.h> +#include <linux/sched/task_stack.h> #include <linux/seq_file.h> #include <linux/tick.h> #include <linux/threads.h> -#include <linux/tracehook.h> +#include <linux/resume_user_mode.h> #include <asm/current.h> -#include <asm/pgtable.h> #include <asm/mmu_context.h> -#include <asm/uaccess.h> +#include <asm/switch_to.h> +#include <asm/exec.h> +#include <linux/uaccess.h> #include <as-layout.h> #include <kern_util.h> #include <os.h> #include <skas.h> +#include <registers.h> +#include <linux/time-internal.h> +#include <linux/elfcore.h> /* * This is a per-cpu array. A processor only modifies its entry and it only * cares about its entry, so it's OK if another processor is modifying its * entry. */ -struct cpu_task cpu_tasks[NR_CPUS] = { [0 ... NR_CPUS - 1] = { -1, NULL } }; - -static inline int external_pid(void) -{ - /* FIXME: Need to look up userspace_pid by cpu */ - return userspace_pid[0]; -} - -int pid_to_processor_id(int pid) -{ - int i; - - for (i = 0; i < ncpus; i++) { - if (cpu_tasks[i].pid == pid) - return i; - } - return -1; -} +struct task_struct *cpu_tasks[NR_CPUS] = { + [0 ... NR_CPUS - 1] = &init_task, +}; +EXPORT_SYMBOL(cpu_tasks); void free_stack(unsigned long stack, int order) { @@ -71,46 +67,35 @@ unsigned long alloc_stack(int order, int atomic) static inline void set_current(struct task_struct *task) { - cpu_tasks[task_thread_info(task)->cpu] = ((struct cpu_task) - { external_pid(), task }); + cpu_tasks[task_thread_info(task)->cpu] = task; } -extern void arch_switch_to(struct task_struct *to); - -void *__switch_to(struct task_struct *from, struct task_struct *to) +struct task_struct *__switch_to(struct task_struct *from, struct task_struct *to) { to->thread.prev_sched = from; set_current(to); - do { - current->thread.saved_task = NULL; - - switch_threads(&from->thread.switch_buf, - &to->thread.switch_buf); - - arch_switch_to(current); - - if (current->thread.saved_task) - show_regs(&(current->thread.regs)); - to = current->thread.saved_task; - from = current; - } while (current->thread.saved_task); + switch_threads(&from->thread.switch_buf, &to->thread.switch_buf); + arch_switch_to(current); return current->thread.prev_sched; } void interrupt_end(void) { - if (need_resched()) - schedule(); - if (test_thread_flag(TIF_SIGPENDING)) - do_signal(); - if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME)) - tracehook_notify_resume(¤t->thread.regs); -} - -void exit_thread(void) -{ + struct pt_regs *regs = ¤t->thread.regs; + unsigned long thread_flags; + + thread_flags = read_thread_flags(); + while (thread_flags & _TIF_WORK_MASK) { + if (thread_flags & _TIF_NEED_RESCHED) + schedule(); + if (thread_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) + do_signal(regs); + if (thread_flags & _TIF_NOTIFY_RESUME) + resume_user_mode_work(regs); + thread_flags = read_thread_flags(); + } } int get_current_pid(void) @@ -124,28 +109,26 @@ int get_current_pid(void) */ void new_thread_handler(void) { - int (*fn)(void *), n; + int (*fn)(void *); void *arg; if (current->thread.prev_sched != NULL) schedule_tail(current->thread.prev_sched); current->thread.prev_sched = NULL; - fn = current->thread.request.u.thread.proc; - arg = current->thread.request.u.thread.arg; + fn = current->thread.request.thread.proc; + arg = current->thread.request.thread.arg; /* * callback returns only if the kernel thread execs a process */ - n = fn(arg); + fn(arg); userspace(¤t->thread.regs.regs); } /* Called magically, see new_thread_handler above */ -void fork_handler(void) +static void fork_handler(void) { - force_flush_all(); - schedule_tail(current->thread.prev_sched); /* @@ -160,16 +143,17 @@ void fork_handler(void) userspace(¤t->thread.regs.regs); } -int copy_thread(unsigned long clone_flags, unsigned long sp, - unsigned long arg, struct task_struct * p) +int copy_thread(struct task_struct * p, const struct kernel_clone_args *args) { + u64 clone_flags = args->flags; + unsigned long sp = args->stack; + unsigned long tls = args->tls; void (*handler)(void); - int kthread = current->flags & PF_KTHREAD; int ret = 0; p->thread = (struct thread_struct) INIT_THREAD; - if (!kthread) { + if (!args->fn) { memcpy(&p->thread.regs.regs, current_pt_regs(), sizeof(p->thread.regs.regs)); PT_REGS_SET_SYSCALL_RETURN(&p->thread.regs, 0); @@ -181,21 +165,21 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, arch_copy_thread(¤t->thread.arch, &p->thread.arch); } else { get_safe_registers(p->thread.regs.regs.gp, p->thread.regs.regs.fp); - p->thread.request.u.thread.proc = (int (*)(void *))sp; - p->thread.request.u.thread.arg = (void *)arg; + p->thread.request.thread.proc = args->fn; + p->thread.request.thread.arg = args->fn_arg; handler = new_thread_handler; } new_thread(task_stack_page(p), &p->thread.switch_buf, handler); - if (!kthread) { + if (!args->fn) { clear_flushed_tls(p); /* * Set a new TLS for the child thread? */ if (clone_flags & CLONE_SETTLS) - ret = arch_copy_tls(p); + ret = arch_set_tls(p, tls); } return ret; @@ -203,34 +187,50 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, void initial_thread_cb(void (*proc)(void *), void *arg) { - int save_kmalloc_ok = kmalloc_ok; - - kmalloc_ok = 0; initial_thread_cb_skas(proc, arg); - kmalloc_ok = save_kmalloc_ok; +} + +int arch_dup_task_struct(struct task_struct *dst, + struct task_struct *src) +{ + /* init_task is not dynamically sized (missing FPU state) */ + if (unlikely(src == &init_task)) { + memcpy(dst, src, sizeof(init_task)); + memset((void *)dst + sizeof(init_task), 0, + arch_task_struct_size - sizeof(init_task)); + } else { + memcpy(dst, src, arch_task_struct_size); + } + + return 0; +} + +void um_idle_sleep(void) +{ + if (time_travel_mode != TT_MODE_OFF) + time_travel_sleep(); + else + os_idle_sleep(); } void arch_cpu_idle(void) { - unsigned long long nsecs; + um_idle_sleep(); +} - cpu_tasks[current_thread_info()->cpu].pid = os_getpid(); - nsecs = disable_timer(); - idle_sleep(nsecs); - local_irq_enable(); +void arch_cpu_idle_prepare(void) +{ + os_idle_prepare(); } -int __cant_sleep(void) { +int __uml_cant_sleep(void) { return in_atomic() || irqs_disabled() || in_interrupt(); /* Is in_interrupt() really needed? */ } -int user_context(unsigned long sp) +int uml_need_resched(void) { - unsigned long stack; - - stack = sp & (PAGE_MASK << CONFIG_KERNEL_STACK_ORDER); - return stack != (unsigned long) current_thread_info(); + return need_resched(); } extern exitcall_t __uml_exitcall_begin, __uml_exitcall_end; @@ -250,127 +250,20 @@ char *uml_strdup(const char *string) } EXPORT_SYMBOL(uml_strdup); -int copy_to_user_proc(void __user *to, void *from, int size) -{ - return copy_to_user(to, from, size); -} - int copy_from_user_proc(void *to, void __user *from, int size) { return copy_from_user(to, from, size); } -int clear_user_proc(void __user *buf, int size) -{ - return clear_user(buf, size); -} - -int strlen_user_proc(char __user *str) -{ - return strlen_user(str); -} - -int smp_sigio_handler(void) -{ -#ifdef CONFIG_SMP - int cpu = current_thread_info()->cpu; - IPI_handler(cpu); - if (cpu != 0) - return 1; -#endif - return 0; -} - -int cpu(void) +int singlestepping(void) { - return current_thread_info()->cpu; -} - -static atomic_t using_sysemu = ATOMIC_INIT(0); -int sysemu_supported; - -void set_using_sysemu(int value) -{ - if (value > sysemu_supported) - return; - atomic_set(&using_sysemu, value); -} - -int get_using_sysemu(void) -{ - return atomic_read(&using_sysemu); -} - -static int sysemu_proc_show(struct seq_file *m, void *v) -{ - seq_printf(m, "%d\n", get_using_sysemu()); - return 0; -} - -static int sysemu_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, sysemu_proc_show, NULL); -} - -static ssize_t sysemu_proc_write(struct file *file, const char __user *buf, - size_t count, loff_t *pos) -{ - char tmp[2]; - - if (copy_from_user(tmp, buf, 1)) - return -EFAULT; - - if (tmp[0] >= '0' && tmp[0] <= '2') - set_using_sysemu(tmp[0] - '0'); - /* We use the first char, but pretend to write everything */ - return count; -} - -static const struct file_operations sysemu_proc_fops = { - .owner = THIS_MODULE, - .open = sysemu_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = sysemu_proc_write, -}; - -int __init make_proc_sysemu(void) -{ - struct proc_dir_entry *ent; - if (!sysemu_supported) - return 0; - - ent = proc_create("sysemu", 0600, NULL, &sysemu_proc_fops); - - if (ent == NULL) - { - printk(KERN_WARNING "Failed to register /proc/sysemu\n"); - return 0; - } - - return 0; -} - -late_initcall(make_proc_sysemu); - -int singlestepping(void * t) -{ - struct task_struct *task = t ? t : current; - - if (!(task->ptrace & PT_DTRACE)) - return 0; - - if (task->thread.singlestep_syscall) - return 1; - - return 2; + return test_thread_flag(TIF_SINGLESTEP); } /* * Only x86 and x86_64 have an arch_align_stack(). * All other arches have "#define arch_align_stack(x) (x)" - * in their asm/system.h + * in their asm/exec.h * As this is included in UML from asm-um/system-generic.h, * we can use it to behave as the subarch does. */ @@ -378,19 +271,16 @@ int singlestepping(void * t) unsigned long arch_align_stack(unsigned long sp) { if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= get_random_int() % 8192; + sp -= get_random_u32_below(8192); return sp & ~0xf; } #endif -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { unsigned long stack_page, sp, ip; bool seen_sched = 0; - if ((p == NULL) || (p == current) || (p->state == TASK_RUNNING)) - return 0; - stack_page = (unsigned long) task_stack_page(p); /* Bail if the process has no kernel stack for some reason */ if (stack_page == 0) @@ -417,11 +307,3 @@ unsigned long get_wchan(struct task_struct *p) return 0; } - -int elf_core_copy_fpregs(struct task_struct *t, elf_fpregset_t *fpu) -{ - int cpu = current_thread_info()->cpu; - - return save_fp_registers(userspace_pid[cpu], (unsigned long *) fpu); -} - diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c index 694d551c8899..fdbb37b5c399 100644 --- a/arch/um/kernel/ptrace.c +++ b/arch/um/kernel/ptrace.c @@ -1,21 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <linux/audit.h> #include <linux/ptrace.h> #include <linux/sched.h> -#include <linux/tracehook.h> -#include <asm/uaccess.h> -#include <skas_ptrace.h> - +#include <linux/uaccess.h> +#include <asm/ptrace-abi.h> +#define CREATE_TRACE_POINTS +#include <trace/events/syscalls.h> void user_enable_single_step(struct task_struct *child) { - child->ptrace |= PT_DTRACE; - child->thread.singlestep_syscall = 0; + set_tsk_thread_flag(child, TIF_SINGLESTEP); #ifdef SUBARCH_SET_SINGLESTEPPING SUBARCH_SET_SINGLESTEPPING(child, 1); @@ -24,8 +23,7 @@ void user_enable_single_step(struct task_struct *child) void user_disable_single_step(struct task_struct *child) { - child->ptrace &= ~PT_DTRACE; - child->thread.singlestep_syscall = 0; + clear_tsk_thread_flag(child, TIF_SINGLESTEP); #ifdef SUBARCH_SET_SINGLESTEPPING SUBARCH_SET_SINGLESTEPPING(child, 0); @@ -40,9 +38,6 @@ void ptrace_disable(struct task_struct *child) user_disable_single_step(child); } -extern int peek_user(struct task_struct * child, long addr, long data); -extern int poke_user(struct task_struct * child, long addr, long data); - long arch_ptrace(struct task_struct *child, long request, unsigned long addr, unsigned long data) { @@ -68,7 +63,7 @@ long arch_ptrace(struct task_struct *child, long request, #ifdef PTRACE_GETREGS case PTRACE_GETREGS: { /* Get all gp regs from the child. */ - if (!access_ok(VERIFY_WRITE, p, MAX_REG_OFFSET)) { + if (!access_ok(p, MAX_REG_OFFSET)) { ret = -EIO; break; } @@ -83,7 +78,7 @@ long arch_ptrace(struct task_struct *child, long request, #ifdef PTRACE_SETREGS case PTRACE_SETREGS: { /* Set all gp regs in the child. */ unsigned long tmp = 0; - if (!access_ok(VERIFY_READ, p, MAX_REG_OFFSET)) { + if (!access_ok(p, MAX_REG_OFFSET)) { ret = -EIO; break; } @@ -104,35 +99,6 @@ long arch_ptrace(struct task_struct *child, long request, ret = ptrace_set_thread_area(child, addr, vp); break; - case PTRACE_FAULTINFO: { - /* - * Take the info from thread->arch->faultinfo, - * but transfer max. sizeof(struct ptrace_faultinfo). - * On i386, ptrace_faultinfo is smaller! - */ - ret = copy_to_user(p, &child->thread.arch.faultinfo, - sizeof(struct ptrace_faultinfo)) ? - -EIO : 0; - break; - } - -#ifdef PTRACE_LDT - case PTRACE_LDT: { - struct ptrace_ldt ldt; - - if (copy_from_user(&ldt, p, sizeof(ldt))) { - ret = -EIO; - break; - } - - /* - * This one is confusing, so just punt and return -EIO for - * now - */ - ret = -EIO; - break; - } -#endif default: ret = ptrace_request(child, request, addr, data); if (ret == -EIO) @@ -143,39 +109,33 @@ long arch_ptrace(struct task_struct *child, long request, return ret; } -static void send_sigtrap(struct task_struct *tsk, struct uml_pt_regs *regs, - int error_code) +static void send_sigtrap(struct uml_pt_regs *regs, int error_code) { - struct siginfo info; - - memset(&info, 0, sizeof(info)); - info.si_signo = SIGTRAP; - info.si_code = TRAP_BRKPT; - - /* User-mode eip? */ - info.si_addr = UPT_IS_USER(regs) ? (void __user *) UPT_IP(regs) : NULL; - /* Send us the fake SIGTRAP */ - force_sig_info(SIGTRAP, &info, tsk); + force_sig_fault(SIGTRAP, TRAP_BRKPT, + /* User-mode eip? */ + UPT_IS_USER(regs) ? (void __user *) UPT_IP(regs) : NULL); } /* - * XXX Check PT_DTRACE vs TIF_SINGLESTEP for singlestepping check and + * XXX Check TIF_SINGLESTEP for singlestepping check and * PT_PTRACED vs TIF_SYSCALL_TRACE for syscall tracing check */ -void syscall_trace_enter(struct pt_regs *regs) +int syscall_trace_enter(struct pt_regs *regs) { - audit_syscall_entry(HOST_AUDIT_ARCH, - UPT_SYSCALL_NR(®s->regs), + audit_syscall_entry(UPT_SYSCALL_NR(®s->regs), UPT_SYSCALL_ARG1(®s->regs), UPT_SYSCALL_ARG2(®s->regs), UPT_SYSCALL_ARG3(®s->regs), UPT_SYSCALL_ARG4(®s->regs)); + if (test_thread_flag(TIF_SYSCALL_TRACEPOINT)) + trace_sys_enter(regs, UPT_SYSCALL_NR(®s->regs)); + if (!test_thread_flag(TIF_SYSCALL_TRACE)) - return; + return 0; - tracehook_report_syscall_entry(regs); + return ptrace_report_syscall_entry(regs); } void syscall_trace_leave(struct pt_regs *regs) @@ -185,13 +145,16 @@ void syscall_trace_leave(struct pt_regs *regs) audit_syscall_exit(regs); /* Fake a debug trap */ - if (ptraced & PT_DTRACE) - send_sigtrap(current, ®s->regs, 0); + if (test_thread_flag(TIF_SINGLESTEP)) + send_sigtrap(®s->regs, 0); + + if (test_thread_flag(TIF_SYSCALL_TRACEPOINT)) + trace_sys_exit(regs, PT_REGS_SYSCALL_RET(regs)); if (!test_thread_flag(TIF_SYSCALL_TRACE)) return; - tracehook_report_syscall_exit(regs, 0); + ptrace_report_syscall_exit(regs, 0); /* force do_signal() --> is_syscall() */ if (ptraced & PT_PTRACED) set_thread_flag(TIF_SIGPENDING); diff --git a/arch/um/kernel/reboot.c b/arch/um/kernel/reboot.c index ced8903921ae..680bce4bd8fa 100644 --- a/arch/um/kernel/reboot.c +++ b/arch/um/kernel/reboot.c @@ -1,42 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ -#include <linux/sched.h> +#include <linux/sched/signal.h> +#include <linux/sched/task.h> +#include <linux/sched/mm.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/oom.h> +#include <linux/reboot.h> #include <kern_util.h> #include <os.h> #include <skas.h> void (*pm_power_off)(void); +EXPORT_SYMBOL(pm_power_off); static void kill_off_processes(void) { - if (proc_mm) - /* - * FIXME: need to loop over userspace_pids - */ - os_kill_ptraced_process(userspace_pid[0], 1); - else { - struct task_struct *p; - int pid; + struct task_struct *p; + int pid; - read_lock(&tasklist_lock); - for_each_process(p) { - struct task_struct *t; + read_lock(&tasklist_lock); + for_each_process(p) { + struct task_struct *t; - t = find_lock_task_mm(p); - if (!t) - continue; - pid = t->mm->context.id.u.pid; - task_unlock(t); - os_kill_ptraced_process(pid, 1); - } - read_unlock(&tasklist_lock); + t = find_lock_task_mm(p); + if (!t) + continue; + pid = t->mm->context.id.pid; + task_unlock(t); + os_kill_ptraced_process(pid, 1); } + read_unlock(&tasklist_lock); } void uml_cleanup(void) @@ -62,3 +59,18 @@ void machine_halt(void) { machine_power_off(); } + +static int sys_power_off_handler(struct sys_off_data *data) +{ + machine_power_off(); + return 0; +} + +static int register_power_off(void) +{ + register_sys_off_handler(SYS_OFF_MODE_POWER_OFF, + SYS_OFF_PRIO_DEFAULT, + sys_power_off_handler, NULL); + return 0; +} +__initcall(register_power_off); diff --git a/arch/um/kernel/sigio.c b/arch/um/kernel/sigio.c index b5e0cbb34382..4fc04742048a 100644 --- a/arch/um/kernel/sigio.c +++ b/arch/um/kernel/sigio.c @@ -1,6 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{linux.intel,addtoit}.com) - * Licensed under the GPL */ #include <linux/interrupt.h> @@ -8,42 +8,15 @@ #include <os.h> #include <sigio.h> -/* Protected by sigio_lock() called from write_sigio_workaround */ -static int sigio_irq_fd = -1; - -static irqreturn_t sigio_interrupt(int irq, void *data) -{ - char c; - - os_read_file(sigio_irq_fd, &c, sizeof(c)); - reactivate_fd(sigio_irq_fd, SIGIO_WRITE_IRQ); - return IRQ_HANDLED; -} - -int write_sigio_irq(int fd) -{ - int err; - - err = um_request_irq(SIGIO_WRITE_IRQ, fd, IRQ_READ, sigio_interrupt, - 0, "write sigio", NULL); - if (err) { - printk(KERN_ERR "write_sigio_irq : um_request_irq failed, " - "err = %d\n", err); - return -1; - } - sigio_irq_fd = fd; - return 0; -} - /* These are called from os-Linux/sigio.c to protect its pollfds arrays. */ -static DEFINE_SPINLOCK(sigio_spinlock); +static DEFINE_MUTEX(sigio_mutex); void sigio_lock(void) { - spin_lock(&sigio_spinlock); + mutex_lock(&sigio_mutex); } void sigio_unlock(void) { - spin_unlock(&sigio_spinlock); + mutex_unlock(&sigio_mutex); } diff --git a/arch/um/kernel/signal.c b/arch/um/kernel/signal.c index 3e831b3fd07b..a56b44522766 100644 --- a/arch/um/kernel/signal.c +++ b/arch/um/kernel/signal.c @@ -1,32 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <linux/module.h> #include <linux/ptrace.h> #include <linux/sched.h> +#include <linux/ftrace.h> #include <asm/siginfo.h> #include <asm/signal.h> #include <asm/unistd.h> #include <frame_kern.h> #include <kern_util.h> +#include <os.h> EXPORT_SYMBOL(block_signals); EXPORT_SYMBOL(unblock_signals); +void block_signals_trace(void) +{ + block_signals(); + if (current_thread_info()) + trace_hardirqs_off(); +} + +void unblock_signals_trace(void) +{ + if (current_thread_info()) + trace_hardirqs_on(); + unblock_signals(); +} + +void um_trace_signals_on(void) +{ + if (current_thread_info()) + trace_hardirqs_on(); +} + +void um_trace_signals_off(void) +{ + if (current_thread_info()) + trace_hardirqs_off(); +} + /* * OK, we're invoking a handler */ -static void handle_signal(struct pt_regs *regs, unsigned long signr, - struct k_sigaction *ka, siginfo_t *info) +static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) { sigset_t *oldset = sigmask_to_save(); int singlestep = 0; unsigned long sp; int err; - if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED)) + if (test_thread_flag(TIF_SINGLESTEP) && (current->ptrace & PT_PTRACED)) singlestep = 1; /* Did we come from a system call? */ @@ -39,11 +66,11 @@ static void handle_signal(struct pt_regs *regs, unsigned long signr, break; case -ERESTARTSYS: - if (!(ka->sa.sa_flags & SA_RESTART)) { + if (!(ksig->ka.sa.sa_flags & SA_RESTART)) { PT_REGS_SYSCALL_RET(regs) = -EINTR; break; } - /* fallthrough */ + fallthrough; case -ERESTARTNOINTR: PT_REGS_RESTART_SYSCALL(regs); PT_REGS_ORIG_SYSCALL(regs) = PT_REGS_SYSCALL_NR(regs); @@ -52,32 +79,28 @@ static void handle_signal(struct pt_regs *regs, unsigned long signr, } sp = PT_REGS_SP(regs); - if ((ka->sa.sa_flags & SA_ONSTACK) && (sas_ss_flags(sp) == 0)) + if ((ksig->ka.sa.sa_flags & SA_ONSTACK) && (sas_ss_flags(sp) == 0)) sp = current->sas_ss_sp + current->sas_ss_size; #ifdef CONFIG_ARCH_HAS_SC_SIGNALS - if (!(ka->sa.sa_flags & SA_SIGINFO)) - err = setup_signal_stack_sc(sp, signr, ka, regs, oldset); + if (!(ksig->ka.sa.sa_flags & SA_SIGINFO)) + err = setup_signal_stack_sc(sp, ksig, regs, oldset); else #endif - err = setup_signal_stack_si(sp, signr, ka, regs, info, oldset); + err = setup_signal_stack_si(sp, ksig, regs, oldset); - if (err) - force_sigsegv(signr, current); - else - signal_delivered(signr, info, ka, regs, singlestep); + signal_setup_done(err, ksig, singlestep); } -static int kern_do_signal(struct pt_regs *regs) +void do_signal(struct pt_regs *regs) { - struct k_sigaction ka_copy; - siginfo_t info; - int sig, handled_sig = 0; + struct ksignal ksig; + int handled_sig = 0; - while ((sig = get_signal_to_deliver(&info, &ka_copy, regs, NULL)) > 0) { + while (get_signal(&ksig)) { handled_sig = 1; /* Whee! Actually deliver the signal. */ - handle_signal(regs, sig, &ka_copy, &info); + handle_signal(&ksig, regs); } /* Did we come from a system call? */ @@ -98,27 +121,9 @@ static int kern_do_signal(struct pt_regs *regs) } /* - * This closes a way to execute a system call on the host. If - * you set a breakpoint on a system call instruction and singlestep - * from it, the tracing thread used to PTRACE_SINGLESTEP the process - * rather than PTRACE_SYSCALL it, allowing the system call to execute - * on the host. The tracing thread will check this flag and - * PTRACE_SYSCALL if necessary. - */ - if (current->ptrace & PT_DTRACE) - current->thread.singlestep_syscall = - is_syscall(PT_REGS_IP(¤t->thread.regs)); - - /* * if there's no signal to deliver, we just put the saved sigmask * back */ if (!handled_sig) restore_saved_sigmask(); - return handled_sig; -} - -int do_signal(void) -{ - return kern_do_signal(¤t->thread.regs); } diff --git a/arch/um/kernel/skas/.gitignore b/arch/um/kernel/skas/.gitignore new file mode 100644 index 000000000000..c3409ced0f38 --- /dev/null +++ b/arch/um/kernel/skas/.gitignore @@ -0,0 +1,2 @@ +stub_exe +stub_exe.dbg diff --git a/arch/um/kernel/skas/Makefile b/arch/um/kernel/skas/Makefile index 0b76d8869c94..3384be42691f 100644 --- a/arch/um/kernel/skas/Makefile +++ b/arch/um/kernel/skas/Makefile @@ -1,15 +1,50 @@ +# SPDX-License-Identifier: GPL-2.0 # # Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) -# Licensed under the GPL # -obj-y := clone.o mmu.o process.o syscall.o uaccess.o +obj-y := stub.o mmu.o process.o syscall.o uaccess.o \ + stub_exe_embed.o -# clone.o is in the stub, so it can't be built with profiling +# Stub executable + +stub_exe_objs-y := stub_exe.o + +stub_exe_objs := $(foreach F,$(stub_exe_objs-y),$(obj)/$F) + +# Object file containing the ELF executable +$(obj)/stub_exe_embed.o: $(src)/stub_exe_embed.S $(obj)/stub_exe + +$(obj)/stub_exe.dbg: $(stub_exe_objs) FORCE + $(call if_changed,stub_exe) + +$(obj)/stub_exe: OBJCOPYFLAGS := -S +$(obj)/stub_exe: $(obj)/stub_exe.dbg FORCE + $(call if_changed,objcopy) + +quiet_cmd_stub_exe = STUB_EXE $@ + cmd_stub_exe = $(CC) -nostdlib -o $@ \ + $(filter-out $(UM_GPROF_OPT) $(UM_GCOV_OPT),$(KBUILD_CFLAGS)) $(STUB_EXE_LDFLAGS) \ + $(filter %.o,$^) + +STUB_EXE_LDFLAGS = -Wl,-n -static + +targets += stub_exe.dbg stub_exe $(stub_exe_objs-y) + +# end + +# stub.o is in the stub, so it can't be built with profiling # GCC hardened also auto-enables -fpic, but we need %ebx so it can't work -> # disable it -CFLAGS_clone.o := $(CFLAGS_NO_HARDENING) -UNPROFILE_OBJS := clone.o +CFLAGS_stub.o := $(CFLAGS_NO_HARDENING) +CFLAGS_stub_exe.o := $(CFLAGS_NO_HARDENING) + +# Clang will call memset() from __builtin_alloca() when stack variable +# initialization is enabled, which is used in stub_exe.c. +CFLAGS_stub_exe.o += $(call cc-option, -ftrivial-auto-var-init=uninitialized) + +UNPROFILE_OBJS := stub.o stub_exe.o +KCOV_INSTRUMENT := n -include arch/um/scripts/Makefile.rules +include $(srctree)/arch/um/scripts/Makefile.rules diff --git a/arch/um/kernel/skas/clone.c b/arch/um/kernel/skas/clone.c deleted file mode 100644 index 289771dadf81..000000000000 --- a/arch/um/kernel/skas/clone.c +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (C) 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL - */ - -#include <signal.h> -#include <sched.h> -#include <asm/unistd.h> -#include <sys/time.h> -#include <as-layout.h> -#include <ptrace_user.h> -#include <stub-data.h> -#include <sysdep/stub.h> - -/* - * This is in a separate file because it needs to be compiled with any - * extraneous gcc flags (-pg, -fprofile-arcs, -ftest-coverage) disabled - * - * Use UM_KERN_PAGE_SIZE instead of PAGE_SIZE because that calls getpagesize - * on some systems. - */ - -void __attribute__ ((__section__ (".__syscall_stub"))) -stub_clone_handler(void) -{ - struct stub_data *data = (struct stub_data *) STUB_DATA; - long err; - - err = stub_syscall2(__NR_clone, CLONE_PARENT | CLONE_FILES | SIGCHLD, - STUB_DATA + UM_KERN_PAGE_SIZE / 2 - sizeof(void *)); - if (err != 0) - goto out; - - err = stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0); - if (err) - goto out; - - err = stub_syscall3(__NR_setitimer, ITIMER_VIRTUAL, - (long) &data->timer, 0); - if (err) - goto out; - - remap_stack(data->fd, data->offset); - goto done; - - out: - /* - * save current result. - * Parent: pid; - * child: retcode of mmap already saved and it jumps around this - * assignment - */ - data->err = err; - done: - trap_myself(); -} diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c index ff03067a3b14..00957788591b 100644 --- a/arch/um/kernel/skas/mmu.c +++ b/arch/um/kernel/skas/mmu.c @@ -1,178 +1,162 @@ +// SPDX-License-Identifier: GPL-2.0 /* + * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de) * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <linux/mm.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/slab.h> + +#include <shared/irq_kern.h> #include <asm/pgalloc.h> -#include <asm/pgtable.h> +#include <asm/sections.h> +#include <asm/mmu_context.h> #include <as-layout.h> #include <os.h> #include <skas.h> +#include <stub-data.h> -extern int __syscall_stub_start; +/* Ensure the stub_data struct covers the allocated area */ +static_assert(sizeof(struct stub_data) == STUB_DATA_PAGES * UM_KERN_PAGE_SIZE); -static int init_stub_pte(struct mm_struct *mm, unsigned long proc, - unsigned long kernel) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - pgd = pgd_offset(mm, proc); - pud = pud_alloc(mm, pgd, proc); - if (!pud) - goto out; +static spinlock_t mm_list_lock; +static struct list_head mm_list; - pmd = pmd_alloc(mm, pud, proc); - if (!pmd) - goto out_pmd; +void enter_turnstile(struct mm_id *mm_id) __acquires(turnstile) +{ + struct mm_context *ctx = container_of(mm_id, struct mm_context, id); - pte = pte_alloc_map(mm, NULL, pmd, proc); - if (!pte) - goto out_pte; + mutex_lock(&ctx->turnstile); +} - *pte = mk_pte(virt_to_page(kernel), __pgprot(_PAGE_PRESENT)); - *pte = pte_mkread(*pte); - return 0; +void exit_turnstile(struct mm_id *mm_id) __releases(turnstile) +{ + struct mm_context *ctx = container_of(mm_id, struct mm_context, id); - out_pte: - pmd_free(mm, pmd); - out_pmd: - pud_free(mm, pud); - out: - return -ENOMEM; + mutex_unlock(&ctx->turnstile); } int init_new_context(struct task_struct *task, struct mm_struct *mm) { - struct mm_context *from_mm = NULL; - struct mm_context *to_mm = &mm->context; + struct mm_id *new_id = &mm->context.id; unsigned long stack = 0; int ret = -ENOMEM; - if (skas_needs_stub) { - stack = get_zeroed_page(GFP_KERNEL); - if (stack == 0) - goto out; - } + mutex_init(&mm->context.turnstile); + spin_lock_init(&mm->context.sync_tlb_lock); - to_mm->id.stack = stack; - if (current->mm != NULL && current->mm != &init_mm) - from_mm = ¤t->mm->context; + stack = __get_free_pages(GFP_KERNEL | __GFP_ZERO, ilog2(STUB_DATA_PAGES)); + if (stack == 0) + goto out; - if (proc_mm) { - ret = new_mm(stack); - if (ret < 0) { - printk(KERN_ERR "init_new_context_skas - " - "new_mm failed, errno = %d\n", ret); - goto out_free; - } - to_mm->id.u.mm_fd = ret; - } - else { - if (from_mm) - to_mm->id.u.pid = copy_context_skas0(stack, - from_mm->id.u.pid); - else to_mm->id.u.pid = start_userspace(stack); - - if (to_mm->id.u.pid < 0) { - ret = to_mm->id.u.pid; - goto out_free; - } + new_id->stack = stack; + new_id->syscall_data_len = 0; + new_id->syscall_fd_num = 0; + + scoped_guard(spinlock_irqsave, &mm_list_lock) { + /* Insert into list, used for lookups when the child dies */ + list_add(&mm->context.list, &mm_list); } - ret = init_new_ldt(to_mm, from_mm); - if (ret < 0) { - printk(KERN_ERR "init_new_context_skas - init_ldt" - " failed, errno = %d\n", ret); + ret = start_userspace(new_id); + if (ret < 0) goto out_free; - } + + /* Ensure the new MM is clean and nothing unwanted is mapped */ + unmap(new_id, 0, STUB_START); return 0; out_free: - if (to_mm->id.stack != 0) - free_page(to_mm->id.stack); + free_pages(new_id->stack, ilog2(STUB_DATA_PAGES)); out: return ret; } -void uml_setup_stubs(struct mm_struct *mm) +void destroy_context(struct mm_struct *mm) { - int err, ret; + struct mm_context *mmu = &mm->context; - if (!skas_needs_stub) + /* + * If init_new_context wasn't called, this will be + * zero, resulting in a kill(0), which will result in the + * whole UML suddenly dying. Also, cover negative and + * 1 cases, since they shouldn't happen either. + * + * Negative cases happen if the child died unexpectedly. + */ + if (mmu->id.pid >= 0 && mmu->id.pid < 2) { + printk(KERN_ERR "corrupt mm_context - pid = %d\n", + mmu->id.pid); return; + } - ret = init_stub_pte(mm, STUB_CODE, - (unsigned long) &__syscall_stub_start); - if (ret) - goto out; - - ret = init_stub_pte(mm, STUB_DATA, mm->context.id.stack); - if (ret) - goto out; - - mm->context.stub_pages[0] = virt_to_page(&__syscall_stub_start); - mm->context.stub_pages[1] = virt_to_page(mm->context.id.stack); + scoped_guard(spinlock_irqsave, &mm_list_lock) + list_del(&mm->context.list); - /* dup_mmap already holds mmap_sem */ - err = install_special_mapping(mm, STUB_START, STUB_END - STUB_START, - VM_READ | VM_MAYREAD | VM_EXEC | - VM_MAYEXEC | VM_DONTCOPY, - mm->context.stub_pages); - if (err) { - printk(KERN_ERR "install_special_mapping returned %d\n", err); - goto out; + if (mmu->id.pid > 0) { + os_kill_ptraced_process(mmu->id.pid, 1); + mmu->id.pid = -1; } - return; -out: - force_sigsegv(SIGSEGV, current); + if (using_seccomp && mmu->id.sock) + os_close_file(mmu->id.sock); + + free_pages(mmu->id.stack, ilog2(STUB_DATA_PAGES)); } -void arch_exit_mmap(struct mm_struct *mm) +static irqreturn_t mm_sigchld_irq(int irq, void* dev) { - pte_t *pte; + struct mm_context *mm_context; + pid_t pid; - pte = virt_to_pte(mm, STUB_CODE); - if (pte != NULL) - pte_clear(mm, STUB_CODE, pte); + guard(spinlock)(&mm_list_lock); - pte = virt_to_pte(mm, STUB_DATA); - if (pte == NULL) - return; + while ((pid = os_reap_child()) > 0) { + /* + * A child died, check if we have an MM with the PID. This is + * only relevant in SECCOMP mode (as ptrace will fail anyway). + * + * See wait_stub_done_seccomp for more details. + */ + list_for_each_entry(mm_context, &mm_list, list) { + if (mm_context->id.pid == pid) { + struct stub_data *stub_data; + printk("Unexpectedly lost MM child! Affected tasks will segfault."); + + /* Marks the MM as dead */ + mm_context->id.pid = -1; + + stub_data = (void *)mm_context->id.stack; + stub_data->futex = FUTEX_IN_KERN; +#if IS_ENABLED(CONFIG_SMP) + os_futex_wake(&stub_data->futex); +#endif + + /* + * NOTE: Currently executing syscalls by + * affected tasks may finish normally. + */ + break; + } + } + } - pte_clear(mm, STUB_DATA, pte); + return IRQ_HANDLED; } -void destroy_context(struct mm_struct *mm) +static int __init init_child_tracking(void) { - struct mm_context *mmu = &mm->context; + int err; - if (proc_mm) - os_close_file(mmu->id.u.mm_fd); - else { - /* - * If init_new_context wasn't called, this will be - * zero, resulting in a kill(0), which will result in the - * whole UML suddenly dying. Also, cover negative and - * 1 cases, since they shouldn't happen either. - */ - if (mmu->id.u.pid < 2) { - printk(KERN_ERR "corrupt mm_context - pid = %d\n", - mmu->id.u.pid); - return; - } - os_kill_ptraced_process(mmu->id.u.pid, 1); - } + spin_lock_init(&mm_list_lock); + INIT_LIST_HEAD(&mm_list); - if (skas_needs_stub) - free_page(mmu->id.stack); + err = request_irq(SIGCHLD_IRQ, mm_sigchld_irq, 0, "SIGCHLD", NULL); + if (err < 0) + panic("Failed to register SIGCHLD IRQ: %d", err); - free_ldt(mmu); + return 0; } +early_initcall(init_child_tracking) diff --git a/arch/um/kernel/skas/process.c b/arch/um/kernel/skas/process.c index 4da11b3c8ddb..4a7673b0261a 100644 --- a/arch/um/kernel/skas/process.c +++ b/arch/um/kernel/skas/process.c @@ -1,73 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <linux/init.h> -#include <linux/sched.h> +#include <linux/sched/mm.h> +#include <linux/sched/task_stack.h> +#include <linux/sched/task.h> +#include <linux/smp-internal.h> + +#include <asm/tlbflush.h> + #include <as-layout.h> #include <kern.h> #include <os.h> #include <skas.h> - -int new_mm(unsigned long stack) -{ - int fd, err; - - fd = os_open_file("/proc/mm", of_cloexec(of_write(OPENFLAGS())), 0); - if (fd < 0) - return fd; - - if (skas_needs_stub) { - err = map_stub_pages(fd, STUB_CODE, STUB_DATA, stack); - if (err) { - os_close_file(fd); - return err; - } - } - - return fd; -} +#include <kern_util.h> extern void start_kernel(void); static int __init start_kernel_proc(void *unused) { - int pid; - - block_signals(); - pid = os_getpid(); + block_signals_trace(); - cpu_tasks[0].pid = pid; - cpu_tasks[0].task = current; -#ifdef CONFIG_SMP - init_cpu_online(get_cpu_mask(0)); -#endif start_kernel(); return 0; } -extern int userspace_pid[]; - -extern char cpu0_irqstack[]; +char cpu_irqstacks[NR_CPUS][THREAD_SIZE] __aligned(THREAD_SIZE); int __init start_uml(void) { - stack_protections((unsigned long) &cpu0_irqstack); - set_sigstack(cpu0_irqstack, THREAD_SIZE); - if (proc_mm) { - userspace_pid[0] = start_userspace(0); - if (userspace_pid[0] < 0) { - printf("start_uml - start_userspace returned %d\n", - userspace_pid[0]); - exit(1); - } - } + stack_protections((unsigned long) &cpu_irqstacks[0]); + set_sigstack(cpu_irqstacks[0], THREAD_SIZE); init_new_thread_signals(); - init_task.thread.request.u.thread.proc = start_kernel_proc; - init_task.thread.request.u.thread.arg = NULL; + init_task.thread.request.thread.proc = start_kernel_proc; + init_task.thread.request.thread.arg = NULL; return start_idle_thread(task_stack_page(&init_task), &init_task.thread.switch_buf); } @@ -79,3 +49,31 @@ unsigned long current_stub_stack(void) return current->mm->context.id.stack; } + +struct mm_id *current_mm_id(void) +{ + if (current->mm == NULL) + return NULL; + + return ¤t->mm->context.id; +} + +void current_mm_sync(void) +{ + if (current->mm == NULL) + return; + + um_tlb_sync(current->mm); +} + +static DEFINE_SPINLOCK(initial_jmpbuf_spinlock); + +void initial_jmpbuf_lock(void) +{ + spin_lock_irq(&initial_jmpbuf_spinlock); +} + +void initial_jmpbuf_unlock(void) +{ + spin_unlock_irq(&initial_jmpbuf_spinlock); +} diff --git a/arch/um/kernel/skas/stub.c b/arch/um/kernel/skas/stub.c new file mode 100644 index 000000000000..67cab46a602c --- /dev/null +++ b/arch/um/kernel/skas/stub.c @@ -0,0 +1,181 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 Benjamin Berg <benjamin@sipsolutions.net> + */ + +#include <sysdep/stub.h> + +#include <linux/futex.h> +#include <sys/socket.h> +#include <errno.h> + +/* + * Known security issues + * + * Userspace can jump to this address to execute *any* syscall that is + * permitted by the stub. As we will return afterwards, it can do + * whatever it likes, including: + * - Tricking the kernel into handing out the memory FD + * - Using this memory FD to read/write all physical memory + * - Running in parallel to the kernel processing a syscall + * (possibly creating data races?) + * - Blocking e.g. SIGALRM to avoid time based scheduling + * + * To avoid this, the permitted location for each syscall needs to be + * checked for in the SECCOMP filter (which is reasonably simple). Also, + * more care will need to go into considerations how the code might be + * tricked by using a prepared stack (or even modifying the stack from + * another thread in case SMP support is added). + * + * As for the SIGALRM, the best counter measure will be to check in the + * kernel that the process is reporting back the SIGALRM in a timely + * fashion. + */ +static __always_inline int syscall_handler(int fd_map[STUB_MAX_FDS]) +{ + struct stub_data *d = get_stub_data(); + int i; + unsigned long res; + int fd; + + for (i = 0; i < d->syscall_data_len; i++) { + struct stub_syscall *sc = &d->syscall_data[i]; + + switch (sc->syscall) { + case STUB_SYSCALL_MMAP: + if (fd_map) + fd = fd_map[sc->mem.fd]; + else + fd = sc->mem.fd; + + res = stub_syscall6(STUB_MMAP_NR, + sc->mem.addr, sc->mem.length, + sc->mem.prot, + MAP_SHARED | MAP_FIXED, + fd, sc->mem.offset); + if (res != sc->mem.addr) { + d->err = res; + d->syscall_data_len = i; + return -1; + } + break; + case STUB_SYSCALL_MUNMAP: + res = stub_syscall2(__NR_munmap, + sc->mem.addr, sc->mem.length); + if (res) { + d->err = res; + d->syscall_data_len = i; + return -1; + } + break; + default: + d->err = -95; /* EOPNOTSUPP */ + d->syscall_data_len = i; + return -1; + } + } + + d->err = 0; + d->syscall_data_len = 0; + + return 0; +} + +void __section(".__syscall_stub") +stub_syscall_handler(void) +{ + syscall_handler(NULL); + + trap_myself(); +} + +void __section(".__syscall_stub") +stub_signal_interrupt(int sig, siginfo_t *info, void *p) +{ + struct stub_data *d = get_stub_data(); + char rcv_data; + union { + char data[CMSG_SPACE(sizeof(int) * STUB_MAX_FDS)]; + struct cmsghdr align; + } ctrl = {}; + struct iovec iov = { + .iov_base = &rcv_data, + .iov_len = 1, + }; + struct msghdr msghdr = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = &ctrl, + .msg_controllen = sizeof(ctrl), + }; + ucontext_t *uc = p; + struct cmsghdr *fd_msg; + int *fd_map; + int num_fds; + long res; + + d->signal = sig; + d->si_offset = (unsigned long)info - (unsigned long)&d->sigstack[0]; + d->mctx_offset = (unsigned long)&uc->uc_mcontext - (unsigned long)&d->sigstack[0]; + +restart_wait: + d->futex = FUTEX_IN_KERN; + do { + res = stub_syscall3(__NR_futex, (unsigned long)&d->futex, + FUTEX_WAKE, 1); + } while (res == -EINTR); + + do { + res = stub_syscall4(__NR_futex, (unsigned long)&d->futex, + FUTEX_WAIT, FUTEX_IN_KERN, 0); + } while (res == -EINTR || d->futex == FUTEX_IN_KERN); + + if (res < 0 && res != -EAGAIN) + stub_syscall1(__NR_exit_group, 1); + + if (d->syscall_data_len) { + /* Read passed FDs (if any) */ + do { + res = stub_syscall3(__NR_recvmsg, 0, (unsigned long)&msghdr, 0); + } while (res == -EINTR); + + /* We should never have a receive error (other than -EAGAIN) */ + if (res < 0 && res != -EAGAIN) + stub_syscall1(__NR_exit_group, 1); + + /* Receive the FDs */ + num_fds = 0; + fd_msg = msghdr.msg_control; + fd_map = (void *)&CMSG_DATA(fd_msg); + if (res == iov.iov_len && msghdr.msg_controllen > sizeof(struct cmsghdr)) + num_fds = (fd_msg->cmsg_len - CMSG_LEN(0)) / sizeof(int); + + /* Try running queued syscalls. */ + res = syscall_handler(fd_map); + + while (num_fds) + stub_syscall2(__NR_close, fd_map[--num_fds], 0); + } else { + res = 0; + } + + if (res < 0 || d->restart_wait) { + /* Report SIGSYS if we restart. */ + d->signal = SIGSYS; + d->restart_wait = 0; + + goto restart_wait; + } + + /* Restore arch dependent state that is not part of the mcontext */ + stub_seccomp_restore_state(&d->arch_data); + + /* Return so that the host modified mcontext is restored. */ +} + +void __section(".__syscall_stub") +stub_signal_restorer(void) +{ + /* We must not have anything on the stack when doing rt_sigreturn */ + stub_syscall0(__NR_rt_sigreturn); +} diff --git a/arch/um/kernel/skas/stub_exe.c b/arch/um/kernel/skas/stub_exe.c new file mode 100644 index 000000000000..cbafaa684e66 --- /dev/null +++ b/arch/um/kernel/skas/stub_exe.c @@ -0,0 +1,230 @@ +#include <sys/ptrace.h> +#include <sys/prctl.h> +#include <sys/fcntl.h> +#include <asm/unistd.h> +#include <sysdep/stub.h> +#include <stub-data.h> +#include <linux/filter.h> +#include <linux/seccomp.h> +#include <generated/asm-offsets.h> + +void _start(void); + +noinline static void real_init(void) +{ + struct stub_init_data init_data; + unsigned long res; + struct { + void *ss_sp; + int ss_flags; + size_t ss_size; + } stack = { + .ss_size = STUB_DATA_PAGES * UM_KERN_PAGE_SIZE, + }; + struct { + void *sa_handler_; + unsigned long sa_flags; + void *sa_restorer; + unsigned long long sa_mask; + } sa = { + /* Need to set SA_RESTORER (but the handler never returns) */ + .sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO | 0x04000000, + }; + + /* set a nice name */ + stub_syscall2(__NR_prctl, PR_SET_NAME, (unsigned long)"uml-userspace"); + + /* Make sure this process dies if the kernel dies */ + stub_syscall2(__NR_prctl, PR_SET_PDEATHSIG, SIGKILL); + + /* Needed in SECCOMP mode (and safe to do anyway) */ + stub_syscall5(__NR_prctl, PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + + /* read information from STDIN and close it */ + res = stub_syscall3(__NR_read, 0, + (unsigned long)&init_data, sizeof(init_data)); + if (res != sizeof(init_data)) + stub_syscall1(__NR_exit, 10); + + /* In SECCOMP mode, FD 0 is a socket and is later used for FD passing */ + if (!init_data.seccomp) + stub_syscall1(__NR_close, 0); + else + stub_syscall3(__NR_fcntl, 0, F_SETFL, O_NONBLOCK); + + /* map stub code + data */ + res = stub_syscall6(STUB_MMAP_NR, + init_data.stub_start, UM_KERN_PAGE_SIZE, + PROT_READ | PROT_EXEC, MAP_FIXED | MAP_SHARED, + init_data.stub_code_fd, init_data.stub_code_offset); + if (res != init_data.stub_start) + stub_syscall1(__NR_exit, 11); + + res = stub_syscall6(STUB_MMAP_NR, + init_data.stub_start + UM_KERN_PAGE_SIZE, + STUB_DATA_PAGES * UM_KERN_PAGE_SIZE, + PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, + init_data.stub_data_fd, init_data.stub_data_offset); + if (res != init_data.stub_start + UM_KERN_PAGE_SIZE) + stub_syscall1(__NR_exit, 12); + + /* In SECCOMP mode, we only need the signalling FD from now on */ + if (init_data.seccomp) { + res = stub_syscall3(__NR_close_range, 1, ~0U, 0); + if (res != 0) + stub_syscall1(__NR_exit, 13); + } + + /* setup signal stack inside stub data */ + stack.ss_sp = (void *)init_data.stub_start + UM_KERN_PAGE_SIZE; + stub_syscall2(__NR_sigaltstack, (unsigned long)&stack, 0); + + /* register signal handlers */ + sa.sa_handler_ = (void *) init_data.signal_handler; + sa.sa_restorer = (void *) init_data.signal_restorer; + if (!init_data.seccomp) { + /* In ptrace mode, the SIGSEGV handler never returns */ + sa.sa_mask = 0; + + res = stub_syscall4(__NR_rt_sigaction, SIGSEGV, + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); + if (res != 0) + stub_syscall1(__NR_exit, 14); + } else { + /* SECCOMP mode uses rt_sigreturn, need to mask all signals */ + sa.sa_mask = ~0ULL; + + res = stub_syscall4(__NR_rt_sigaction, SIGSEGV, + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); + if (res != 0) + stub_syscall1(__NR_exit, 15); + + res = stub_syscall4(__NR_rt_sigaction, SIGSYS, + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); + if (res != 0) + stub_syscall1(__NR_exit, 16); + + res = stub_syscall4(__NR_rt_sigaction, SIGALRM, + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); + if (res != 0) + stub_syscall1(__NR_exit, 17); + + res = stub_syscall4(__NR_rt_sigaction, SIGTRAP, + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); + if (res != 0) + stub_syscall1(__NR_exit, 18); + + res = stub_syscall4(__NR_rt_sigaction, SIGILL, + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); + if (res != 0) + stub_syscall1(__NR_exit, 19); + + res = stub_syscall4(__NR_rt_sigaction, SIGFPE, + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); + if (res != 0) + stub_syscall1(__NR_exit, 20); + } + + /* + * If in seccomp mode, install the SECCOMP filter and trigger a syscall. + * Otherwise set PTRACE_TRACEME and do a SIGSTOP. + */ + if (init_data.seccomp) { + struct sock_filter filter[] = { +#if __BITS_PER_LONG > 32 + /* [0] Load upper 32bit of instruction pointer from seccomp_data */ + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, + (offsetof(struct seccomp_data, instruction_pointer) + 4)), + + /* [1] Jump forward 3 instructions if the upper address is not identical */ + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, (init_data.stub_start) >> 32, 0, 3), +#endif + /* [2] Load lower 32bit of instruction pointer from seccomp_data */ + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, + (offsetof(struct seccomp_data, instruction_pointer))), + + /* [3] Mask out lower bits */ + BPF_STMT(BPF_ALU | BPF_AND | BPF_K, 0xfffff000), + + /* [4] Jump to [6] if the lower bits are not on the expected page */ + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, (init_data.stub_start) & 0xfffff000, 1, 0), + + /* [5] Trap call, allow */ + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRAP), + + /* [6,7] Check architecture */ + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, + offsetof(struct seccomp_data, arch)), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, + UM_SECCOMP_ARCH_NATIVE, 1, 0), + + /* [8] Kill (for architecture check) */ + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS), + + /* [9] Load syscall number */ + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, + offsetof(struct seccomp_data, nr)), + + /* [10-16] Check against permitted syscalls */ + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_futex, + 7, 0), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K,__NR_recvmsg, + 6, 0), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K,__NR_close, + 5, 0), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, STUB_MMAP_NR, + 4, 0), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_munmap, + 3, 0), +#ifdef __i386__ + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_set_thread_area, + 2, 0), +#else + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_arch_prctl, + 2, 0), +#endif + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_rt_sigreturn, + 1, 0), + + /* [17] Not one of the permitted syscalls */ + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS), + + /* [18] Permitted call for the stub */ + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = sizeof(filter) / sizeof(filter[0]), + .filter = filter, + }; + + if (stub_syscall3(__NR_seccomp, SECCOMP_SET_MODE_FILTER, + SECCOMP_FILTER_FLAG_TSYNC, + (unsigned long)&prog) != 0) + stub_syscall1(__NR_exit, 21); + + /* Fall through, the exit syscall will cause SIGSYS */ + } else { + stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0); + + stub_syscall2(__NR_kill, stub_syscall0(__NR_getpid), SIGSTOP); + } + + stub_syscall1(__NR_exit, 30); + + __builtin_unreachable(); +} + +__attribute__((naked)) void _start(void) +{ + /* + * Since the stack after exec() starts at the top-most address, + * but that's exactly where we also want to map the stub data + * and code, this must: + * - push the stack by 1 code and STUB_DATA_PAGES data pages + * - call real_init() + * This way, real_init() can use the stack normally, while the + * original stack further down (higher address) will become + * inaccessible after the mmap() calls above. + */ + stub_start(real_init); +} diff --git a/arch/um/kernel/skas/stub_exe_embed.S b/arch/um/kernel/skas/stub_exe_embed.S new file mode 100644 index 000000000000..6d8914fbe8f1 --- /dev/null +++ b/arch/um/kernel/skas/stub_exe_embed.S @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/init.h> +#include <linux/linkage.h> + +__INITDATA + +SYM_DATA_START(stub_exe_start) + .incbin "arch/um/kernel/skas/stub_exe" +SYM_DATA_END_LABEL(stub_exe_start, SYM_L_GLOBAL, stub_exe_end) + +__FINIT diff --git a/arch/um/kernel/skas/syscall.c b/arch/um/kernel/skas/syscall.c index c0681e097432..ba7494f9bfe4 100644 --- a/arch/um/kernel/skas/syscall.c +++ b/arch/um/kernel/skas/syscall.c @@ -1,40 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <linux/kernel.h> #include <linux/ptrace.h> +#include <linux/seccomp.h> #include <kern_util.h> #include <sysdep/ptrace.h> -#include <sysdep/syscalls.h> - -extern int syscall_table_size; -#define NR_SYSCALLS (syscall_table_size / sizeof(void *)) +#include <sysdep/ptrace_user.h> +#include <linux/time-internal.h> +#include <asm/syscall.h> +#include <asm/unistd.h> +#include <asm/delay.h> void handle_syscall(struct uml_pt_regs *r) { struct pt_regs *regs = container_of(r, struct pt_regs, regs); - long result; int syscall; - syscall_trace_enter(regs); + /* Initialize the syscall number and default return value. */ + UPT_SYSCALL_NR(r) = PT_SYSCALL_NR(r->gp); + PT_REGS_SET_SYSCALL_RETURN(regs, -ENOSYS); + + if (syscall_trace_enter(regs)) + goto out; + + /* Do the seccomp check after ptrace; failures should be fast. */ + if (secure_computing() == -1) + goto out; + + syscall = UPT_SYSCALL_NR(r); /* - * This should go in the declaration of syscall, but when I do that, - * strace -f -c bash -c 'ls ; ls' breaks, sometimes not tracing - * children at all, sometimes hanging when bash doesn't see the first - * ls exit. - * The assembly looks functionally the same to me. This is - * gcc version 4.0.1 20050727 (Red Hat 4.0.1-5) - * in case it's a compiler bug. + * If no time passes, then sched_yield may not actually yield, causing + * broken spinlock implementations in userspace (ASAN) to hang for long + * periods of time. */ - syscall = UPT_SYSCALL_NR(r); - if ((syscall >= NR_SYSCALLS) || (syscall < 0)) - result = -ENOSYS; - else result = EXECUTE_SYSCALL(syscall, regs); + if ((time_travel_mode == TT_MODE_INFCPU || + time_travel_mode == TT_MODE_EXTERNAL) && + syscall == __NR_sched_yield) + tt_extra_sched_jiffies += 1; + + if (syscall >= 0 && syscall < __NR_syscalls) { + unsigned long ret; + + ret = (*sys_call_table[syscall])(UPT_SYSCALL_ARG1(®s->regs), + UPT_SYSCALL_ARG2(®s->regs), + UPT_SYSCALL_ARG3(®s->regs), + UPT_SYSCALL_ARG4(®s->regs), + UPT_SYSCALL_ARG5(®s->regs), + UPT_SYSCALL_ARG6(®s->regs)); + + PT_REGS_SET_SYSCALL_RETURN(regs, ret); - PT_REGS_SET_SYSCALL_RETURN(regs, result); + /* + * An error value here can be some form of -ERESTARTSYS + * and then we'd just loop. Make any error syscalls take + * some time, so that it won't just loop if something is + * not ready, and hopefully other things will make some + * progress. + */ + if (IS_ERR_VALUE(ret) && + (time_travel_mode == TT_MODE_INFCPU || + time_travel_mode == TT_MODE_EXTERNAL)) { + um_udelay(1); + schedule(); + } + } +out: syscall_trace_leave(regs); } diff --git a/arch/um/kernel/skas/uaccess.c b/arch/um/kernel/skas/uaccess.c index 1d3e0c17340b..198269e384c4 100644 --- a/arch/um/kernel/skas/uaccess.c +++ b/arch/um/kernel/skas/uaccess.c @@ -1,6 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <linux/err.h> @@ -10,13 +10,14 @@ #include <linux/sched.h> #include <asm/current.h> #include <asm/page.h> -#include <asm/pgtable.h> #include <kern_util.h> +#include <asm/futex.h> #include <os.h> pte_t *virt_to_pte(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; @@ -27,7 +28,11 @@ pte_t *virt_to_pte(struct mm_struct *mm, unsigned long addr) if (!pgd_present(*pgd)) return NULL; - pud = pud_offset(pgd, addr); + p4d = p4d_offset(pgd, addr); + if (!p4d_present(*p4d)) + return NULL; + + pud = pud_offset(p4d, addr); if (!pud_present(*pud)) return NULL; @@ -59,38 +64,38 @@ static pte_t *maybe_map(unsigned long virt, int is_write) static int do_op_one_page(unsigned long addr, int len, int is_write, int (*op)(unsigned long addr, int len, void *arg), void *arg) { - jmp_buf buf; struct page *page; pte_t *pte; - int n, faulted; + int n; pte = maybe_map(addr, is_write); if (pte == NULL) return -1; page = pte_page(*pte); +#ifdef CONFIG_64BIT + pagefault_disable(); + addr = (unsigned long) page_address(page) + + (addr & ~PAGE_MASK); +#else addr = (unsigned long) kmap_atomic(page) + (addr & ~PAGE_MASK); +#endif + n = (*op)(addr, len, arg); - current->thread.fault_catcher = &buf; - - faulted = UML_SETJMP(&buf); - if (faulted == 0) - n = (*op)(addr, len, arg); - else - n = -1; - - current->thread.fault_catcher = NULL; - +#ifdef CONFIG_64BIT + pagefault_enable(); +#else kunmap_atomic((void *)addr); +#endif return n; } -static int buffer_op(unsigned long addr, int len, int is_write, - int (*op)(unsigned long, int, void *), void *arg) +static long buffer_op(unsigned long addr, int len, int is_write, + int (*op)(unsigned long, int, void *), void *arg) { - int size, remain, n; + long size, remain, n; size = min(PAGE_ALIGN(addr) - addr, (unsigned long) len); remain = len; @@ -139,18 +144,11 @@ static int copy_chunk_from_user(unsigned long from, int len, void *arg) return 0; } -int copy_from_user(void *to, const void __user *from, int n) +unsigned long raw_copy_from_user(void *to, const void __user *from, unsigned long n) { - if (segment_eq(get_fs(), KERNEL_DS)) { - memcpy(to, (__force void*)from, n); - return 0; - } - - return access_ok(VERIFY_READ, from, n) ? - buffer_op((unsigned long) from, n, 0, copy_chunk_from_user, &to): - n; + return buffer_op((unsigned long) from, n, 0, copy_chunk_from_user, &to); } -EXPORT_SYMBOL(copy_from_user); +EXPORT_SYMBOL(raw_copy_from_user); static int copy_chunk_to_user(unsigned long to, int len, void *arg) { @@ -161,18 +159,11 @@ static int copy_chunk_to_user(unsigned long to, int len, void *arg) return 0; } -int copy_to_user(void __user *to, const void *from, int n) +unsigned long raw_copy_to_user(void __user *to, const void *from, unsigned long n) { - if (segment_eq(get_fs(), KERNEL_DS)) { - memcpy((__force void *) to, from, n); - return 0; - } - - return access_ok(VERIFY_WRITE, to, n) ? - buffer_op((unsigned long) to, n, 1, copy_chunk_to_user, &from) : - n; + return buffer_op((unsigned long) to, n, 1, copy_chunk_to_user, &from); } -EXPORT_SYMBOL(copy_to_user); +EXPORT_SYMBOL(raw_copy_to_user); static int strncpy_chunk_from_user(unsigned long from, int len, void *arg) { @@ -188,19 +179,13 @@ static int strncpy_chunk_from_user(unsigned long from, int len, void *arg) return 0; } -int strncpy_from_user(char *dst, const char __user *src, int count) +long strncpy_from_user(char *dst, const char __user *src, long count) { - int n; + long n; char *ptr = dst; - if (segment_eq(get_fs(), KERNEL_DS)) { - strncpy(dst, (__force void *) src, count); - return strnlen(dst, count); - } - - if (!access_ok(VERIFY_READ, src, 1)) + if (!access_ok(src, 1)) return -EFAULT; - n = buffer_op((unsigned long) src, count, 0, strncpy_chunk_from_user, &ptr); if (n != 0) @@ -215,22 +200,11 @@ static int clear_chunk(unsigned long addr, int len, void *unused) return 0; } -int __clear_user(void __user *mem, int len) +unsigned long __clear_user(void __user *mem, unsigned long len) { return buffer_op((unsigned long) mem, len, 1, clear_chunk, NULL); } - -int clear_user(void __user *mem, int len) -{ - if (segment_eq(get_fs(), KERNEL_DS)) { - memset((__force void*)mem, 0, len); - return 0; - } - - return access_ok(VERIFY_WRITE, mem, len) ? - buffer_op((unsigned long) mem, len, 1, clear_chunk, NULL) : len; -} -EXPORT_SYMBOL(clear_user); +EXPORT_SYMBOL(__clear_user); static int strnlen_chunk(unsigned long str, int len, void *arg) { @@ -244,16 +218,151 @@ static int strnlen_chunk(unsigned long str, int len, void *arg) return 0; } -int strnlen_user(const void __user *str, int len) +long strnlen_user(const char __user *str, long len) { int count = 0, n; - if (segment_eq(get_fs(), KERNEL_DS)) - return strnlen((__force char*)str, len) + 1; - + if (!access_ok(str, 1)) + return -EFAULT; n = buffer_op((unsigned long) str, len, 0, strnlen_chunk, &count); if (n == 0) return count + 1; - return -EFAULT; + return 0; } EXPORT_SYMBOL(strnlen_user); + +/** + * arch_futex_atomic_op_inuser() - Atomic arithmetic operation with constant + * argument and comparison of the previous + * futex value with another constant. + * + * @op: operation to execute + * @oparg: argument to operation + * @oval: old value at uaddr + * @uaddr: pointer to user space address + * + * Return: + * 0 - On success + * -EFAULT - User access resulted in a page fault + * -EAGAIN - Atomic operation was unable to complete due to contention + * -ENOSYS - Operation not supported + */ + +int arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval, u32 __user *uaddr) +{ + int oldval, ret; + struct page *page; + unsigned long addr = (unsigned long) uaddr; + pte_t *pte; + + ret = -EFAULT; + if (!access_ok(uaddr, sizeof(*uaddr))) + return -EFAULT; + preempt_disable(); + pte = maybe_map(addr, 1); + if (pte == NULL) + goto out_inuser; + + page = pte_page(*pte); +#ifdef CONFIG_64BIT + pagefault_disable(); + addr = (unsigned long) page_address(page) + + (((unsigned long) addr) & ~PAGE_MASK); +#else + addr = (unsigned long) kmap_atomic(page) + + ((unsigned long) addr & ~PAGE_MASK); +#endif + uaddr = (u32 *) addr; + oldval = *uaddr; + + ret = 0; + + switch (op) { + case FUTEX_OP_SET: + *uaddr = oparg; + break; + case FUTEX_OP_ADD: + *uaddr += oparg; + break; + case FUTEX_OP_OR: + *uaddr |= oparg; + break; + case FUTEX_OP_ANDN: + *uaddr &= ~oparg; + break; + case FUTEX_OP_XOR: + *uaddr ^= oparg; + break; + default: + ret = -ENOSYS; + } +#ifdef CONFIG_64BIT + pagefault_enable(); +#else + kunmap_atomic((void *)addr); +#endif + +out_inuser: + preempt_enable(); + + if (ret == 0) + *oval = oldval; + + return ret; +} +EXPORT_SYMBOL(arch_futex_atomic_op_inuser); + +/** + * futex_atomic_cmpxchg_inatomic() - Compare and exchange the content of the + * uaddr with newval if the current value is + * oldval. + * @uval: pointer to store content of @uaddr + * @uaddr: pointer to user space address + * @oldval: old value + * @newval: new value to store to @uaddr + * + * Return: + * 0 - On success + * -EFAULT - User access resulted in a page fault + * -EAGAIN - Atomic operation was unable to complete due to contention + */ + +int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) +{ + struct page *page; + pte_t *pte; + int ret = -EFAULT; + + if (!access_ok(uaddr, sizeof(*uaddr))) + return -EFAULT; + + preempt_disable(); + pte = maybe_map((unsigned long) uaddr, 1); + if (pte == NULL) + goto out_inatomic; + + page = pte_page(*pte); +#ifdef CONFIG_64BIT + pagefault_disable(); + uaddr = page_address(page) + (((unsigned long) uaddr) & ~PAGE_MASK); +#else + uaddr = kmap_atomic(page) + ((unsigned long) uaddr & ~PAGE_MASK); +#endif + + *uval = *uaddr; + + ret = cmpxchg(uaddr, oldval, newval); + +#ifdef CONFIG_64BIT + pagefault_enable(); +#else + kunmap_atomic(uaddr); +#endif + ret = 0; + +out_inatomic: + preempt_enable(); + return ret; +} +EXPORT_SYMBOL(futex_atomic_cmpxchg_inatomic); diff --git a/arch/um/kernel/smp.c b/arch/um/kernel/smp.c index 5c8c3ea7db7b..f1e52b7348fb 100644 --- a/arch/um/kernel/smp.c +++ b/arch/um/kernel/smp.c @@ -1,238 +1,242 @@ +// SPDX-License-Identifier: GPL-2.0 /* + * Copyright (C) 2025 Ant Group + * Author: Tiwei Bie <tiwei.btw@antgroup.com> + * + * Based on the previous implementation in TT mode * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ -#include <linux/percpu.h> -#include <asm/pgalloc.h> -#include <asm/tlb.h> - -#ifdef CONFIG_SMP - #include <linux/sched.h> +#include <linux/sched/task.h> +#include <linux/sched/task_stack.h> #include <linux/module.h> +#include <linux/processor.h> #include <linux/threads.h> -#include <linux/interrupt.h> -#include <linux/err.h> +#include <linux/cpu.h> #include <linux/hardirq.h> -#include <asm/smp.h> -#include <asm/processor.h> -#include <asm/spinlock.h> +#include <linux/smp.h> +#include <linux/smp-internal.h> +#include <init.h> #include <kern.h> -#include <irq_user.h> #include <os.h> +#include <smp.h> -/* Per CPU bogomips and other parameters - * The only piece used here is the ipi pipe, which is set before SMP is - * started and never changed. - */ -struct cpuinfo_um cpu_data[NR_CPUS]; +enum { + UML_IPI_RES = 0, + UML_IPI_CALL_SINGLE, + UML_IPI_CALL, + UML_IPI_STOP, +}; -/* A statistic, can be a little off */ -int num_reschedules_sent = 0; +void arch_smp_send_reschedule(int cpu) +{ + os_send_ipi(cpu, UML_IPI_RES); +} -/* Not changed after boot */ -struct task_struct *idle_threads[NR_CPUS]; +void arch_send_call_function_single_ipi(int cpu) +{ + os_send_ipi(cpu, UML_IPI_CALL_SINGLE); +} -void smp_send_reschedule(int cpu) +void arch_send_call_function_ipi_mask(const struct cpumask *mask) { - os_write_file(cpu_data[cpu].ipi_pipe[1], "R", 1); - num_reschedules_sent++; + int cpu; + + for_each_cpu(cpu, mask) + os_send_ipi(cpu, UML_IPI_CALL); } void smp_send_stop(void) { - int i; + int cpu, me = smp_processor_id(); - printk(KERN_INFO "Stopping all CPUs..."); - for (i = 0; i < num_online_cpus(); i++) { - if (i == current_thread->cpu) + for_each_online_cpu(cpu) { + if (cpu == me) continue; - os_write_file(cpu_data[i].ipi_pipe[1], "S", 1); + os_send_ipi(cpu, UML_IPI_STOP); } - printk(KERN_CONT "done\n"); } -static cpumask_t smp_commenced_mask = CPU_MASK_NONE; -static cpumask_t cpu_callin_map = CPU_MASK_NONE; +static void ipi_handler(int vector, struct uml_pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs((struct pt_regs *)regs); + int cpu = raw_smp_processor_id(); + + irq_enter(); + + if (current->mm) + os_alarm_process(current->mm->context.id.pid); + + switch (vector) { + case UML_IPI_RES: + inc_irq_stat(irq_resched_count); + scheduler_ipi(); + break; + + case UML_IPI_CALL_SINGLE: + inc_irq_stat(irq_call_count); + generic_smp_call_function_single_interrupt(); + break; + + case UML_IPI_CALL: + inc_irq_stat(irq_call_count); + generic_smp_call_function_interrupt(); + break; + + case UML_IPI_STOP: + set_cpu_online(cpu, false); + while (1) + pause(); + break; + + default: + pr_err("CPU#%d received unknown IPI (vector=%d)!\n", cpu, vector); + break; + } + + irq_exit(); + set_irq_regs(old_regs); +} -static int idle_proc(void *cpup) +void uml_ipi_handler(int vector) { - int cpu = (int) cpup, err; + struct uml_pt_regs r = { .is_user = 0 }; - err = os_pipe(cpu_data[cpu].ipi_pipe, 1, 1); - if (err < 0) - panic("CPU#%d failed to create IPI pipe, err = %d", cpu, -err); + preempt_disable(); + ipi_handler(vector, &r); + preempt_enable(); +} - os_set_fd_async(cpu_data[cpu].ipi_pipe[0]); +/* AP states used only during CPU startup */ +enum { + UML_CPU_PAUSED = 0, + UML_CPU_RUNNING, +}; - wmb(); - if (cpu_test_and_set(cpu, cpu_callin_map)) { - printk(KERN_ERR "huh, CPU#%d already present??\n", cpu); - BUG(); - } +static int cpu_states[NR_CPUS]; - while (!cpu_isset(cpu, smp_commenced_mask)) - cpu_relax(); +static int start_secondary(void *unused) +{ + int err, cpu = raw_smp_processor_id(); notify_cpu_starting(cpu); set_cpu_online(cpu, true); - default_idle(); - return 0; -} -static struct task_struct *idle_thread(int cpu) -{ - struct task_struct *new_task; - - current->thread.request.u.thread.proc = idle_proc; - current->thread.request.u.thread.arg = (void *) cpu; - new_task = fork_idle(cpu); - if (IS_ERR(new_task)) - panic("copy_process failed in idle_thread, error = %ld", - PTR_ERR(new_task)); - - cpu_tasks[cpu] = ((struct cpu_task) - { .pid = new_task->thread.mode.tt.extern_pid, - .task = new_task } ); - idle_threads[cpu] = new_task; - panic("skas mode doesn't support SMP"); - return new_task; -} + err = um_setup_timer(); + if (err) + panic("CPU#%d failed to setup timer, err = %d", cpu, err); -void smp_prepare_cpus(unsigned int maxcpus) -{ - struct task_struct *idle; - unsigned long waittime; - int err, cpu, me = smp_processor_id(); - int i; + local_irq_enable(); - for (i = 0; i < ncpus; ++i) - set_cpu_possible(i, true); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); - set_cpu_online(me, true); - cpu_set(me, cpu_callin_map); + return 0; +} - err = os_pipe(cpu_data[me].ipi_pipe, 1, 1); - if (err < 0) - panic("CPU#0 failed to create IPI pipe, errno = %d", -err); +void uml_start_secondary(void *opaque) +{ + int cpu = raw_smp_processor_id(); + struct mm_struct *mm = &init_mm; + struct task_struct *idle; - os_set_fd_async(cpu_data[me].ipi_pipe[0]); + stack_protections((unsigned long) &cpu_irqstacks[cpu]); + set_sigstack(&cpu_irqstacks[cpu], THREAD_SIZE); - for (cpu = 1; cpu < ncpus; cpu++) { - printk(KERN_INFO "Booting processor %d...\n", cpu); + set_cpu_present(cpu, true); + os_futex_wait(&cpu_states[cpu], UML_CPU_PAUSED); - idle = idle_thread(cpu); + smp_rmb(); /* paired with smp_wmb() in __cpu_up() */ - init_idle(idle, cpu); + idle = cpu_tasks[cpu]; + idle->thread_info.cpu = cpu; - waittime = 200000000; - while (waittime-- && !cpu_isset(cpu, cpu_callin_map)) - cpu_relax(); + mmgrab(mm); + idle->active_mm = mm; - printk(KERN_INFO "%s\n", - cpu_isset(cpu, cpu_calling_map) ? "done" : "failed"); - } -} + idle->thread.request.thread.proc = start_secondary; + idle->thread.request.thread.arg = NULL; -void smp_prepare_boot_cpu(void) -{ - set_cpu_online(smp_processor_id(), true); + new_thread(task_stack_page(idle), &idle->thread.switch_buf, + new_thread_handler); + os_start_secondary(opaque, &idle->thread.switch_buf); } -int __cpu_up(unsigned int cpu, struct task_struct *tidle) +void __init smp_prepare_cpus(unsigned int max_cpus) { - cpu_set(cpu, smp_commenced_mask); - while (!cpu_online(cpu)) - mb(); - return 0; -} + int err, cpu, me = smp_processor_id(); + unsigned long deadline; -int setup_profiling_timer(unsigned int multiplier) -{ - printk(KERN_INFO "setup_profiling_timer\n"); - return 0; -} + os_init_smp(); -void smp_call_function_slave(int cpu); + for_each_possible_cpu(cpu) { + if (cpu == me) + continue; -void IPI_handler(int cpu) -{ - unsigned char c; - int fd; - - fd = cpu_data[cpu].ipi_pipe[0]; - while (os_read_file(fd, &c, 1) == 1) { - switch (c) { - case 'C': - smp_call_function_slave(cpu); - break; - - case 'R': - scheduler_ipi(); - break; - - case 'S': - printk(KERN_INFO "CPU#%d stopping\n", cpu); - while (1) - pause(); - break; - - default: - printk(KERN_ERR "CPU#%d received unknown IPI [%c]!\n", - cpu, c); - break; + pr_debug("Booting processor %d...\n", cpu); + err = os_start_cpu_thread(cpu); + if (err) { + pr_crit("CPU#%d failed to start cpu thread, err = %d", + cpu, err); + continue; } + + deadline = jiffies + msecs_to_jiffies(1000); + spin_until_cond(cpu_present(cpu) || + time_is_before_jiffies(deadline)); + + if (!cpu_present(cpu)) + pr_crit("CPU#%d failed to boot\n", cpu); } } -int hard_smp_processor_id(void) +int __cpu_up(unsigned int cpu, struct task_struct *tidle) { - return pid_to_processor_id(os_getpid()); -} + cpu_tasks[cpu] = tidle; + smp_wmb(); /* paired with smp_rmb() in uml_start_secondary() */ + cpu_states[cpu] = UML_CPU_RUNNING; + os_futex_wake(&cpu_states[cpu]); + spin_until_cond(cpu_online(cpu)); -static DEFINE_SPINLOCK(call_lock); -static atomic_t scf_started; -static atomic_t scf_finished; -static void (*func)(void *info); -static void *info; - -void smp_call_function_slave(int cpu) -{ - atomic_inc(&scf_started); - (*func)(info); - atomic_inc(&scf_finished); + return 0; } -int smp_call_function(void (*_func)(void *info), void *_info, int wait) +void __init smp_cpus_done(unsigned int max_cpus) { - int cpus = num_online_cpus() - 1; - int i; +} - if (!cpus) - return 0; +/* Set in uml_ncpus_setup */ +int uml_ncpus = 1; - /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); +void __init prefill_possible_map(void) +{ + int cpu; - spin_lock_bh(&call_lock); - atomic_set(&scf_started, 0); - atomic_set(&scf_finished, 0); - func = _func; - info = _info; + for (cpu = 0; cpu < uml_ncpus; cpu++) + set_cpu_possible(cpu, true); + for (; cpu < NR_CPUS; cpu++) + set_cpu_possible(cpu, false); +} - for_each_online_cpu(i) - os_write_file(cpu_data[i].ipi_pipe[1], "C", 1); +static int __init uml_ncpus_setup(char *line, int *add) +{ + *add = 0; - while (atomic_read(&scf_started) != cpus) - barrier(); + if (kstrtoint(line, 10, ¨_ncpus)) { + os_warn("%s: Couldn't parse '%s'\n", __func__, line); + return -1; + } - if (wait) - while (atomic_read(&scf_finished) != cpus) - barrier(); + uml_ncpus = clamp(uml_ncpus, 1, NR_CPUS); - spin_unlock_bh(&call_lock); return 0; } -#endif +__uml_setup("ncpus=", uml_ncpus_setup, +"ncpus=<# of desired CPUs>\n" +" This tells UML how many virtual processors to start. The maximum\n" +" number of supported virtual processors can be obtained by querying\n" +" the CONFIG_NR_CPUS option using --showconfig.\n\n" +); + +EXPORT_SYMBOL(uml_curr_cpu); diff --git a/arch/um/kernel/stacktrace.c b/arch/um/kernel/stacktrace.c new file mode 100644 index 000000000000..fd3b61b3d4d2 --- /dev/null +++ b/arch/um/kernel/stacktrace.c @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Copyright (C) 2013 Richard Weinberger <richard@nod.at> + * Copyright (C) 2014 Google Inc., Author: Daniel Walter <dwalter@google.com> + */ + +#include <linux/kallsyms.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/stacktrace.h> +#include <linux/module.h> +#include <linux/uaccess.h> +#include <asm/stacktrace.h> + +void dump_trace(struct task_struct *tsk, + const struct stacktrace_ops *ops, + void *data) +{ + int reliable = 0; + unsigned long *sp, bp, addr; + struct pt_regs *segv_regs = tsk->thread.segv_regs; + struct stack_frame *frame; + + bp = get_frame_pointer(tsk, segv_regs); + sp = get_stack_pointer(tsk, segv_regs); + + frame = (struct stack_frame *)bp; + while (((long) sp & (THREAD_SIZE-1)) != 0) { + addr = READ_ONCE_NOCHECK(*sp); + if (__kernel_text_address(addr)) { + reliable = 0; + if ((unsigned long) sp == bp + sizeof(long)) { + frame = frame ? frame->next_frame : NULL; + bp = (unsigned long)frame; + reliable = 1; + } + ops->address(data, addr, reliable); + } + sp++; + } +} + +static void save_addr(void *data, unsigned long address, int reliable) +{ + struct stack_trace *trace = data; + + if (!reliable) + return; + if (trace->nr_entries >= trace->max_entries) + return; + + trace->entries[trace->nr_entries++] = address; +} + +static const struct stacktrace_ops dump_ops = { + .address = save_addr +}; + +static void __save_stack_trace(struct task_struct *tsk, struct stack_trace *trace) +{ + dump_trace(tsk, &dump_ops, trace); +} + +void save_stack_trace(struct stack_trace *trace) +{ + __save_stack_trace(current, trace); +} +EXPORT_SYMBOL_GPL(save_stack_trace); + +void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) +{ + __save_stack_trace(tsk, trace); +} +EXPORT_SYMBOL_GPL(save_stack_trace_tsk); diff --git a/arch/um/kernel/syscall.c b/arch/um/kernel/syscall.c deleted file mode 100644 index c1d0ae069b53..000000000000 --- a/arch/um/kernel/syscall.c +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL - */ - -#include <linux/file.h> -#include <linux/fs.h> -#include <linux/mm.h> -#include <linux/sched.h> -#include <linux/utsname.h> -#include <linux/syscalls.h> -#include <asm/current.h> -#include <asm/mman.h> -#include <asm/uaccess.h> -#include <asm/unistd.h> - -long old_mmap(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long offset) -{ - long err = -EINVAL; - if (offset & ~PAGE_MASK) - goto out; - - err = sys_mmap_pgoff(addr, len, prot, flags, fd, offset >> PAGE_SHIFT); - out: - return err; -} diff --git a/arch/um/kernel/sysrq.c b/arch/um/kernel/sysrq.c index 0dc4d1c6f98a..13ee5666668d 100644 --- a/arch/um/kernel/sysrq.c +++ b/arch/um/kernel/sysrq.c @@ -1,66 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL + * Copyright (C) 2013 Richard Weinberger <richrd@nod.at> */ #include <linux/kallsyms.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/sched.h> -#include <asm/sysrq.h> +#include <linux/sched/debug.h> +#include <linux/sched/task_stack.h> -/* Catch non-i386 SUBARCH's. */ -#if !defined(CONFIG_UML_X86) || defined(CONFIG_64BIT) -void show_trace(struct task_struct *task, unsigned long * stack) -{ - unsigned long addr; +#include <asm/stacktrace.h> +#include <os.h> - if (!stack) { - stack = (unsigned long*) &stack; - WARN_ON(1); - } +static void _print_addr(void *data, unsigned long address, int reliable) +{ + const char *loglvl = data; - printk(KERN_INFO "Call Trace: \n"); - while (((long) stack & (THREAD_SIZE-1)) != 0) { - addr = *stack; - if (__kernel_text_address(addr)) { - printk(KERN_INFO "%08lx: [<%08lx>]", - (unsigned long) stack, addr); - print_symbol(KERN_CONT " %s", addr); - printk(KERN_CONT "\n"); - } - stack++; - } - printk(KERN_INFO "\n"); + printk("%s [<%08lx>] %s%pS\n", loglvl, address, reliable ? "" : "? ", + (void *)address); } -#endif -/*Stolen from arch/i386/kernel/traps.c */ -static const int kstack_depth_to_print = 24; +static const struct stacktrace_ops stackops = { + .address = _print_addr +}; -/* This recently started being used in arch-independent code too, as in - * kernel/sched/core.c.*/ -void show_stack(struct task_struct *task, unsigned long *esp) +void show_stack(struct task_struct *task, unsigned long *stack, + const char *loglvl) { - unsigned long *stack; + struct pt_regs *segv_regs = current->thread.segv_regs; int i; - if (esp == NULL) { - if (task != current && task != NULL) { - esp = (unsigned long *) KSTK_ESP(task); - } else { - esp = (unsigned long *) &esp; - } - } + if (!stack) + stack = get_stack_pointer(task, segv_regs); - stack = esp; - for (i = 0; i < kstack_depth_to_print; i++) { + printk("%sStack:\n", loglvl); + for (i = 0; i < 3 * STACKSLOTS_PER_LINE; i++) { if (kstack_end(stack)) break; - if (i && ((i % 8) == 0)) - printk(KERN_INFO " "); - printk(KERN_CONT "%08lx ", *stack++); + if (i && ((i % STACKSLOTS_PER_LINE) == 0)) + pr_cont("\n"); + pr_cont(" %08lx", READ_ONCE_NOCHECK(*stack)); + stack++; } - show_trace(task, esp); + printk("%sCall Trace:\n", loglvl); + dump_trace(task ?: current, &stackops, (void *)loglvl); } diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c index 117568d4f64a..b344a36b44eb 100644 --- a/arch/um/kernel/time.c +++ b/arch/um/kernel/time.c @@ -1,115 +1,1092 @@ +// SPDX-License-Identifier: GPL-2.0 /* + * Copyright (C) 2015 Anton Ivanov (aivanov@{brocade.com,kot-begemot.co.uk}) + * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de) + * Copyright (C) 2012-2014 Cisco Systems * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL + * Copyright (C) 2019 Intel Corporation */ #include <linux/clockchips.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/jiffies.h> +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/spinlock.h> #include <linux/threads.h> #include <asm/irq.h> #include <asm/param.h> #include <kern_util.h> #include <os.h> +#include <linux/delay.h> +#include <linux/time-internal.h> +#include <linux/um_timetravel.h> +#include <shared/init.h> + +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT +#include <linux/sched/clock.h> + +enum time_travel_mode time_travel_mode; +EXPORT_SYMBOL_GPL(time_travel_mode); + +static bool time_travel_start_set; +static unsigned long long time_travel_start; +static unsigned long long time_travel_time; +static unsigned long long time_travel_shm_offset; +static LIST_HEAD(time_travel_events); +static LIST_HEAD(time_travel_irqs); +static unsigned long long time_travel_timer_interval; +static unsigned long long time_travel_next_event; +static struct time_travel_event time_travel_timer_event; +static int time_travel_ext_fd = -1; +static unsigned int time_travel_ext_waiting; +static bool time_travel_ext_prev_request_valid; +static unsigned long long time_travel_ext_prev_request; +static unsigned long long *time_travel_ext_free_until; +static unsigned long long _time_travel_ext_free_until; +static u16 time_travel_shm_id; +static struct um_timetravel_schedshm *time_travel_shm; +static union um_timetravel_schedshm_client *time_travel_shm_client; + +unsigned long tt_extra_sched_jiffies; + +notrace unsigned long long sched_clock(void) +{ + return (unsigned long long)(jiffies - INITIAL_JIFFIES + + tt_extra_sched_jiffies) + * (NSEC_PER_SEC / HZ); +} + +static void time_travel_set_time(unsigned long long ns) +{ + if (unlikely(ns < time_travel_time)) + panic("time-travel: time goes backwards %lld -> %lld\n", + time_travel_time, ns); + else if (unlikely(ns >= S64_MAX)) + panic("The system was going to sleep forever, aborting"); + + time_travel_time = ns; +} + +enum time_travel_message_handling { + TTMH_IDLE, + TTMH_POLL, + TTMH_READ, + TTMH_READ_START_ACK, +}; + +static u64 bc_message; +int time_travel_should_print_bc_msg; + +void _time_travel_print_bc_msg(void) +{ + time_travel_should_print_bc_msg = 0; + printk(KERN_INFO "time-travel: received broadcast 0x%llx\n", bc_message); +} + +static void time_travel_setup_shm(int fd, u16 id) +{ + u32 len; + + time_travel_shm = os_mmap_rw_shared(fd, sizeof(*time_travel_shm)); + + if (!time_travel_shm) + goto out; + + len = time_travel_shm->len; + + if (time_travel_shm->version != UM_TIMETRAVEL_SCHEDSHM_VERSION || + len < struct_size(time_travel_shm, clients, id + 1)) { + os_unmap_memory(time_travel_shm, sizeof(*time_travel_shm)); + time_travel_shm = NULL; + goto out; + } + + time_travel_shm = os_mremap_rw_shared(time_travel_shm, + sizeof(*time_travel_shm), + len); + if (!time_travel_shm) + goto out; + + time_travel_shm_offset = time_travel_shm->current_time; + time_travel_shm_client = &time_travel_shm->clients[id]; + time_travel_shm_client->capa |= UM_TIMETRAVEL_SCHEDSHM_CAP_TIME_SHARE; + time_travel_shm_id = id; + /* always look at that free_until from now on */ + time_travel_ext_free_until = &time_travel_shm->free_until; +out: + os_close_file(fd); +} + +static void time_travel_handle_message(struct um_timetravel_msg *msg, + enum time_travel_message_handling mode) +{ + struct um_timetravel_msg resp = { + .op = UM_TIMETRAVEL_ACK, + }; + int ret; + + /* + * We can't unlock here, but interrupt signals with a timetravel_handler + * (see um_request_irq_tt) get to the timetravel_handler anyway. + */ + if (mode != TTMH_READ) { + BUG_ON(mode == TTMH_IDLE && !irqs_disabled()); + + while (os_poll(1, &time_travel_ext_fd) != 0) { + /* nothing */ + } + } + + if (unlikely(mode == TTMH_READ_START_ACK)) { + int fd[UM_TIMETRAVEL_SHARED_MAX_FDS]; + + ret = os_rcv_fd_msg(time_travel_ext_fd, fd, + ARRAY_SIZE(fd), msg, sizeof(*msg)); + if (ret == sizeof(*msg)) { + time_travel_setup_shm(fd[UM_TIMETRAVEL_SHARED_MEMFD], + msg->time & UM_TIMETRAVEL_START_ACK_ID); + /* we don't use the logging for now */ + os_close_file(fd[UM_TIMETRAVEL_SHARED_LOGFD]); + } + } else { + ret = os_read_file(time_travel_ext_fd, msg, sizeof(*msg)); + } + + if (ret == 0) + panic("time-travel external link is broken\n"); + if (ret != sizeof(*msg)) + panic("invalid time-travel message - %d bytes\n", ret); + + switch (msg->op) { + default: + WARN_ONCE(1, "time-travel: unexpected message %lld\n", + (unsigned long long)msg->op); + break; + case UM_TIMETRAVEL_ACK: + return; + case UM_TIMETRAVEL_RUN: + time_travel_set_time(msg->time); + if (time_travel_shm) { + /* no request right now since we're running */ + time_travel_shm_client->flags &= + ~UM_TIMETRAVEL_SCHEDSHM_FLAGS_REQ_RUN; + /* no ack for shared memory RUN */ + return; + } + break; + case UM_TIMETRAVEL_FREE_UNTIL: + /* not supposed to get this with shm, but ignore it */ + if (time_travel_shm) + break; + time_travel_ext_free_until = &_time_travel_ext_free_until; + _time_travel_ext_free_until = msg->time; + break; + case UM_TIMETRAVEL_BROADCAST: + bc_message = msg->time; + time_travel_should_print_bc_msg = 1; + break; + } + + resp.seq = msg->seq; + os_write_file(time_travel_ext_fd, &resp, sizeof(resp)); +} + +static u64 time_travel_ext_req(u32 op, u64 time) +{ + static int seq; + int mseq = ++seq; + struct um_timetravel_msg msg = { + .op = op, + .time = time, + .seq = mseq, + }; + + /* + * We need to block even the timetravel handlers of SIGIO here and + * only restore their use when we got the ACK - otherwise we may + * (will) get interrupted by that, try to queue the IRQ for future + * processing and thus send another request while we're still waiting + * for an ACK, but the peer doesn't know we got interrupted and will + * send the ACKs in the same order as the message, but we'd need to + * see them in the opposite order ... + * + * This wouldn't matter *too* much, but some ACKs carry the + * current time (for UM_TIMETRAVEL_GET) and getting another + * ACK without a time would confuse us a lot! + * + * The sequence number assignment that happens here lets us + * debug such message handling issues more easily. + */ + block_signals_hard(); + os_write_file(time_travel_ext_fd, &msg, sizeof(msg)); + + /* no ACK expected for WAIT in shared memory mode */ + if (msg.op == UM_TIMETRAVEL_WAIT && time_travel_shm) + goto done; + + while (msg.op != UM_TIMETRAVEL_ACK) + time_travel_handle_message(&msg, + op == UM_TIMETRAVEL_START ? + TTMH_READ_START_ACK : + TTMH_READ); + + if (msg.seq != mseq) + panic("time-travel: ACK message has different seqno! op=%d, seq=%d != %d time=%lld\n", + msg.op, msg.seq, mseq, msg.time); + + if (op == UM_TIMETRAVEL_GET) + time_travel_set_time(msg.time); +done: + unblock_signals_hard(); + + return msg.time; +} + +void __time_travel_wait_readable(int fd) +{ + int fds[2] = { fd, time_travel_ext_fd }; + int ret; + + if (time_travel_mode != TT_MODE_EXTERNAL) + return; + + while ((ret = os_poll(2, fds))) { + struct um_timetravel_msg msg; + + if (ret == 1) + time_travel_handle_message(&msg, TTMH_READ); + } +} +EXPORT_SYMBOL_GPL(__time_travel_wait_readable); + +static void time_travel_ext_update_request(unsigned long long time) +{ + if (time_travel_mode != TT_MODE_EXTERNAL) + return; + + /* asked for exactly this time previously */ + if (time_travel_ext_prev_request_valid && + time == time_travel_ext_prev_request) + return; + + /* + * if we're running and are allowed to run past the request + * then we don't need to update it either + * + * Note for shm we ignore FREE_UNTIL messages and leave the pointer + * to shared memory, and for non-shm the offset is 0. + */ + if (!time_travel_ext_waiting && time_travel_ext_free_until && + time < (*time_travel_ext_free_until - time_travel_shm_offset)) + return; + + time_travel_ext_prev_request = time; + time_travel_ext_prev_request_valid = true; + + if (time_travel_shm) { + union um_timetravel_schedshm_client *running; + + running = &time_travel_shm->clients[time_travel_shm->running_id]; + + if (running->capa & UM_TIMETRAVEL_SCHEDSHM_CAP_TIME_SHARE) { + time_travel_shm_client->flags |= + UM_TIMETRAVEL_SCHEDSHM_FLAGS_REQ_RUN; + time += time_travel_shm_offset; + time_travel_shm_client->req_time = time; + if (time < time_travel_shm->free_until) + time_travel_shm->free_until = time; + return; + } + } + + time_travel_ext_req(UM_TIMETRAVEL_REQUEST, time); +} + +void __time_travel_propagate_time(void) +{ + static unsigned long long last_propagated; + + if (time_travel_shm) { + if (time_travel_shm->running_id != time_travel_shm_id) + panic("time-travel: setting time while not running\n"); + time_travel_shm->current_time = time_travel_time + + time_travel_shm_offset; + return; + } + + if (last_propagated == time_travel_time) + return; + + time_travel_ext_req(UM_TIMETRAVEL_UPDATE, time_travel_time); + last_propagated = time_travel_time; +} +EXPORT_SYMBOL_GPL(__time_travel_propagate_time); + +/* returns true if we must do a wait to the simtime device */ +static bool time_travel_ext_request(unsigned long long time) +{ + /* + * If we received an external sync point ("free until") then we + * don't have to request/wait for anything until then, unless + * we're already waiting. + * + * Note for shm we ignore FREE_UNTIL messages and leave the pointer + * to shared memory, and for non-shm the offset is 0. + */ + if (!time_travel_ext_waiting && time_travel_ext_free_until && + time < (*time_travel_ext_free_until - time_travel_shm_offset)) + return false; + + time_travel_ext_update_request(time); + return true; +} + +static void time_travel_ext_wait(bool idle) +{ + struct um_timetravel_msg msg = { + .op = UM_TIMETRAVEL_ACK, + }; + + time_travel_ext_prev_request_valid = false; + if (!time_travel_shm) + time_travel_ext_free_until = NULL; + time_travel_ext_waiting++; + + time_travel_ext_req(UM_TIMETRAVEL_WAIT, -1); + + /* + * Here we are deep in the idle loop, so we have to break out of the + * kernel abstraction in a sense and implement this in terms of the + * UML system waiting on the VQ interrupt while sleeping, when we get + * the signal it'll call time_travel_ext_vq_notify_done() completing the + * call. + */ + while (msg.op != UM_TIMETRAVEL_RUN) + time_travel_handle_message(&msg, idle ? TTMH_IDLE : TTMH_POLL); + + time_travel_ext_waiting--; + + /* we might request more stuff while polling - reset when we run */ + time_travel_ext_prev_request_valid = false; +} + +static void time_travel_ext_get_time(void) +{ + if (time_travel_shm) + time_travel_set_time(time_travel_shm->current_time - + time_travel_shm_offset); + else + time_travel_ext_req(UM_TIMETRAVEL_GET, -1); +} + +static void __time_travel_update_time(unsigned long long ns, bool idle) +{ + if (time_travel_mode == TT_MODE_EXTERNAL && time_travel_ext_request(ns)) + time_travel_ext_wait(idle); + else + time_travel_set_time(ns); +} + +static struct time_travel_event *time_travel_first_event(void) +{ + return list_first_entry_or_null(&time_travel_events, + struct time_travel_event, + list); +} + +static void __time_travel_add_event(struct time_travel_event *e, + unsigned long long time) +{ + struct time_travel_event *tmp; + bool inserted = false; + unsigned long flags; + + if (e->pending) + return; + + e->pending = true; + e->time = time; + + local_irq_save(flags); + list_for_each_entry(tmp, &time_travel_events, list) { + /* + * Add the new entry before one with higher time, + * or if they're equal and both on stack, because + * in that case we need to unwind the stack in the + * right order, and the later event (timer sleep + * or such) must be dequeued first. + */ + if ((tmp->time > e->time) || + (tmp->time == e->time && tmp->onstack && e->onstack)) { + list_add_tail(&e->list, &tmp->list); + inserted = true; + break; + } + } + + if (!inserted) + list_add_tail(&e->list, &time_travel_events); + + tmp = time_travel_first_event(); + time_travel_ext_update_request(tmp->time); + time_travel_next_event = tmp->time; + local_irq_restore(flags); +} + +static void time_travel_add_event(struct time_travel_event *e, + unsigned long long time) +{ + if (WARN_ON(!e->fn)) + return; + + __time_travel_add_event(e, time); +} + +void time_travel_add_event_rel(struct time_travel_event *e, + unsigned long long delay_ns) +{ + time_travel_add_event(e, time_travel_time + delay_ns); +} + +static void time_travel_periodic_timer(struct time_travel_event *e) +{ + time_travel_add_event(&time_travel_timer_event, + time_travel_time + time_travel_timer_interval); + + /* clock tick; decrease extra jiffies by keeping sched_clock constant */ + if (tt_extra_sched_jiffies > 0) + tt_extra_sched_jiffies -= 1; + + deliver_alarm(); +} + +void deliver_time_travel_irqs(void) +{ + struct time_travel_event *e; + unsigned long flags; + + /* + * Don't do anything for most cases. Note that because here we have + * to disable IRQs (and re-enable later) we'll actually recurse at + * the end of the function, so this is strictly necessary. + */ + if (likely(list_empty(&time_travel_irqs))) + return; + + local_irq_save(flags); + irq_enter(); + while ((e = list_first_entry_or_null(&time_travel_irqs, + struct time_travel_event, + list))) { + list_del(&e->list); + e->pending = false; + e->fn(e); + } + irq_exit(); + local_irq_restore(flags); +} + +static void time_travel_deliver_event(struct time_travel_event *e) +{ + if (e == &time_travel_timer_event) { + /* + * deliver_alarm() does the irq_enter/irq_exit + * by itself, so must handle it specially here + */ + e->fn(e); + } else if (irqs_disabled()) { + list_add_tail(&e->list, &time_travel_irqs); + /* + * set pending again, it was set to false when the + * event was deleted from the original list, but + * now it's still pending until we deliver the IRQ. + */ + e->pending = true; + } else { + unsigned long flags; + + local_irq_save(flags); + irq_enter(); + e->fn(e); + irq_exit(); + local_irq_restore(flags); + } +} + +bool time_travel_del_event(struct time_travel_event *e) +{ + unsigned long flags; + + if (!e->pending) + return false; + local_irq_save(flags); + list_del(&e->list); + e->pending = false; + local_irq_restore(flags); + return true; +} + +static void time_travel_update_time(unsigned long long next, bool idle) +{ + struct time_travel_event ne = { + .onstack = true, + }; + struct time_travel_event *e; + bool finished = idle; + + /* add it without a handler - we deal with that specifically below */ + __time_travel_add_event(&ne, next); + + do { + e = time_travel_first_event(); + + BUG_ON(!e); + __time_travel_update_time(e->time, idle); + + /* new events may have been inserted while we were waiting */ + if (e == time_travel_first_event()) { + BUG_ON(!time_travel_del_event(e)); + BUG_ON(time_travel_time != e->time); + + if (e == &ne) { + finished = true; + } else { + if (e->onstack) + panic("On-stack event dequeued outside of the stack! time=%lld, event time=%lld, event=%pS\n", + time_travel_time, e->time, e); + time_travel_deliver_event(e); + } + } + + e = time_travel_first_event(); + if (e) + time_travel_ext_update_request(e->time); + } while (ne.pending && !finished); + + time_travel_del_event(&ne); +} + +static void time_travel_update_time_rel(unsigned long long offs) +{ + unsigned long flags; + + /* + * Disable interrupts before calculating the new time so + * that a real timer interrupt (signal) can't happen at + * a bad time e.g. after we read time_travel_time but + * before we've completed updating the time. + */ + local_irq_save(flags); + time_travel_update_time(time_travel_time + offs, false); + local_irq_restore(flags); +} + +void time_travel_ndelay(unsigned long nsec) +{ + /* + * Not strictly needed to use _rel() version since this is + * only used in INFCPU/EXT modes, but it doesn't hurt and + * is more readable too. + */ + time_travel_update_time_rel(nsec); +} +EXPORT_SYMBOL(time_travel_ndelay); + +void time_travel_add_irq_event(struct time_travel_event *e) +{ + BUG_ON(time_travel_mode != TT_MODE_EXTERNAL); + + time_travel_ext_get_time(); + /* + * We could model interrupt latency here, for now just + * don't have any latency at all and request the exact + * same time (again) to run the interrupt... + */ + time_travel_add_event(e, time_travel_time); +} +EXPORT_SYMBOL_GPL(time_travel_add_irq_event); + +static void time_travel_oneshot_timer(struct time_travel_event *e) +{ + /* clock tick; decrease extra jiffies by keeping sched_clock constant */ + if (tt_extra_sched_jiffies > 0) + tt_extra_sched_jiffies -= 1; + + deliver_alarm(); +} + +void time_travel_sleep(void) +{ + /* + * Wait "forever" (using S64_MAX because there are some potential + * wrapping issues, especially with the current TT_MODE_EXTERNAL + * controller application. + */ + unsigned long long next = S64_MAX; + int cpu = raw_smp_processor_id(); + + if (time_travel_mode == TT_MODE_BASIC) + os_timer_disable(cpu); + + time_travel_update_time(next, true); + + if (time_travel_mode == TT_MODE_BASIC && + time_travel_timer_event.pending) { + if (time_travel_timer_event.fn == time_travel_periodic_timer) { + /* + * This is somewhat wrong - we should get the first + * one sooner like the os_timer_one_shot() below... + */ + os_timer_set_interval(cpu, time_travel_timer_interval); + } else { + os_timer_one_shot(cpu, time_travel_timer_event.time - next); + } + } +} + +static void time_travel_handle_real_alarm(void) +{ + time_travel_set_time(time_travel_next_event); + + time_travel_del_event(&time_travel_timer_event); + + if (time_travel_timer_event.fn == time_travel_periodic_timer) + time_travel_add_event(&time_travel_timer_event, + time_travel_time + + time_travel_timer_interval); +} + +static void time_travel_set_interval(unsigned long long interval) +{ + time_travel_timer_interval = interval; +} + +static int time_travel_connect_external(const char *socket) +{ + const char *sep; + unsigned long long id = (unsigned long long)-1; + int rc; + + if ((sep = strchr(socket, ':'))) { + char buf[25] = {}; + if (sep - socket > sizeof(buf) - 1) + goto invalid_number; + + memcpy(buf, socket, sep - socket); + if (kstrtoull(buf, 0, &id)) { +invalid_number: + panic("time-travel: invalid external ID in string '%s'\n", + socket); + return -EINVAL; + } + + socket = sep + 1; + } + + rc = os_connect_socket(socket); + if (rc < 0) { + panic("time-travel: failed to connect to external socket %s\n", + socket); + return rc; + } + + time_travel_ext_fd = rc; + + time_travel_ext_req(UM_TIMETRAVEL_START, id); + + return 1; +} + +static void time_travel_set_start(void) +{ + if (time_travel_start_set) + return; + + switch (time_travel_mode) { + case TT_MODE_EXTERNAL: + time_travel_start = time_travel_ext_req(UM_TIMETRAVEL_GET_TOD, -1); + /* controller gave us the *current* time, so adjust by that */ + time_travel_ext_get_time(); + time_travel_start -= time_travel_time; + break; + case TT_MODE_INFCPU: + case TT_MODE_BASIC: + if (!time_travel_start_set) + time_travel_start = os_persistent_clock_emulation(); + break; + case TT_MODE_OFF: + /* we just read the host clock with os_persistent_clock_emulation() */ + break; + } + + time_travel_start_set = true; +} +#else /* CONFIG_UML_TIME_TRAVEL_SUPPORT */ +#define time_travel_start_set 0 +#define time_travel_start 0 +#define time_travel_time 0 +#define time_travel_ext_waiting 0 + +static inline void time_travel_update_time(unsigned long long ns, bool idle) +{ +} + +static inline void time_travel_update_time_rel(unsigned long long offs) +{ +} + +static inline void time_travel_handle_real_alarm(void) +{ +} + +static void time_travel_set_interval(unsigned long long interval) +{ +} + +static inline void time_travel_set_start(void) +{ +} + +/* fail link if this actually gets used */ +extern u64 time_travel_ext_req(u32 op, u64 time); + +/* these are empty macros so the struct/fn need not exist */ +#define time_travel_add_event(e, time) do { } while (0) +/* externally not usable - redefine here so we can */ +#undef time_travel_del_event +#define time_travel_del_event(e) do { } while (0) +#endif + +static struct clock_event_device timer_clockevent[NR_CPUS]; void timer_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) { unsigned long flags; + /* + * In basic time-travel mode we still get real interrupts + * (signals) but since we don't read time from the OS, we + * must update the simulated time here to the expiry when + * we get a signal. + * This is not the case in inf-cpu mode, since there we + * never get any real signals from the OS. + */ + if (time_travel_mode == TT_MODE_BASIC) + time_travel_handle_real_alarm(); + local_irq_save(flags); do_IRQ(TIMER_IRQ, regs); local_irq_restore(flags); } -static void itimer_set_mode(enum clock_event_mode mode, - struct clock_event_device *evt) +static int itimer_shutdown(struct clock_event_device *evt) { - switch (mode) { - case CLOCK_EVT_MODE_PERIODIC: - set_interval(); - break; + int cpu = evt - &timer_clockevent[0]; - case CLOCK_EVT_MODE_SHUTDOWN: - case CLOCK_EVT_MODE_UNUSED: - case CLOCK_EVT_MODE_ONESHOT: - disable_timer(); - break; + if (time_travel_mode != TT_MODE_OFF) + time_travel_del_event(&time_travel_timer_event); - case CLOCK_EVT_MODE_RESUME: - break; + if (time_travel_mode != TT_MODE_INFCPU && + time_travel_mode != TT_MODE_EXTERNAL) + os_timer_disable(cpu); + + return 0; +} + +static int itimer_set_periodic(struct clock_event_device *evt) +{ + unsigned long long interval = NSEC_PER_SEC / HZ; + int cpu = evt - &timer_clockevent[0]; + + if (time_travel_mode != TT_MODE_OFF) { + time_travel_del_event(&time_travel_timer_event); + time_travel_set_event_fn(&time_travel_timer_event, + time_travel_periodic_timer); + time_travel_set_interval(interval); + time_travel_add_event(&time_travel_timer_event, + time_travel_time + interval); } + + if (time_travel_mode != TT_MODE_INFCPU && + time_travel_mode != TT_MODE_EXTERNAL) + os_timer_set_interval(cpu, interval); + + return 0; } static int itimer_next_event(unsigned long delta, struct clock_event_device *evt) { - return timer_one_shot(delta + 1); + delta += 1; + + if (time_travel_mode != TT_MODE_OFF) { + time_travel_del_event(&time_travel_timer_event); + time_travel_set_event_fn(&time_travel_timer_event, + time_travel_oneshot_timer); + time_travel_add_event(&time_travel_timer_event, + time_travel_time + delta); + } + + if (time_travel_mode != TT_MODE_INFCPU && + time_travel_mode != TT_MODE_EXTERNAL) + return os_timer_one_shot(raw_smp_processor_id(), delta); + + return 0; } -static struct clock_event_device itimer_clockevent = { - .name = "itimer", - .rating = 250, - .cpumask = cpu_all_mask, - .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, - .set_mode = itimer_set_mode, - .set_next_event = itimer_next_event, - .shift = 32, - .irq = 0, +static int itimer_one_shot(struct clock_event_device *evt) +{ + return itimer_next_event(0, evt); +} + +static struct clock_event_device _timer_clockevent = { + .name = "posix-timer", + .rating = 250, + .features = CLOCK_EVT_FEAT_PERIODIC | + CLOCK_EVT_FEAT_ONESHOT, + .set_state_shutdown = itimer_shutdown, + .set_state_periodic = itimer_set_periodic, + .set_state_oneshot = itimer_one_shot, + .set_next_event = itimer_next_event, + .shift = 0, + .max_delta_ns = 0xffffffff, + .max_delta_ticks = 0xffffffff, + .min_delta_ns = TIMER_MIN_DELTA, + .min_delta_ticks = TIMER_MIN_DELTA, // microsecond resolution should be enough for anyone, same as 640K RAM + .irq = 0, + .mult = 1, }; static irqreturn_t um_timer(int irq, void *dev) { - (*itimer_clockevent.event_handler)(&itimer_clockevent); + int cpu = raw_smp_processor_id(); + struct clock_event_device *evt = &timer_clockevent[cpu]; + + /* + * Interrupt the (possibly) running userspace process, technically this + * should only happen if userspace is currently executing. + * With infinite CPU time-travel, we can only get here when userspace + * is not executing. Do not notify there and avoid spurious scheduling. + */ + if (time_travel_mode != TT_MODE_INFCPU && + time_travel_mode != TT_MODE_EXTERNAL && + get_current()->mm) + os_alarm_process(get_current()->mm->context.id.pid); + + evt->event_handler(evt); return IRQ_HANDLED; } -static cycle_t itimer_read(struct clocksource *cs) +static u64 timer_read(struct clocksource *cs) { - return os_nsecs() / 1000; + if (time_travel_mode != TT_MODE_OFF) { + /* + * We make reading the timer cost a bit so that we don't get + * stuck in loops that expect time to move more than the + * exact requested sleep amount, e.g. python's socket server, + * see https://bugs.python.org/issue37026. + * + * However, don't do that when we're in interrupt or such as + * then we might recurse into our own processing, and get to + * even more waiting, and that's not good - it messes up the + * "what do I do next" and onstack event we use to know when + * to return from time_travel_update_time(). + */ + if (!irqs_disabled() && !in_interrupt() && !in_softirq() && + !time_travel_ext_waiting) + time_travel_update_time_rel(TIMER_MULTIPLIER); + return time_travel_time / TIMER_MULTIPLIER; + } + + return os_nsecs() / TIMER_MULTIPLIER; } -static struct clocksource itimer_clocksource = { - .name = "itimer", +static struct clocksource timer_clocksource = { + .name = "timer", .rating = 300, - .read = itimer_read, + .read = timer_read, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; -static void __init setup_itimer(void) +int um_setup_timer(void) { + int cpu = raw_smp_processor_id(); + struct clock_event_device *evt = &timer_clockevent[cpu]; int err; - err = request_irq(TIMER_IRQ, um_timer, 0, "timer", NULL); + err = os_timer_create(); + if (err) + return err; + + memcpy(evt, &_timer_clockevent, sizeof(*evt)); + evt->cpumask = cpumask_of(cpu); + clockevents_register_device(evt); + + return 0; +} + +static void __init um_timer_init(void) +{ + int err; + + err = request_irq(TIMER_IRQ, um_timer, IRQF_TIMER, "hr timer", NULL); if (err != 0) printk(KERN_ERR "register_timer : request_irq failed - " "errno = %d\n", -err); - itimer_clockevent.mult = div_sc(HZ, NSEC_PER_SEC, 32); - itimer_clockevent.max_delta_ns = - clockevent_delta2ns(60 * HZ, &itimer_clockevent); - itimer_clockevent.min_delta_ns = - clockevent_delta2ns(1, &itimer_clockevent); - err = clocksource_register_hz(&itimer_clocksource, USEC_PER_SEC); + err = um_setup_timer(); + if (err) { + printk(KERN_ERR "creation of timer failed - errno = %d\n", -err); + return; + } + + err = clocksource_register_hz(&timer_clocksource, NSEC_PER_SEC/TIMER_MULTIPLIER); if (err) { printk(KERN_ERR "clocksource_register_hz returned %d\n", err); return; } - clockevents_register_device(&itimer_clockevent); } -void read_persistent_clock(struct timespec *ts) +void read_persistent_clock64(struct timespec64 *ts) { - long long nsecs = os_nsecs(); + long long nsecs; + + time_travel_set_start(); - set_normalized_timespec(ts, nsecs / NSEC_PER_SEC, - nsecs % NSEC_PER_SEC); + if (time_travel_mode != TT_MODE_OFF) + nsecs = time_travel_start + time_travel_time; + else + nsecs = os_persistent_clock_emulation(); + + set_normalized_timespec64(ts, nsecs / NSEC_PER_SEC, + nsecs % NSEC_PER_SEC); } void __init time_init(void) { - timer_init(); - late_time_init = setup_itimer; + timer_set_signal_handler(); + late_time_init = um_timer_init; +} + +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT +unsigned long calibrate_delay_is_known(void) +{ + if (time_travel_mode == TT_MODE_INFCPU || + time_travel_mode == TT_MODE_EXTERNAL) + return 1; + return 0; +} + +static int setup_time_travel(char *str) +{ + if (strcmp(str, "=inf-cpu") == 0) { + time_travel_mode = TT_MODE_INFCPU; + _timer_clockevent.name = "time-travel-timer-infcpu"; + timer_clocksource.name = "time-travel-clock"; + return 1; + } + + if (strncmp(str, "=ext:", 5) == 0) { + time_travel_mode = TT_MODE_EXTERNAL; + _timer_clockevent.name = "time-travel-timer-external"; + timer_clocksource.name = "time-travel-clock-external"; + return time_travel_connect_external(str + 5); + } + + if (!*str) { + time_travel_mode = TT_MODE_BASIC; + _timer_clockevent.name = "time-travel-timer"; + timer_clocksource.name = "time-travel-clock"; + return 1; + } + + return -EINVAL; +} + +__setup("time-travel", setup_time_travel); +__uml_help(setup_time_travel, +"time-travel\n" +" This option just enables basic time travel mode, in which the clock/timers\n" +" inside the UML instance skip forward when there's nothing to do, rather than\n" +" waiting for real time to elapse. However, instance CPU speed is limited by\n" +" the real CPU speed, so e.g. a 10ms timer will always fire after ~10ms wall\n" +" clock (but quicker when there's nothing to do).\n" +"\n" +"time-travel=inf-cpu\n" +" This enables time travel mode with infinite processing power, in which there\n" +" are no wall clock timers, and any CPU processing happens - as seen from the\n" +" guest - instantly. This can be useful for accurate simulation regardless of\n" +" debug overhead, physical CPU speed, etc. but is somewhat dangerous as it can\n" +" easily lead to getting stuck (e.g. if anything in the system busy loops).\n" +"\n" +"time-travel=ext:[ID:]/path/to/socket\n" +" This enables time travel mode similar to =inf-cpu, except the system will\n" +" use the given socket to coordinate with a central scheduler, in order to\n" +" have more than one system simultaneously be on simulated time. The virtio\n" +" driver code in UML knows about this so you can also simulate networks and\n" +" devices using it, assuming the device has the right capabilities.\n" +" The optional ID is a 64-bit integer that's sent to the central scheduler.\n\n"); + +static int setup_time_travel_start(char *str) +{ + int err; + + err = kstrtoull(str, 0, &time_travel_start); + if (err) + return err; + + time_travel_start_set = 1; + return 1; +} + +__setup("time-travel-start=", setup_time_travel_start); +__uml_help(setup_time_travel_start, +"time-travel-start=<nanoseconds>\n" +" Configure the UML instance's wall clock to start at this value rather than\n" +" the host's wall clock at the time of UML boot.\n\n"); + +static struct kobject *bc_time_kobject; + +static ssize_t bc_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "0x%llx", bc_message); +} + +static ssize_t bc_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) +{ + int ret; + u64 user_bc_message; + + ret = kstrtou64(buf, 0, &user_bc_message); + if (ret) + return ret; + + bc_message = user_bc_message; + + time_travel_ext_req(UM_TIMETRAVEL_BROADCAST, bc_message); + pr_info("um: time: sent broadcast message: 0x%llx\n", bc_message); + return count; +} + +static struct kobj_attribute bc_attribute = __ATTR(bc-message, 0660, bc_show, bc_store); + +static int __init um_bc_start(void) +{ + if (time_travel_mode != TT_MODE_EXTERNAL) + return 0; + + bc_time_kobject = kobject_create_and_add("um-ext-time", kernel_kobj); + if (!bc_time_kobject) + return 0; + + if (sysfs_create_file(bc_time_kobject, &bc_attribute.attr)) + pr_debug("failed to create the bc file in /sys/kernel/um_time"); + + return 0; } +late_initcall(um_bc_start); +#endif diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c index 9472079471bb..39608cccf2c6 100644 --- a/arch/um/kernel/tlb.c +++ b/arch/um/kernel/tlb.c @@ -1,212 +1,89 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <linux/mm.h> #include <linux/module.h> -#include <linux/sched.h> -#include <asm/pgtable.h> +#include <linux/sched/signal.h> + #include <asm/tlbflush.h> +#include <asm/mmu_context.h> #include <as-layout.h> #include <mem_user.h> #include <os.h> #include <skas.h> +#include <kern_util.h> -struct host_vm_change { - struct host_vm_op { - enum { NONE, MMAP, MUNMAP, MPROTECT } type; - union { - struct { - unsigned long addr; - unsigned long len; - unsigned int prot; - int fd; - __u64 offset; - } mmap; - struct { - unsigned long addr; - unsigned long len; - } munmap; - struct { - unsigned long addr; - unsigned long len; - unsigned int prot; - } mprotect; - } u; - } ops[1]; - int index; - struct mm_id *id; - void *data; - int force; -}; - -#define INIT_HVC(mm, force) \ - ((struct host_vm_change) \ - { .ops = { { .type = NONE } }, \ - .id = &mm->context.id, \ - .data = NULL, \ - .index = 0, \ - .force = force }) - -static int do_ops(struct host_vm_change *hvc, int end, - int finished) -{ - struct host_vm_op *op; - int i, ret = 0; - - for (i = 0; i < end && !ret; i++) { - op = &hvc->ops[i]; - switch (op->type) { - case MMAP: - ret = map(hvc->id, op->u.mmap.addr, op->u.mmap.len, - op->u.mmap.prot, op->u.mmap.fd, - op->u.mmap.offset, finished, &hvc->data); - break; - case MUNMAP: - ret = unmap(hvc->id, op->u.munmap.addr, - op->u.munmap.len, finished, &hvc->data); - break; - case MPROTECT: - ret = protect(hvc->id, op->u.mprotect.addr, - op->u.mprotect.len, op->u.mprotect.prot, - finished, &hvc->data); - break; - default: - printk(KERN_ERR "Unknown op type %d in do_ops\n", - op->type); - BUG(); - break; - } - } +struct vm_ops { + struct mm_id *mm_idp; - return ret; -} + int (*mmap)(struct mm_id *mm_idp, + unsigned long virt, unsigned long len, int prot, + int phys_fd, unsigned long long offset); + int (*unmap)(struct mm_id *mm_idp, + unsigned long virt, unsigned long len); +}; -static int add_mmap(unsigned long virt, unsigned long phys, unsigned long len, - unsigned int prot, struct host_vm_change *hvc) +static int kern_map(struct mm_id *mm_idp, + unsigned long virt, unsigned long len, int prot, + int phys_fd, unsigned long long offset) { - __u64 offset; - struct host_vm_op *last; - int fd, ret = 0; - - fd = phys_mapping(phys, &offset); - if (hvc->index != 0) { - last = &hvc->ops[hvc->index - 1]; - if ((last->type == MMAP) && - (last->u.mmap.addr + last->u.mmap.len == virt) && - (last->u.mmap.prot == prot) && (last->u.mmap.fd == fd) && - (last->u.mmap.offset + last->u.mmap.len == offset)) { - last->u.mmap.len += len; - return 0; - } - } - - if (hvc->index == ARRAY_SIZE(hvc->ops)) { - ret = do_ops(hvc, ARRAY_SIZE(hvc->ops), 0); - hvc->index = 0; - } - - hvc->ops[hvc->index++] = ((struct host_vm_op) - { .type = MMAP, - .u = { .mmap = { .addr = virt, - .len = len, - .prot = prot, - .fd = fd, - .offset = offset } - } }); - return ret; + /* TODO: Why is executable needed to be always set in the kernel? */ + return os_map_memory((void *)virt, phys_fd, offset, len, + prot & UM_PROT_READ, prot & UM_PROT_WRITE, + 1); } -static int add_munmap(unsigned long addr, unsigned long len, - struct host_vm_change *hvc) +static int kern_unmap(struct mm_id *mm_idp, + unsigned long virt, unsigned long len) { - struct host_vm_op *last; - int ret = 0; - - if (hvc->index != 0) { - last = &hvc->ops[hvc->index - 1]; - if ((last->type == MUNMAP) && - (last->u.munmap.addr + last->u.mmap.len == addr)) { - last->u.munmap.len += len; - return 0; - } - } - - if (hvc->index == ARRAY_SIZE(hvc->ops)) { - ret = do_ops(hvc, ARRAY_SIZE(hvc->ops), 0); - hvc->index = 0; - } - - hvc->ops[hvc->index++] = ((struct host_vm_op) - { .type = MUNMAP, - .u = { .munmap = { .addr = addr, - .len = len } } }); - return ret; + return os_unmap_memory((void *)virt, len); } -static int add_mprotect(unsigned long addr, unsigned long len, - unsigned int prot, struct host_vm_change *hvc) +void report_enomem(void) { - struct host_vm_op *last; - int ret = 0; - - if (hvc->index != 0) { - last = &hvc->ops[hvc->index - 1]; - if ((last->type == MPROTECT) && - (last->u.mprotect.addr + last->u.mprotect.len == addr) && - (last->u.mprotect.prot == prot)) { - last->u.mprotect.len += len; - return 0; - } - } - - if (hvc->index == ARRAY_SIZE(hvc->ops)) { - ret = do_ops(hvc, ARRAY_SIZE(hvc->ops), 0); - hvc->index = 0; - } - - hvc->ops[hvc->index++] = ((struct host_vm_op) - { .type = MPROTECT, - .u = { .mprotect = { .addr = addr, - .len = len, - .prot = prot } } }); - return ret; + printk(KERN_ERR "UML ran out of memory on the host side! " + "This can happen due to a memory limitation or " + "vm.max_map_count has been reached.\n"); } -#define ADD_ROUND(n, inc) (((n) + (inc)) & ~((inc) - 1)) - static inline int update_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, - struct host_vm_change *hvc) + struct vm_ops *ops) { pte_t *pte; - int r, w, x, prot, ret = 0; + int ret = 0; pte = pte_offset_kernel(pmd, addr); do { - if ((addr >= STUB_START) && (addr < STUB_END)) + if (!pte_needsync(*pte)) continue; - r = pte_read(*pte); - w = pte_write(*pte); - x = pte_exec(*pte); - if (!pte_young(*pte)) { - r = 0; - w = 0; - } else if (!pte_dirty(*pte)) - w = 0; - - prot = ((r ? UM_PROT_READ : 0) | (w ? UM_PROT_WRITE : 0) | - (x ? UM_PROT_EXEC : 0)); - if (hvc->force || pte_newpage(*pte)) { - if (pte_present(*pte)) - ret = add_mmap(addr, pte_val(*pte) & PAGE_MASK, - PAGE_SIZE, prot, hvc); - else - ret = add_munmap(addr, PAGE_SIZE, hvc); - } else if (pte_newprot(*pte)) - ret = add_mprotect(addr, PAGE_SIZE, prot, hvc); + if (pte_present(*pte)) { + __u64 offset; + unsigned long phys = pte_val(*pte) & PAGE_MASK; + int fd = phys_mapping(phys, &offset); + int r, w, x, prot; + + r = pte_read(*pte); + w = pte_write(*pte); + x = pte_exec(*pte); + if (!pte_young(*pte)) { + r = 0; + w = 0; + } else if (!pte_dirty(*pte)) + w = 0; + + prot = (r ? UM_PROT_READ : 0) | + (w ? UM_PROT_WRITE : 0) | + (x ? UM_PROT_EXEC : 0); + + ret = ops->mmap(ops->mm_idp, addr, PAGE_SIZE, + prot, fd, offset); + } else + ret = ops->unmap(ops->mm_idp, addr, PAGE_SIZE); + *pte = pte_mkuptodate(*pte); } while (pte++, addr += PAGE_SIZE, ((addr < end) && !ret)); return ret; @@ -214,7 +91,7 @@ static inline int update_pte_range(pmd_t *pmd, unsigned long addr, static inline int update_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, - struct host_vm_change *hvc) + struct vm_ops *ops) { pmd_t *pmd; unsigned long next; @@ -224,314 +101,125 @@ static inline int update_pmd_range(pud_t *pud, unsigned long addr, do { next = pmd_addr_end(addr, end); if (!pmd_present(*pmd)) { - if (hvc->force || pmd_newpage(*pmd)) { - ret = add_munmap(addr, next - addr, hvc); + if (pmd_needsync(*pmd)) { + ret = ops->unmap(ops->mm_idp, addr, + next - addr); pmd_mkuptodate(*pmd); } } - else ret = update_pte_range(pmd, addr, next, hvc); + else ret = update_pte_range(pmd, addr, next, ops); } while (pmd++, addr = next, ((addr < end) && !ret)); return ret; } -static inline int update_pud_range(pgd_t *pgd, unsigned long addr, +static inline int update_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, - struct host_vm_change *hvc) + struct vm_ops *ops) { pud_t *pud; unsigned long next; int ret = 0; - pud = pud_offset(pgd, addr); + pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); if (!pud_present(*pud)) { - if (hvc->force || pud_newpage(*pud)) { - ret = add_munmap(addr, next - addr, hvc); + if (pud_needsync(*pud)) { + ret = ops->unmap(ops->mm_idp, addr, + next - addr); pud_mkuptodate(*pud); } } - else ret = update_pmd_range(pud, addr, next, hvc); + else ret = update_pmd_range(pud, addr, next, ops); } while (pud++, addr = next, ((addr < end) && !ret)); return ret; } -void fix_range_common(struct mm_struct *mm, unsigned long start_addr, - unsigned long end_addr, int force) +static inline int update_p4d_range(pgd_t *pgd, unsigned long addr, + unsigned long end, + struct vm_ops *ops) { - pgd_t *pgd; - struct host_vm_change hvc; - unsigned long addr = start_addr, next; + p4d_t *p4d; + unsigned long next; int ret = 0; - hvc = INIT_HVC(mm, force); - pgd = pgd_offset(mm, addr); + p4d = p4d_offset(pgd, addr); do { - next = pgd_addr_end(addr, end_addr); - if (!pgd_present(*pgd)) { - if (force || pgd_newpage(*pgd)) { - ret = add_munmap(addr, next - addr, &hvc); - pgd_mkuptodate(*pgd); + next = p4d_addr_end(addr, end); + if (!p4d_present(*p4d)) { + if (p4d_needsync(*p4d)) { + ret = ops->unmap(ops->mm_idp, addr, + next - addr); + p4d_mkuptodate(*p4d); } - } - else ret = update_pud_range(pgd, addr, next, &hvc); - } while (pgd++, addr = next, ((addr < end_addr) && !ret)); - - if (!ret) - ret = do_ops(&hvc, hvc.index, 1); - - /* This is not an else because ret is modified above */ - if (ret) { - printk(KERN_ERR "fix_range_common: failed, killing current " - "process\n"); - force_sig(SIGKILL, current); - } + } else + ret = update_pud_range(p4d, addr, next, ops); + } while (p4d++, addr = next, ((addr < end) && !ret)); + return ret; } -static int flush_tlb_kernel_range_common(unsigned long start, unsigned long end) +int um_tlb_sync(struct mm_struct *mm) { - struct mm_struct *mm; pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - unsigned long addr, last; - int updated = 0, err; - - mm = &init_mm; - for (addr = start; addr < end;) { - pgd = pgd_offset(mm, addr); - if (!pgd_present(*pgd)) { - last = ADD_ROUND(addr, PGDIR_SIZE); - if (last > end) - last = end; - if (pgd_newpage(*pgd)) { - updated = 1; - err = os_unmap_memory((void *) addr, - last - addr); - if (err < 0) - panic("munmap failed, errno = %d\n", - -err); - } - addr = last; - continue; - } - - pud = pud_offset(pgd, addr); - if (!pud_present(*pud)) { - last = ADD_ROUND(addr, PUD_SIZE); - if (last > end) - last = end; - if (pud_newpage(*pud)) { - updated = 1; - err = os_unmap_memory((void *) addr, - last - addr); - if (err < 0) - panic("munmap failed, errno = %d\n", - -err); - } - addr = last; - continue; - } - - pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) { - last = ADD_ROUND(addr, PMD_SIZE); - if (last > end) - last = end; - if (pmd_newpage(*pmd)) { - updated = 1; - err = os_unmap_memory((void *) addr, - last - addr); - if (err < 0) - panic("munmap failed, errno = %d\n", - -err); - } - addr = last; - continue; - } - - pte = pte_offset_kernel(pmd, addr); - if (!pte_present(*pte) || pte_newpage(*pte)) { - updated = 1; - err = os_unmap_memory((void *) addr, - PAGE_SIZE); - if (err < 0) - panic("munmap failed, errno = %d\n", - -err); - if (pte_present(*pte)) - map_memory(addr, - pte_val(*pte) & PAGE_MASK, - PAGE_SIZE, 1, 1, 1); - } - else if (pte_newprot(*pte)) { - updated = 1; - os_protect_memory((void *) addr, PAGE_SIZE, 1, 1, 1); - } - addr += PAGE_SIZE; - } - return updated; -} + struct vm_ops ops; + unsigned long addr, next; + int ret = 0; -void flush_tlb_page(struct vm_area_struct *vma, unsigned long address) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - struct mm_struct *mm = vma->vm_mm; - void *flush = NULL; - int r, w, x, prot, err = 0; - struct mm_id *mm_id; - - address &= PAGE_MASK; - pgd = pgd_offset(mm, address); - if (!pgd_present(*pgd)) - goto kill; - - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - goto kill; - - pmd = pmd_offset(pud, address); - if (!pmd_present(*pmd)) - goto kill; - - pte = pte_offset_kernel(pmd, address); - - r = pte_read(*pte); - w = pte_write(*pte); - x = pte_exec(*pte); - if (!pte_young(*pte)) { - r = 0; - w = 0; - } else if (!pte_dirty(*pte)) { - w = 0; - } + guard(spinlock_irqsave)(&mm->context.sync_tlb_lock); - mm_id = &mm->context.id; - prot = ((r ? UM_PROT_READ : 0) | (w ? UM_PROT_WRITE : 0) | - (x ? UM_PROT_EXEC : 0)); - if (pte_newpage(*pte)) { - if (pte_present(*pte)) { - unsigned long long offset; - int fd; + if (mm->context.sync_tlb_range_to == 0) + return 0; - fd = phys_mapping(pte_val(*pte) & PAGE_MASK, &offset); - err = map(mm_id, address, PAGE_SIZE, prot, fd, offset, - 1, &flush); - } - else err = unmap(mm_id, address, PAGE_SIZE, 1, &flush); + ops.mm_idp = &mm->context.id; + if (mm == &init_mm) { + ops.mmap = kern_map; + ops.unmap = kern_unmap; + } else { + ops.mmap = map; + ops.unmap = unmap; } - else if (pte_newprot(*pte)) - err = protect(mm_id, address, PAGE_SIZE, prot, 1, &flush); - - if (err) - goto kill; - *pte = pte_mkuptodate(*pte); - - return; - -kill: - printk(KERN_ERR "Failed to flush page for address 0x%lx\n", address); - force_sig(SIGKILL, current); -} - -pgd_t *pgd_offset_proc(struct mm_struct *mm, unsigned long address) -{ - return pgd_offset(mm, address); -} - -pud_t *pud_offset_proc(pgd_t *pgd, unsigned long address) -{ - return pud_offset(pgd, address); -} - -pmd_t *pmd_offset_proc(pud_t *pud, unsigned long address) -{ - return pmd_offset(pud, address); -} + addr = mm->context.sync_tlb_range_from; + pgd = pgd_offset(mm, addr); + do { + next = pgd_addr_end(addr, mm->context.sync_tlb_range_to); + if (!pgd_present(*pgd)) { + if (pgd_needsync(*pgd)) { + ret = ops.unmap(ops.mm_idp, addr, + next - addr); + pgd_mkuptodate(*pgd); + } + } else + ret = update_p4d_range(pgd, addr, next, &ops); + } while (pgd++, addr = next, + ((addr < mm->context.sync_tlb_range_to) && !ret)); -pte_t *pte_offset_proc(pmd_t *pmd, unsigned long address) -{ - return pte_offset_kernel(pmd, address); -} + if (ret == -ENOMEM) + report_enomem(); -pte_t *addr_pte(struct task_struct *task, unsigned long addr) -{ - pgd_t *pgd = pgd_offset(task->mm, addr); - pud_t *pud = pud_offset(pgd, addr); - pmd_t *pmd = pmd_offset(pud, addr); + mm->context.sync_tlb_range_from = 0; + mm->context.sync_tlb_range_to = 0; - return pte_offset_map(pmd, addr); + return ret; } void flush_tlb_all(void) { - flush_tlb_mm(current->mm); -} - -void flush_tlb_kernel_range(unsigned long start, unsigned long end) -{ - flush_tlb_kernel_range_common(start, end); -} - -void flush_tlb_kernel_vm(void) -{ - flush_tlb_kernel_range_common(start_vm, end_vm); -} - -void __flush_tlb_one(unsigned long addr) -{ - flush_tlb_kernel_range_common(addr, addr + PAGE_SIZE); -} - -static void fix_range(struct mm_struct *mm, unsigned long start_addr, - unsigned long end_addr, int force) -{ - fix_range_common(mm, start_addr, end_addr, force); -} - -void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end) -{ - if (vma->vm_mm == NULL) - flush_tlb_kernel_range_common(start, end); - else fix_range(vma->vm_mm, start, end, 0); -} -EXPORT_SYMBOL(flush_tlb_range); - -void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, - unsigned long end) -{ /* * Don't bother flushing if this address space is about to be * destroyed. */ - if (atomic_read(&mm->mm_users) == 0) + if (atomic_read(¤t->mm->mm_users) == 0) return; - fix_range(mm, start, end, 0); + flush_tlb_mm(current->mm); } void flush_tlb_mm(struct mm_struct *mm) { - struct vm_area_struct *vma = mm->mmap; - - while (vma != NULL) { - fix_range(mm, vma->vm_start, vma->vm_end, 0); - vma = vma->vm_next; - } -} + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); -void force_flush_all(void) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma = mm->mmap; - - while (vma != NULL) { - fix_range(mm, vma->vm_start, vma->vm_end, 1); - vma = vma->vm_next; - } + for_each_vma(vmi, vma) + um_tlb_mark_sync(mm, vma->vm_start, vma->vm_end); } diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index 089f3987e273..177615820a4c 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -1,14 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <linux/mm.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/hardirq.h> #include <linux/module.h> +#include <linux/uaccess.h> +#include <linux/sched/debug.h> #include <asm/current.h> -#include <asm/pgtable.h> #include <asm/tlbflush.h> #include <arch.h> #include <as-layout.h> @@ -17,7 +18,123 @@ #include <skas.h> /* - * Note this is constrained to return 0, -EFAULT, -EACCESS, -ENOMEM by + * NOTE: UML does not have exception tables. As such, this is almost a copy + * of the code in mm/memory.c, only adjusting the logic to simply check whether + * we are coming from the kernel instead of doing an additional lookup in the + * exception table. + * We can do this simplification because we never get here if the exception was + * fixable. + */ +static inline bool get_mmap_lock_carefully(struct mm_struct *mm, bool is_user) +{ + if (likely(mmap_read_trylock(mm))) + return true; + + if (!is_user) + return false; + + return !mmap_read_lock_killable(mm); +} + +static inline bool mmap_upgrade_trylock(struct mm_struct *mm) +{ + /* + * We don't have this operation yet. + * + * It should be easy enough to do: it's basically a + * atomic_long_try_cmpxchg_acquire() + * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but + * it also needs the proper lockdep magic etc. + */ + return false; +} + +static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, bool is_user) +{ + mmap_read_unlock(mm); + if (!is_user) + return false; + + return !mmap_write_lock_killable(mm); +} + +/* + * Helper for page fault handling. + * + * This is kind of equivalend to "mmap_read_lock()" followed + * by "find_extend_vma()", except it's a lot more careful about + * the locking (and will drop the lock on failure). + * + * For example, if we have a kernel bug that causes a page + * fault, we don't want to just use mmap_read_lock() to get + * the mm lock, because that would deadlock if the bug were + * to happen while we're holding the mm lock for writing. + * + * So this checks the exception tables on kernel faults in + * order to only do this all for instructions that are actually + * expected to fault. + * + * We can also actually take the mm lock for writing if we + * need to extend the vma, which helps the VM layer a lot. + */ +static struct vm_area_struct * +um_lock_mm_and_find_vma(struct mm_struct *mm, + unsigned long addr, bool is_user) +{ + struct vm_area_struct *vma; + + if (!get_mmap_lock_carefully(mm, is_user)) + return NULL; + + vma = find_vma(mm, addr); + if (likely(vma && (vma->vm_start <= addr))) + return vma; + + /* + * Well, dang. We might still be successful, but only + * if we can extend a vma to do so. + */ + if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { + mmap_read_unlock(mm); + return NULL; + } + + /* + * We can try to upgrade the mmap lock atomically, + * in which case we can continue to use the vma + * we already looked up. + * + * Otherwise we'll have to drop the mmap lock and + * re-take it, and also look up the vma again, + * re-checking it. + */ + if (!mmap_upgrade_trylock(mm)) { + if (!upgrade_mmap_lock_carefully(mm, is_user)) + return NULL; + + vma = find_vma(mm, addr); + if (!vma) + goto fail; + if (vma->vm_start <= addr) + goto success; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto fail; + } + + if (expand_stack_locked(vma, addr)) + goto fail; + +success: + mmap_write_downgrade(mm); + return vma; + +fail: + mmap_write_unlock(mm); + return NULL; +} + +/* + * Note this is constrained to return 0, -EFAULT, -EACCES, -ENOMEM by * segv(). */ int handle_page_fault(unsigned long address, unsigned long ip, @@ -25,79 +142,68 @@ int handle_page_fault(unsigned long address, unsigned long ip, { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - pgd_t *pgd; - pud_t *pud; pmd_t *pmd; pte_t *pte; int err = -EFAULT; - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | - (is_write ? FAULT_FLAG_WRITE : 0); + unsigned int flags = FAULT_FLAG_DEFAULT; *code_out = SEGV_MAPERR; /* - * If the fault was during atomic operation, don't take the fault, just + * If the fault was with pagefaults disabled, don't take the fault, just * fail. */ - if (in_atomic()) + if (faulthandler_disabled()) goto out_nosemaphore; + if (is_user) + flags |= FAULT_FLAG_USER; retry: - down_read(&mm->mmap_sem); - vma = find_vma(mm, address); + vma = um_lock_mm_and_find_vma(mm, address, is_user); if (!vma) - goto out; - else if (vma->vm_start <= address) - goto good_area; - else if (!(vma->vm_flags & VM_GROWSDOWN)) - goto out; - else if (is_user && !ARCH_IS_STACKGROW(address)) - goto out; - else if (expand_stack(vma, address)) - goto out; + goto out_nosemaphore; -good_area: *code_out = SEGV_ACCERR; - if (is_write && !(vma->vm_flags & VM_WRITE)) - goto out; - - /* Don't require VM_READ|VM_EXEC for write faults! */ - if (!is_write && !(vma->vm_flags & (VM_READ | VM_EXEC))) - goto out; + if (is_write) { + if (!(vma->vm_flags & VM_WRITE)) + goto out; + flags |= FAULT_FLAG_WRITE; + } else { + /* Don't require VM_READ|VM_EXEC for write faults! */ + if (!(vma->vm_flags & (VM_READ | VM_EXEC))) + goto out; + } do { - int fault; + vm_fault_t fault; - fault = handle_mm_fault(mm, vma, address, flags); + fault = handle_mm_fault(vma, address, flags, NULL); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) goto out_nosemaphore; + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + return 0; + if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) { goto out_of_memory; + } else if (fault & VM_FAULT_SIGSEGV) { + goto out; } else if (fault & VM_FAULT_SIGBUS) { err = -EACCES; goto out; } BUG(); } - if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_MAJOR) - current->maj_flt++; - else - current->min_flt++; - if (fault & VM_FAULT_RETRY) { - flags &= ~FAULT_FLAG_ALLOW_RETRY; - flags |= FAULT_FLAG_TRIED; - - goto retry; - } + if (fault & VM_FAULT_RETRY) { + flags |= FAULT_FLAG_TRIED; + + goto retry; } - pgd = pgd_offset(mm, address); - pud = pud_offset(pgd, address); - pmd = pmd_offset(pud, address); + pmd = pmd_off(mm, address); pte = pte_offset_kernel(pmd, address); } while (!pte_present(*pte)); err = 0; @@ -112,9 +218,9 @@ good_area: #if 0 WARN_ON(!pte_young(*pte) || (is_write && !pte_dirty(*pte))); #endif - flush_tlb_page(vma, address); + out: - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); out_nosemaphore: return err; @@ -123,11 +229,12 @@ out_of_memory: * We ran out of memory, call the OOM killer, and return the userspace * (which will retry the fault, or kill us if we got oom-killed). */ - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); + if (!is_user) + goto out_nosemaphore; pagefault_out_of_memory(); return 0; } -EXPORT_SYMBOL(handle_page_fault); static void show_segv_info(struct uml_pt_regs *regs) { @@ -140,7 +247,7 @@ static void show_segv_info(struct uml_pt_regs *regs) if (!printk_ratelimit()) return; - printk("%s%s[%d]: segfault at %lx ip %p sp %p error %x", + printk("%s%s[%d]: segfault at %lx ip %px sp %px error %x", task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, tsk->comm, task_pid_nr(tsk), FAULT_ADDRESS(*fi), (void *)UPT_IP(regs), (void *)UPT_SP(regs), @@ -152,19 +259,14 @@ static void show_segv_info(struct uml_pt_regs *regs) static void bad_segv(struct faultinfo fi, unsigned long ip) { - struct siginfo si; - - si.si_signo = SIGSEGV; - si.si_code = SEGV_ACCERR; - si.si_addr = (void __user *) FAULT_ADDRESS(fi); current->thread.arch.faultinfo = fi; - force_sig_info(SIGSEGV, &si, current); + force_sig_fault(SIGSEGV, SEGV_ACCERR, (void __user *) FAULT_ADDRESS(fi)); } void fatal_sigsegv(void) { - force_sigsegv(SIGSEGV, current); - do_signal(); + force_fatal_sig(SIGSEGV); + do_signal(¤t->thread.regs); /* * This is to tell gcc that we're not returning - do_signal * can, in general, return, but in this case, it's not, since @@ -173,7 +275,19 @@ void fatal_sigsegv(void) os_dump_core(); } -void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) +/** + * segv_handler() - the SIGSEGV handler + * @sig: the signal number + * @unused_si: the signal info struct; unused in this handler + * @regs: the ptrace register information + * @mc: the mcontext of the signal + * + * The handler first extracts the faultinfo from the UML ptrace regs struct. + * If the userfault did not happen in an UML userspace process, bad_segv is called. + * Otherwise the signal did happen in a cloned userspace process, handle it. + */ +void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs, + void *mc) { struct faultinfo * fi = UPT_FAULTINFO(regs); @@ -182,7 +296,7 @@ void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) bad_segv(*fi, UPT_IP(regs)); return; } - segv(*fi, UPT_IP(regs), UPT_IS_USER(regs), regs); + segv(*fi, UPT_IP(regs), UPT_IS_USER(regs), regs, mc); } /* @@ -192,26 +306,55 @@ void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) * give us bad data! */ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user, - struct uml_pt_regs *regs) + struct uml_pt_regs *regs, void *mc) { - struct siginfo si; - jmp_buf *catcher; + int si_code; int err; int is_write = FAULT_WRITE(fi); unsigned long address = FAULT_ADDRESS(fi); - if (!is_user && (address >= start_vm) && (address < end_vm)) { - flush_tlb_kernel_vm(); - return 0; + if (!is_user && regs) + current->thread.segv_regs = container_of(regs, struct pt_regs, regs); + + if (!is_user && address >= start_vm && address < end_vm) { + /* + * Kernel has pending updates from set_ptes that were not + * flushed yet. Syncing them should fix the pagefault (if not + * we'll get here again and panic). + */ + err = um_tlb_sync(&init_mm); + if (err == -ENOMEM) + report_enomem(); + if (err) + panic("Failed to sync kernel TLBs: %d", err); + goto out; + } + else if (current->pagefault_disabled) { + if (!mc) { + show_regs(container_of(regs, struct pt_regs, regs)); + panic("Segfault with pagefaults disabled but no mcontext"); + } + if (!current->thread.segv_continue) { + show_regs(container_of(regs, struct pt_regs, regs)); + panic("Segfault without recovery target"); + } + mc_set_rip(mc, current->thread.segv_continue); + current->thread.segv_continue = NULL; + goto out; } else if (current->mm == NULL) { show_regs(container_of(regs, struct pt_regs, regs)); panic("Segfault with no mm"); } + else if (!is_user && address > PAGE_SIZE && address < TASK_SIZE) { + show_regs(container_of(regs, struct pt_regs, regs)); + panic("Kernel tried to access user memory at addr 0x%lx, ip 0x%lx", + address, ip); + } - if (SEGV_IS_FIXABLE(&fi) || SEGV_MAYBE_FIXABLE(&fi)) + if (SEGV_IS_FIXABLE(&fi)) err = handle_page_fault(address, ip, is_write, is_user, - &si.si_code); + &si_code); else { err = -EFAULT; /* @@ -222,17 +365,10 @@ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user, address = 0; } - catcher = current->thread.fault_catcher; if (!err) - return 0; - else if (catcher != NULL) { - current->thread.fault_addr = (void *) address; - UML_LONGJMP(catcher, 1); - } - else if (current->thread.fault_addr != NULL) - panic("fault_addr set but no fault catcher"); + goto out; else if (!is_user && arch_fixup(ip, regs)) - return 0; + goto out; if (!is_user) { show_regs(container_of(regs, struct pt_regs, regs)); @@ -243,27 +379,25 @@ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user, show_segv_info(regs); if (err == -EACCES) { - si.si_signo = SIGBUS; - si.si_errno = 0; - si.si_code = BUS_ADRERR; - si.si_addr = (void __user *)address; current->thread.arch.faultinfo = fi; - force_sig_info(SIGBUS, &si, current); + force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address); } else { BUG_ON(err != -EFAULT); - si.si_signo = SIGSEGV; - si.si_addr = (void __user *) address; current->thread.arch.faultinfo = fi; - force_sig_info(SIGSEGV, &si, current); + force_sig_fault(SIGSEGV, si_code, (void __user *) address); } + +out: + if (regs) + current->thread.segv_regs = NULL; + return 0; } -void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs) +void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs, + void *mc) { - struct faultinfo *fi; - struct siginfo clean_si; - + int code, err; if (!UPT_IS_USER(regs)) { if (sig == SIGBUS) printk(KERN_ERR "Bus error - the host /dev/shm or /tmp " @@ -273,44 +407,24 @@ void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs) arch_examine_signal(sig, regs); - memset(&clean_si, 0, sizeof(clean_si)); - clean_si.si_signo = si->si_signo; - clean_si.si_errno = si->si_errno; - clean_si.si_code = si->si_code; - switch (sig) { - case SIGILL: - case SIGFPE: - case SIGSEGV: - case SIGBUS: - case SIGTRAP: - fi = UPT_FAULTINFO(regs); - clean_si.si_addr = (void __user *) FAULT_ADDRESS(*fi); + /* Is the signal layout for the signal known? + * Signal data must be scrubbed to prevent information leaks. + */ + code = si->si_code; + err = si->si_errno; + if ((err == 0) && (siginfo_layout(sig, code) == SIL_FAULT)) { + struct faultinfo *fi = UPT_FAULTINFO(regs); current->thread.arch.faultinfo = *fi; -#ifdef __ARCH_SI_TRAPNO - clean_si.si_trapno = si->si_trapno; -#endif - break; - default: - printk(KERN_ERR "Attempted to relay unknown signal %d (si_code = %d)\n", - sig, si->si_code); + force_sig_fault(sig, code, (void __user *)FAULT_ADDRESS(*fi)); + } else { + printk(KERN_ERR "Attempted to relay unknown signal %d (si_code = %d) with errno %d\n", + sig, code, err); + force_sig(sig); } - - force_sig_info(sig, &clean_si, current); -} - -void bus_handler(int sig, struct siginfo *si, struct uml_pt_regs *regs) -{ - if (current->thread.fault_catcher != NULL) - UML_LONGJMP(current->thread.fault_catcher, 1); - else - relay_signal(sig, si, regs); } -void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) +void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs, + void *mc) { do_IRQ(WINCH_IRQ, regs); } - -void trap_init(void) -{ -} diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 87df5e3acc26..e2b24e1ecfa6 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -1,19 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ +#include <linux/cpu.h> #include <linux/delay.h> #include <linux/init.h> #include <linux/mm.h> +#include <linux/ctype.h> #include <linux/module.h> +#include <linux/panic_notifier.h> #include <linux/seq_file.h> #include <linux/string.h> +#include <linux/string_choices.h> #include <linux/utsname.h> #include <linux/sched.h> -#include <asm/pgtable.h> +#include <linux/sched/task.h> +#include <linux/kmsg_dump.h> +#include <linux/suspend.h> +#include <linux/random.h> +#include <linux/smp-internal.h> + #include <asm/processor.h> +#include <asm/cpufeature.h> +#include <asm/sections.h> #include <asm/setup.h> +#include <asm/text-patching.h> #include <as-layout.h> #include <arch.h> #include <init.h> @@ -22,7 +34,10 @@ #include <mem_user.h> #include <os.h> -#define DEFAULT_COMMAND_LINE "root=98:0" +#include "um_arch.h" + +#define DEFAULT_COMMAND_LINE_ROOT "root=98:0" +#define DEFAULT_COMMAND_LINE_CONSOLE "console=tty0" /* Changed in add_arg and setup_arch, which run before SMP is started */ static char __initdata command_line[COMMAND_LINE_SIZE] = { 0 }; @@ -30,7 +45,7 @@ static char __initdata command_line[COMMAND_LINE_SIZE] = { 0 }; static void __init add_arg(char *arg) { if (strlen(command_line) + strlen(arg) + 1 > COMMAND_LINE_SIZE) { - printf("add_arg: Too many command line arguments!\n"); + os_warn("add_arg: Too many command line arguments!\n"); exit(1); } if (strlen(command_line) > 0) @@ -40,43 +55,42 @@ static void __init add_arg(char *arg) /* * These fields are initialized at boot time and not changed. - * XXX This structure is used only in the non-SMP case. Maybe this - * should be moved to smp.c. */ struct cpuinfo_um boot_cpu_data = { .loops_per_jiffy = 0, - .ipi_pipe = { -1, -1 } + .cache_alignment = L1_CACHE_BYTES, + .x86_capability = { 0 } }; -union thread_union cpu0_irqstack - __attribute__((__section__(".data..init_irqstack"))) = - { INIT_THREAD_INFO(init_task) }; +EXPORT_SYMBOL(boot_cpu_data); -unsigned long thread_saved_pc(struct task_struct *task) -{ - /* FIXME: Need to look up userspace_pid by cpu */ - return os_process_pc(userspace_pid[0]); -} /* Changed in setup_arch, which is called in early boot */ static char host_info[(__NEW_UTS_LEN + 1) * 5]; static int show_cpuinfo(struct seq_file *m, void *v) { - int index = 0; + int i = 0; -#ifdef CONFIG_SMP - index = (struct cpuinfo_um *) v - cpu_data; - if (!cpu_online(index)) +#if IS_ENABLED(CONFIG_SMP) + i = (uintptr_t) v - 1; + if (!cpu_online(i)) return 0; #endif - seq_printf(m, "processor\t: %d\n", index); + seq_printf(m, "processor\t: %d\n", i); seq_printf(m, "vendor_id\t: User Mode Linux\n"); seq_printf(m, "model name\t: UML\n"); seq_printf(m, "mode\t\t: skas\n"); seq_printf(m, "host\t\t: %s\n", host_info); - seq_printf(m, "bogomips\t: %lu.%02lu\n\n", + seq_printf(m, "fpu\t\t: %s\n", str_yes_no(cpu_has(&boot_cpu_data, X86_FEATURE_FPU))); + seq_printf(m, "flags\t\t:"); + for (i = 0; i < 32*NCAPINTS; i++) + if (cpu_has(&boot_cpu_data, i) && (x86_cap_flags[i] != NULL)) + seq_printf(m, " %s", x86_cap_flags[i]); + seq_printf(m, "\n"); + seq_printf(m, "cache_alignment\t: %d\n", boot_cpu_data.cache_alignment); + seq_printf(m, "bogomips\t: %lu.%02lu\n", loops_per_jiffy/(500000/HZ), (loops_per_jiffy/(5000/HZ)) % 100); @@ -85,7 +99,9 @@ static int show_cpuinfo(struct seq_file *m, void *v) static void *c_start(struct seq_file *m, loff_t *pos) { - return *pos < NR_CPUS ? cpu_data + *pos : NULL; + if (*pos < nr_cpu_ids) + return (void *)(uintptr_t)(*pos + 1); + return NULL; } static void *c_next(struct seq_file *m, void *v, loff_t *pos) @@ -113,14 +129,13 @@ unsigned long uml_reserved; /* Also modified in mem_init */ unsigned long start_vm; unsigned long end_vm; -/* Set in uml_ncpus_setup */ -int ncpus = 1; - /* Set in early boot */ -static int have_root __initdata = 0; +static int have_root __initdata; +static int have_console __initdata; /* Set in uml_mem_setup and modified in linux_main */ -long long physmem_size = 32 * 1024 * 1024; +unsigned long long physmem_size = 64 * 1024 * 1024; +EXPORT_SYMBOL(physmem_size); static const char *usage_string = "User Mode Linux v%s\n" @@ -128,6 +143,7 @@ static const char *usage_string = static int __init uml_version_setup(char *line, int *add) { + /* Explicitly use printf() to show version in stdout */ printf("%s\n", init_utsname()->release); exit(0); @@ -154,42 +170,24 @@ __uml_setup("root=", uml_root_setup, " root=/dev/ubd5\n\n" ); -static int __init no_skas_debug_setup(char *line, int *add) +static int __init uml_console_setup(char *line, int *add) { - printf("'debug' is not necessary to gdb UML in skas mode - run \n"); - printf("'gdb linux'\n"); - + have_console = 1; return 0; } -__uml_setup("debug", no_skas_debug_setup, -"debug\n" -" this flag is not needed to run gdb on UML in skas mode\n\n" +__uml_setup("console=", uml_console_setup, +"console=<preferred console>\n" +" Specify the preferred console output driver\n\n" ); -#ifdef CONFIG_SMP -static int __init uml_ncpus_setup(char *line, int *add) -{ - if (!sscanf(line, "%d", &ncpus)) { - printf("Couldn't parse [%s]\n", line); - return -1; - } - - return 0; -} - -__uml_setup("ncpus=", uml_ncpus_setup, -"ncpus=<# of desired CPUs>\n" -" This tells an SMP kernel how many virtual processors to start.\n\n" -); -#endif - static int __init Usage(char *line, int *add) { const char **p; printf(usage_string, init_utsname()->release); p = &__uml_help_start; + /* Explicitly use printf() to show help in stdout */ while (p < &__uml_help_end) { printf("%s", *p); p++; @@ -233,42 +231,86 @@ static void __init uml_postsetup(void) static int panic_exit(struct notifier_block *self, unsigned long unused1, void *unused2) { + kmsg_dump(KMSG_DUMP_PANIC); bust_spinlocks(1); - show_regs(&(current->thread.regs)); bust_spinlocks(0); uml_exitcode = 1; os_dump_core(); - return 0; + + return NOTIFY_DONE; } static struct notifier_block panic_exit_notifier = { - .notifier_call = panic_exit, - .next = NULL, - .priority = 0 + .notifier_call = panic_exit, + .priority = INT_MAX - 1, /* run as 2nd notifier, won't return */ }; +void uml_finishsetup(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, + &panic_exit_notifier); + + uml_postsetup(); + + new_thread_handler(); +} + /* Set during early boot */ +unsigned long stub_start; unsigned long task_size; EXPORT_SYMBOL(task_size); -unsigned long host_task_size; - unsigned long brk_start; -unsigned long end_iomem; -EXPORT_SYMBOL(end_iomem); #define MIN_VMALLOC (32 * 1024 * 1024) -extern char __binary_start; +static void __init parse_host_cpu_flags(char *line) +{ + int i; + for (i = 0; i < 32*NCAPINTS; i++) { + if ((x86_cap_flags[i] != NULL) && strstr(line, x86_cap_flags[i])) + set_cpu_cap(&boot_cpu_data, i); + } +} + +static void __init parse_cache_line(char *line) +{ + long res; + char *to_parse = strstr(line, ":"); + if (to_parse) { + to_parse++; + while (*to_parse != 0 && isspace(*to_parse)) { + to_parse++; + } + if (kstrtoul(to_parse, 10, &res) == 0 && is_power_of_2(res)) + boot_cpu_data.cache_alignment = res; + else + boot_cpu_data.cache_alignment = L1_CACHE_BYTES; + } +} -int __init linux_main(int argc, char **argv) +static unsigned long __init get_top_address(char **envp) +{ + unsigned long top_addr = (unsigned long) &top_addr; + int i; + + /* The earliest variable should be after the program name in ELF */ + for (i = 0; envp[i]; i++) { + if ((unsigned long) envp[i] > top_addr) + top_addr = (unsigned long) envp[i]; + } + + return PAGE_ALIGN(top_addr + 1); +} + +int __init linux_main(int argc, char **argv, char **envp) { unsigned long avail, diff; unsigned long virtmem_size, max_physmem; + unsigned long host_task_size; unsigned long stack; unsigned int i; int add; - char * mode; for (i = 1; i < argc; i++) { if ((i == 1) && (argv[i][0] == ' ')) @@ -279,26 +321,31 @@ int __init linux_main(int argc, char **argv) add_arg(argv[i]); } if (have_root == 0) - add_arg(DEFAULT_COMMAND_LINE); + add_arg(DEFAULT_COMMAND_LINE_ROOT); + + if (have_console == 0) + add_arg(DEFAULT_COMMAND_LINE_CONSOLE); + + host_task_size = get_top_address(envp); + /* reserve a few pages for the stubs */ + stub_start = host_task_size - STUB_SIZE; + host_task_size = stub_start; + + /* Limit TASK_SIZE to what is addressable by the page table */ + task_size = host_task_size; + if (task_size > (unsigned long long) PTRS_PER_PGD * PGDIR_SIZE) + task_size = PTRS_PER_PGD * PGDIR_SIZE; - host_task_size = os_get_top_address(); /* * TASK_SIZE needs to be PGDIR_SIZE aligned or else exit_mmap craps * out */ - task_size = host_task_size & PGDIR_MASK; + task_size = task_size & PGDIR_MASK; /* OS sanity checks that need to happen before the kernel runs */ os_early_checks(); - can_do_skas(); - - if (proc_mm && ptrace_faultinfo) - mode = "SKAS3"; - else - mode = "SKAS0"; - - printf("UML running in %s mode\n", mode); + get_host_cpu_features(parse_host_cpu_flags, parse_cache_line); brk_start = (unsigned long) sbrk(0); @@ -307,54 +354,32 @@ int __init linux_main(int argc, char **argv) * so they actually get what they asked for. This should * add zero for non-exec shield users */ - - diff = UML_ROUND_UP(brk_start) - UML_ROUND_UP(&_end); + diff = PAGE_ALIGN(brk_start) - PAGE_ALIGN((unsigned long) &_end); if (diff > 1024 * 1024) { - printf("Adding %ld bytes to physical memory to account for " - "exec-shield gap\n", diff); - physmem_size += UML_ROUND_UP(brk_start) - UML_ROUND_UP(&_end); + os_info("Adding %ld bytes to physical memory to account for " + "exec-shield gap\n", diff); + physmem_size += diff; } - uml_physmem = (unsigned long) &__binary_start & PAGE_MASK; + uml_physmem = (unsigned long) __binary_start & PAGE_MASK; /* Reserve up to 4M after the current brk */ uml_reserved = ROUND_4M(brk_start) + (1 << 22); setup_machinename(init_utsname()->machine); - highmem = 0; - iomem_size = (iomem_size + PAGE_SIZE - 1) & PAGE_MASK; - max_physmem = TASK_SIZE - uml_physmem - iomem_size - MIN_VMALLOC; - - /* - * Zones have to begin on a 1 << MAX_ORDER page boundary, - * so this makes sure that's true for highmem - */ - max_physmem &= ~((1 << (PAGE_SHIFT + MAX_ORDER)) - 1); - if (physmem_size + iomem_size > max_physmem) { - highmem = physmem_size + iomem_size - max_physmem; - physmem_size -= highmem; -#ifndef CONFIG_HIGHMEM - highmem = 0; - printf("CONFIG_HIGHMEM not enabled - physical memory shrunk " - "to %Lu bytes\n", physmem_size); -#endif + physmem_size = PAGE_ALIGN(physmem_size); + max_physmem = TASK_SIZE - uml_physmem - MIN_VMALLOC; + if (physmem_size > max_physmem) { + physmem_size = max_physmem; + os_info("Physical memory size shrunk to %llu bytes\n", + physmem_size); } high_physmem = uml_physmem + physmem_size; - end_iomem = high_physmem + iomem_size; - high_memory = (void *) end_iomem; start_vm = VMALLOC_START; - setup_physmem(uml_physmem, uml_reserved, physmem_size, highmem); - if (init_maps(physmem_size, iomem_size, highmem)) { - printf("Failed to allocate mem_map for %Lu bytes of physical " - "memory and %Lu bytes of highmem\n", physmem_size, - highmem); - exit(1); - } - virtmem_size = physmem_size; stack = (unsigned long) argv; stack &= ~(1024 * 1024 - 1); @@ -364,39 +389,70 @@ int __init linux_main(int argc, char **argv) end_vm = start_vm + virtmem_size; if (virtmem_size < physmem_size) - printf("Kernel virtual memory size shrunk to %lu bytes\n", - virtmem_size); + os_info("Kernel virtual memory size shrunk to %lu bytes\n", + virtmem_size); - atomic_notifier_chain_register(&panic_notifier_list, - &panic_exit_notifier); - - uml_postsetup(); + arch_task_struct_size = sizeof(struct task_struct) + host_fp_size; - stack_protections((unsigned long) &init_thread_info); os_flush_stdout(); return start_uml(); } +int __init __weak read_initrd(void) +{ + return 0; +} + void __init setup_arch(char **cmdline_p) { + u8 rng_seed[32]; + + stack_protections((unsigned long) init_task.stack); + setup_physmem(uml_physmem, uml_reserved, physmem_size); + uml_dtb_init(); + read_initrd(); + paging_init(); - strlcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); + strscpy(boot_command_line, command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; setup_hostinfo(host_info, sizeof host_info); + prefill_possible_map(); + + if (os_getrandom(rng_seed, sizeof(rng_seed), 0) == sizeof(rng_seed)) { + add_bootloader_randomness(rng_seed, sizeof(rng_seed)); + memzero_explicit(rng_seed, sizeof(rng_seed)); + } } -void __init check_bugs(void) +void __init arch_cpu_finalize_init(void) { arch_check_bugs(); os_check_bugs(); } +void apply_seal_endbr(s32 *start, s32 *end) +{ +} + +void apply_retpolines(s32 *start, s32 *end) +{ +} + +void apply_returns(s32 *start, s32 *end) +{ +} + +void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, + s32 *start_cfi, s32 *end_cfi) +{ +} + void apply_alternatives(struct alt_instr *start, struct alt_instr *end) { } -#ifdef CONFIG_SMP +#if IS_ENABLED(CONFIG_SMP) void alternatives_smp_module_add(struct module *mod, char *name, void *locks, void *locks_end, void *text, void *text_end) @@ -407,3 +463,90 @@ void alternatives_smp_module_del(struct module *mod) { } #endif + +void *text_poke(void *addr, const void *opcode, size_t len) +{ + /* + * In UML, the only reference to this function is in + * apply_relocate_add(), which shouldn't ever actually call this + * because UML doesn't have live patching. + */ + WARN_ON(1); + + return memcpy(addr, opcode, len); +} + +void *text_poke_copy(void *addr, const void *opcode, size_t len) +{ + return text_poke(addr, opcode, len); +} + +void smp_text_poke_sync_each_cpu(void) +{ +} + +void uml_pm_wake(void) +{ + pm_system_wakeup(); +} + +#ifdef CONFIG_PM_SLEEP +static int um_suspend_valid(suspend_state_t state) +{ + return state == PM_SUSPEND_MEM; +} + +static int um_suspend_prepare(void) +{ + um_irqs_suspend(); + return 0; +} + +static int um_suspend_enter(suspend_state_t state) +{ + if (WARN_ON(state != PM_SUSPEND_MEM)) + return -EINVAL; + + /* + * This is identical to the idle sleep, but we've just + * (during suspend) turned off all interrupt sources + * except for the ones we want, so now we can only wake + * up on something we actually want to wake up on. All + * timing has also been suspended. + */ + um_idle_sleep(); + return 0; +} + +static void um_suspend_finish(void) +{ + um_irqs_resume(); +} + +const struct platform_suspend_ops um_suspend_ops = { + .valid = um_suspend_valid, + .prepare = um_suspend_prepare, + .enter = um_suspend_enter, + .finish = um_suspend_finish, +}; + +static int init_pm_wake_signal(void) +{ + /* + * In external time-travel mode we can't use signals to wake up + * since that would mess with the scheduling. We'll have to do + * some additional work to support wakeup on virtio devices or + * similar, perhaps implementing a fake RTC controller that can + * trigger wakeup (and request the appropriate scheduling from + * the external scheduler when going to suspend.) + */ + if (time_travel_mode != TT_MODE_EXTERNAL) + register_pm_wake_signal(); + + suspend_set_ops(&um_suspend_ops); + + return 0; +} + +late_initcall(init_pm_wake_signal); +#endif diff --git a/arch/um/kernel/um_arch.h b/arch/um/kernel/um_arch.h new file mode 100644 index 000000000000..46e731ab9dfc --- /dev/null +++ b/arch/um/kernel/um_arch.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __UML_ARCH_H__ +#define __UML_ARCH_H__ + +extern void * __init uml_load_file(const char *filename, unsigned long long *size); + +#ifdef CONFIG_OF +extern void __init uml_dtb_init(void); +#else +static inline void uml_dtb_init(void) { } +#endif + +extern int __init read_initrd(void); + +#endif diff --git a/arch/um/kernel/umid.c b/arch/um/kernel/umid.c index f6cc3bd61781..72bc60ade347 100644 --- a/arch/um/kernel/umid.c +++ b/arch/um/kernel/umid.c @@ -1,6 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <asm/errno.h> @@ -9,21 +9,21 @@ #include <os.h> /* Changed by set_umid_arg */ -static int umid_inited = 0; +static int umid_inited; static int __init set_umid_arg(char *name, int *add) { int err; if (umid_inited) { - printf("umid already set\n"); + os_warn("umid already set\n"); return 0; } *add = 0; err = set_umid(name); if (err == -EEXIST) - printf("umid '%s' already in use\n", name); + os_warn("umid '%s' already in use\n", name); else if (!err) umid_inited = 1; diff --git a/arch/um/kernel/uml.lds.S b/arch/um/kernel/uml.lds.S index 6899195602b7..a409d4b66114 100644 --- a/arch/um/kernel/uml.lds.S +++ b/arch/um/kernel/uml.lds.S @@ -1,4 +1,5 @@ -#include <asm-generic/vmlinux.lds.h> +/* SPDX-License-Identifier: GPL-2.0 */ +#include <asm/vmlinux.lds.h> #include <asm/page.h> OUTPUT_FORMAT(ELF_FORMAT) @@ -6,6 +7,12 @@ OUTPUT_ARCH(ELF_ARCH) ENTRY(_start) jiffies = jiffies_64; +VERSION { + { + local: *; + }; +} + SECTIONS { /* This must contain the right address - not quite the default ELF one.*/ @@ -18,10 +25,10 @@ SECTIONS __binary_start = START; . = START + SIZEOF_HEADERS; + . = ALIGN(PAGE_SIZE); _text = .; INIT_TEXT_SECTION(0) - . = ALIGN(PAGE_SIZE); .text : { @@ -29,6 +36,8 @@ SECTIONS TEXT_TEXT SCHED_TEXT LOCK_TEXT + IRQENTRY_TEXT + SOFTIRQENTRY_TEXT *(.fixup) /* .gnu.warning sections are handled specially by elf32.em. */ *(.gnu.warning) @@ -68,8 +77,6 @@ SECTIONS .data : { INIT_TASK_DATA(KERNEL_STACK_SIZE) - . = ALIGN(KERNEL_STACK_SIZE); - *(.data..init_irqstack) DATA_DATA *(.gnu.linkonce.d*) CONSTRUCTORS @@ -85,6 +92,7 @@ SECTIONS } .got : { *(.got.plt) *(.got) } + .eh_frame : { KEEP (*(.eh_frame)) } .dynamic : { *(.dynamic) } .tdata : { *(.tdata .tdata.* .gnu.linkonce.td.*) } .tbss : { *(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon) } @@ -104,8 +112,8 @@ SECTIONS PROVIDE (end = .); STABS_DEBUG - DWARF_DEBUG + ELF_DETAILS DISCARDS } diff --git a/arch/um/kernel/vmlinux.lds.S b/arch/um/kernel/vmlinux.lds.S index 16e49bfa2b42..53d719c04ba9 100644 --- a/arch/um/kernel/vmlinux.lds.S +++ b/arch/um/kernel/vmlinux.lds.S @@ -1,4 +1,4 @@ - +#define RUNTIME_DISCARD_EXIT KERNEL_STACK_SIZE = 4096 * (1 << CONFIG_KERNEL_STACK_ORDER); #ifdef CONFIG_LD_SCRIPT_STATIC diff --git a/arch/um/os-Linux/Makefile b/arch/um/os-Linux/Makefile index 08ff5094fcdd..f8d672d570d9 100644 --- a/arch/um/os-Linux/Makefile +++ b/arch/um/os-Linux/Makefile @@ -1,20 +1,23 @@ +# SPDX-License-Identifier: GPL-2.0 # # Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) -# Licensed under the GPL # -obj-y = aio.o execvp.o file.o helper.o irq.o main.o mem.o process.o \ +# Don't instrument UML-specific code +KCOV_INSTRUMENT := n + +obj-y = elf_aux.o execvp.o file.o helper.o irq.o main.o mem.o process.o \ registers.o sigio.o signal.o start_up.o time.o tty.o \ - umid.o user_syms.o util.o drivers/ skas/ + umid.o user_syms.o util.o skas/ -obj-$(CONFIG_ARCH_REUSE_HOST_VSYSCALL_AREA) += elf_aux.o +CFLAGS_signal.o += -Wframe-larger-than=4096 -USER_OBJS := $(user-objs-y) aio.o elf_aux.o execvp.o file.o helper.o irq.o \ - main.o mem.o process.o registers.o sigio.o signal.o start_up.o time.o \ - tty.o umid.o util.o +CFLAGS_main.o += -Wno-frame-larger-than -HAVE_AIO_ABI := $(shell [ -r /usr/include/linux/aio_abi.h ] && \ - echo -DHAVE_AIO_ABI ) -CFLAGS_aio.o += $(HAVE_AIO_ABI) +obj-$(CONFIG_SMP) += smp.o + +USER_OBJS := $(user-objs-y) elf_aux.o execvp.o file.o helper.o irq.o \ + main.o mem.o process.o registers.o sigio.o signal.o start_up.o time.o \ + tty.o umid.o util.o smp.o -include arch/um/scripts/Makefile.rules +include $(srctree)/arch/um/scripts/Makefile.rules diff --git a/arch/um/os-Linux/aio.c b/arch/um/os-Linux/aio.c deleted file mode 100644 index 3a6bc2af0961..000000000000 --- a/arch/um/os-Linux/aio.c +++ /dev/null @@ -1,391 +0,0 @@ -/* - * Copyright (C) 2004 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL - */ - -#include <unistd.h> -#include <sched.h> -#include <signal.h> -#include <errno.h> -#include <sys/time.h> -#include <asm/unistd.h> -#include <aio.h> -#include <init.h> -#include <kern_util.h> -#include <os.h> - -struct aio_thread_req { - enum aio_type type; - int io_fd; - unsigned long long offset; - char *buf; - int len; - struct aio_context *aio; -}; - -#if defined(HAVE_AIO_ABI) -#include <linux/aio_abi.h> - -/* - * If we have the headers, we are going to build with AIO enabled. - * If we don't have aio in libc, we define the necessary stubs here. - */ - -#if !defined(HAVE_AIO_LIBC) - -static long io_setup(int n, aio_context_t *ctxp) -{ - return syscall(__NR_io_setup, n, ctxp); -} - -static long io_submit(aio_context_t ctx, long nr, struct iocb **iocbpp) -{ - return syscall(__NR_io_submit, ctx, nr, iocbpp); -} - -static long io_getevents(aio_context_t ctx_id, long min_nr, long nr, - struct io_event *events, struct timespec *timeout) -{ - return syscall(__NR_io_getevents, ctx_id, min_nr, nr, events, timeout); -} - -#endif - -/* - * The AIO_MMAP cases force the mmapped page into memory here - * rather than in whatever place first touches the data. I used - * to do this by touching the page, but that's delicate because - * gcc is prone to optimizing that away. So, what's done here - * is we read from the descriptor from which the page was - * mapped. The caller is required to pass an offset which is - * inside the page that was mapped. Thus, when the read - * returns, we know that the page is in the page cache, and - * that it now backs the mmapped area. - */ - -static int do_aio(aio_context_t ctx, enum aio_type type, int fd, char *buf, - int len, unsigned long long offset, struct aio_context *aio) -{ - struct iocb *iocbp = & ((struct iocb) { - .aio_data = (unsigned long) aio, - .aio_fildes = fd, - .aio_buf = (unsigned long) buf, - .aio_nbytes = len, - .aio_offset = offset - }); - char c; - - switch (type) { - case AIO_READ: - iocbp->aio_lio_opcode = IOCB_CMD_PREAD; - break; - case AIO_WRITE: - iocbp->aio_lio_opcode = IOCB_CMD_PWRITE; - break; - case AIO_MMAP: - iocbp->aio_lio_opcode = IOCB_CMD_PREAD; - iocbp->aio_buf = (unsigned long) &c; - iocbp->aio_nbytes = sizeof(c); - break; - default: - printk(UM_KERN_ERR "Bogus op in do_aio - %d\n", type); - return -EINVAL; - } - - return (io_submit(ctx, 1, &iocbp) > 0) ? 0 : -errno; -} - -/* Initialized in an initcall and unchanged thereafter */ -static aio_context_t ctx = 0; - -static int aio_thread(void *arg) -{ - struct aio_thread_reply reply; - struct io_event event; - int err, n, reply_fd; - - signal(SIGWINCH, SIG_IGN); - - while (1) { - n = io_getevents(ctx, 1, 1, &event, NULL); - if (n < 0) { - if (errno == EINTR) - continue; - printk(UM_KERN_ERR "aio_thread - io_getevents failed, " - "errno = %d\n", errno); - } - else { - reply = ((struct aio_thread_reply) - { .data = (void *) (long) event.data, - .err = event.res }); - reply_fd = ((struct aio_context *) reply.data)->reply_fd; - err = write(reply_fd, &reply, sizeof(reply)); - if (err != sizeof(reply)) - printk(UM_KERN_ERR "aio_thread - write failed, " - "fd = %d, err = %d\n", reply_fd, errno); - } - } - return 0; -} - -#endif - -static int do_not_aio(struct aio_thread_req *req) -{ - char c; - unsigned long long actual; - int n; - - actual = lseek64(req->io_fd, req->offset, SEEK_SET); - if (actual != req->offset) - return -errno; - - switch (req->type) { - case AIO_READ: - n = read(req->io_fd, req->buf, req->len); - break; - case AIO_WRITE: - n = write(req->io_fd, req->buf, req->len); - break; - case AIO_MMAP: - n = read(req->io_fd, &c, sizeof(c)); - break; - default: - printk(UM_KERN_ERR "do_not_aio - bad request type : %d\n", - req->type); - return -EINVAL; - } - - if (n < 0) - return -errno; - return 0; -} - -/* These are initialized in initcalls and not changed */ -static int aio_req_fd_r = -1; -static int aio_req_fd_w = -1; -static int aio_pid = -1; -static unsigned long aio_stack; - -static int not_aio_thread(void *arg) -{ - struct aio_thread_req req; - struct aio_thread_reply reply; - int err; - - signal(SIGWINCH, SIG_IGN); - while (1) { - err = read(aio_req_fd_r, &req, sizeof(req)); - if (err != sizeof(req)) { - if (err < 0) - printk(UM_KERN_ERR "not_aio_thread - " - "read failed, fd = %d, err = %d\n", - aio_req_fd_r, - errno); - else { - printk(UM_KERN_ERR "not_aio_thread - short " - "read, fd = %d, length = %d\n", - aio_req_fd_r, err); - } - continue; - } - err = do_not_aio(&req); - reply = ((struct aio_thread_reply) { .data = req.aio, - .err = err }); - err = write(req.aio->reply_fd, &reply, sizeof(reply)); - if (err != sizeof(reply)) - printk(UM_KERN_ERR "not_aio_thread - write failed, " - "fd = %d, err = %d\n", req.aio->reply_fd, errno); - } - - return 0; -} - -static int init_aio_24(void) -{ - int fds[2], err; - - err = os_pipe(fds, 1, 1); - if (err) - goto out; - - aio_req_fd_w = fds[0]; - aio_req_fd_r = fds[1]; - - err = os_set_fd_block(aio_req_fd_w, 0); - if (err) - goto out_close_pipe; - - err = run_helper_thread(not_aio_thread, NULL, - CLONE_FILES | CLONE_VM, &aio_stack); - if (err < 0) - goto out_close_pipe; - - aio_pid = err; - goto out; - -out_close_pipe: - close(fds[0]); - close(fds[1]); - aio_req_fd_w = -1; - aio_req_fd_r = -1; -out: -#ifndef HAVE_AIO_ABI - printk(UM_KERN_INFO "/usr/include/linux/aio_abi.h not present during " - "build\n"); -#endif - printk(UM_KERN_INFO "2.6 host AIO support not used - falling back to " - "I/O thread\n"); - return 0; -} - -#ifdef HAVE_AIO_ABI -#define DEFAULT_24_AIO 0 -static int init_aio_26(void) -{ - int err; - - if (io_setup(256, &ctx)) { - err = -errno; - printk(UM_KERN_ERR "aio_thread failed to initialize context, " - "err = %d\n", errno); - return err; - } - - err = run_helper_thread(aio_thread, NULL, - CLONE_FILES | CLONE_VM, &aio_stack); - if (err < 0) - return err; - - aio_pid = err; - - printk(UM_KERN_INFO "Using 2.6 host AIO\n"); - return 0; -} - -static int submit_aio_26(enum aio_type type, int io_fd, char *buf, int len, - unsigned long long offset, struct aio_context *aio) -{ - struct aio_thread_reply reply; - int err; - - err = do_aio(ctx, type, io_fd, buf, len, offset, aio); - if (err) { - reply = ((struct aio_thread_reply) { .data = aio, - .err = err }); - err = write(aio->reply_fd, &reply, sizeof(reply)); - if (err != sizeof(reply)) { - err = -errno; - printk(UM_KERN_ERR "submit_aio_26 - write failed, " - "fd = %d, err = %d\n", aio->reply_fd, -err); - } - else err = 0; - } - - return err; -} - -#else -#define DEFAULT_24_AIO 1 -static int init_aio_26(void) -{ - return -ENOSYS; -} - -static int submit_aio_26(enum aio_type type, int io_fd, char *buf, int len, - unsigned long long offset, struct aio_context *aio) -{ - return -ENOSYS; -} -#endif - -/* Initialized in an initcall and unchanged thereafter */ -static int aio_24 = DEFAULT_24_AIO; - -static int __init set_aio_24(char *name, int *add) -{ - aio_24 = 1; - return 0; -} - -__uml_setup("aio=2.4", set_aio_24, -"aio=2.4\n" -" This is used to force UML to use 2.4-style AIO even when 2.6 AIO is\n" -" available. 2.4 AIO is a single thread that handles one request at a\n" -" time, synchronously. 2.6 AIO is a thread which uses the 2.6 AIO \n" -" interface to handle an arbitrary number of pending requests. 2.6 AIO \n" -" is not available in tt mode, on 2.4 hosts, or when UML is built with\n" -" /usr/include/linux/aio_abi.h not available. Many distributions don't\n" -" include aio_abi.h, so you will need to copy it from a kernel tree to\n" -" your /usr/include/linux in order to build an AIO-capable UML\n\n" -); - -static int init_aio(void) -{ - int err; - - if (!aio_24) { - err = init_aio_26(); - if (err && (errno == ENOSYS)) { - printk(UM_KERN_INFO "2.6 AIO not supported on the " - "host - reverting to 2.4 AIO\n"); - aio_24 = 1; - } - else return err; - } - - if (aio_24) - return init_aio_24(); - - return 0; -} - -/* - * The reason for the __initcall/__uml_exitcall asymmetry is that init_aio - * needs to be called when the kernel is running because it calls run_helper, - * which needs get_free_page. exit_aio is a __uml_exitcall because the generic - * kernel does not run __exitcalls on shutdown, and can't because many of them - * break when called outside of module unloading. - */ -__initcall(init_aio); - -static void exit_aio(void) -{ - if (aio_pid != -1) { - os_kill_process(aio_pid, 1); - free_stack(aio_stack, 0); - } -} - -__uml_exitcall(exit_aio); - -static int submit_aio_24(enum aio_type type, int io_fd, char *buf, int len, - unsigned long long offset, struct aio_context *aio) -{ - struct aio_thread_req req = { .type = type, - .io_fd = io_fd, - .offset = offset, - .buf = buf, - .len = len, - .aio = aio, - }; - int err; - - err = write(aio_req_fd_w, &req, sizeof(req)); - if (err == sizeof(req)) - err = 0; - else err = -errno; - - return err; -} - -int submit_aio(enum aio_type type, int io_fd, char *buf, int len, - unsigned long long offset, int reply_fd, - struct aio_context *aio) -{ - aio->reply_fd = reply_fd; - if (aio_24) - return submit_aio_24(type, io_fd, buf, len, offset, aio); - else - return submit_aio_26(type, io_fd, buf, len, offset, aio); -} diff --git a/arch/um/os-Linux/drivers/Makefile b/arch/um/os-Linux/drivers/Makefile deleted file mode 100644 index 6c546dc9222b..000000000000 --- a/arch/um/os-Linux/drivers/Makefile +++ /dev/null @@ -1,13 +0,0 @@ -# -# Copyright (C) 2000, 2002 Jeff Dike (jdike@karaya.com) -# Licensed under the GPL -# - -ethertap-objs := ethertap_kern.o ethertap_user.o -tuntap-objs := tuntap_kern.o tuntap_user.o - -obj-y = -obj-$(CONFIG_UML_NET_ETHERTAP) += ethertap.o -obj-$(CONFIG_UML_NET_TUNTAP) += tuntap.o - -include arch/um/scripts/Makefile.rules diff --git a/arch/um/os-Linux/drivers/etap.h b/arch/um/os-Linux/drivers/etap.h deleted file mode 100644 index 54183a679fdd..000000000000 --- a/arch/um/os-Linux/drivers/etap.h +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL - */ - -#ifndef __DRIVERS_ETAP_H -#define __DRIVERS_ETAP_H - -#include <net_user.h> - -struct ethertap_data { - char *dev_name; - char *gate_addr; - int data_fd; - int control_fd; - void *dev; -}; - -extern const struct net_user_info ethertap_user_info; - -#endif diff --git a/arch/um/os-Linux/drivers/ethertap_kern.c b/arch/um/os-Linux/drivers/ethertap_kern.c deleted file mode 100644 index f424600a583f..000000000000 --- a/arch/um/os-Linux/drivers/ethertap_kern.c +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and - * James Leu (jleu@mindspring.net). - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Copyright (C) 2001 by various other people who didn't put their name here. - * Licensed under the GPL. - */ - -#include <linux/init.h> -#include <linux/netdevice.h> -#include "etap.h" -#include <net_kern.h> - -struct ethertap_init { - char *dev_name; - char *gate_addr; -}; - -static void etap_init(struct net_device *dev, void *data) -{ - struct uml_net_private *pri; - struct ethertap_data *epri; - struct ethertap_init *init = data; - - pri = netdev_priv(dev); - epri = (struct ethertap_data *) pri->user; - epri->dev_name = init->dev_name; - epri->gate_addr = init->gate_addr; - epri->data_fd = -1; - epri->control_fd = -1; - epri->dev = dev; - - printk(KERN_INFO "ethertap backend - %s", epri->dev_name); - if (epri->gate_addr != NULL) - printk(KERN_CONT ", IP = %s", epri->gate_addr); - printk(KERN_CONT "\n"); -} - -static int etap_read(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - int len; - - len = net_recvfrom(fd, skb_mac_header(skb), - skb->dev->mtu + 2 + ETH_HEADER_ETHERTAP); - if (len <= 0) - return(len); - - skb_pull(skb, 2); - len -= 2; - return len; -} - -static int etap_write(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - skb_push(skb, 2); - return net_send(fd, skb->data, skb->len); -} - -const struct net_kern_info ethertap_kern_info = { - .init = etap_init, - .protocol = eth_protocol, - .read = etap_read, - .write = etap_write, -}; - -int ethertap_setup(char *str, char **mac_out, void *data) -{ - struct ethertap_init *init = data; - - *init = ((struct ethertap_init) - { .dev_name = NULL, - .gate_addr = NULL }); - if (tap_setup_common(str, "ethertap", &init->dev_name, mac_out, - &init->gate_addr)) - return 0; - if (init->dev_name == NULL) { - printk(KERN_ERR "ethertap_setup : Missing tap device name\n"); - return 0; - } - - return 1; -} - -static struct transport ethertap_transport = { - .list = LIST_HEAD_INIT(ethertap_transport.list), - .name = "ethertap", - .setup = ethertap_setup, - .user = ðertap_user_info, - .kern = ðertap_kern_info, - .private_size = sizeof(struct ethertap_data), - .setup_size = sizeof(struct ethertap_init), -}; - -static int register_ethertap(void) -{ - register_transport(ðertap_transport); - return 0; -} - -late_initcall(register_ethertap); diff --git a/arch/um/os-Linux/drivers/ethertap_user.c b/arch/um/os-Linux/drivers/ethertap_user.c deleted file mode 100644 index b39b6696ac58..000000000000 --- a/arch/um/os-Linux/drivers/ethertap_user.c +++ /dev/null @@ -1,248 +0,0 @@ -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and - * James Leu (jleu@mindspring.net). - * Copyright (C) 2001 by various other people who didn't put their name here. - * Licensed under the GPL. - */ - -#include <stdio.h> -#include <unistd.h> -#include <errno.h> -#include <string.h> -#include <sys/socket.h> -#include <sys/wait.h> -#include "etap.h" -#include <os.h> -#include <net_user.h> -#include <um_malloc.h> - -#define MAX_PACKET ETH_MAX_PACKET - -static int etap_user_init(void *data, void *dev) -{ - struct ethertap_data *pri = data; - - pri->dev = dev; - return 0; -} - -struct addr_change { - enum { ADD_ADDR, DEL_ADDR } what; - unsigned char addr[4]; - unsigned char netmask[4]; -}; - -static void etap_change(int op, unsigned char *addr, unsigned char *netmask, - int fd) -{ - struct addr_change change; - char *output; - int n; - - change.what = op; - memcpy(change.addr, addr, sizeof(change.addr)); - memcpy(change.netmask, netmask, sizeof(change.netmask)); - CATCH_EINTR(n = write(fd, &change, sizeof(change))); - if (n != sizeof(change)) { - printk(UM_KERN_ERR "etap_change - request failed, err = %d\n", - errno); - return; - } - - output = uml_kmalloc(UM_KERN_PAGE_SIZE, UM_GFP_KERNEL); - if (output == NULL) - printk(UM_KERN_ERR "etap_change : Failed to allocate output " - "buffer\n"); - read_output(fd, output, UM_KERN_PAGE_SIZE); - if (output != NULL) { - printk("%s", output); - kfree(output); - } -} - -static void etap_open_addr(unsigned char *addr, unsigned char *netmask, - void *arg) -{ - etap_change(ADD_ADDR, addr, netmask, *((int *) arg)); -} - -static void etap_close_addr(unsigned char *addr, unsigned char *netmask, - void *arg) -{ - etap_change(DEL_ADDR, addr, netmask, *((int *) arg)); -} - -struct etap_pre_exec_data { - int control_remote; - int control_me; - int data_me; -}; - -static void etap_pre_exec(void *arg) -{ - struct etap_pre_exec_data *data = arg; - - dup2(data->control_remote, 1); - close(data->data_me); - close(data->control_me); -} - -static int etap_tramp(char *dev, char *gate, int control_me, - int control_remote, int data_me, int data_remote) -{ - struct etap_pre_exec_data pe_data; - int pid, err, n; - char version_buf[sizeof("nnnnn\0")]; - char data_fd_buf[sizeof("nnnnnn\0")]; - char gate_buf[sizeof("nnn.nnn.nnn.nnn\0")]; - char *setup_args[] = { "uml_net", version_buf, "ethertap", dev, - data_fd_buf, gate_buf, NULL }; - char *nosetup_args[] = { "uml_net", version_buf, "ethertap", - dev, data_fd_buf, NULL }; - char **args, c; - - sprintf(data_fd_buf, "%d", data_remote); - sprintf(version_buf, "%d", UML_NET_VERSION); - if (gate != NULL) { - strcpy(gate_buf, gate); - args = setup_args; - } - else args = nosetup_args; - - err = 0; - pe_data.control_remote = control_remote; - pe_data.control_me = control_me; - pe_data.data_me = data_me; - pid = run_helper(etap_pre_exec, &pe_data, args); - - if (pid < 0) - err = pid; - close(data_remote); - close(control_remote); - CATCH_EINTR(n = read(control_me, &c, sizeof(c))); - if (n != sizeof(c)) { - err = -errno; - printk(UM_KERN_ERR "etap_tramp : read of status failed, " - "err = %d\n", -err); - return err; - } - if (c != 1) { - printk(UM_KERN_ERR "etap_tramp : uml_net failed\n"); - err = helper_wait(pid); - } - return err; -} - -static int etap_open(void *data) -{ - struct ethertap_data *pri = data; - char *output; - int data_fds[2], control_fds[2], err, output_len; - - err = tap_open_common(pri->dev, pri->gate_addr); - if (err) - return err; - - err = socketpair(AF_UNIX, SOCK_DGRAM, 0, data_fds); - if (err) { - err = -errno; - printk(UM_KERN_ERR "etap_open - data socketpair failed - " - "err = %d\n", errno); - return err; - } - - err = socketpair(AF_UNIX, SOCK_STREAM, 0, control_fds); - if (err) { - err = -errno; - printk(UM_KERN_ERR "etap_open - control socketpair failed - " - "err = %d\n", errno); - goto out_close_data; - } - - err = etap_tramp(pri->dev_name, pri->gate_addr, control_fds[0], - control_fds[1], data_fds[0], data_fds[1]); - output_len = UM_KERN_PAGE_SIZE; - output = uml_kmalloc(output_len, UM_GFP_KERNEL); - read_output(control_fds[0], output, output_len); - - if (output == NULL) - printk(UM_KERN_ERR "etap_open : failed to allocate output " - "buffer\n"); - else { - printk("%s", output); - kfree(output); - } - - if (err < 0) { - printk(UM_KERN_ERR "etap_tramp failed - err = %d\n", -err); - goto out_close_control; - } - - pri->data_fd = data_fds[0]; - pri->control_fd = control_fds[0]; - iter_addresses(pri->dev, etap_open_addr, &pri->control_fd); - return data_fds[0]; - -out_close_control: - close(control_fds[0]); - close(control_fds[1]); -out_close_data: - close(data_fds[0]); - close(data_fds[1]); - return err; -} - -static void etap_close(int fd, void *data) -{ - struct ethertap_data *pri = data; - - iter_addresses(pri->dev, etap_close_addr, &pri->control_fd); - close(fd); - - if (shutdown(pri->data_fd, SHUT_RDWR) < 0) - printk(UM_KERN_ERR "etap_close - shutdown data socket failed, " - "errno = %d\n", errno); - - if (shutdown(pri->control_fd, SHUT_RDWR) < 0) - printk(UM_KERN_ERR "etap_close - shutdown control socket " - "failed, errno = %d\n", errno); - - close(pri->data_fd); - pri->data_fd = -1; - close(pri->control_fd); - pri->control_fd = -1; -} - -static void etap_add_addr(unsigned char *addr, unsigned char *netmask, - void *data) -{ - struct ethertap_data *pri = data; - - tap_check_ips(pri->gate_addr, addr); - if (pri->control_fd == -1) - return; - etap_open_addr(addr, netmask, &pri->control_fd); -} - -static void etap_del_addr(unsigned char *addr, unsigned char *netmask, - void *data) -{ - struct ethertap_data *pri = data; - - if (pri->control_fd == -1) - return; - - etap_close_addr(addr, netmask, &pri->control_fd); -} - -const struct net_user_info ethertap_user_info = { - .init = etap_user_init, - .open = etap_open, - .close = etap_close, - .remove = NULL, - .add_address = etap_add_addr, - .delete_address = etap_del_addr, - .mtu = ETH_MAX_PACKET, - .max_packet = ETH_MAX_PACKET + ETH_HEADER_ETHERTAP, -}; diff --git a/arch/um/os-Linux/drivers/tuntap.h b/arch/um/os-Linux/drivers/tuntap.h deleted file mode 100644 index 7367354ac8df..000000000000 --- a/arch/um/os-Linux/drivers/tuntap.h +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL - */ - -#ifndef __UM_TUNTAP_H -#define __UM_TUNTAP_H - -#include <net_user.h> - -struct tuntap_data { - char *dev_name; - int fixed_config; - char *gate_addr; - int fd; - void *dev; -}; - -extern const struct net_user_info tuntap_user_info; - -#endif diff --git a/arch/um/os-Linux/drivers/tuntap_kern.c b/arch/um/os-Linux/drivers/tuntap_kern.c deleted file mode 100644 index d9d56e5810fe..000000000000 --- a/arch/um/os-Linux/drivers/tuntap_kern.c +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL - */ - -#include <linux/netdevice.h> -#include <linux/init.h> -#include <linux/skbuff.h> -#include <asm/errno.h> -#include <net_kern.h> -#include "tuntap.h" - -struct tuntap_init { - char *dev_name; - char *gate_addr; -}; - -static void tuntap_init(struct net_device *dev, void *data) -{ - struct uml_net_private *pri; - struct tuntap_data *tpri; - struct tuntap_init *init = data; - - pri = netdev_priv(dev); - tpri = (struct tuntap_data *) pri->user; - tpri->dev_name = init->dev_name; - tpri->fixed_config = (init->dev_name != NULL); - tpri->gate_addr = init->gate_addr; - tpri->fd = -1; - tpri->dev = dev; - - printk(KERN_INFO "TUN/TAP backend - "); - if (tpri->gate_addr != NULL) - printk(KERN_CONT "IP = %s", tpri->gate_addr); - printk(KERN_CONT "\n"); -} - -static int tuntap_read(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - return net_read(fd, skb_mac_header(skb), - skb->dev->mtu + ETH_HEADER_OTHER); -} - -static int tuntap_write(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - return net_write(fd, skb->data, skb->len); -} - -const struct net_kern_info tuntap_kern_info = { - .init = tuntap_init, - .protocol = eth_protocol, - .read = tuntap_read, - .write = tuntap_write, -}; - -int tuntap_setup(char *str, char **mac_out, void *data) -{ - struct tuntap_init *init = data; - - *init = ((struct tuntap_init) - { .dev_name = NULL, - .gate_addr = NULL }); - if (tap_setup_common(str, "tuntap", &init->dev_name, mac_out, - &init->gate_addr)) - return 0; - - return 1; -} - -static struct transport tuntap_transport = { - .list = LIST_HEAD_INIT(tuntap_transport.list), - .name = "tuntap", - .setup = tuntap_setup, - .user = &tuntap_user_info, - .kern = &tuntap_kern_info, - .private_size = sizeof(struct tuntap_data), - .setup_size = sizeof(struct tuntap_init), -}; - -static int register_tuntap(void) -{ - register_transport(&tuntap_transport); - return 0; -} - -late_initcall(register_tuntap); diff --git a/arch/um/os-Linux/drivers/tuntap_user.c b/arch/um/os-Linux/drivers/tuntap_user.c deleted file mode 100644 index 14126d9176aa..000000000000 --- a/arch/um/os-Linux/drivers/tuntap_user.c +++ /dev/null @@ -1,215 +0,0 @@ -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL - */ - -#include <stdio.h> -#include <unistd.h> -#include <errno.h> -#include <string.h> -#include <linux/if_tun.h> -#include <net/if.h> -#include <sys/ioctl.h> -#include <sys/socket.h> -#include <sys/wait.h> -#include <sys/uio.h> -#include <kern_util.h> -#include <os.h> -#include "tuntap.h" - -static int tuntap_user_init(void *data, void *dev) -{ - struct tuntap_data *pri = data; - - pri->dev = dev; - return 0; -} - -static void tuntap_add_addr(unsigned char *addr, unsigned char *netmask, - void *data) -{ - struct tuntap_data *pri = data; - - tap_check_ips(pri->gate_addr, addr); - if ((pri->fd == -1) || pri->fixed_config) - return; - open_addr(addr, netmask, pri->dev_name); -} - -static void tuntap_del_addr(unsigned char *addr, unsigned char *netmask, - void *data) -{ - struct tuntap_data *pri = data; - - if ((pri->fd == -1) || pri->fixed_config) - return; - close_addr(addr, netmask, pri->dev_name); -} - -struct tuntap_pre_exec_data { - int stdout; - int close_me; -}; - -static void tuntap_pre_exec(void *arg) -{ - struct tuntap_pre_exec_data *data = arg; - - dup2(data->stdout, 1); - close(data->close_me); -} - -static int tuntap_open_tramp(char *gate, int *fd_out, int me, int remote, - char *buffer, int buffer_len, int *used_out) -{ - struct tuntap_pre_exec_data data; - char version_buf[sizeof("nnnnn\0")]; - char *argv[] = { "uml_net", version_buf, "tuntap", "up", gate, - NULL }; - char buf[CMSG_SPACE(sizeof(*fd_out))]; - struct msghdr msg; - struct cmsghdr *cmsg; - struct iovec iov; - int pid, n, err; - - sprintf(version_buf, "%d", UML_NET_VERSION); - - data.stdout = remote; - data.close_me = me; - - pid = run_helper(tuntap_pre_exec, &data, argv); - - if (pid < 0) - return -pid; - - close(remote); - - msg.msg_name = NULL; - msg.msg_namelen = 0; - if (buffer != NULL) { - iov = ((struct iovec) { buffer, buffer_len }); - msg.msg_iov = &iov; - msg.msg_iovlen = 1; - } - else { - msg.msg_iov = NULL; - msg.msg_iovlen = 0; - } - msg.msg_control = buf; - msg.msg_controllen = sizeof(buf); - msg.msg_flags = 0; - n = recvmsg(me, &msg, 0); - *used_out = n; - if (n < 0) { - err = -errno; - printk(UM_KERN_ERR "tuntap_open_tramp : recvmsg failed - " - "errno = %d\n", errno); - return err; - } - helper_wait(pid); - - cmsg = CMSG_FIRSTHDR(&msg); - if (cmsg == NULL) { - printk(UM_KERN_ERR "tuntap_open_tramp : didn't receive a " - "message\n"); - return -EINVAL; - } - if ((cmsg->cmsg_level != SOL_SOCKET) || - (cmsg->cmsg_type != SCM_RIGHTS)) { - printk(UM_KERN_ERR "tuntap_open_tramp : didn't receive a " - "descriptor\n"); - return -EINVAL; - } - *fd_out = ((int *) CMSG_DATA(cmsg))[0]; - os_set_exec_close(*fd_out); - return 0; -} - -static int tuntap_open(void *data) -{ - struct ifreq ifr; - struct tuntap_data *pri = data; - char *output, *buffer; - int err, fds[2], len, used; - - err = tap_open_common(pri->dev, pri->gate_addr); - if (err < 0) - return err; - - if (pri->fixed_config) { - pri->fd = os_open_file("/dev/net/tun", - of_cloexec(of_rdwr(OPENFLAGS())), 0); - if (pri->fd < 0) { - printk(UM_KERN_ERR "Failed to open /dev/net/tun, " - "err = %d\n", -pri->fd); - return pri->fd; - } - memset(&ifr, 0, sizeof(ifr)); - ifr.ifr_flags = IFF_TAP | IFF_NO_PI; - strlcpy(ifr.ifr_name, pri->dev_name, sizeof(ifr.ifr_name)); - if (ioctl(pri->fd, TUNSETIFF, &ifr) < 0) { - err = -errno; - printk(UM_KERN_ERR "TUNSETIFF failed, errno = %d\n", - errno); - close(pri->fd); - return err; - } - } - else { - err = socketpair(AF_UNIX, SOCK_DGRAM, 0, fds); - if (err) { - err = -errno; - printk(UM_KERN_ERR "tuntap_open : socketpair failed - " - "errno = %d\n", errno); - return err; - } - - buffer = get_output_buffer(&len); - if (buffer != NULL) - len--; - used = 0; - - err = tuntap_open_tramp(pri->gate_addr, &pri->fd, fds[0], - fds[1], buffer, len, &used); - - output = buffer; - if (err < 0) { - printk("%s", output); - free_output_buffer(buffer); - printk(UM_KERN_ERR "tuntap_open_tramp failed - " - "err = %d\n", -err); - return err; - } - - pri->dev_name = uml_strdup(buffer); - output += IFNAMSIZ; - printk("%s", output); - free_output_buffer(buffer); - - close(fds[0]); - iter_addresses(pri->dev, open_addr, pri->dev_name); - } - - return pri->fd; -} - -static void tuntap_close(int fd, void *data) -{ - struct tuntap_data *pri = data; - - if (!pri->fixed_config) - iter_addresses(pri->dev, close_addr, pri->dev_name); - close(fd); - pri->fd = -1; -} - -const struct net_user_info tuntap_user_info = { - .init = tuntap_user_init, - .open = tuntap_open, - .close = tuntap_close, - .remove = NULL, - .add_address = tuntap_add_addr, - .delete_address = tuntap_del_addr, - .mtu = ETH_MAX_PACKET, - .max_packet = ETH_MAX_PACKET + ETH_HEADER_OTHER, -}; diff --git a/arch/um/os-Linux/elf_aux.c b/arch/um/os-Linux/elf_aux.c index 1a365ddc4d02..72f416edf252 100644 --- a/arch/um/os-Linux/elf_aux.c +++ b/arch/um/os-Linux/elf_aux.c @@ -1,7 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * arch/um/kernel/elf_aux.c * - * Scan the Elf auxiliary vector provided by the host to extract + * Scan the ELF auxiliary vector provided by the host to extract * information about vsyscall-page, etc. * * Copyright (C) 2004 Fujitsu Siemens Computers GmbH @@ -12,37 +13,27 @@ #include <init.h> #include <elf_user.h> #include <mem_user.h> +#include "internal.h" +#include <linux/swab.h> +#if __BITS_PER_LONG == 64 +typedef Elf64_auxv_t elf_auxv_t; +#else typedef Elf32_auxv_t elf_auxv_t; +#endif /* These are initialized very early in boot and never changed */ char * elf_aux_platform; -extern long elf_aux_hwcap; -unsigned long vsyscall_ehdr; -unsigned long vsyscall_end; -unsigned long __kernel_vsyscall; +long elf_aux_hwcap; __init void scan_elf_aux( char **envp) { - long page_size = 0; elf_auxv_t * auxv; while ( *envp++ != NULL) ; for ( auxv = (elf_auxv_t *)envp; auxv->a_type != AT_NULL; auxv++) { switch ( auxv->a_type ) { - case AT_SYSINFO: - __kernel_vsyscall = auxv->a_un.a_val; - /* See if the page is under TASK_SIZE */ - if (__kernel_vsyscall < (unsigned long) envp) - __kernel_vsyscall = 0; - break; - case AT_SYSINFO_EHDR: - vsyscall_ehdr = auxv->a_un.a_val; - /* See if the page is under TASK_SIZE */ - if (vsyscall_ehdr < (unsigned long) envp) - vsyscall_ehdr = 0; - break; case AT_HWCAP: elf_aux_hwcap = auxv->a_un.a_val; break; @@ -54,20 +45,6 @@ __init void scan_elf_aux( char **envp) elf_aux_platform = (char *) (long) auxv->a_un.a_val; break; - case AT_PAGESZ: - page_size = auxv->a_un.a_val; - break; } } - if ( ! __kernel_vsyscall || ! vsyscall_ehdr || - ! elf_aux_hwcap || ! elf_aux_platform || - ! page_size || (vsyscall_ehdr % page_size) ) { - __kernel_vsyscall = 0; - vsyscall_ehdr = 0; - elf_aux_hwcap = 0; - elf_aux_platform = "i586"; - } - else { - vsyscall_end = vsyscall_ehdr + page_size; - } } diff --git a/arch/um/os-Linux/execvp.c b/arch/um/os-Linux/execvp.c index 8fb25ca07c46..c09a5fd5e225 100644 --- a/arch/um/os-Linux/execvp.c +++ b/arch/um/os-Linux/execvp.c @@ -93,6 +93,7 @@ int execvp_noalloc(char *buf, const char *file, char *const argv[]) up finding no executable we can use, we want to diagnose that we did find one but were denied access. */ got_eacces = 1; + break; case ENOENT: case ESTALE: case ENOTDIR: @@ -136,7 +137,7 @@ int main(int argc, char**argv) int ret; argc--; if (!argc) { - fprintf(stderr, "Not enough arguments\n"); + os_warn("Not enough arguments\n"); return 1; } argv++; diff --git a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c index c17bd6f7d674..21f0e50fb1df 100644 --- a/arch/um/os-Linux/file.c +++ b/arch/um/os-Linux/file.c @@ -1,18 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <stdio.h> #include <unistd.h> +#include <stdlib.h> +#include <string.h> #include <errno.h> #include <fcntl.h> #include <signal.h> +#include <linux/falloc.h> #include <sys/ioctl.h> #include <sys/mount.h> #include <sys/socket.h> #include <sys/stat.h> +#include <sys/sysmacros.h> #include <sys/un.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/eventfd.h> +#include <poll.h> #include <os.h> static void copy_stat(struct uml_stat *dst, const struct stat64 *src) @@ -98,21 +106,6 @@ int os_get_ifname(int fd, char* namebuf) return 0; } -int os_set_slip(int fd) -{ - int disc, sencap; - - disc = N_SLIP; - if (ioctl(fd, TIOCSETD, &disc) < 0) - return -errno; - - sencap = 0; - if (ioctl(fd, SIOCSIFENCAP, &sencap) < 0) - return -errno; - - return 0; -} - int os_mode_fd(int fd, int mode) { int err; @@ -233,6 +226,16 @@ out: return err; } +int os_dup_file(int fd) +{ + int new_fd = dup(fd); + + if (new_fd < 0) + return -errno; + + return new_fd; +} + void os_close_file(int fd) { close(fd); @@ -257,6 +260,15 @@ int os_read_file(int fd, void *buf, int len) return n; } +int os_pread_file(int fd, void *buf, int len, unsigned long long offset) +{ + int n = pread(fd, buf, len, offset); + + if (n < 0) + return -errno; + return n; +} + int os_write_file(int fd, const void *buf, int len) { int n = write(fd, (void *) buf, len); @@ -266,6 +278,25 @@ int os_write_file(int fd, const void *buf, int len) return n; } +int os_sync_file(int fd) +{ + int n = fdatasync(fd); + + if (n < 0) + return -errno; + return n; +} + +int os_pwrite_file(int fd, const void *buf, int len, unsigned long long offset) +{ + int n = pwrite(fd, (void *) buf, len, offset); + + if (n < 0) + return -errno; + return n; +} + + int os_file_size(const char *file, unsigned long long *size_out) { struct uml_stat buf; @@ -304,7 +335,7 @@ int os_file_size(const char *file, unsigned long long *size_out) return 0; } -int os_file_modtime(const char *file, unsigned long *modtime) +int os_file_modtime(const char *file, long long *modtime) { struct uml_stat buf; int err; @@ -461,44 +492,51 @@ int os_shutdown_socket(int fd, int r, int w) return 0; } -int os_rcv_fd(int fd, int *helper_pid_out) +/** + * os_rcv_fd_msg - receive message with (optional) FDs + * @fd: the FD to receive from + * @fds: the array for FDs to write to + * @n_fds: number of FDs to receive (@fds array size) + * @data: the message buffer + * @data_len: the size of the message to receive + * + * Receive a message with FDs. + * + * Returns: the size of the received message, or an error code + */ +ssize_t os_rcv_fd_msg(int fd, int *fds, unsigned int n_fds, + void *data, size_t data_len) { - int new, n; - char buf[CMSG_SPACE(sizeof(new))]; - struct msghdr msg; +#define MAX_RCV_FDS 2 + char buf[CMSG_SPACE(sizeof(*fds) * MAX_RCV_FDS)]; struct cmsghdr *cmsg; - struct iovec iov; - - msg.msg_name = NULL; - msg.msg_namelen = 0; - iov = ((struct iovec) { .iov_base = helper_pid_out, - .iov_len = sizeof(*helper_pid_out) }); - msg.msg_iov = &iov; - msg.msg_iovlen = 1; - msg.msg_control = buf; - msg.msg_controllen = sizeof(buf); - msg.msg_flags = 0; + struct iovec iov = { + .iov_base = data, + .iov_len = data_len, + }; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = buf, + .msg_controllen = CMSG_SPACE(sizeof(*fds) * n_fds), + }; + int n; + + if (n_fds > MAX_RCV_FDS) + return -EINVAL; n = recvmsg(fd, &msg, 0); if (n < 0) return -errno; - else if (n != iov.iov_len) - *helper_pid_out = -1; cmsg = CMSG_FIRSTHDR(&msg); - if (cmsg == NULL) { - printk(UM_KERN_ERR "rcv_fd didn't receive anything, " - "error = %d\n", errno); - return -1; - } - if ((cmsg->cmsg_level != SOL_SOCKET) || - (cmsg->cmsg_type != SCM_RIGHTS)) { - printk(UM_KERN_ERR "rcv_fd didn't receive a descriptor\n"); - return -1; - } + if (!cmsg || + cmsg->cmsg_level != SOL_SOCKET || + cmsg->cmsg_type != SCM_RIGHTS) + return n; - new = ((int *) CMSG_DATA(cmsg))[0]; - return new; + memcpy(fds, CMSG_DATA(cmsg), cmsg->cmsg_len - CMSG_LEN(0)); + return n; } int os_create_unix_socket(const char *file, int len, int close_on_exec) @@ -574,3 +612,115 @@ unsigned long long os_makedev(unsigned major, unsigned minor) { return makedev(major, minor); } + +int os_falloc_punch(int fd, unsigned long long offset, int len) +{ + int n = fallocate(fd, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE, offset, len); + + if (n < 0) + return -errno; + return n; +} + +int os_falloc_zeroes(int fd, unsigned long long offset, int len) +{ + int n = fallocate(fd, FALLOC_FL_ZERO_RANGE|FALLOC_FL_KEEP_SIZE, offset, len); + + if (n < 0) + return -errno; + return n; +} + +int os_eventfd(unsigned int initval, int flags) +{ + int fd = eventfd(initval, flags); + + if (fd < 0) + return -errno; + return fd; +} + +int os_sendmsg_fds(int fd, const void *buf, unsigned int len, const int *fds, + unsigned int fds_num) +{ + struct iovec iov = { + .iov_base = (void *) buf, + .iov_len = len, + }; + union { + char control[CMSG_SPACE(sizeof(*fds) * OS_SENDMSG_MAX_FDS)]; + struct cmsghdr align; + } u; + unsigned int fds_size = sizeof(*fds) * fds_num; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = u.control, + .msg_controllen = CMSG_SPACE(fds_size), + }; + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + int err; + + if (fds_num > OS_SENDMSG_MAX_FDS) + return -EINVAL; + memset(u.control, 0, sizeof(u.control)); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(fds_size); + memcpy(CMSG_DATA(cmsg), fds, fds_size); + err = sendmsg(fd, &msg, 0); + + if (err < 0) + return -errno; + return err; +} + +int os_poll(unsigned int n, const int *fds) +{ + /* currently need 2 FDs at most so avoid dynamic allocation */ + struct pollfd pollfds[2] = {}; + unsigned int i; + int ret; + + if (n > ARRAY_SIZE(pollfds)) + return -EINVAL; + + for (i = 0; i < n; i++) { + pollfds[i].fd = fds[i]; + pollfds[i].events = POLLIN; + } + + ret = poll(pollfds, n, -1); + if (ret < 0) + return -errno; + + /* Return the index of the available FD */ + for (i = 0; i < n; i++) { + if (pollfds[i].revents) + return i; + } + + return -EIO; +} + +void *os_mmap_rw_shared(int fd, size_t size) +{ + void *res = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + + if (res == MAP_FAILED) + return NULL; + + return res; +} + +void *os_mremap_rw_shared(void *old_addr, size_t old_size, size_t new_size) +{ + void *res; + + res = mremap(old_addr, old_size, new_size, MREMAP_MAYMOVE, NULL); + + if (res == MAP_FAILED) + return NULL; + + return res; +} diff --git a/arch/um/os-Linux/helper.c b/arch/um/os-Linux/helper.c index e3ee4a51ef63..89c2ad2a4e3a 100644 --- a/arch/um/os-Linux/helper.c +++ b/arch/um/os-Linux/helper.c @@ -1,12 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <stdlib.h> +#include <string.h> #include <unistd.h> #include <errno.h> #include <sched.h> +#include <pthread.h> #include <linux/limits.h> #include <sys/socket.h> #include <sys/wait.h> @@ -45,7 +47,7 @@ int run_helper(void (*pre_exec)(void *), void *pre_data, char **argv) unsigned long stack, sp; int pid, fds[2], ret, n; - stack = alloc_stack(0, __cant_sleep()); + stack = alloc_stack(0, __uml_cant_sleep()); if (stack == 0) return -ENOMEM; @@ -64,12 +66,12 @@ int run_helper(void (*pre_exec)(void *), void *pre_data, char **argv) goto out_close; } - sp = stack + UM_KERN_PAGE_SIZE - sizeof(void *); + sp = stack + UM_KERN_PAGE_SIZE; data.pre_exec = pre_exec; data.pre_data = pre_data; data.argv = argv; data.fd = fds[1]; - data.buf = __cant_sleep() ? uml_kmalloc(PATH_MAX, UM_GFP_ATOMIC) : + data.buf = __uml_cant_sleep() ? uml_kmalloc(PATH_MAX, UM_GFP_ATOMIC) : uml_kmalloc(PATH_MAX, UM_GFP_KERNEL); pid = clone(helper_child, (void *) sp, CLONE_VM, &data); if (pid < 0) { @@ -96,9 +98,13 @@ int run_helper(void (*pre_exec)(void *), void *pre_data, char **argv) "ret = %d\n", -n); ret = n; } - CATCH_EINTR(waitpid(pid, NULL, __WCLONE)); + CATCH_EINTR(waitpid(pid, NULL, __WALL)); } + if (ret < 0) + printk(UM_KERN_ERR "run_helper : failed to exec %s on host: %s\n", + argv[0], strerror(-ret)); + out_free2: kfree(data.buf); out_close: @@ -116,11 +122,15 @@ int run_helper_thread(int (*proc)(void *), void *arg, unsigned int flags, unsigned long stack, sp; int pid, status, err; - stack = alloc_stack(0, __cant_sleep()); + /* To share memory space, use os_run_helper_thread() instead. */ + if (flags & CLONE_VM) + return -EINVAL; + + stack = alloc_stack(0, __uml_cant_sleep()); if (stack == 0) return -ENOMEM; - sp = stack + UM_KERN_PAGE_SIZE - sizeof(void *); + sp = stack + UM_KERN_PAGE_SIZE; pid = clone(proc, (void *) sp, flags, arg); if (pid < 0) { err = -errno; @@ -129,7 +139,7 @@ int run_helper_thread(int (*proc)(void *), void *arg, unsigned int flags, return err; } if (stack_out == NULL) { - CATCH_EINTR(pid = waitpid(pid, &status, __WCLONE)); + CATCH_EINTR(pid = waitpid(pid, &status, __WALL)); if (pid < 0) { err = -errno; printk(UM_KERN_ERR "run_helper_thread - wait failed, " @@ -148,7 +158,7 @@ int run_helper_thread(int (*proc)(void *), void *arg, unsigned int flags, int helper_wait(int pid) { int ret, status; - int wflags = __WCLONE; + int wflags = __WALL; CATCH_EINTR(ret = waitpid(pid, &status, wflags)); if (ret < 0) { @@ -162,3 +172,65 @@ int helper_wait(int pid) } else return 0; } + +struct os_helper_thread { + pthread_t handle; +}; + +int os_run_helper_thread(struct os_helper_thread **td_out, + void *(*routine)(void *), void *arg) +{ + struct os_helper_thread *td; + sigset_t sigset, oset; + int err, flags; + + flags = __uml_cant_sleep() ? UM_GFP_ATOMIC : UM_GFP_KERNEL; + td = uml_kmalloc(sizeof(*td), flags); + if (!td) + return -ENOMEM; + + sigfillset(&sigset); + if (sigprocmask(SIG_SETMASK, &sigset, &oset) < 0) { + err = -errno; + kfree(td); + return err; + } + + err = pthread_create(&td->handle, NULL, routine, arg); + + if (sigprocmask(SIG_SETMASK, &oset, NULL) < 0) + panic("Failed to restore the signal mask: %d", errno); + + if (err != 0) + kfree(td); + else + *td_out = td; + + return -err; +} + +void os_kill_helper_thread(struct os_helper_thread *td) +{ + pthread_cancel(td->handle); + pthread_join(td->handle, NULL); + kfree(td); +} + +void os_fix_helper_thread_signals(void) +{ + sigset_t sigset; + + sigemptyset(&sigset); + + sigaddset(&sigset, SIGWINCH); + sigaddset(&sigset, SIGPIPE); + sigaddset(&sigset, SIGPROF); + sigaddset(&sigset, SIGINT); + sigaddset(&sigset, SIGTERM); + sigaddset(&sigset, SIGCHLD); + sigaddset(&sigset, SIGALRM); + sigaddset(&sigset, SIGIO); + sigaddset(&sigset, SIGUSR1); + + pthread_sigmask(SIG_SETMASK, &sigset, NULL); +} diff --git a/arch/um/os-Linux/internal.h b/arch/um/os-Linux/internal.h index 0dc2c9f135f6..bac9fcc8c14c 100644 --- a/arch/um/os-Linux/internal.h +++ b/arch/um/os-Linux/internal.h @@ -1 +1,36 @@ -void alarm_handler(int sig, struct siginfo *unused_si, mcontext_t *mc); +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __UM_OS_LINUX_INTERNAL_H +#define __UM_OS_LINUX_INTERNAL_H + +#include <mm_id.h> +#include <stub-data.h> +#include <signal.h> + +/* + * elf_aux.c + */ +void scan_elf_aux(char **envp); + +/* + * mem.c + */ +void check_tmpexec(void); + +/* + * signal.c + */ +extern __thread int signals_enabled; +int timer_alarm_pending(void); + +/* + * skas/process.c + */ +void wait_stub_done(int pid); +void wait_stub_done_seccomp(struct mm_id *mm_idp, int running, int wait_sigsys); + +/* + * smp.c + */ +#define IPI_SIGNAL SIGRTMIN + +#endif /* __UM_OS_LINUX_INTERNAL_H */ diff --git a/arch/um/os-Linux/irq.c b/arch/um/os-Linux/irq.c index b9afb74b79ad..cf7e49c08b21 100644 --- a/arch/um/os-Linux/irq.c +++ b/arch/um/os-Linux/irq.c @@ -1,135 +1,145 @@ +// SPDX-License-Identifier: GPL-2.0 /* + * Copyright (C) 2017 - Cambridge Greys Ltd + * Copyright (C) 2011 - 2014 Cisco Systems Inc * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <stdlib.h> #include <errno.h> -#include <poll.h> +#include <sys/epoll.h> #include <signal.h> #include <string.h> #include <irq_user.h> #include <os.h> #include <um_malloc.h> +/* Epoll support */ + +static int epollfd = -1; + +#define MAX_EPOLL_EVENTS 64 + +static struct epoll_event epoll_events[MAX_EPOLL_EVENTS]; + +/* Helper to return an Epoll data pointer from an epoll event structure. + * We need to keep this one on the userspace side to keep includes separate + */ + +void *os_epoll_get_data_pointer(int index) +{ + return epoll_events[index].data.ptr; +} + +/* Helper to compare events versus the events in the epoll structure. + * Same as above - needs to be on the userspace side + */ + + +int os_epoll_triggered(int index, int events) +{ + return epoll_events[index].events & events; +} +/* Helper to set the event mask. + * The event mask is opaque to the kernel side, because it does not have + * access to the right includes/defines for EPOLL constants. + */ + +int os_event_mask(enum um_irq_type irq_type) +{ + if (irq_type == IRQ_READ) + return EPOLLIN | EPOLLPRI | EPOLLERR | EPOLLHUP | EPOLLRDHUP; + if (irq_type == IRQ_WRITE) + return EPOLLOUT; + return 0; +} + /* - * Locked by irq_lock in arch/um/kernel/irq.c. Changed by os_create_pollfd - * and os_free_irq_by_cb, which are called under irq_lock. + * Initial Epoll Setup */ -static struct pollfd *pollfds = NULL; -static int pollfds_num = 0; -static int pollfds_size = 0; +int os_setup_epoll(void) +{ + epollfd = epoll_create(MAX_EPOLL_EVENTS); + return epollfd; +} -int os_waiting_for_events(struct irq_fd *active_fds) +/* + * Helper to run the actual epoll_wait + */ +int os_waiting_for_events_epoll(void) { - struct irq_fd *irq_fd; - int i, n, err; + int n, err; - n = poll(pollfds, pollfds_num, 0); + n = epoll_wait(epollfd, + (struct epoll_event *) &epoll_events, MAX_EPOLL_EVENTS, 0); if (n < 0) { err = -errno; if (errno != EINTR) - printk(UM_KERN_ERR "os_waiting_for_events:" - " poll returned %d, errno = %d\n", n, errno); + printk( + UM_KERN_ERR "os_waiting_for_events:" + " epoll returned %d, error = %s\n", n, + strerror(errno) + ); return err; } - - if (n == 0) - return 0; - - irq_fd = active_fds; - - for (i = 0; i < pollfds_num; i++) { - if (pollfds[i].revents != 0) { - irq_fd->current_events = pollfds[i].revents; - pollfds[i].fd = -1; - } - irq_fd = irq_fd->next; - } return n; } -int os_create_pollfd(int fd, int events, void *tmp_pfd, int size_tmpfds) -{ - if (pollfds_num == pollfds_size) { - if (size_tmpfds <= pollfds_size * sizeof(pollfds[0])) { - /* return min size needed for new pollfds area */ - return (pollfds_size + 1) * sizeof(pollfds[0]); - } - - if (pollfds != NULL) { - memcpy(tmp_pfd, pollfds, - sizeof(pollfds[0]) * pollfds_size); - /* remove old pollfds */ - kfree(pollfds); - } - pollfds = tmp_pfd; - pollfds_size++; - } else - kfree(tmp_pfd); /* remove not used tmp_pfd */ - - pollfds[pollfds_num] = ((struct pollfd) { .fd = fd, - .events = events, - .revents = 0 }); - pollfds_num++; - - return 0; -} -void os_free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg, - struct irq_fd *active_fds, struct irq_fd ***last_irq_ptr2) +/* + * Helper to add a fd to epoll + */ +int os_add_epoll_fd(int events, int fd, void *data) { - struct irq_fd **prev; - int i = 0; - - prev = &active_fds; - while (*prev != NULL) { - if ((*test)(*prev, arg)) { - struct irq_fd *old_fd = *prev; - if ((pollfds[i].fd != -1) && - (pollfds[i].fd != (*prev)->fd)) { - printk(UM_KERN_ERR "os_free_irq_by_cb - " - "mismatch between active_fds and " - "pollfds, fd %d vs %d\n", - (*prev)->fd, pollfds[i].fd); - goto out; - } - - pollfds_num--; - - /* - * This moves the *whole* array after pollfds[i] - * (though it doesn't spot as such)! - */ - memmove(&pollfds[i], &pollfds[i + 1], - (pollfds_num - i) * sizeof(pollfds[0])); - if (*last_irq_ptr2 == &old_fd->next) - *last_irq_ptr2 = prev; - - *prev = (*prev)->next; - if (old_fd->type == IRQ_WRITE) - ignore_sigio_fd(old_fd->fd); - kfree(old_fd); - continue; - } - prev = &(*prev)->next; - i++; - } - out: - return; + struct epoll_event event; + int result; + + event.data.ptr = data; + event.events = events | EPOLLET; + result = epoll_ctl(epollfd, EPOLL_CTL_ADD, fd, &event); + if ((result) && (errno == EEXIST)) + result = os_mod_epoll_fd(events, fd, data); + if (result) + printk("epollctl add err fd %d, %s\n", fd, strerror(errno)); + return result; } -int os_get_pollfd(int i) +/* + * Helper to mod the fd event mask and/or data backreference + */ +int os_mod_epoll_fd(int events, int fd, void *data) { - return pollfds[i].fd; + struct epoll_event event; + int result; + + event.data.ptr = data; + event.events = events; + result = epoll_ctl(epollfd, EPOLL_CTL_MOD, fd, &event); + if (result) + printk(UM_KERN_ERR + "epollctl mod err fd %d, %s\n", fd, strerror(errno)); + return result; } -void os_set_pollfd(int i, int fd) +/* + * Helper to delete the epoll fd + */ +int os_del_epoll_fd(int fd) { - pollfds[i].fd = fd; + struct epoll_event event; + /* This is quiet as we use this as IO ON/OFF - so it is often + * invoked on a non-existent fd + */ + return epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, &event); } void os_set_ioignore(void) { signal(SIGIO, SIG_IGN); } + +void os_close_epoll_fd(void) +{ + /* Needed so we do not leak an fd when rebooting */ + os_close_file(epollfd); +} diff --git a/arch/um/os-Linux/main.c b/arch/um/os-Linux/main.c index 749c96da7b99..7e114862a723 100644 --- a/arch/um/os-Linux/main.c +++ b/arch/um/os-Linux/main.c @@ -1,6 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* + * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de) * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <stdio.h> @@ -10,19 +11,17 @@ #include <signal.h> #include <string.h> #include <sys/resource.h> +#include <sys/personality.h> #include <as-layout.h> #include <init.h> #include <kern_util.h> #include <os.h> #include <um_malloc.h> +#include "internal.h" -#define PGD_BOUND (4 * 1024 * 1024) #define STACKSIZE (8 * 1024 * 1024) -#define THREAD_NAME_LEN (256) -long elf_aux_hwcap; - -static void set_stklim(void) +static void __init set_stklim(void) { struct rlimit lim; @@ -39,24 +38,13 @@ static void set_stklim(void) } } -static __init void do_uml_initcalls(void) -{ - initcall_t *call; - - call = &__uml_initcall_start; - while (call < &__uml_initcall_end) { - (*call)(); - call++; - } -} - static void last_ditch_exit(int sig) { uml_cleanup(); exit(1); } -static void install_fatal_handler(int sig) +static void __init install_fatal_handler(int sig) { struct sigaction action; @@ -73,15 +61,15 @@ static void install_fatal_handler(int sig) action.sa_restorer = NULL; action.sa_handler = last_ditch_exit; if (sigaction(sig, &action, NULL) < 0) { - printf("failed to install handler for signal %d - errno = %d\n", - sig, errno); + os_warn("failed to install handler for signal %d " + "- errno = %d\n", sig, errno); exit(1); } } #define UML_LIB_PATH ":" OS_LIB_PATH "/uml" -static void setup_env_path(void) +static void __init setup_env_path(void) { char *new_path = NULL; char *old_path = NULL; @@ -112,17 +100,32 @@ static void setup_env_path(void) } } -extern void scan_elf_aux( char **envp); - int __init main(int argc, char **argv, char **envp) { char **new_argv; int ret, i, err; + /* Disable randomization and re-exec if it was changed successfully */ + ret = personality(PER_LINUX | ADDR_NO_RANDOMIZE); + if (ret >= 0 && (ret & (PER_LINUX | ADDR_NO_RANDOMIZE)) != + (PER_LINUX | ADDR_NO_RANDOMIZE)) { + char buf[4096] = {}; + ssize_t ret; + + ret = readlink("/proc/self/exe", buf, sizeof(buf)); + if (ret < 0 || ret >= sizeof(buf)) { + perror("readlink failure"); + exit(1); + } + execve(buf, argv, envp); + } + set_stklim(); setup_env_path(); + setsid(); + new_argv = malloc((argc + 1) * sizeof(char *)); if (new_argv == NULL) { perror("Mallocing argv"); @@ -144,12 +147,10 @@ int __init main(int argc, char **argv, char **envp) install_fatal_handler(SIGINT); install_fatal_handler(SIGTERM); -#ifdef CONFIG_ARCH_REUSE_HOST_VSYSCALL_AREA scan_elf_aux(envp); -#endif - do_uml_initcalls(); - ret = linux_main(argc, argv); + change_sig(SIGPIPE, 0); + ret = linux_main(argc, argv, envp); /* * Disable SIGPROF - I have no idea why libc doesn't do this or turn @@ -160,18 +161,18 @@ int __init main(int argc, char **argv, char **envp) /* * This signal stuff used to be in the reboot case. However, - * sometimes a SIGVTALRM can come in when we're halting (reproducably + * sometimes a timer signal can come in when we're halting (reproducably * when writing out gcov information, presumably because that takes * some time) and cause a segfault. */ - /* stop timers and set SIGVTALRM to be ignored */ - disable_timer(); + /* stop timers and set timer signal to be ignored */ + os_timer_disable(0); /* disable SIGIO for the fds and set SIGIO to be ignored */ err = deactivate_all_fds(); if (err) - printf("deactivate_all_fds failed, errno = %d\n", -err); + os_warn("deactivate_all_fds failed, errno = %d\n", -err); /* * Let any pending signals fire now. This ensures @@ -180,18 +181,23 @@ int __init main(int argc, char **argv, char **envp) */ unblock_signals(); + os_info("\n"); /* Reboot */ if (ret) { - printf("\n"); execvp(new_argv[0], new_argv); perror("Failed to exec kernel"); ret = 1; } - printf("\n"); return uml_exitcode; } extern void *__real_malloc(int); +extern void __real_free(void *); + +/* workaround for -Wmissing-prototypes warnings */ +void *__wrap_malloc(int size); +void *__wrap_calloc(int n, int size); +void __wrap_free(void *ptr); void *__wrap_malloc(int size) { @@ -224,10 +230,6 @@ void *__wrap_calloc(int n, int size) return ptr; } -extern void __real_free(void *); - -extern unsigned long high_physmem; - void __wrap_free(void *ptr) { unsigned long addr = (unsigned long) ptr; diff --git a/arch/um/os-Linux/mem.c b/arch/um/os-Linux/mem.c index ba4398056fe9..72f302f4d197 100644 --- a/arch/um/os-Linux/mem.c +++ b/arch/um/os-Linux/mem.c @@ -1,6 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <stdio.h> @@ -12,189 +12,164 @@ #include <string.h> #include <sys/stat.h> #include <sys/mman.h> -#include <sys/param.h> +#include <sys/vfs.h> +#include <linux/magic.h> #include <init.h> +#include <kern_util.h> #include <os.h> - -/* Modified by which_tmpdir, which is called during early boot */ -static char *default_tmpdir = "/tmp"; +#include "internal.h" /* - * Modified when creating the physical memory file and when checking - * the tmp filesystem for usability, both happening during early boot. + * kasan_map_memory - maps memory from @start with a size of @len. + * The allocated memory is filled with zeroes upon success. + * @start: the start address of the memory to be mapped + * @len: the length of the memory to be mapped + * + * This function is used to map shadow memory for KASAN in uml */ -static char *tempdir = NULL; - -static void __init find_tempdir(void) +void kasan_map_memory(void *start, size_t len) { - const char *dirs[] = { "TMP", "TEMP", "TMPDIR", NULL }; - int i; - char *dir = NULL; + if (mmap(start, + len, + PROT_READ|PROT_WRITE, + MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE|MAP_NORESERVE, + -1, + 0) == MAP_FAILED) { + os_info("Couldn't allocate shadow memory: %s\n.", + strerror(errno)); + exit(1); + } - if (tempdir != NULL) - /* We've already been called */ - return; - for (i = 0; dirs[i]; i++) { - dir = getenv(dirs[i]); - if ((dir != NULL) && (*dir != '\0')) - break; + if (madvise(start, len, MADV_DONTDUMP)) { + os_info("Couldn't set MAD_DONTDUMP on shadow memory: %s\n.", + strerror(errno)); + exit(1); } - if ((dir == NULL) || (*dir == '\0')) - dir = default_tmpdir; - tempdir = malloc(strlen(dir) + 2); - if (tempdir == NULL) { - fprintf(stderr, "Failed to malloc tempdir, " - "errno = %d\n", errno); - return; + if (madvise(start, len, MADV_DONTFORK)) { + os_info("Couldn't set MADV_DONTFORK on shadow memory: %s\n.", + strerror(errno)); + exit(1); } - strcpy(tempdir, dir); - strcat(tempdir, "/"); } -/* - * This will return 1, with the first character in buf being the - * character following the next instance of c in the file. This will - * read the file as needed. If there's an error, -errno is returned; - * if the end of the file is reached, 0 is returned. - */ -static int next(int fd, char *buf, size_t size, char c) -{ - ssize_t n; - size_t len; - char *ptr; +/* Set by make_tempfile() during early boot. */ +char *tempdir = NULL; - while ((ptr = strchr(buf, c)) == NULL) { - n = read(fd, buf, size - 1); - if (n == 0) - return 0; - else if (n < 0) - return -errno; - - buf[n] = '\0'; +/* Check if dir is on tmpfs. Return 0 if yes, -1 if no or error. */ +static int __init check_tmpfs(const char *dir) +{ + struct statfs st; + + os_info("Checking if %s is on tmpfs...", dir); + if (statfs(dir, &st) < 0) { + os_info("%s\n", strerror(errno)); + } else if (st.f_type != TMPFS_MAGIC) { + os_info("no\n"); + } else { + os_info("OK\n"); + return 0; } - - ptr++; - len = strlen(ptr); - memmove(buf, ptr, len + 1); - - /* - * Refill the buffer so that if there's a partial string that we care - * about, it will be completed, and we can recognize it. - */ - n = read(fd, &buf[len], size - len - 1); - if (n < 0) - return -errno; - - buf[len + n] = '\0'; - return 1; + return -1; } -/* which_tmpdir is called only during early boot */ -static int checked_tmpdir = 0; - /* - * Look for a tmpfs mounted at /dev/shm. I couldn't find a cleaner - * way to do this than to parse /proc/mounts. statfs will return the - * same filesystem magic number and fs id for both /dev and /dev/shm - * when they are both tmpfs, so you can't tell if they are different - * filesystems. Also, there seems to be no other way of finding the - * mount point of a filesystem from within it. - * - * If a /dev/shm tmpfs entry is found, then we switch to using it. - * Otherwise, we stay with the default /tmp. + * Choose the tempdir to use. We want something on tmpfs so that our memory is + * not subject to the host's vm.dirty_ratio. If a tempdir is specified in the + * environment, we use that even if it's not on tmpfs, but we warn the user. + * Otherwise, we try common tmpfs locations, and if no tmpfs directory is found + * then we fall back to /tmp. */ -static void which_tmpdir(void) +static char * __init choose_tempdir(void) { - int fd, found; - char buf[128] = { '\0' }; - - if (checked_tmpdir) - return; - - checked_tmpdir = 1; - - printf("Checking for tmpfs mount on /dev/shm..."); - - fd = open("/proc/mounts", O_RDONLY); - if (fd < 0) { - printf("failed to open /proc/mounts, errno = %d\n", errno); - return; + static const char * const vars[] = { + "TMPDIR", + "TMP", + "TEMP", + NULL + }; + static const char fallback_dir[] = "/tmp"; + static const char * const tmpfs_dirs[] = { + "/dev/shm", + fallback_dir, + NULL + }; + int i; + const char *dir; + + os_info("Checking environment variables for a tempdir..."); + for (i = 0; vars[i]; i++) { + dir = getenv(vars[i]); + if ((dir != NULL) && (*dir != '\0')) { + os_info("%s\n", dir); + if (check_tmpfs(dir) >= 0) + goto done; + else + goto warn; + } } + os_info("none found\n"); - while (1) { - found = next(fd, buf, ARRAY_SIZE(buf), ' '); - if (found != 1) - break; - - if (!strncmp(buf, "/dev/shm", strlen("/dev/shm"))) - goto found; - - found = next(fd, buf, ARRAY_SIZE(buf), '\n'); - if (found != 1) - break; + for (i = 0; tmpfs_dirs[i]; i++) { + dir = tmpfs_dirs[i]; + if (check_tmpfs(dir) >= 0) + goto done; } -err: - if (found == 0) - printf("nothing mounted on /dev/shm\n"); - else if (found < 0) - printf("read returned errno %d\n", -found); - -out: - close(fd); - - return; - -found: - found = next(fd, buf, ARRAY_SIZE(buf), ' '); - if (found != 1) - goto err; - - if (strncmp(buf, "tmpfs", strlen("tmpfs"))) { - printf("not tmpfs\n"); - goto out; - } - - printf("OK\n"); - default_tmpdir = "/dev/shm"; - goto out; + dir = fallback_dir; +warn: + os_warn("Warning: tempdir %s is not on tmpfs\n", dir); +done: + /* Make a copy since getenv results may not remain valid forever. */ + return strdup(dir); } -static int __init make_tempfile(const char *template, char **out_tempname, - int do_unlink) +/* + * Create an unlinked tempfile in a suitable tempdir. template must be the + * basename part of the template with a leading '/'. + */ +static int __init make_tempfile(const char *template) { char *tempname; int fd; - which_tmpdir(); - tempname = malloc(MAXPATHLEN); + if (tempdir == NULL) { + tempdir = choose_tempdir(); + if (tempdir == NULL) { + os_warn("Failed to choose tempdir: %s\n", + strerror(errno)); + return -1; + } + } + +#ifdef O_TMPFILE + fd = open(tempdir, O_CLOEXEC | O_RDWR | O_EXCL | O_TMPFILE, 0700); + /* + * If the running system does not support O_TMPFILE flag then retry + * without it. + */ + if (fd != -1 || (errno != EINVAL && errno != EISDIR && + errno != EOPNOTSUPP)) + return fd; +#endif + + tempname = malloc(strlen(tempdir) + strlen(template) + 1); if (tempname == NULL) return -1; - find_tempdir(); - if ((tempdir == NULL) || (strlen(tempdir) >= MAXPATHLEN)) - goto out; - - if (template[0] != '/') - strcpy(tempname, tempdir); - else - tempname[0] = '\0'; - strncat(tempname, template, MAXPATHLEN-1-strlen(tempname)); + strcpy(tempname, tempdir); + strcat(tempname, template); fd = mkstemp(tempname); if (fd < 0) { - fprintf(stderr, "open - cannot create %s: %s\n", tempname, + os_warn("open - cannot create %s: %s\n", tempname, strerror(errno)); goto out; } - if (do_unlink && (unlink(tempname) < 0)) { + if (unlink(tempname) < 0) { perror("unlink"); goto close; } - if (out_tempname) { - *out_tempname = tempname; - } else - free(tempname); + free(tempname); return fd; close: close(fd); @@ -203,23 +178,17 @@ out: return -1; } -#define TEMPNAME_TEMPLATE "vm_file-XXXXXX" +#define TEMPNAME_TEMPLATE "/vm_file-XXXXXX" static int __init create_tmp_file(unsigned long long len) { int fd, err; char zero; - fd = make_tempfile(TEMPNAME_TEMPLATE, NULL, 1); + fd = make_tempfile(TEMPNAME_TEMPLATE); if (fd < 0) exit(1); - err = fchmod(fd, 0777); - if (err < 0) { - perror("fchmod"); - exit(1); - } - /* * Seek to len - 1 because writing a character there will * increase the file size by one byte, to the desired length. @@ -254,7 +223,6 @@ int __init create_mem_file(unsigned long long len) return fd; } - void __init check_tmpexec(void) { void *addr; @@ -262,17 +230,16 @@ void __init check_tmpexec(void) addr = mmap(NULL, UM_KERN_PAGE_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE, fd, 0); - printf("Checking PROT_EXEC mmap in %s...",tempdir); - fflush(stdout); + os_info("Checking PROT_EXEC mmap in %s...", tempdir); if (addr == MAP_FAILED) { err = errno; - perror("failed"); + os_warn("%s\n", strerror(err)); close(fd); if (err == EPERM) - printf("%s must be not mounted noexec\n",tempdir); + os_warn("%s must be not mounted noexec\n", tempdir); exit(1); } - printf("OK\n"); + os_info("OK\n"); munmap(addr, UM_KERN_PAGE_SIZE); close(fd); diff --git a/arch/um/os-Linux/process.c b/arch/um/os-Linux/process.c index b8f34c9e53ae..3a2a84ab9325 100644 --- a/arch/um/os-Linux/process.c +++ b/arch/um/os-Linux/process.c @@ -1,119 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 /* + * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de) * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <stdio.h> +#include <stdlib.h> #include <unistd.h> #include <errno.h> #include <signal.h> #include <fcntl.h> +#include <limits.h> +#include <linux/futex.h> #include <sys/mman.h> #include <sys/ptrace.h> +#include <sys/prctl.h> #include <sys/wait.h> #include <asm/unistd.h> #include <init.h> #include <longjmp.h> #include <os.h> -#include <skas_ptrace.h> +#include <skas/skas.h> -#define ARBITRARY_ADDR -1 -#define FAILURE_PID -1 - -#define STAT_PATH_LEN sizeof("/proc/#######/stat\0") -#define COMM_SCANF "%*[^)])" - -unsigned long os_process_pc(int pid) +void os_alarm_process(int pid) { - char proc_stat[STAT_PATH_LEN], buf[256]; - unsigned long pc = ARBITRARY_ADDR; - int fd, err; + if (pid <= 0) + return; - sprintf(proc_stat, "/proc/%d/stat", pid); - fd = open(proc_stat, O_RDONLY, 0); - if (fd < 0) { - printk(UM_KERN_ERR "os_process_pc - couldn't open '%s', " - "errno = %d\n", proc_stat, errno); - goto out; - } - CATCH_EINTR(err = read(fd, buf, sizeof(buf))); - if (err < 0) { - printk(UM_KERN_ERR "os_process_pc - couldn't read '%s', " - "err = %d\n", proc_stat, errno); - goto out_close; - } - os_close_file(fd); - pc = ARBITRARY_ADDR; - if (sscanf(buf, "%*d " COMM_SCANF " %*c %*d %*d %*d %*d %*d %*d %*d " - "%*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d " - "%*d %*d %*d %*d %*d %lu", &pc) != 1) - printk(UM_KERN_ERR "os_process_pc - couldn't find pc in '%s'\n", - buf); - out_close: - close(fd); - out: - return pc; + kill(pid, SIGALRM); } -int os_process_parent(int pid) +void os_kill_process(int pid, int reap_child) { - char stat[STAT_PATH_LEN]; - char data[256]; - int parent = FAILURE_PID, n, fd; - - if (pid == -1) - return parent; - - snprintf(stat, sizeof(stat), "/proc/%d/stat", pid); - fd = open(stat, O_RDONLY, 0); - if (fd < 0) { - printk(UM_KERN_ERR "Couldn't open '%s', errno = %d\n", stat, - errno); - return parent; - } + if (pid <= 0) + return; - CATCH_EINTR(n = read(fd, data, sizeof(data))); - close(fd); - - if (n < 0) { - printk(UM_KERN_ERR "Couldn't read '%s', errno = %d\n", stat, - errno); - return parent; - } - - parent = FAILURE_PID; - n = sscanf(data, "%*d " COMM_SCANF " %*c %d", &parent); - if (n != 1) - printk(UM_KERN_ERR "Failed to scan '%s'\n", data); + /* Block signals until child is reaped */ + block_signals(); - return parent; -} - -void os_stop_process(int pid) -{ - kill(pid, SIGSTOP); -} - -void os_kill_process(int pid, int reap_child) -{ kill(pid, SIGKILL); if (reap_child) CATCH_EINTR(waitpid(pid, NULL, __WALL)); -} - -/* This is here uniquely to have access to the userspace errno, i.e. the one - * used by ptrace in case of error. - */ - -long os_ptrace_ldt(long pid, long addr, long data) -{ - int ret; - - ret = ptrace(PTRACE_LDT, pid, addr, data); - if (ret < 0) - return -errno; - return ret; + unblock_signals(); } /* Kill off a ptraced child by all means available. kill it normally first, @@ -123,11 +52,27 @@ long os_ptrace_ldt(long pid, long addr, long data) void os_kill_ptraced_process(int pid, int reap_child) { + if (pid <= 0) + return; + + /* Block signals until child is reaped */ + block_signals(); + kill(pid, SIGKILL); ptrace(PTRACE_KILL, pid); ptrace(PTRACE_CONT, pid); if (reap_child) CATCH_EINTR(waitpid(pid, NULL, __WALL)); + + unblock_signals(); +} + +pid_t os_reap_child(void) +{ + int status; + + /* Try to reap a child */ + return waitpid(-1, &status, WNOHANG); } /* Don't use the glibc version, which caches the result in TLS. It misses some @@ -139,11 +84,6 @@ int os_getpid(void) return syscall(__NR_getpid); } -int os_getpgrp(void) -{ - return getpgrp(); -} - int os_map_memory(void *virt, int fd, unsigned long long off, unsigned long len, int r, int w, int x) { @@ -241,6 +181,31 @@ void init_new_thread_signals(void) set_handler(SIGBUS); signal(SIGHUP, SIG_IGN); set_handler(SIGIO); + /* We (currently) only use the child reaper IRQ in seccomp mode */ + if (using_seccomp) + set_handler(SIGCHLD); signal(SIGWINCH, SIG_IGN); - signal(SIGTERM, SIG_DFL); +} + +void os_set_pdeathsig(void) +{ + prctl(PR_SET_PDEATHSIG, SIGKILL); +} + +int os_futex_wait(void *uaddr, unsigned int val) +{ + int r; + + CATCH_EINTR(r = syscall(__NR_futex, uaddr, FUTEX_WAIT, val, + NULL, NULL, 0)); + return r < 0 ? -errno : r; +} + +int os_futex_wake(void *uaddr) +{ + int r; + + CATCH_EINTR(r = syscall(__NR_futex, uaddr, FUTEX_WAKE, INT_MAX, + NULL, NULL, 0)); + return r < 0 ? -errno : r; } diff --git a/arch/um/os-Linux/registers.c b/arch/um/os-Linux/registers.c index 2ff8d4fe83c4..bfba2cbc9478 100644 --- a/arch/um/os-Linux/registers.c +++ b/arch/um/os-Linux/registers.c @@ -1,7 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2004 PathScale, Inc * Copyright (C) 2004 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <errno.h> @@ -10,33 +10,14 @@ #include <sysdep/ptrace.h> #include <sysdep/ptrace_user.h> #include <registers.h> - -int save_registers(int pid, struct uml_pt_regs *regs) -{ - int err; - - err = ptrace(PTRACE_GETREGS, pid, 0, regs->gp); - if (err < 0) - return -errno; - return 0; -} - -int restore_registers(int pid, struct uml_pt_regs *regs) -{ - int err; - - err = ptrace(PTRACE_SETREGS, pid, 0, regs->gp); - if (err < 0) - return -errno; - return 0; -} +#include <stdlib.h> /* This is set once at boot time and not changed thereafter */ -static unsigned long exec_regs[MAX_REG_NR]; -static unsigned long exec_fp_regs[FP_SIZE]; +unsigned long exec_regs[MAX_REG_NR]; +unsigned long *exec_fp_regs; -int init_registers(int pid) +int init_pid_registers(int pid) { int err; @@ -44,7 +25,11 @@ int init_registers(int pid) if (err < 0) return -errno; - arch_init_registers(pid); + err = arch_init_registers(pid); + if (err < 0) + return err; + + exec_fp_regs = malloc(host_fp_size); get_fp_registers(pid, exec_fp_regs); return 0; } @@ -54,5 +39,5 @@ void get_safe_registers(unsigned long *regs, unsigned long *fp_regs) memcpy(regs, exec_regs, sizeof(exec_regs)); if (fp_regs) - memcpy(fp_regs, exec_fp_regs, sizeof(exec_fp_regs)); + memcpy(fp_regs, exec_fp_regs, host_fp_size); } diff --git a/arch/um/os-Linux/sigio.c b/arch/um/os-Linux/sigio.c index 8b61cc0e82c8..6de145f8fe3d 100644 --- a/arch/um/os-Linux/sigio.c +++ b/arch/um/os-Linux/sigio.c @@ -1,6 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2002 - 2008 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <unistd.h> @@ -11,6 +11,8 @@ #include <sched.h> #include <signal.h> #include <string.h> +#include <sys/epoll.h> +#include <asm/unistd.h> #include <kern_util.h> #include <init.h> #include <os.h> @@ -21,366 +23,136 @@ * Protected by sigio_lock(), also used by sigio_cleanup, which is an * exitcall. */ -static int write_sigio_pid = -1; -static unsigned long write_sigio_stack; +static struct os_helper_thread *write_sigio_td; -/* - * These arrays are initialized before the sigio thread is started, and - * the descriptors closed after it is killed. So, it can't see them change. - * On the UML side, they are changed under the sigio_lock. - */ -#define SIGIO_FDS_INIT {-1, -1} - -static int write_sigio_fds[2] = SIGIO_FDS_INIT; -static int sigio_private[2] = SIGIO_FDS_INIT; +static int epollfd = -1; -struct pollfds { - struct pollfd *poll; - int size; - int used; -}; +#define MAX_EPOLL_EVENTS 64 -/* - * Protected by sigio_lock(). Used by the sigio thread, but the UML thread - * synchronizes with it. - */ -static struct pollfds current_poll; -static struct pollfds next_poll; -static struct pollfds all_sigio_fds; +static struct epoll_event epoll_events[MAX_EPOLL_EVENTS]; -static int write_sigio_thread(void *unused) +static void *write_sigio_thread(void *unused) { - struct pollfds *fds, tmp; - struct pollfd *p; - int i, n, respond_fd; - char c; + int pid = getpid(); + int r; + + os_fix_helper_thread_signals(); - signal(SIGWINCH, SIG_IGN); - fds = ¤t_poll; while (1) { - n = poll(fds->poll, fds->used, -1); - if (n < 0) { + r = epoll_wait(epollfd, epoll_events, MAX_EPOLL_EVENTS, -1); + if (r < 0) { if (errno == EINTR) continue; - printk(UM_KERN_ERR "write_sigio_thread : poll returned " - "%d, errno = %d\n", n, errno); - } - for (i = 0; i < fds->used; i++) { - p = &fds->poll[i]; - if (p->revents == 0) - continue; - if (p->fd == sigio_private[1]) { - CATCH_EINTR(n = read(sigio_private[1], &c, - sizeof(c))); - if (n != sizeof(c)) - printk(UM_KERN_ERR - "write_sigio_thread : " - "read on socket failed, " - "err = %d\n", errno); - tmp = current_poll; - current_poll = next_poll; - next_poll = tmp; - respond_fd = sigio_private[1]; - } - else { - respond_fd = write_sigio_fds[1]; - fds->used--; - memmove(&fds->poll[i], &fds->poll[i + 1], - (fds->used - i) * sizeof(*fds->poll)); - } - - CATCH_EINTR(n = write(respond_fd, &c, sizeof(c))); - if (n != sizeof(c)) - printk(UM_KERN_ERR "write_sigio_thread : " - "write on socket failed, err = %d\n", - errno); + printk(UM_KERN_ERR "%s: epoll_wait failed, errno = %d\n", + __func__, errno); } - } - - return 0; -} - -static int need_poll(struct pollfds *polls, int n) -{ - struct pollfd *new; - if (n <= polls->size) - return 0; - - new = uml_kmalloc(n * sizeof(struct pollfd), UM_GFP_ATOMIC); - if (new == NULL) { - printk(UM_KERN_ERR "need_poll : failed to allocate new " - "pollfds\n"); - return -ENOMEM; + CATCH_EINTR(r = syscall(__NR_tgkill, pid, pid, SIGIO)); + if (r < 0) + printk(UM_KERN_ERR "%s: tgkill failed, errno = %d\n", + __func__, errno); } - memcpy(new, polls->poll, polls->used * sizeof(struct pollfd)); - kfree(polls->poll); - - polls->poll = new; - polls->size = n; - return 0; + return NULL; } -/* - * Must be called with sigio_lock held, because it's needed by the marked - * critical section. - */ -static void update_thread(void) +int __add_sigio_fd(int fd) { - unsigned long flags; - int n; - char c; - - flags = set_signals(0); - CATCH_EINTR(n = write(sigio_private[0], &c, sizeof(c))); - if (n != sizeof(c)) { - printk(UM_KERN_ERR "update_thread : write failed, err = %d\n", - errno); - goto fail; - } - - CATCH_EINTR(n = read(sigio_private[0], &c, sizeof(c))); - if (n != sizeof(c)) { - printk(UM_KERN_ERR "update_thread : read failed, err = %d\n", - errno); - goto fail; - } - - set_signals(flags); - return; - fail: - /* Critical section start */ - if (write_sigio_pid != -1) { - os_kill_process(write_sigio_pid, 1); - free_stack(write_sigio_stack, 0); - } - write_sigio_pid = -1; - close(sigio_private[0]); - close(sigio_private[1]); - close(write_sigio_fds[0]); - close(write_sigio_fds[1]); - /* Critical section end */ - set_signals(flags); + struct epoll_event event = { + .data.fd = fd, + .events = EPOLLIN | EPOLLET, + }; + int r; + + CATCH_EINTR(r = epoll_ctl(epollfd, EPOLL_CTL_ADD, fd, &event)); + return r < 0 ? -errno : 0; } int add_sigio_fd(int fd) { - struct pollfd *p; - int err = 0, i, n; + int err; sigio_lock(); - for (i = 0; i < all_sigio_fds.used; i++) { - if (all_sigio_fds.poll[i].fd == fd) - break; - } - if (i == all_sigio_fds.used) - goto out; - - p = &all_sigio_fds.poll[i]; + err = __add_sigio_fd(fd); + sigio_unlock(); - for (i = 0; i < current_poll.used; i++) { - if (current_poll.poll[i].fd == fd) - goto out; - } + return err; +} - n = current_poll.used; - err = need_poll(&next_poll, n + 1); - if (err) - goto out; +int __ignore_sigio_fd(int fd) +{ + struct epoll_event event; + int r; - memcpy(next_poll.poll, current_poll.poll, - current_poll.used * sizeof(struct pollfd)); - next_poll.poll[n] = *p; - next_poll.used = n + 1; - update_thread(); - out: - sigio_unlock(); - return err; + CATCH_EINTR(r = epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, &event)); + return r < 0 ? -errno : 0; } int ignore_sigio_fd(int fd) { - struct pollfd *p; - int err = 0, i, n = 0; - - /* - * This is called from exitcalls elsewhere in UML - if - * sigio_cleanup has already run, then update_thread will hang - * or fail because the thread is no longer running. - */ - if (write_sigio_pid == -1) - return -EIO; + int err; sigio_lock(); - for (i = 0; i < current_poll.used; i++) { - if (current_poll.poll[i].fd == fd) - break; - } - if (i == current_poll.used) - goto out; - - err = need_poll(&next_poll, current_poll.used - 1); - if (err) - goto out; - - for (i = 0; i < current_poll.used; i++) { - p = ¤t_poll.poll[i]; - if (p->fd != fd) - next_poll.poll[n++] = *p; - } - next_poll.used = current_poll.used - 1; - - update_thread(); - out: + err = __ignore_sigio_fd(fd); sigio_unlock(); - return err; -} - -static struct pollfd *setup_initial_poll(int fd) -{ - struct pollfd *p; - p = uml_kmalloc(sizeof(struct pollfd), UM_GFP_KERNEL); - if (p == NULL) { - printk(UM_KERN_ERR "setup_initial_poll : failed to allocate " - "poll\n"); - return NULL; - } - *p = ((struct pollfd) { .fd = fd, - .events = POLLIN, - .revents = 0 }); - return p; + return err; } static void write_sigio_workaround(void) { - struct pollfd *p; int err; - int l_write_sigio_fds[2]; - int l_sigio_private[2]; - int l_write_sigio_pid; - /* We call this *tons* of times - and most ones we must just fail. */ sigio_lock(); - l_write_sigio_pid = write_sigio_pid; - sigio_unlock(); - - if (l_write_sigio_pid != -1) - return; + if (write_sigio_td) + goto out; - err = os_pipe(l_write_sigio_fds, 1, 1); - if (err < 0) { - printk(UM_KERN_ERR "write_sigio_workaround - os_pipe 1 failed, " - "err = %d\n", -err); - return; + epollfd = epoll_create(MAX_EPOLL_EVENTS); + if (epollfd < 0) { + printk(UM_KERN_ERR "%s: epoll_create failed, errno = %d\n", + __func__, errno); + goto out; } - err = os_pipe(l_sigio_private, 1, 1); + + err = os_run_helper_thread(&write_sigio_td, write_sigio_thread, NULL); if (err < 0) { - printk(UM_KERN_ERR "write_sigio_workaround - os_pipe 2 failed, " - "err = %d\n", -err); - goto out_close1; + printk(UM_KERN_ERR "%s: os_run_helper_thread failed, errno = %d\n", + __func__, -err); + close(epollfd); + epollfd = -1; + goto out; } - p = setup_initial_poll(l_sigio_private[1]); - if (!p) - goto out_close2; - - sigio_lock(); - - /* - * Did we race? Don't try to optimize this, please, it's not so likely - * to happen, and no more than once at the boot. - */ - if (write_sigio_pid != -1) - goto out_free; - - current_poll = ((struct pollfds) { .poll = p, - .used = 1, - .size = 1 }); - - if (write_sigio_irq(l_write_sigio_fds[0])) - goto out_clear_poll; - - memcpy(write_sigio_fds, l_write_sigio_fds, sizeof(l_write_sigio_fds)); - memcpy(sigio_private, l_sigio_private, sizeof(l_sigio_private)); - - write_sigio_pid = run_helper_thread(write_sigio_thread, NULL, - CLONE_FILES | CLONE_VM, - &write_sigio_stack); - - if (write_sigio_pid < 0) - goto out_clear; - - sigio_unlock(); - return; - -out_clear: - write_sigio_pid = -1; - write_sigio_fds[0] = -1; - write_sigio_fds[1] = -1; - sigio_private[0] = -1; - sigio_private[1] = -1; -out_clear_poll: - current_poll = ((struct pollfds) { .poll = NULL, - .size = 0, - .used = 0 }); -out_free: +out: sigio_unlock(); - kfree(p); -out_close2: - close(l_sigio_private[0]); - close(l_sigio_private[1]); -out_close1: - close(l_write_sigio_fds[0]); - close(l_write_sigio_fds[1]); } -void sigio_broken(int fd, int read) +void sigio_broken(void) { - int err; - write_sigio_workaround(); - - sigio_lock(); - err = need_poll(&all_sigio_fds, all_sigio_fds.used + 1); - if (err) { - printk(UM_KERN_ERR "maybe_sigio_broken - failed to add pollfd " - "for descriptor %d\n", fd); - goto out; - } - - all_sigio_fds.poll[all_sigio_fds.used++] = - ((struct pollfd) { .fd = fd, - .events = read ? POLLIN : POLLOUT, - .revents = 0 }); -out: - sigio_unlock(); } /* Changed during early boot */ static int pty_output_sigio; -static int pty_close_sigio; -void maybe_sigio_broken(int fd, int read) +void maybe_sigio_broken(int fd) { if (!isatty(fd)) return; - if ((read || pty_output_sigio) && (!read || pty_close_sigio)) + if (pty_output_sigio) return; - sigio_broken(fd, read); + sigio_broken(); } static void sigio_cleanup(void) { - if (write_sigio_pid == -1) + if (!write_sigio_td) return; - os_kill_process(write_sigio_pid, 1); - free_stack(write_sigio_stack, 0); - write_sigio_pid = -1; + os_kill_helper_thread(write_sigio_td); + write_sigio_td = NULL; } __uml_exitcall(sigio_cleanup); @@ -514,19 +286,6 @@ static void tty_output(int master, int slave) printk(UM_KERN_CONT "tty_output : read failed, err = %d\n", n); } -static void tty_close(int master, int slave) -{ - printk(UM_KERN_INFO "Checking that host ptys support SIGIO on " - "close..."); - - close(slave); - if (got_sigio) { - printk(UM_KERN_CONT "Yes\n"); - pty_close_sigio = 1; - } else - printk(UM_KERN_CONT "No, enabling workaround\n"); -} - static void __init check_sigio(void) { if ((access("/dev/ptmx", R_OK) < 0) && @@ -536,7 +295,6 @@ static void __init check_sigio(void) return; } check_one_sigio(tty_output); - check_one_sigio(tty_close); } /* Here because it only does the SIGIO testing for now */ diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c index 9d9f1b4bf826..327fb3c52fc7 100644 --- a/arch/um/os-Linux/signal.c +++ b/arch/um/os-Linux/signal.c @@ -1,31 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 /* + * Copyright (C) 2015 Anton Ivanov (aivanov@{brocade.com,kot-begemot.co.uk}) + * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de) * Copyright (C) 2004 PathScale, Inc * Copyright (C) 2004 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <stdlib.h> #include <stdarg.h> +#include <stdbool.h> #include <errno.h> #include <signal.h> +#include <string.h> #include <strings.h> #include <as-layout.h> #include <kern_util.h> #include <os.h> #include <sysdep/mcontext.h> +#include <um_malloc.h> +#include <sys/ucontext.h> +#include <timetravel.h> #include "internal.h" -void (*sig_info[NSIG])(int, struct siginfo *, struct uml_pt_regs *) = { +void (*sig_info[NSIG])(int, struct siginfo *, struct uml_pt_regs *, void *mc) = { [SIGTRAP] = relay_signal, [SIGFPE] = relay_signal, [SIGILL] = relay_signal, [SIGWINCH] = winch, - [SIGBUS] = bus_handler, + [SIGBUS] = relay_signal, [SIGSEGV] = segv_handler, [SIGIO] = sigio_handler, - [SIGVTALRM] = timer_handler }; + [SIGCHLD] = sigchld_handler, +}; -static void sig_handler_common(int sig, siginfo_t *si, mcontext_t *mc) +static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc) { struct uml_pt_regs r; int save_errno = errno; @@ -38,10 +46,10 @@ static void sig_handler_common(int sig, siginfo_t *si, mcontext_t *mc) } /* enable signals if sig isn't IRQ signal */ - if ((sig != SIGIO) && (sig != SIGWINCH) && (sig != SIGVTALRM)) - unblock_signals(); + if ((sig != SIGIO) && (sig != SIGWINCH) && (sig != SIGCHLD)) + unblock_signals_trace(); - (*sig_info[sig])(sig, si, &r); + (*sig_info[sig])(sig, si, &r, mc); errno = save_errno; } @@ -55,72 +63,131 @@ static void sig_handler_common(int sig, siginfo_t *si, mcontext_t *mc) #define SIGIO_BIT 0 #define SIGIO_MASK (1 << SIGIO_BIT) -#define SIGVTALRM_BIT 1 -#define SIGVTALRM_MASK (1 << SIGVTALRM_BIT) +#define SIGALRM_BIT 1 +#define SIGALRM_MASK (1 << SIGALRM_BIT) -static int signals_enabled; -static unsigned int signals_pending; +#define SIGCHLD_BIT 2 +#define SIGCHLD_MASK (1 << SIGCHLD_BIT) -void sig_handler(int sig, siginfo_t *si, mcontext_t *mc) +__thread int signals_enabled; +#if IS_ENABLED(CONFIG_UML_TIME_TRAVEL_SUPPORT) +static int signals_blocked, signals_blocked_pending; +#endif +static __thread unsigned int signals_pending; +static __thread unsigned int signals_active; + +static void sig_handler(int sig, struct siginfo *si, mcontext_t *mc) { - int enabled; + int enabled = signals_enabled; + +#if IS_ENABLED(CONFIG_UML_TIME_TRAVEL_SUPPORT) + if ((signals_blocked || + __atomic_load_n(&signals_blocked_pending, __ATOMIC_SEQ_CST)) && + (sig == SIGIO)) { + /* increment so unblock will do another round */ + __atomic_add_fetch(&signals_blocked_pending, 1, + __ATOMIC_SEQ_CST); + return; + } +#endif - enabled = signals_enabled; if (!enabled && (sig == SIGIO)) { - signals_pending |= SIGIO_MASK; + /* + * In TT_MODE_EXTERNAL, need to still call time-travel + * handlers. This will mark signals_pending by itself + * (only if necessary.) + * Note we won't get here if signals are hard-blocked + * (which is handled above), in that case the hard- + * unblock will handle things. + */ + if (time_travel_mode == TT_MODE_EXTERNAL) + sigio_run_timetravel_handlers(); + else + signals_pending |= SIGIO_MASK; return; } - block_signals(); + if (!enabled && (sig == SIGCHLD)) { + signals_pending |= SIGCHLD_MASK; + return; + } + + block_signals_trace(); sig_handler_common(sig, si, mc); - set_signals(enabled); + um_set_signals_trace(enabled); } -static void real_alarm_handler(mcontext_t *mc) +static void timer_real_alarm_handler(mcontext_t *mc) { struct uml_pt_regs regs; if (mc != NULL) get_regs_from_mc(®s, mc); - regs.is_user = 0; - unblock_signals(); - timer_handler(SIGVTALRM, NULL, ®s); + else + memset(®s, 0, sizeof(regs)); + timer_handler(SIGALRM, NULL, ®s); } -void alarm_handler(int sig, struct siginfo *unused_si, mcontext_t *mc) +static void timer_alarm_handler(int sig, struct siginfo *unused_si, mcontext_t *mc) { int enabled; enabled = signals_enabled; if (!signals_enabled) { - signals_pending |= SIGVTALRM_MASK; + signals_pending |= SIGALRM_MASK; return; } - block_signals(); + block_signals_trace(); + + signals_active |= SIGALRM_MASK; + + timer_real_alarm_handler(mc); - real_alarm_handler(mc); - set_signals(enabled); + signals_active &= ~SIGALRM_MASK; + + um_set_signals_trace(enabled); +} + +void deliver_alarm(void) { + timer_alarm_handler(SIGALRM, NULL, NULL); } -void timer_init(void) +void timer_set_signal_handler(void) { - set_handler(SIGVTALRM); + set_handler(SIGALRM); +} + +int timer_alarm_pending(void) +{ + return !!(signals_pending & SIGALRM_MASK); } void set_sigstack(void *sig_stack, int size) { - stack_t stack = ((stack_t) { .ss_flags = 0, - .ss_sp = (__ptr_t) sig_stack, - .ss_size = size - sizeof(void *) }); + stack_t stack = { + .ss_flags = 0, + .ss_sp = sig_stack, + .ss_size = size + }; if (sigaltstack(&stack, NULL) != 0) panic("enabling signal stack failed, errno = %d\n", errno); } -static void (*handlers[_NSIG])(int sig, siginfo_t *si, mcontext_t *mc) = { +static void sigusr1_handler(int sig, struct siginfo *unused_si, mcontext_t *mc) +{ + uml_pm_wake(); +} + +void register_pm_wake_signal(void) +{ + set_handler(SIGUSR1); +} + +static void (*handlers[_NSIG])(int sig, struct siginfo *si, mcontext_t *mc) = { [SIGSEGV] = sig_handler, [SIGBUS] = sig_handler, [SIGILL] = sig_handler, @@ -129,51 +196,19 @@ static void (*handlers[_NSIG])(int sig, siginfo_t *si, mcontext_t *mc) = { [SIGIO] = sig_handler, [SIGWINCH] = sig_handler, - [SIGVTALRM] = alarm_handler -}; + /* SIGCHLD is only actually registered in seccomp mode. */ + [SIGCHLD] = sig_handler, + [SIGALRM] = timer_alarm_handler, + [SIGUSR1] = sigusr1_handler, +}; static void hard_handler(int sig, siginfo_t *si, void *p) { - struct ucontext *uc = p; + ucontext_t *uc = p; mcontext_t *mc = &uc->uc_mcontext; - unsigned long pending = 1UL << sig; - do { - int nested, bail; - - /* - * pending comes back with one bit set for each - * interrupt that arrived while setting up the stack, - * plus a bit for this interrupt, plus the zero bit is - * set if this is a nested interrupt. - * If bail is true, then we interrupted another - * handler setting up the stack. In this case, we - * have to return, and the upper handler will deal - * with this interrupt. - */ - bail = to_irq_stack(&pending); - if (bail) - return; - - nested = pending & 1; - pending &= ~1; - - while ((sig = ffs(pending)) != 0){ - sig--; - pending &= ~(1 << sig); - (*handlers[sig])(sig, si, mc); - } - - /* - * Again, pending comes back with a mask of signals - * that arrived while tearing down the stack. If this - * is non-zero, we just go back, set up the stack - * again, and handle the new interrupts. - */ - if (!nested) - pending = from_irq_stack(nested); - } while (pending); + (*handlers[sig])(sig, (struct siginfo *)si, mc); } void set_handler(int sig) @@ -186,9 +221,9 @@ void set_handler(int sig) /* block irq ones */ sigemptyset(&action.sa_mask); - sigaddset(&action.sa_mask, SIGVTALRM); sigaddset(&action.sa_mask, SIGIO); sigaddset(&action.sa_mask, SIGWINCH); + sigaddset(&action.sa_mask, SIGALRM); if (sig == SIGSEGV) flags |= SA_NODEFER; @@ -207,6 +242,11 @@ void set_handler(int sig) panic("sigprocmask failed - errno = %d\n", errno); } +void send_sigio_to_self(void) +{ + kill(os_getpid(), SIGIO); +} + int change_sig(int signal, int on) { sigset_t sigset; @@ -219,9 +259,29 @@ int change_sig(int signal, int on) return 0; } -void block_signals(void) +static inline void __block_signals(void) { + if (!signals_enabled) + return; + + os_local_ipi_disable(); + barrier(); signals_enabled = 0; +} + +static inline void __unblock_signals(void) +{ + if (signals_enabled) + return; + + signals_enabled = 1; + barrier(); + os_local_ipi_enable(); +} + +void block_signals(void) +{ + __block_signals(); /* * This must return with signals disabled, so this barrier * ensures that writes are flushed out before the return. @@ -238,6 +298,12 @@ void unblock_signals(void) if (signals_enabled == 1) return; + __unblock_signals(); + +#if IS_ENABLED(CONFIG_UML_TIME_TRAVEL_SUPPORT) + deliver_time_travel_irqs(); +#endif + /* * We loop because the IRQ handler returns with interrupts off. So, * interrupts may have arrived and we need to re-enable them and @@ -247,12 +313,9 @@ void unblock_signals(void) /* * Save and reset save_pending after enabling signals. This * way, signals_pending won't be changed while we're reading it. - */ - signals_enabled = 1; - - /* + * * Setting signals_enabled and reading signals_pending must - * happen in this order. + * happen in this order, so have the barrier here. */ barrier(); @@ -265,10 +328,13 @@ void unblock_signals(void) /* * We have pending interrupts, so disable signals, as the * handlers expect them off when they are called. They will - * be enabled again above. + * be enabled again above. We need to trace this, as we're + * expected to be enabling interrupts already, but any more + * tracing that happens inside the handlers we call for the + * pending signals will mess up the tracing state. */ - - signals_enabled = 0; + __block_signals(); + um_trace_signals_off(); /* * Deal with SIGIO first because the alarm handler might @@ -281,17 +347,34 @@ void unblock_signals(void) if (save_pending & SIGIO_MASK) sig_handler_common(SIGIO, NULL, NULL); - if (save_pending & SIGVTALRM_MASK) - real_alarm_handler(NULL); + if (save_pending & SIGCHLD_MASK) { + struct uml_pt_regs regs = {}; + + sigchld_handler(SIGCHLD, NULL, ®s, NULL); + } + + /* Do not reenter the handler */ + + if ((save_pending & SIGALRM_MASK) && (!(signals_active & SIGALRM_MASK))) + timer_real_alarm_handler(NULL); + + /* Rerun the loop only if there is still pending SIGIO and not in TIMER handler */ + + if (!(signals_pending & SIGIO_MASK) && (signals_active & SIGALRM_MASK)) + return; + + /* Re-enable signals and trace that we're doing so. */ + um_trace_signals_on(); + __unblock_signals(); } } -int get_signals(void) +int um_get_signals(void) { return signals_enabled; } -int set_signals(int enable) +int um_set_signals(int enable) { int ret; if (signals_enabled == enable) @@ -304,3 +387,117 @@ int set_signals(int enable) return ret; } + +int um_set_signals_trace(int enable) +{ + int ret; + if (signals_enabled == enable) + return enable; + + ret = signals_enabled; + if (enable) + unblock_signals_trace(); + else + block_signals_trace(); + + return ret; +} + +#if IS_ENABLED(CONFIG_UML_TIME_TRAVEL_SUPPORT) +void mark_sigio_pending(void) +{ + /* + * It would seem that this should be atomic so + * it isn't a read-modify-write with a signal + * that could happen in the middle, losing the + * value set by the signal. + * + * However, this function is only called when in + * time-travel=ext simulation mode, in which case + * the only signal ever pending is SIGIO, which + * is blocked while this can be called, and the + * timer signal (SIGALRM) cannot happen. + */ + signals_pending |= SIGIO_MASK; +} + +void block_signals_hard(void) +{ + signals_blocked++; + barrier(); +} + +void unblock_signals_hard(void) +{ + static bool unblocking; + + if (!signals_blocked) + panic("unblocking signals while not blocked"); + + if (--signals_blocked) + return; + /* + * Must be set to 0 before we check pending so the + * SIGIO handler will run as normal unless we're still + * going to process signals_blocked_pending. + */ + barrier(); + + /* + * Note that block_signals_hard()/unblock_signals_hard() can be called + * within the unblock_signals()/sigio_run_timetravel_handlers() below. + * This would still be prone to race conditions since it's actually a + * call _within_ e.g. vu_req_read_message(), where we observed this + * issue, which loops. Thus, if the inner call handles the recorded + * pending signals, we can get out of the inner call with the real + * signal hander no longer blocked, and still have a race. Thus don't + * handle unblocking in the inner call, if it happens, but only in + * the outermost call - 'unblocking' serves as an ownership for the + * signals_blocked_pending decrement. + */ + if (unblocking) + return; + unblocking = true; + + while (__atomic_load_n(&signals_blocked_pending, __ATOMIC_SEQ_CST)) { + if (signals_enabled) { + /* signals are enabled so we can touch this */ + signals_pending |= SIGIO_MASK; + /* + * this is a bit inefficient, but that's + * not really important + */ + block_signals(); + unblock_signals(); + } else { + /* + * we need to run time-travel handlers even + * if not enabled + */ + sigio_run_timetravel_handlers(); + } + + /* + * The decrement of signals_blocked_pending must be atomic so + * that the signal handler will either happen before or after + * the decrement, not during a read-modify-write: + * - If it happens before, it can increment it and we'll + * decrement it and do another round in the loop. + * - If it happens after it'll see 0 for both signals_blocked + * and signals_blocked_pending and thus run the handler as + * usual (subject to signals_enabled, but that's unrelated.) + * + * Note that a call to unblock_signals_hard() within the calls + * to unblock_signals() or sigio_run_timetravel_handlers() above + * will do nothing due to the 'unblocking' state, so this cannot + * underflow as the only one decrementing will be the outermost + * one. + */ + if (__atomic_sub_fetch(&signals_blocked_pending, 1, + __ATOMIC_SEQ_CST) < 0) + panic("signals_blocked_pending underflow"); + } + + unblocking = false; +} +#endif diff --git a/arch/um/os-Linux/skas/Makefile b/arch/um/os-Linux/skas/Makefile index d2ea3409e072..75f11989d2e9 100644 --- a/arch/um/os-Linux/skas/Makefile +++ b/arch/um/os-Linux/skas/Makefile @@ -1,10 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0 # # Copyright (C) 2002 - 2007 Jeff Dike (jdike@{linux.intel,addtoit}.com) -# Licensed under the GPL # obj-y := mem.o process.o USER_OBJS := $(obj-y) -include arch/um/scripts/Makefile.rules +include $(srctree)/arch/um/scripts/Makefile.rules diff --git a/arch/um/os-Linux/skas/mem.c b/arch/um/os-Linux/skas/mem.c index 689b18db798f..8b9921ac3ef8 100644 --- a/arch/um/os-Linux/skas/mem.c +++ b/arch/um/os-Linux/skas/mem.c @@ -1,6 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* + * Copyright (C) 2021 Benjamin Berg <benjamin@sipsolutions.net> * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <stddef.h> @@ -12,16 +13,47 @@ #include <as-layout.h> #include <mm_id.h> #include <os.h> -#include <proc_mm.h> #include <ptrace_user.h> #include <registers.h> #include <skas.h> #include <sysdep/ptrace.h> #include <sysdep/stub.h> +#include "../internal.h" -extern unsigned long batch_syscall_stub, __syscall_stub_start; +extern char __syscall_stub_start[]; -extern void wait_stub_done(int pid); +void syscall_stub_dump_error(struct mm_id *mm_idp) +{ + struct stub_data *proc_data = (void *)mm_idp->stack; + struct stub_syscall *sc; + + if (proc_data->syscall_data_len < 0 || + proc_data->syscall_data_len >= ARRAY_SIZE(proc_data->syscall_data)) + panic("Syscall data was corrupted by stub (len is: %d, expected maximum: %d)!", + proc_data->syscall_data_len, + mm_idp->syscall_data_len); + + sc = &proc_data->syscall_data[proc_data->syscall_data_len]; + + printk(UM_KERN_ERR "%s : length = %d, last offset = %d", + __func__, mm_idp->syscall_data_len, + proc_data->syscall_data_len); + printk(UM_KERN_ERR "%s : stub syscall type %d failed, return value = 0x%lx\n", + __func__, sc->syscall, proc_data->err); + + print_hex_dump(UM_KERN_ERR, " syscall data: ", 0, + 16, 4, sc, sizeof(*sc), 0); + + if (using_seccomp) { + printk(UM_KERN_ERR "%s: FD map num: %d", __func__, + mm_idp->syscall_fd_num); + print_hex_dump(UM_KERN_ERR, + " FD map: ", 0, 16, + sizeof(mm_idp->syscall_fd_map[0]), + mm_idp->syscall_fd_map, + sizeof(mm_idp->syscall_fd_map), 0); + } +} static inline unsigned long *check_init_stack(struct mm_id * mm_idp, unsigned long *stack) @@ -38,246 +70,215 @@ static unsigned long syscall_regs[MAX_REG_NR]; static int __init init_syscall_regs(void) { get_safe_registers(syscall_regs, NULL); + syscall_regs[REGS_IP_INDEX] = STUB_CODE + - ((unsigned long) &batch_syscall_stub - - (unsigned long) &__syscall_stub_start); + ((unsigned long) stub_syscall_handler - + (unsigned long) __syscall_stub_start); + syscall_regs[REGS_SP_INDEX] = STUB_DATA + + offsetof(struct stub_data, sigstack) + + sizeof(((struct stub_data *) 0)->sigstack) - + sizeof(void *); + return 0; } __initcall(init_syscall_regs); -extern int proc_mm; - -static inline long do_syscall_stub(struct mm_id * mm_idp, void **addr) +static inline long do_syscall_stub(struct mm_id *mm_idp) { + struct stub_data *proc_data = (void *)mm_idp->stack; int n, i; - long ret, offset; - unsigned long * data; - unsigned long * syscall; - int err, pid = mm_idp->u.pid; - - if (proc_mm) - /* FIXME: Need to look up userspace_pid by cpu */ - pid = userspace_pid[0]; - - n = ptrace_setregs(pid, syscall_regs); - if (n < 0) { - printk(UM_KERN_ERR "Registers - \n"); - for (i = 0; i < MAX_REG_NR; i++) - printk(UM_KERN_ERR "\t%d\t0x%lx\n", i, syscall_regs[i]); - panic("do_syscall_stub : PTRACE_SETREGS failed, errno = %d\n", - -n); - } + int err, pid = mm_idp->pid; + + /* Inform process how much we have filled in. */ + proc_data->syscall_data_len = mm_idp->syscall_data_len; + + if (using_seccomp) { + proc_data->restart_wait = 1; + wait_stub_done_seccomp(mm_idp, 0, 1); + } else { + n = ptrace_setregs(pid, syscall_regs); + if (n < 0) { + printk(UM_KERN_ERR "Registers -\n"); + for (i = 0; i < MAX_REG_NR; i++) + printk(UM_KERN_ERR "\t%d\t0x%lx\n", i, syscall_regs[i]); + panic("%s : PTRACE_SETREGS failed, errno = %d\n", + __func__, -n); + } - err = ptrace(PTRACE_CONT, pid, 0, 0); - if (err) - panic("Failed to continue stub, pid = %d, errno = %d\n", pid, - errno); + err = ptrace(PTRACE_CONT, pid, 0, 0); + if (err) + panic("Failed to continue stub, pid = %d, errno = %d\n", + pid, errno); - wait_stub_done(pid); + wait_stub_done(pid); + } /* - * When the stub stops, we find the following values on the - * beginning of the stack: - * (long )return_value - * (long )offset to failed sycall-data (0, if no error) + * proc_data->err will be negative if there was an (unexpected) error. + * In that case, syscall_data_len points to the last executed syscall, + * otherwise it will be zero (but we do not need to rely on that). */ - ret = *((unsigned long *) mm_idp->stack); - offset = *((unsigned long *) mm_idp->stack + 1); - if (offset) { - data = (unsigned long *)(mm_idp->stack + offset - STUB_DATA); - printk(UM_KERN_ERR "do_syscall_stub : ret = %ld, offset = %ld, " - "data = %p\n", ret, offset, data); - syscall = (unsigned long *)((unsigned long)data + data[0]); - printk(UM_KERN_ERR "do_syscall_stub: syscall %ld failed, " - "return value = 0x%lx, expected return value = 0x%lx\n", - syscall[0], ret, syscall[7]); - printk(UM_KERN_ERR " syscall parameters: " - "0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n", - syscall[1], syscall[2], syscall[3], - syscall[4], syscall[5], syscall[6]); - for (n = 1; n < data[0]/sizeof(long); n++) { - if (n == 1) - printk(UM_KERN_ERR " additional syscall " - "data:"); - if (n % 4 == 1) - printk("\n" UM_KERN_ERR " "); - printk(" 0x%lx", data[n]); - } - if (n > 1) - printk("\n"); + if (proc_data->err < 0) { + syscall_stub_dump_error(mm_idp); + + /* Store error code in case someone tries to add more syscalls */ + mm_idp->syscall_data_len = proc_data->err; + } else { + mm_idp->syscall_data_len = 0; } - else ret = 0; - *addr = check_init_stack(mm_idp, NULL); + if (using_seccomp) + mm_idp->syscall_fd_num = 0; - return ret; + return mm_idp->syscall_data_len; } -long run_syscall_stub(struct mm_id * mm_idp, int syscall, - unsigned long *args, long expected, void **addr, - int done) +int syscall_stub_flush(struct mm_id *mm_idp) { - unsigned long *stack = check_init_stack(mm_idp, *addr); - - *stack += sizeof(long); - stack += *stack / sizeof(long); - - *stack++ = syscall; - *stack++ = args[0]; - *stack++ = args[1]; - *stack++ = args[2]; - *stack++ = args[3]; - *stack++ = args[4]; - *stack++ = args[5]; - *stack++ = expected; - *stack = 0; - - if (!done && ((((unsigned long) stack) & ~UM_KERN_PAGE_MASK) < - UM_KERN_PAGE_SIZE - 10 * sizeof(long))) { - *addr = stack; + int res; + + if (mm_idp->syscall_data_len == 0) return 0; + + /* If an error happened already, report it and reset the state. */ + if (mm_idp->syscall_data_len < 0) { + res = mm_idp->syscall_data_len; + mm_idp->syscall_data_len = 0; + return res; } - return do_syscall_stub(mm_idp, addr); + res = do_syscall_stub(mm_idp); + mm_idp->syscall_data_len = 0; + + return res; } -long syscall_stub_data(struct mm_id * mm_idp, - unsigned long *data, int data_count, - void **addr, void **stub_addr) +struct stub_syscall *syscall_stub_alloc(struct mm_id *mm_idp) { - unsigned long *stack; - int ret = 0; - - /* - * If *addr still is uninitialized, it *must* contain NULL. - * Thus in this case do_syscall_stub correctly won't be called. - */ - if ((((unsigned long) *addr) & ~UM_KERN_PAGE_MASK) >= - UM_KERN_PAGE_SIZE - (10 + data_count) * sizeof(long)) { - ret = do_syscall_stub(mm_idp, addr); - /* in case of error, don't overwrite data on stack */ - if (ret) - return ret; + struct stub_syscall *sc; + struct stub_data *proc_data = (struct stub_data *) mm_idp->stack; + + if (mm_idp->syscall_data_len > 0 && + mm_idp->syscall_data_len == ARRAY_SIZE(proc_data->syscall_data)) + do_syscall_stub(mm_idp); + + if (mm_idp->syscall_data_len < 0) { + /* Return dummy to retain error state. */ + sc = &proc_data->syscall_data[0]; + } else { + sc = &proc_data->syscall_data[mm_idp->syscall_data_len]; + mm_idp->syscall_data_len += 1; } + memset(sc, 0, sizeof(*sc)); - stack = check_init_stack(mm_idp, *addr); - *addr = stack; + return sc; +} - *stack = data_count * sizeof(long); +static struct stub_syscall *syscall_stub_get_previous(struct mm_id *mm_idp, + int syscall_type, + unsigned long virt) +{ + if (mm_idp->syscall_data_len > 0) { + struct stub_data *proc_data = (void *) mm_idp->stack; + struct stub_syscall *sc; - memcpy(stack + 1, data, data_count * sizeof(long)); + sc = &proc_data->syscall_data[mm_idp->syscall_data_len - 1]; - *stub_addr = (void *)(((unsigned long)(stack + 1) & - ~UM_KERN_PAGE_MASK) + STUB_DATA); + if (sc->syscall == syscall_type && + sc->mem.addr + sc->mem.length == virt) + return sc; + } - return 0; + return NULL; } -int map(struct mm_id * mm_idp, unsigned long virt, unsigned long len, int prot, - int phys_fd, unsigned long long offset, int done, void **data) +static int get_stub_fd(struct mm_id *mm_idp, int fd) { - int ret; - - if (proc_mm) { - struct proc_mm_op map; - int fd = mm_idp->u.mm_fd; - - map = ((struct proc_mm_op) { .op = MM_MMAP, - .u = - { .mmap = - { .addr = virt, - .len = len, - .prot = prot, - .flags = MAP_SHARED | - MAP_FIXED, - .fd = phys_fd, - .offset= offset - } } } ); - CATCH_EINTR(ret = write(fd, &map, sizeof(map))); - if (ret != sizeof(map)) { - ret = -errno; - printk(UM_KERN_ERR "map : /proc/mm map failed, " - "err = %d\n", -ret); + int i; + + /* Find an FD slot (or flush and use first) */ + if (!using_seccomp) + return fd; + + /* Already crashed, value does not matter */ + if (mm_idp->syscall_data_len < 0) + return 0; + + /* Find existing FD in map if we can allocate another syscall */ + if (mm_idp->syscall_data_len < + ARRAY_SIZE(((struct stub_data *)NULL)->syscall_data)) { + for (i = 0; i < mm_idp->syscall_fd_num; i++) { + if (mm_idp->syscall_fd_map[i] == fd) + return i; } - else ret = 0; - } - else { - unsigned long args[] = { virt, len, prot, - MAP_SHARED | MAP_FIXED, phys_fd, - MMAP_OFFSET(offset) }; - ret = run_syscall_stub(mm_idp, STUB_MMAP_NR, args, virt, - data, done); + if (mm_idp->syscall_fd_num < STUB_MAX_FDS) { + i = mm_idp->syscall_fd_num; + mm_idp->syscall_fd_map[i] = fd; + + mm_idp->syscall_fd_num++; + + return i; + } } - return ret; + /* FD map full or no syscall space available, continue after flush */ + do_syscall_stub(mm_idp); + mm_idp->syscall_fd_map[0] = fd; + mm_idp->syscall_fd_num = 1; + + return 0; } -int unmap(struct mm_id * mm_idp, unsigned long addr, unsigned long len, - int done, void **data) +int map(struct mm_id *mm_idp, unsigned long virt, unsigned long len, int prot, + int phys_fd, unsigned long long offset) { - int ret; - - if (proc_mm) { - struct proc_mm_op unmap; - int fd = mm_idp->u.mm_fd; - - unmap = ((struct proc_mm_op) { .op = MM_MUNMAP, - .u = - { .munmap = - { .addr = - (unsigned long) addr, - .len = len } } } ); - CATCH_EINTR(ret = write(fd, &unmap, sizeof(unmap))); - if (ret != sizeof(unmap)) { - ret = -errno; - printk(UM_KERN_ERR "unmap - proc_mm write returned " - "%d\n", ret); + struct stub_syscall *sc; + + /* Compress with previous syscall if that is possible */ + sc = syscall_stub_get_previous(mm_idp, STUB_SYSCALL_MMAP, virt); + if (sc && sc->mem.prot == prot && + sc->mem.offset == MMAP_OFFSET(offset - sc->mem.length)) { + int prev_fd = sc->mem.fd; + + if (using_seccomp) + prev_fd = mm_idp->syscall_fd_map[sc->mem.fd]; + + if (phys_fd == prev_fd) { + sc->mem.length += len; + return 0; } - else ret = 0; } - else { - unsigned long args[] = { (unsigned long) addr, len, 0, 0, 0, - 0 }; - ret = run_syscall_stub(mm_idp, __NR_munmap, args, 0, - data, done); - } + phys_fd = get_stub_fd(mm_idp, phys_fd); - return ret; + sc = syscall_stub_alloc(mm_idp); + sc->syscall = STUB_SYSCALL_MMAP; + sc->mem.addr = virt; + sc->mem.length = len; + sc->mem.prot = prot; + sc->mem.fd = phys_fd; + sc->mem.offset = MMAP_OFFSET(offset); + + return 0; } -int protect(struct mm_id * mm_idp, unsigned long addr, unsigned long len, - unsigned int prot, int done, void **data) +int unmap(struct mm_id *mm_idp, unsigned long addr, unsigned long len) { - struct proc_mm_op protect; - int ret; - - if (proc_mm) { - int fd = mm_idp->u.mm_fd; - - protect = ((struct proc_mm_op) { .op = MM_MPROTECT, - .u = - { .mprotect = - { .addr = - (unsigned long) addr, - .len = len, - .prot = prot } } } ); - - CATCH_EINTR(ret = write(fd, &protect, sizeof(protect))); - if (ret != sizeof(protect)) { - ret = -errno; - printk(UM_KERN_ERR "protect failed, err = %d", -ret); - } - else ret = 0; - } - else { - unsigned long args[] = { addr, len, prot, 0, 0, 0 }; + struct stub_syscall *sc; - ret = run_syscall_stub(mm_idp, __NR_mprotect, args, 0, - data, done); + /* Compress with previous syscall if that is possible */ + sc = syscall_stub_get_previous(mm_idp, STUB_SYSCALL_MUNMAP, addr); + if (sc) { + sc->mem.length += len; + return 0; } - return ret; + sc = syscall_stub_alloc(mm_idp); + sc->syscall = STUB_SYSCALL_MUNMAP; + sc->mem.addr = addr; + sc->mem.length = len; + + return 0; } diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c index 4625949bf1e4..d6c22f8aa06d 100644 --- a/arch/um/os-Linux/skas/process.c +++ b/arch/um/os-Linux/skas/process.c @@ -1,33 +1,92 @@ +// SPDX-License-Identifier: GPL-2.0 /* + * Copyright (C) 2021 Benjamin Berg <benjamin@sipsolutions.net> + * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de) * Copyright (C) 2002- 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <stdlib.h> +#include <stdbool.h> #include <unistd.h> #include <sched.h> #include <errno.h> #include <string.h> +#include <fcntl.h> +#include <mem_user.h> #include <sys/mman.h> #include <sys/wait.h> +#include <sys/stat.h> +#include <sys/socket.h> #include <asm/unistd.h> #include <as-layout.h> #include <init.h> #include <kern_util.h> #include <mem.h> #include <os.h> -#include <proc_mm.h> #include <ptrace_user.h> #include <registers.h> #include <skas.h> -#include <skas_ptrace.h> #include <sysdep/stub.h> +#include <sysdep/mcontext.h> +#include <linux/futex.h> +#include <linux/threads.h> +#include <timetravel.h> +#include <asm-generic/rwonce.h> +#include "../internal.h" int is_skas_winch(int pid, int fd, void *data) { return pid == getpgrp(); } +static const char *ptrace_reg_name(int idx) +{ +#define R(n) case HOST_##n: return #n + + switch (idx) { +#ifdef __x86_64__ + R(BX); + R(CX); + R(DI); + R(SI); + R(DX); + R(BP); + R(AX); + R(R8); + R(R9); + R(R10); + R(R11); + R(R12); + R(R13); + R(R14); + R(R15); + R(ORIG_AX); + R(CS); + R(SS); + R(EFLAGS); +#elif defined(__i386__) + R(IP); + R(SP); + R(EFLAGS); + R(AX); + R(BX); + R(CX); + R(DX); + R(SI); + R(DI); + R(BP); + R(CS); + R(SS); + R(DS); + R(FS); + R(ES); + R(GS); + R(ORIG_AX); +#endif + } + return ""; +} + static int ptrace_dump_regs(int pid) { unsigned long regs[MAX_REG_NR]; @@ -37,8 +96,11 @@ static int ptrace_dump_regs(int pid) return -errno; printk(UM_KERN_ERR "Stub registers -\n"); - for (i = 0; i < ARRAY_SIZE(regs); i++) - printk(UM_KERN_ERR "\t%d - %lx\n", i, regs[i]); + for (i = 0; i < ARRAY_SIZE(regs); i++) { + const char *regname = ptrace_reg_name(i); + + printk(UM_KERN_ERR "\t%s\t(%2d): %lx\n", regname, i, regs[i]); + } return 0; } @@ -47,7 +109,7 @@ static int ptrace_dump_regs(int pid) * Signals that are OK to receive in the stub - we'll just continue it. * SIGWINCH will happen when UML is inside a detached screen. */ -#define STUB_SIG_MASK ((1 << SIGVTALRM) | (1 << SIGWINCH)) +#define STUB_SIG_MASK ((1 << SIGALRM) | (1 << SIGWINCH)) /* Signals that the stub will finish with - anything else is an error */ #define STUB_DONE_MASK (1 << SIGTRAP) @@ -66,8 +128,8 @@ void wait_stub_done(int pid) err = ptrace(PTRACE_CONT, pid, 0, 0); if (err) { - printk(UM_KERN_ERR "wait_stub_done : continue failed, " - "errno = %d\n", errno); + printk(UM_KERN_ERR "%s : continue failed, errno = %d\n", + __func__, errno); fatal_sigsegv(); } } @@ -78,384 +140,647 @@ void wait_stub_done(int pid) bad_wait: err = ptrace_dump_regs(pid); if (err) - printk(UM_KERN_ERR "Failed to get registers from stub, " - "errno = %d\n", -err); - printk(UM_KERN_ERR "wait_stub_done : failed to wait for SIGTRAP, " - "pid = %d, n = %d, errno = %d, status = 0x%x\n", pid, n, errno, - status); + printk(UM_KERN_ERR "Failed to get registers from stub, errno = %d\n", + -err); + printk(UM_KERN_ERR "%s : failed to wait for SIGTRAP, pid = %d, n = %d, errno = %d, status = 0x%x\n", + __func__, pid, n, errno, status); fatal_sigsegv(); } -extern unsigned long current_stub_stack(void); - -static void get_skas_faultinfo(int pid, struct faultinfo *fi) +void wait_stub_done_seccomp(struct mm_id *mm_idp, int running, int wait_sigsys) { - int err; + struct stub_data *data = (void *)mm_idp->stack; + int ret; - if (ptrace_faultinfo) { - err = ptrace(PTRACE_FAULTINFO, pid, 0, fi); - if (err) { - printk(UM_KERN_ERR "get_skas_faultinfo - " - "PTRACE_FAULTINFO failed, errno = %d\n", errno); - fatal_sigsegv(); + do { + const char byte = 0; + struct iovec iov = { + .iov_base = (void *)&byte, + .iov_len = sizeof(byte), + }; + union { + char data[CMSG_SPACE(sizeof(mm_idp->syscall_fd_map))]; + struct cmsghdr align; + } ctrl; + struct msghdr msgh = { + .msg_iov = &iov, + .msg_iovlen = 1, + }; + + if (!running) { + if (mm_idp->syscall_fd_num) { + unsigned int fds_size = + sizeof(int) * mm_idp->syscall_fd_num; + struct cmsghdr *cmsg; + + msgh.msg_control = ctrl.data; + msgh.msg_controllen = CMSG_SPACE(fds_size); + cmsg = CMSG_FIRSTHDR(&msgh); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(fds_size); + memcpy(CMSG_DATA(cmsg), mm_idp->syscall_fd_map, + fds_size); + + CATCH_EINTR(syscall(__NR_sendmsg, mm_idp->sock, + &msgh, 0)); + } + + data->signal = 0; + data->futex = FUTEX_IN_CHILD; + CATCH_EINTR(syscall(__NR_futex, &data->futex, + FUTEX_WAKE, 1, NULL, NULL, 0)); } - /* Special handling for i386, which has different structs */ - if (sizeof(struct ptrace_faultinfo) < sizeof(struct faultinfo)) - memset((char *)fi + sizeof(struct ptrace_faultinfo), 0, - sizeof(struct faultinfo) - - sizeof(struct ptrace_faultinfo)); - } - else { - unsigned long fpregs[FP_SIZE]; + do { + /* + * We need to check whether the child is still alive + * before and after the FUTEX_WAIT call. Before, in + * case it just died but we still updated data->futex + * to FUTEX_IN_CHILD. And after, in case it died while + * we were waiting (and SIGCHLD woke us up, see the + * IRQ handler in mmu.c). + * + * Either way, if PID is negative, then we have no + * choice but to kill the task. + */ + if (__READ_ONCE(mm_idp->pid) < 0) + goto out_kill; + + ret = syscall(__NR_futex, &data->futex, + FUTEX_WAIT, FUTEX_IN_CHILD, + NULL, NULL, 0); + if (ret < 0 && errno != EINTR && errno != EAGAIN) { + printk(UM_KERN_ERR "%s : FUTEX_WAIT failed, errno = %d\n", + __func__, errno); + goto out_kill; + } + } while (data->futex == FUTEX_IN_CHILD); - err = get_fp_registers(pid, fpregs); - if (err < 0) { - printk(UM_KERN_ERR "save_fp_registers returned %d\n", - err); - fatal_sigsegv(); - } - err = ptrace(PTRACE_CONT, pid, 0, SIGSEGV); - if (err) { - printk(UM_KERN_ERR "Failed to continue stub, pid = %d, " - "errno = %d\n", pid, errno); - fatal_sigsegv(); - } - wait_stub_done(pid); + if (__READ_ONCE(mm_idp->pid) < 0) + goto out_kill; - /* - * faultinfo is prepared by the stub-segv-handler at start of - * the stub stack page. We just have to copy it. - */ - memcpy(fi, (void *)current_stub_stack(), sizeof(*fi)); + running = 0; - err = put_fp_registers(pid, fpregs); - if (err < 0) { - printk(UM_KERN_ERR "put_fp_registers returned %d\n", - err); - fatal_sigsegv(); - } + /* We may receive a SIGALRM before SIGSYS, iterate again. */ + } while (wait_sigsys && data->signal == SIGALRM); + + if (data->mctx_offset > sizeof(data->sigstack) - sizeof(mcontext_t)) { + printk(UM_KERN_ERR "%s : invalid mcontext offset", __func__); + goto out_kill; } + + if (wait_sigsys && data->signal != SIGSYS) { + printk(UM_KERN_ERR "%s : expected SIGSYS but got %d", + __func__, data->signal); + goto out_kill; + } + + return; + +out_kill: + printk(UM_KERN_ERR "%s : failed to wait for stub, pid = %d, errno = %d\n", + __func__, mm_idp->pid, errno); + /* This is not true inside start_userspace */ + if (current_mm_id() == mm_idp) + fatal_sigsegv(); } -static void handle_segv(int pid, struct uml_pt_regs * regs) +extern unsigned long current_stub_stack(void); + +static void get_skas_faultinfo(int pid, struct faultinfo *fi) { - get_skas_faultinfo(pid, ®s->faultinfo); - segv(regs->faultinfo, 0, 1, NULL); + int err; + + err = ptrace(PTRACE_CONT, pid, 0, SIGSEGV); + if (err) { + printk(UM_KERN_ERR "Failed to continue stub, pid = %d, " + "errno = %d\n", pid, errno); + fatal_sigsegv(); + } + wait_stub_done(pid); + + /* + * faultinfo is prepared by the stub_segv_handler at start of + * the stub stack page. We just have to copy it. + */ + memcpy(fi, (void *)current_stub_stack(), sizeof(*fi)); } -/* - * To use the same value of using_sysemu as the caller, ask it that value - * (in local_using_sysemu - */ -static void handle_trap(int pid, struct uml_pt_regs *regs, - int local_using_sysemu) +static void handle_trap(struct uml_pt_regs *regs) { - int err, status; - if ((UPT_IP(regs) >= STUB_START) && (UPT_IP(regs) < STUB_END)) fatal_sigsegv(); - /* Mark this as a syscall */ - UPT_SYSCALL_NR(regs) = PT_SYSCALL_NR(regs->gp); + handle_syscall(regs); +} - if (!local_using_sysemu) - { - err = ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_NR_OFFSET, - __NR_getpid); - if (err < 0) { - printk(UM_KERN_ERR "handle_trap - nullifying syscall " - "failed, errno = %d\n", errno); - fatal_sigsegv(); - } +extern char __syscall_stub_start[]; - err = ptrace(PTRACE_SYSCALL, pid, 0, 0); - if (err < 0) { - printk(UM_KERN_ERR "handle_trap - continuing to end of " - "syscall failed, errno = %d\n", errno); - fatal_sigsegv(); - } +static int stub_exe_fd; - CATCH_EINTR(err = waitpid(pid, &status, WUNTRACED | __WALL)); - if ((err < 0) || !WIFSTOPPED(status) || - (WSTOPSIG(status) != SIGTRAP + 0x80)) { - err = ptrace_dump_regs(pid); - if (err) - printk(UM_KERN_ERR "Failed to get registers " - "from process, errno = %d\n", -err); - printk(UM_KERN_ERR "handle_trap - failed to wait at " - "end of syscall, errno = %d, status = %d\n", - errno, status); - fatal_sigsegv(); - } +struct tramp_data { + struct stub_data *stub_data; + /* 0 is inherited, 1 is the kernel side */ + int sockpair[2]; +}; + +#ifndef CLOSE_RANGE_CLOEXEC +#define CLOSE_RANGE_CLOEXEC (1U << 2) +#endif + +static int userspace_tramp(void *data) +{ + struct tramp_data *tramp_data = data; + char *const argv[] = { "uml-userspace", NULL }; + unsigned long long offset; + struct stub_init_data init_data = { + .seccomp = using_seccomp, + .stub_start = STUB_START, + }; + int ret; + + if (using_seccomp) { + init_data.signal_handler = STUB_CODE + + (unsigned long) stub_signal_interrupt - + (unsigned long) __syscall_stub_start; + init_data.signal_restorer = STUB_CODE + + (unsigned long) stub_signal_restorer - + (unsigned long) __syscall_stub_start; + } else { + init_data.signal_handler = STUB_CODE + + (unsigned long) stub_segv_handler - + (unsigned long) __syscall_stub_start; + init_data.signal_restorer = 0; } - handle_syscall(regs); + init_data.stub_code_fd = phys_mapping(uml_to_phys(__syscall_stub_start), + &offset); + init_data.stub_code_offset = MMAP_OFFSET(offset); + + init_data.stub_data_fd = phys_mapping(uml_to_phys(tramp_data->stub_data), + &offset); + init_data.stub_data_offset = MMAP_OFFSET(offset); + + /* + * Avoid leaking unneeded FDs to the stub by setting CLOEXEC on all FDs + * and then unsetting it on all memory related FDs. + * This is not strictly necessary from a safety perspective. + */ + syscall(__NR_close_range, 0, ~0U, CLOSE_RANGE_CLOEXEC); + + fcntl(init_data.stub_data_fd, F_SETFD, 0); + + /* dup2 signaling FD/socket to STDIN */ + if (dup2(tramp_data->sockpair[0], 0) < 0) + exit(3); + close(tramp_data->sockpair[0]); + + /* Write init_data and close write side */ + ret = write(tramp_data->sockpair[1], &init_data, sizeof(init_data)); + close(tramp_data->sockpair[1]); + + if (ret != sizeof(init_data)) + exit(4); + + /* Raw execveat for compatibility with older libc versions */ + syscall(__NR_execveat, stub_exe_fd, (unsigned long)"", + (unsigned long)argv, NULL, AT_EMPTY_PATH); + + exit(5); } -extern int __syscall_stub_start; +extern char stub_exe_start[]; +extern char stub_exe_end[]; + +extern char *tempdir; -static int userspace_tramp(void *stack) +#define STUB_EXE_NAME_TEMPLATE "/uml-userspace-XXXXXX" + +#ifndef MFD_EXEC +#define MFD_EXEC 0x0010U +#endif + +static int __init init_stub_exe_fd(void) { - void *addr; - int err; + size_t written = 0; + char *tmpfile = NULL; - ptrace(PTRACE_TRACEME, 0, 0, 0); + stub_exe_fd = memfd_create("uml-userspace", + MFD_EXEC | MFD_CLOEXEC | MFD_ALLOW_SEALING); - signal(SIGTERM, SIG_DFL); - signal(SIGWINCH, SIG_IGN); - err = set_interval(); - if (err) { - printk(UM_KERN_ERR "userspace_tramp - setting timer failed, " - "errno = %d\n", err); - exit(1); + if (stub_exe_fd < 0) { + printk(UM_KERN_INFO "Could not create executable memfd, using temporary file!"); + + tmpfile = malloc(strlen(tempdir) + + strlen(STUB_EXE_NAME_TEMPLATE) + 1); + if (tmpfile == NULL) + panic("Failed to allocate memory for stub binary name"); + + strcpy(tmpfile, tempdir); + strcat(tmpfile, STUB_EXE_NAME_TEMPLATE); + + stub_exe_fd = mkstemp(tmpfile); + if (stub_exe_fd < 0) + panic("Could not create temporary file for stub binary: %d", + -errno); } - if (!proc_mm) { - /* - * This has a pte, but it can't be mapped in with the usual - * tlb_flush mechanism because this is part of that mechanism - */ - int fd; - unsigned long long offset; - fd = phys_mapping(to_phys(&__syscall_stub_start), &offset); - addr = mmap64((void *) STUB_CODE, UM_KERN_PAGE_SIZE, - PROT_EXEC, MAP_FIXED | MAP_PRIVATE, fd, offset); - if (addr == MAP_FAILED) { - printk(UM_KERN_ERR "mapping mmap stub at 0x%lx failed, " - "errno = %d\n", STUB_CODE, errno); - exit(1); - } + while (written < stub_exe_end - stub_exe_start) { + ssize_t res = write(stub_exe_fd, stub_exe_start + written, + stub_exe_end - stub_exe_start - written); + if (res < 0) { + if (errno == EINTR) + continue; - if (stack != NULL) { - fd = phys_mapping(to_phys(stack), &offset); - addr = mmap((void *) STUB_DATA, - UM_KERN_PAGE_SIZE, PROT_READ | PROT_WRITE, - MAP_FIXED | MAP_SHARED, fd, offset); - if (addr == MAP_FAILED) { - printk(UM_KERN_ERR "mapping segfault stack " - "at 0x%lx failed, errno = %d\n", - STUB_DATA, errno); - exit(1); - } + if (tmpfile) + unlink(tmpfile); + panic("Failed write stub binary: %d", -errno); } + + written += res; } - if (!ptrace_faultinfo && (stack != NULL)) { - struct sigaction sa; - - unsigned long v = STUB_CODE + - (unsigned long) stub_segv_handler - - (unsigned long) &__syscall_stub_start; - - set_sigstack((void *) STUB_DATA, UM_KERN_PAGE_SIZE); - sigemptyset(&sa.sa_mask); - sa.sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO; - sa.sa_sigaction = (void *) v; - sa.sa_restorer = NULL; - if (sigaction(SIGSEGV, &sa, NULL) < 0) { - printk(UM_KERN_ERR "userspace_tramp - setting SIGSEGV " - "handler failed - errno = %d\n", errno); - exit(1); + + if (!tmpfile) { + fcntl(stub_exe_fd, F_ADD_SEALS, + F_SEAL_WRITE | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_SEAL); + } else { + if (fchmod(stub_exe_fd, 00500) < 0) { + unlink(tmpfile); + panic("Could not make stub binary executable: %d", + -errno); + } + + close(stub_exe_fd); + stub_exe_fd = open(tmpfile, O_RDONLY | O_CLOEXEC | O_NOFOLLOW); + if (stub_exe_fd < 0) { + unlink(tmpfile); + panic("Could not reopen stub binary: %d", -errno); } + + unlink(tmpfile); + free(tmpfile); } - kill(os_getpid(), SIGSTOP); return 0; } - -/* Each element set once, and only accessed by a single processor anyway */ -#undef NR_CPUS -#define NR_CPUS 1 -int userspace_pid[NR_CPUS]; - -int start_userspace(unsigned long stub_stack) +__initcall(init_stub_exe_fd); + +int using_seccomp; + +/** + * start_userspace() - prepare a new userspace process + * @mm_id: The corresponding struct mm_id + * + * Setups a new temporary stack page that is used while userspace_tramp() runs + * Clones the kernel process into a new userspace process, with FDs only. + * + * Return: When positive: the process id of the new userspace process, + * when negative: an error number. + * FIXME: can PIDs become negative?! + */ +int start_userspace(struct mm_id *mm_id) { + struct stub_data *proc_data = (void *)mm_id->stack; + struct tramp_data tramp_data = { + .stub_data = proc_data, + }; void *stack; unsigned long sp; - int pid, status, n, flags, err; + int status, n, err; + /* setup a temporary stack page */ stack = mmap(NULL, UM_KERN_PAGE_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (stack == MAP_FAILED) { err = -errno; - printk(UM_KERN_ERR "start_userspace : mmap failed, " - "errno = %d\n", errno); + printk(UM_KERN_ERR "%s : mmap failed, errno = %d\n", + __func__, errno); return err; } - sp = (unsigned long) stack + UM_KERN_PAGE_SIZE - sizeof(void *); + /* set stack pointer to the end of the stack page, so it can grow downwards */ + sp = (unsigned long)stack + UM_KERN_PAGE_SIZE; - flags = CLONE_FILES; - if (proc_mm) - flags |= CLONE_VM; - else - flags |= SIGCHLD; - - pid = clone(userspace_tramp, (void *) sp, flags, (void *) stub_stack); - if (pid < 0) { + /* socket pair for init data and SECCOMP FD passing (no CLOEXEC here) */ + if (socketpair(AF_UNIX, SOCK_STREAM, 0, tramp_data.sockpair)) { err = -errno; - printk(UM_KERN_ERR "start_userspace : clone failed, " - "errno = %d\n", errno); + printk(UM_KERN_ERR "%s : socketpair failed, errno = %d\n", + __func__, errno); return err; } - do { - CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED | __WALL)); - if (n < 0) { - err = -errno; - printk(UM_KERN_ERR "start_userspace : wait failed, " - "errno = %d\n", errno); - goto out_kill; - } - } while (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGVTALRM)); + if (using_seccomp) + proc_data->futex = FUTEX_IN_CHILD; - if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) { - err = -EINVAL; - printk(UM_KERN_ERR "start_userspace : expected SIGSTOP, got " - "status = %d\n", status); - goto out_kill; + mm_id->pid = clone(userspace_tramp, (void *) sp, + CLONE_VFORK | CLONE_VM | SIGCHLD, + (void *)&tramp_data); + if (mm_id->pid < 0) { + err = -errno; + printk(UM_KERN_ERR "%s : clone failed, errno = %d\n", + __func__, errno); + goto out_close; } - if (ptrace(PTRACE_OLDSETOPTIONS, pid, NULL, - (void *) PTRACE_O_TRACESYSGOOD) < 0) { - err = -errno; - printk(UM_KERN_ERR "start_userspace : PTRACE_OLDSETOPTIONS " - "failed, errno = %d\n", errno); - goto out_kill; + if (using_seccomp) { + wait_stub_done_seccomp(mm_id, 1, 1); + } else { + do { + CATCH_EINTR(n = waitpid(mm_id->pid, &status, + WUNTRACED | __WALL)); + if (n < 0) { + err = -errno; + printk(UM_KERN_ERR "%s : wait failed, errno = %d\n", + __func__, errno); + goto out_kill; + } + } while (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGALRM)); + + if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) { + err = -EINVAL; + printk(UM_KERN_ERR "%s : expected SIGSTOP, got status = %d\n", + __func__, status); + goto out_kill; + } + + if (ptrace(PTRACE_SETOPTIONS, mm_id->pid, NULL, + (void *) PTRACE_O_TRACESYSGOOD) < 0) { + err = -errno; + printk(UM_KERN_ERR "%s : PTRACE_SETOPTIONS failed, errno = %d\n", + __func__, errno); + goto out_kill; + } } if (munmap(stack, UM_KERN_PAGE_SIZE) < 0) { err = -errno; - printk(UM_KERN_ERR "start_userspace : munmap failed, " - "errno = %d\n", errno); + printk(UM_KERN_ERR "%s : munmap failed, errno = %d\n", + __func__, errno); goto out_kill; } - return pid; + close(tramp_data.sockpair[0]); + if (using_seccomp) + mm_id->sock = tramp_data.sockpair[1]; + else + close(tramp_data.sockpair[1]); + + return 0; + +out_kill: + os_kill_ptraced_process(mm_id->pid, 1); +out_close: + close(tramp_data.sockpair[0]); + close(tramp_data.sockpair[1]); + + mm_id->pid = -1; - out_kill: - os_kill_ptraced_process(pid, 1); return err; } +static int unscheduled_userspace_iterations; +extern unsigned long tt_extra_sched_jiffies; + void userspace(struct uml_pt_regs *regs) { - struct itimerval timer; - unsigned long long nsecs, now; - int err, status, op, pid = userspace_pid[0]; - /* To prevent races if using_sysemu changes under us.*/ - int local_using_sysemu; - siginfo_t si; + int err, status, op; + siginfo_t si_local; + siginfo_t *si; + int sig; /* Handle any immediate reschedules or signals */ interrupt_end(); - if (getitimer(ITIMER_VIRTUAL, &timer)) - printk(UM_KERN_ERR "Failed to get itimer, errno = %d\n", errno); - nsecs = timer.it_value.tv_sec * UM_NSEC_PER_SEC + - timer.it_value.tv_usec * UM_NSEC_PER_USEC; - nsecs += os_nsecs(); - while (1) { + struct mm_id *mm_id = current_mm_id(); + /* - * This can legitimately fail if the process loads a - * bogus value into a segment register. It will - * segfault and PTRACE_GETREGS will read that value - * out of the process. However, PTRACE_SETREGS will - * fail. In this case, there is nothing to do but - * just kill the process. + * At any given time, only one CPU thread can enter the + * turnstile to operate on the same stub process, including + * executing stub system calls (mmap and munmap). */ - if (ptrace(PTRACE_SETREGS, pid, 0, regs->gp)) - fatal_sigsegv(); + enter_turnstile(mm_id); - if (put_fp_registers(pid, regs->fp)) - fatal_sigsegv(); + /* + * When we are in time-travel mode, userspace can theoretically + * do a *lot* of work without being scheduled. The problem with + * this is that it will prevent kernel bookkeeping (primarily + * the RCU) from running and this can for example cause OOM + * situations. + * + * This code accounts a jiffie against the scheduling clock + * after the defined userspace iterations in the same thread. + * By doing so the situation is effectively prevented. + */ + if (time_travel_mode == TT_MODE_INFCPU || + time_travel_mode == TT_MODE_EXTERNAL) { +#ifdef CONFIG_UML_MAX_USERSPACE_ITERATIONS + if (CONFIG_UML_MAX_USERSPACE_ITERATIONS && + unscheduled_userspace_iterations++ > + CONFIG_UML_MAX_USERSPACE_ITERATIONS) { + tt_extra_sched_jiffies += 1; + unscheduled_userspace_iterations = 0; + } +#endif + } - /* Now we set local_using_sysemu to be used for one loop */ - local_using_sysemu = get_using_sysemu(); + time_travel_print_bc_msg(); - op = SELECT_PTRACE_OPERATION(local_using_sysemu, - singlestepping(NULL)); + current_mm_sync(); - if (ptrace(op, pid, 0, 0)) { - printk(UM_KERN_ERR "userspace - ptrace continue " - "failed, op = %d, errno = %d\n", op, errno); - fatal_sigsegv(); - } + if (using_seccomp) { + struct stub_data *proc_data = (void *) mm_id->stack; - CATCH_EINTR(err = waitpid(pid, &status, WUNTRACED | __WALL)); - if (err < 0) { - printk(UM_KERN_ERR "userspace - wait failed, " - "errno = %d\n", errno); - fatal_sigsegv(); - } + err = set_stub_state(regs, proc_data, singlestepping()); + if (err) { + printk(UM_KERN_ERR "%s - failed to set regs: %d", + __func__, err); + fatal_sigsegv(); + } - regs->is_user = 1; - if (ptrace(PTRACE_GETREGS, pid, 0, regs->gp)) { - printk(UM_KERN_ERR "userspace - PTRACE_GETREGS failed, " - "errno = %d\n", errno); - fatal_sigsegv(); - } + /* Must have been reset by the syscall caller */ + if (proc_data->restart_wait != 0) + panic("Programming error: Flag to only run syscalls in child was not cleared!"); - if (get_fp_registers(pid, regs->fp)) { - printk(UM_KERN_ERR "userspace - get_fp_registers failed, " - "errno = %d\n", errno); - fatal_sigsegv(); - } + /* Mark pending syscalls for flushing */ + proc_data->syscall_data_len = mm_id->syscall_data_len; - UPT_SYSCALL_NR(regs) = -1; /* Assume: It's not a syscall */ + wait_stub_done_seccomp(mm_id, 0, 0); - if (WIFSTOPPED(status)) { - int sig = WSTOPSIG(status); + sig = proc_data->signal; - ptrace(PTRACE_GETSIGINFO, pid, 0, &si); + if (sig == SIGTRAP && proc_data->err != 0) { + printk(UM_KERN_ERR "%s - Error flushing stub syscalls", + __func__); + syscall_stub_dump_error(mm_id); + mm_id->syscall_data_len = proc_data->err; + fatal_sigsegv(); + } - switch (sig) { - case SIGSEGV: - if (PTRACE_FULL_FAULTINFO || - !ptrace_faultinfo) { + mm_id->syscall_data_len = 0; + mm_id->syscall_fd_num = 0; + + err = get_stub_state(regs, proc_data, NULL); + if (err) { + printk(UM_KERN_ERR "%s - failed to get regs: %d", + __func__, err); + fatal_sigsegv(); + } + + if (proc_data->si_offset > sizeof(proc_data->sigstack) - sizeof(*si)) + panic("%s - Invalid siginfo offset from child", __func__); + + si = &si_local; + memcpy(si, &proc_data->sigstack[proc_data->si_offset], sizeof(*si)); + + regs->is_user = 1; + + /* Fill in ORIG_RAX and extract fault information */ + PT_SYSCALL_NR(regs->gp) = si->si_syscall; + if (sig == SIGSEGV) { + mcontext_t *mcontext = (void *)&proc_data->sigstack[proc_data->mctx_offset]; + + GET_FAULTINFO_FROM_MC(regs->faultinfo, mcontext); + } + } else { + int pid = mm_id->pid; + + /* Flush out any pending syscalls */ + err = syscall_stub_flush(mm_id); + if (err) { + if (err == -ENOMEM) + report_enomem(); + + printk(UM_KERN_ERR "%s - Error flushing stub syscalls: %d", + __func__, -err); + fatal_sigsegv(); + } + + /* + * This can legitimately fail if the process loads a + * bogus value into a segment register. It will + * segfault and PTRACE_GETREGS will read that value + * out of the process. However, PTRACE_SETREGS will + * fail. In this case, there is nothing to do but + * just kill the process. + */ + if (ptrace(PTRACE_SETREGS, pid, 0, regs->gp)) { + printk(UM_KERN_ERR "%s - ptrace set regs failed, errno = %d\n", + __func__, errno); + fatal_sigsegv(); + } + + if (put_fp_registers(pid, regs->fp)) { + printk(UM_KERN_ERR "%s - ptrace set fp regs failed, errno = %d\n", + __func__, errno); + fatal_sigsegv(); + } + + if (singlestepping()) + op = PTRACE_SYSEMU_SINGLESTEP; + else + op = PTRACE_SYSEMU; + + if (ptrace(op, pid, 0, 0)) { + printk(UM_KERN_ERR "%s - ptrace continue failed, op = %d, errno = %d\n", + __func__, op, errno); + fatal_sigsegv(); + } + + CATCH_EINTR(err = waitpid(pid, &status, WUNTRACED | __WALL)); + if (err < 0) { + printk(UM_KERN_ERR "%s - wait failed, errno = %d\n", + __func__, errno); + fatal_sigsegv(); + } + + regs->is_user = 1; + if (ptrace(PTRACE_GETREGS, pid, 0, regs->gp)) { + printk(UM_KERN_ERR "%s - PTRACE_GETREGS failed, errno = %d\n", + __func__, errno); + fatal_sigsegv(); + } + + if (get_fp_registers(pid, regs->fp)) { + printk(UM_KERN_ERR "%s - get_fp_registers failed, errno = %d\n", + __func__, errno); + fatal_sigsegv(); + } + + if (WIFSTOPPED(status)) { + sig = WSTOPSIG(status); + + /* + * These signal handlers need the si argument + * and SIGSEGV needs the faultinfo. + * The SIGIO and SIGALARM handlers which constitute + * the majority of invocations, do not use it. + */ + switch (sig) { + case SIGSEGV: get_skas_faultinfo(pid, ®s->faultinfo); - (*sig_info[SIGSEGV])(SIGSEGV, &si, - regs); + fallthrough; + case SIGTRAP: + case SIGILL: + case SIGBUS: + case SIGFPE: + case SIGWINCH: + ptrace(PTRACE_GETSIGINFO, pid, 0, + (struct siginfo *)&si_local); + si = &si_local; + break; + default: + si = NULL; + break; } - else handle_segv(pid, regs); + } else { + sig = 0; + } + } + + exit_turnstile(mm_id); + + UPT_SYSCALL_NR(regs) = -1; /* Assume: It's not a syscall */ + + if (sig) { + switch (sig) { + case SIGSEGV: + if (using_seccomp || PTRACE_FULL_FAULTINFO) + (*sig_info[SIGSEGV])(SIGSEGV, + (struct siginfo *)si, + regs, NULL); + else + segv(regs->faultinfo, 0, 1, NULL, NULL); + + break; + case SIGSYS: + handle_syscall(regs); break; case SIGTRAP + 0x80: - handle_trap(pid, regs, local_using_sysemu); + handle_trap(regs); break; case SIGTRAP: - relay_signal(SIGTRAP, &si, regs); + relay_signal(SIGTRAP, (struct siginfo *)si, regs, NULL); break; - case SIGVTALRM: - now = os_nsecs(); - if (now < nsecs) - break; - block_signals(); - (*sig_info[sig])(sig, &si, regs); - unblock_signals(); - nsecs = timer.it_value.tv_sec * - UM_NSEC_PER_SEC + - timer.it_value.tv_usec * - UM_NSEC_PER_USEC; - nsecs += os_nsecs(); + case SIGALRM: break; case SIGIO: case SIGILL: case SIGBUS: case SIGFPE: case SIGWINCH: - block_signals(); - (*sig_info[sig])(sig, &si, regs); - unblock_signals(); + block_signals_trace(); + (*sig_info[sig])(sig, (struct siginfo *)si, regs, NULL); + unblock_signals_trace(); break; default: - printk(UM_KERN_ERR "userspace - child stopped " - "with signal %d\n", sig); + printk(UM_KERN_ERR "%s - child stopped with signal %d\n", + __func__, sig); fatal_sigsegv(); } - pid = userspace_pid[0]; interrupt_end(); /* Avoid -ERESTARTSYS handling in host */ @@ -465,173 +790,6 @@ void userspace(struct uml_pt_regs *regs) } } -static unsigned long thread_regs[MAX_REG_NR]; -static unsigned long thread_fp_regs[FP_SIZE]; - -static int __init init_thread_regs(void) -{ - get_safe_registers(thread_regs, thread_fp_regs); - /* Set parent's instruction pointer to start of clone-stub */ - thread_regs[REGS_IP_INDEX] = STUB_CODE + - (unsigned long) stub_clone_handler - - (unsigned long) &__syscall_stub_start; - thread_regs[REGS_SP_INDEX] = STUB_DATA + UM_KERN_PAGE_SIZE - - sizeof(void *); -#ifdef __SIGNAL_FRAMESIZE - thread_regs[REGS_SP_INDEX] -= __SIGNAL_FRAMESIZE; -#endif - return 0; -} - -__initcall(init_thread_regs); - -int copy_context_skas0(unsigned long new_stack, int pid) -{ - struct timeval tv = { .tv_sec = 0, .tv_usec = UM_USEC_PER_SEC / UM_HZ }; - int err; - unsigned long current_stack = current_stub_stack(); - struct stub_data *data = (struct stub_data *) current_stack; - struct stub_data *child_data = (struct stub_data *) new_stack; - unsigned long long new_offset; - int new_fd = phys_mapping(to_phys((void *)new_stack), &new_offset); - - /* - * prepare offset and fd of child's stack as argument for parent's - * and child's mmap2 calls - */ - *data = ((struct stub_data) { .offset = MMAP_OFFSET(new_offset), - .fd = new_fd, - .timer = ((struct itimerval) - { .it_value = tv, - .it_interval = tv }) }); - - err = ptrace_setregs(pid, thread_regs); - if (err < 0) { - err = -errno; - printk(UM_KERN_ERR "copy_context_skas0 : PTRACE_SETREGS " - "failed, pid = %d, errno = %d\n", pid, -err); - return err; - } - - err = put_fp_registers(pid, thread_fp_regs); - if (err < 0) { - printk(UM_KERN_ERR "copy_context_skas0 : put_fp_registers " - "failed, pid = %d, err = %d\n", pid, err); - return err; - } - - /* set a well known return code for detection of child write failure */ - child_data->err = 12345678; - - /* - * Wait, until parent has finished its work: read child's pid from - * parent's stack, and check, if bad result. - */ - err = ptrace(PTRACE_CONT, pid, 0, 0); - if (err) { - err = -errno; - printk(UM_KERN_ERR "Failed to continue new process, pid = %d, " - "errno = %d\n", pid, errno); - return err; - } - - wait_stub_done(pid); - - pid = data->err; - if (pid < 0) { - printk(UM_KERN_ERR "copy_context_skas0 - stub-parent reports " - "error %d\n", -pid); - return pid; - } - - /* - * Wait, until child has finished too: read child's result from - * child's stack and check it. - */ - wait_stub_done(pid); - if (child_data->err != STUB_DATA) { - printk(UM_KERN_ERR "copy_context_skas0 - stub-child reports " - "error %ld\n", child_data->err); - err = child_data->err; - goto out_kill; - } - - if (ptrace(PTRACE_OLDSETOPTIONS, pid, NULL, - (void *)PTRACE_O_TRACESYSGOOD) < 0) { - err = -errno; - printk(UM_KERN_ERR "copy_context_skas0 : PTRACE_OLDSETOPTIONS " - "failed, errno = %d\n", errno); - goto out_kill; - } - - return pid; - - out_kill: - os_kill_ptraced_process(pid, 1); - return err; -} - -/* - * This is used only, if stub pages are needed, while proc_mm is - * available. Opening /proc/mm creates a new mm_context, which lacks - * the stub-pages. Thus, we map them using /proc/mm-fd - */ -int map_stub_pages(int fd, unsigned long code, unsigned long data, - unsigned long stack) -{ - struct proc_mm_op mmop; - int n; - unsigned long long code_offset; - int code_fd = phys_mapping(to_phys((void *) &__syscall_stub_start), - &code_offset); - - mmop = ((struct proc_mm_op) { .op = MM_MMAP, - .u = - { .mmap = - { .addr = code, - .len = UM_KERN_PAGE_SIZE, - .prot = PROT_EXEC, - .flags = MAP_FIXED | MAP_PRIVATE, - .fd = code_fd, - .offset = code_offset - } } }); - CATCH_EINTR(n = write(fd, &mmop, sizeof(mmop))); - if (n != sizeof(mmop)) { - n = errno; - printk(UM_KERN_ERR "mmap args - addr = 0x%lx, fd = %d, " - "offset = %llx\n", code, code_fd, - (unsigned long long) code_offset); - printk(UM_KERN_ERR "map_stub_pages : /proc/mm map for code " - "failed, err = %d\n", n); - return -n; - } - - if (stack) { - unsigned long long map_offset; - int map_fd = phys_mapping(to_phys((void *)stack), &map_offset); - mmop = ((struct proc_mm_op) - { .op = MM_MMAP, - .u = - { .mmap = - { .addr = data, - .len = UM_KERN_PAGE_SIZE, - .prot = PROT_READ | PROT_WRITE, - .flags = MAP_FIXED | MAP_SHARED, - .fd = map_fd, - .offset = map_offset - } } }); - CATCH_EINTR(n = write(fd, &mmop, sizeof(mmop))); - if (n != sizeof(mmop)) { - n = errno; - printk(UM_KERN_ERR "map_stub_pages : /proc/mm map for " - "data failed, err = %d\n", n); - return -n; - } - } - - return 0; -} - void new_thread(void *stack, jmp_buf *buf, void (*handler)(void)) { (*buf)[0].JB_IP = (unsigned long) handler; @@ -646,16 +804,17 @@ void new_thread(void *stack, jmp_buf *buf, void (*handler)(void)) void switch_threads(jmp_buf *me, jmp_buf *you) { + unscheduled_userspace_iterations = 0; + if (UML_SETJMP(me) == 0) UML_LONGJMP(you, 1); } static jmp_buf initial_jmpbuf; -/* XXX Make these percpu */ -static void (*cb_proc)(void *arg); -static void *cb_arg; -static jmp_buf *cb_back; +static __thread void (*cb_proc)(void *arg); +static __thread void *cb_arg; +static __thread jmp_buf *cb_back; int start_idle_thread(void *stack, jmp_buf *switch_buf) { @@ -674,7 +833,7 @@ int start_idle_thread(void *stack, jmp_buf *switch_buf) n = setjmp(initial_jmpbuf); switch (n) { case INIT_JMP_NEW_THREAD: - (*switch_buf)[0].JB_IP = (unsigned long) new_thread_handler; + (*switch_buf)[0].JB_IP = (unsigned long) uml_finishsetup; (*switch_buf)[0].JB_SP = (unsigned long) stack + UM_THREAD_SIZE - sizeof(void *); break; @@ -689,11 +848,16 @@ int start_idle_thread(void *stack, jmp_buf *switch_buf) kmalloc_ok = 0; return 1; default: - printk(UM_KERN_ERR "Bad sigsetjmp return in " - "start_idle_thread - %d\n", n); + printk(UM_KERN_ERR "Bad sigsetjmp return in %s - %d\n", + __func__, n); fatal_sigsegv(); } longjmp(*switch_buf, 1); + + /* unreachable */ + printk(UM_KERN_ERR "impossible long jump!"); + fatal_sigsegv(); + return 0; } void initial_thread_cb_skas(void (*proc)(void *), void *arg) @@ -704,10 +868,10 @@ void initial_thread_cb_skas(void (*proc)(void *), void *arg) cb_arg = arg; cb_back = &here; - block_signals(); + initial_jmpbuf_lock(); if (UML_SETJMP(&here) == 0) UML_LONGJMP(&initial_jmpbuf, INIT_JMP_CALLBACK); - unblock_signals(); + initial_jmpbuf_unlock(); cb_proc = NULL; cb_arg = NULL; @@ -716,29 +880,29 @@ void initial_thread_cb_skas(void (*proc)(void *), void *arg) void halt_skas(void) { - block_signals(); + initial_jmpbuf_lock(); UML_LONGJMP(&initial_jmpbuf, INIT_JMP_HALT); + /* unreachable */ } -void reboot_skas(void) +static bool noreboot; + +static int __init noreboot_cmd_param(char *str, int *add) { - block_signals(); - UML_LONGJMP(&initial_jmpbuf, INIT_JMP_REBOOT); + *add = 0; + noreboot = true; + return 0; } -void __switch_mm(struct mm_id *mm_idp) -{ - int err; +__uml_setup("noreboot", noreboot_cmd_param, +"noreboot\n" +" Rather than rebooting, exit always, akin to QEMU's -no-reboot option.\n" +" This is useful if you're using CONFIG_PANIC_TIMEOUT in order to catch\n" +" crashes in CI\n\n"); - /* FIXME: need cpu pid in __switch_mm */ - if (proc_mm) { - err = ptrace(PTRACE_SWITCH_MM, userspace_pid[0], 0, - mm_idp->u.mm_fd); - if (err) { - printk(UM_KERN_ERR "__switch_mm - PTRACE_SWITCH_MM " - "failed, errno = %d\n", errno); - fatal_sigsegv(); - } - } - else userspace_pid[0] = mm_idp->u.pid; +void reboot_skas(void) +{ + initial_jmpbuf_lock(); + UML_LONGJMP(&initial_jmpbuf, noreboot ? INIT_JMP_HALT : INIT_JMP_REBOOT); + /* unreachable */ } diff --git a/arch/um/os-Linux/smp.c b/arch/um/os-Linux/smp.c new file mode 100644 index 000000000000..18d3858a7cd2 --- /dev/null +++ b/arch/um/os-Linux/smp.c @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025 Ant Group + * Author: Tiwei Bie <tiwei.btw@antgroup.com> + */ + +#include <errno.h> +#include <pthread.h> +#include <signal.h> +#include <kern_util.h> +#include <um_malloc.h> +#include <init.h> +#include <os.h> +#include <smp.h> +#include "internal.h" + +struct cpu_thread_data { + int cpu; + sigset_t sigset; +}; + +static __thread int __curr_cpu; + +int uml_curr_cpu(void) +{ + return __curr_cpu; +} + +static pthread_t cpu_threads[CONFIG_NR_CPUS]; + +static void *cpu_thread(void *arg) +{ + struct cpu_thread_data *data = arg; + + __curr_cpu = data->cpu; + + uml_start_secondary(data); + + return NULL; +} + +int os_start_cpu_thread(int cpu) +{ + struct cpu_thread_data *data; + sigset_t sigset, oset; + int err; + + data = uml_kmalloc(sizeof(*data), UM_GFP_ATOMIC); + if (!data) + return -ENOMEM; + + sigfillset(&sigset); + if (sigprocmask(SIG_SETMASK, &sigset, &oset) < 0) { + err = errno; + goto err; + } + + data->cpu = cpu; + data->sigset = oset; + + err = pthread_create(&cpu_threads[cpu], NULL, cpu_thread, data); + if (sigprocmask(SIG_SETMASK, &oset, NULL) < 0) + panic("Failed to restore the signal mask, errno = %d", errno); + if (err != 0) + goto err; + + return 0; + +err: + kfree(data); + return -err; +} + +void os_start_secondary(void *arg, jmp_buf *switch_buf) +{ + struct cpu_thread_data *data = arg; + + sigaddset(&data->sigset, IPI_SIGNAL); + sigaddset(&data->sigset, SIGIO); + + if (sigprocmask(SIG_SETMASK, &data->sigset, NULL) < 0) + panic("Failed to restore the signal mask, errno = %d", errno); + + kfree(data); + longjmp(*switch_buf, 1); + + /* unreachable */ + printk(UM_KERN_ERR "impossible long jump!"); + fatal_sigsegv(); +} + +int os_send_ipi(int cpu, int vector) +{ + union sigval value = { .sival_int = vector }; + + return pthread_sigqueue(cpu_threads[cpu], IPI_SIGNAL, value); +} + +static void __local_ipi_set(int enable) +{ + sigset_t sigset; + + sigemptyset(&sigset); + sigaddset(&sigset, IPI_SIGNAL); + + if (sigprocmask(enable ? SIG_UNBLOCK : SIG_BLOCK, &sigset, NULL) < 0) + panic("%s: sigprocmask failed, errno = %d", __func__, errno); +} + +void os_local_ipi_enable(void) +{ + __local_ipi_set(1); +} + +void os_local_ipi_disable(void) +{ + __local_ipi_set(0); +} + +static void ipi_sig_handler(int sig, siginfo_t *si, void *uc) +{ + int save_errno = errno; + + signals_enabled = 0; + um_trace_signals_off(); + + uml_ipi_handler(si->si_value.sival_int); + + um_trace_signals_on(); + signals_enabled = 1; + + errno = save_errno; +} + +void __init os_init_smp(void) +{ + struct sigaction action = { + .sa_sigaction = ipi_sig_handler, + .sa_flags = SA_SIGINFO | SA_ONSTACK | SA_RESTART, + }; + + sigfillset(&action.sa_mask); + + if (sigaction(IPI_SIGNAL, &action, NULL) < 0) + panic("%s: sigaction failed, errno = %d", __func__, errno); + + cpu_threads[0] = pthread_self(); +} diff --git a/arch/um/os-Linux/start_up.c b/arch/um/os-Linux/start_up.c index 337518c5042a..054ac03bbf5e 100644 --- a/arch/um/os-Linux/start_up.c +++ b/arch/um/os-Linux/start_up.c @@ -1,6 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* + * Copyright (C) 2021 Benjamin Berg <benjamin@sipsolutions.net> * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <stdio.h> @@ -17,14 +18,24 @@ #include <sys/wait.h> #include <sys/time.h> #include <sys/resource.h> +#include <asm/ldt.h> #include <asm/unistd.h> #include <init.h> #include <os.h> +#include <smp.h> +#include <kern_util.h> #include <mem_user.h> #include <ptrace_user.h> +#include <stdbool.h> +#include <stub-data.h> +#include <sys/prctl.h> +#include <linux/seccomp.h> +#include <linux/filter.h> +#include <sysdep/mcontext.h> +#include <sysdep/stub.h> #include <registers.h> #include <skas.h> -#include <skas_ptrace.h> +#include "internal.h" static void ptrace_child(void) { @@ -95,6 +106,8 @@ static int start_ptraced_child(void) { int pid, n, status; + fflush(stdout); + pid = fork(); if (pid == 0) ptrace_child(); @@ -111,140 +124,32 @@ static int start_ptraced_child(void) return pid; } -/* When testing for SYSEMU support, if it is one of the broken versions, we - * must just avoid using sysemu, not panic, but only if SYSEMU features are - * broken. - * So only for SYSEMU features we test mustpanic, while normal host features - * must work anyway! - */ -static int stop_ptraced_child(int pid, int exitcode, int mustexit) +static void stop_ptraced_child(int pid, int exitcode) { - int status, n, ret = 0; + int status, n; + + if (ptrace(PTRACE_CONT, pid, 0, 0) < 0) + fatal_perror("stop_ptraced_child : ptrace failed"); - if (ptrace(PTRACE_CONT, pid, 0, 0) < 0) { - perror("stop_ptraced_child : ptrace failed"); - return -1; - } CATCH_EINTR(n = waitpid(pid, &status, 0)); if (!WIFEXITED(status) || (WEXITSTATUS(status) != exitcode)) { int exit_with = WEXITSTATUS(status); - if (exit_with == 2) - non_fatal("check_ptrace : child exited with status 2. " - "\nDisabling SYSEMU support.\n"); - non_fatal("check_ptrace : child exited with exitcode %d, while " - "expecting %d; status 0x%x\n", exit_with, - exitcode, status); - if (mustexit) - exit(1); - ret = -1; + fatal("stop_ptraced_child : child exited with exitcode %d, " + "while expecting %d; status 0x%x\n", exit_with, + exitcode, status); } - - return ret; -} - -/* Changed only during early boot */ -int ptrace_faultinfo; -static int disable_ptrace_faultinfo; - -int ptrace_ldt; -static int disable_ptrace_ldt; - -int proc_mm; -static int disable_proc_mm; - -int have_switch_mm; -static int disable_switch_mm; - -int skas_needs_stub; - -static int __init skas0_cmd_param(char *str, int* add) -{ - disable_ptrace_faultinfo = 1; - disable_ptrace_ldt = 1; - disable_proc_mm = 1; - disable_switch_mm = 1; - - return 0; } -/* The two __uml_setup would conflict, without this stupid alias. */ - -static int __init mode_skas0_cmd_param(char *str, int* add) - __attribute__((alias("skas0_cmd_param"))); - -__uml_setup("skas0", skas0_cmd_param, -"skas0\n" -" Disables SKAS3 and SKAS4 usage, so that SKAS0 is used\n\n"); - -__uml_setup("mode=skas0", mode_skas0_cmd_param, -"mode=skas0\n" -" Disables SKAS3 and SKAS4 usage, so that SKAS0 is used.\n\n"); - -/* Changed only during early boot */ -static int force_sysemu_disabled = 0; - -static int __init nosysemu_cmd_param(char *str, int* add) -{ - force_sysemu_disabled = 1; - return 0; -} - -__uml_setup("nosysemu", nosysemu_cmd_param, -"nosysemu\n" -" Turns off syscall emulation patch for ptrace (SYSEMU) on.\n" -" SYSEMU is a performance-patch introduced by Laurent Vivier. It changes\n" -" behaviour of ptrace() and helps reducing host context switch rate.\n" -" To make it working, you need a kernel patch for your host, too.\n" -" See http://perso.wanadoo.fr/laurent.vivier/UML/ for further \n" -" information.\n\n"); - static void __init check_sysemu(void) { - unsigned long regs[MAX_REG_NR]; int pid, n, status, count=0; - non_fatal("Checking syscall emulation patch for ptrace..."); - sysemu_supported = 0; + os_info("Checking syscall emulation for ptrace..."); pid = start_ptraced_child(); - if (ptrace(PTRACE_SYSEMU, pid, 0, 0) < 0) - goto fail; - - CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED)); - if (n < 0) - fatal_perror("check_sysemu : wait failed"); - if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGTRAP)) - fatal("check_sysemu : expected SIGTRAP, got status = %d\n", - status); - - if (ptrace(PTRACE_GETREGS, pid, 0, regs) < 0) - fatal_perror("check_sysemu : PTRACE_GETREGS failed"); - if (PT_SYSCALL_NR(regs) != __NR_getpid) { - non_fatal("check_sysemu got system call number %d, " - "expected %d...", PT_SYSCALL_NR(regs), __NR_getpid); - goto fail; - } - - n = ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_RET_OFFSET, os_getpid()); - if (n < 0) { - non_fatal("check_sysemu : failed to modify system call " - "return"); - goto fail; - } - - if (stop_ptraced_child(pid, 0, 0) < 0) - goto fail_stopped; - - sysemu_supported = 1; - non_fatal("OK\n"); - set_using_sysemu(!force_sysemu_disabled); - - non_fatal("Checking advanced syscall emulation patch for ptrace..."); - pid = start_ptraced_child(); - - if ((ptrace(PTRACE_OLDSETOPTIONS, pid, 0, + if ((ptrace(PTRACE_SETOPTIONS, pid, 0, (void *) PTRACE_O_TRACESYSGOOD) < 0)) - fatal_perror("check_sysemu: PTRACE_OLDSETOPTIONS failed"); + fatal_perror("check_sysemu: PTRACE_SETOPTIONS failed"); while (1) { count++; @@ -277,32 +182,27 @@ static void __init check_sysemu(void) goto fail; } } - if (stop_ptraced_child(pid, 0, 0) < 0) - goto fail_stopped; + stop_ptraced_child(pid, 0); - sysemu_supported = 2; - non_fatal("OK\n"); + os_info("OK\n"); - if (!force_sysemu_disabled) - set_using_sysemu(sysemu_supported); return; fail: - stop_ptraced_child(pid, 1, 0); -fail_stopped: - non_fatal("missing\n"); + stop_ptraced_child(pid, 1); + fatal("missing\n"); } static void __init check_ptrace(void) { int pid, syscall, n, status; - non_fatal("Checking that ptrace can change system call numbers..."); + os_info("Checking that ptrace can change system call numbers..."); pid = start_ptraced_child(); - if ((ptrace(PTRACE_OLDSETOPTIONS, pid, 0, + if ((ptrace(PTRACE_SETOPTIONS, pid, 0, (void *) PTRACE_O_TRACESYSGOOD) < 0)) - fatal_perror("check_ptrace: PTRACE_OLDSETOPTIONS failed"); + fatal_perror("check_ptrace: PTRACE_SETOPTIONS failed"); while (1) { if (ptrace(PTRACE_SYSCALL, pid, 0, 0) < 0) @@ -328,215 +228,268 @@ static void __init check_ptrace(void) break; } } - stop_ptraced_child(pid, 0, 1); - non_fatal("OK\n"); + stop_ptraced_child(pid, 0); + os_info("OK\n"); check_sysemu(); } -extern void check_tmpexec(void); +extern unsigned long host_fp_size; +extern unsigned long exec_regs[MAX_REG_NR]; +extern unsigned long *exec_fp_regs; -static void __init check_coredump_limit(void) +__initdata static struct stub_data *seccomp_test_stub_data; + +static void __init sigsys_handler(int sig, siginfo_t *info, void *p) { - struct rlimit lim; - int err = getrlimit(RLIMIT_CORE, &lim); + ucontext_t *uc = p; - if (err) { - perror("Getting core dump limit"); - return; - } + /* Stow away the location of the mcontext in the stack */ + seccomp_test_stub_data->mctx_offset = (unsigned long)&uc->uc_mcontext - + (unsigned long)&seccomp_test_stub_data->sigstack[0]; - printf("Core dump limits :\n\tsoft - "); - if (lim.rlim_cur == RLIM_INFINITY) - printf("NONE\n"); - else printf("%lu\n", lim.rlim_cur); + /* Prevent libc from clearing memory (mctx_offset in particular) */ + syscall(__NR_exit, 0); +} - printf("\thard - "); - if (lim.rlim_max == RLIM_INFINITY) - printf("NONE\n"); - else printf("%lu\n", lim.rlim_max); +static int __init seccomp_helper(void *data) +{ + static struct sock_filter filter[] = { + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_clock_nanosleep, 1, 0), + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW), + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRAP), + }; + static struct sock_fprog prog = { + .len = ARRAY_SIZE(filter), + .filter = filter, + }; + struct sigaction sa; + + /* close_range is needed for the stub */ + if (stub_syscall3(__NR_close_range, 1, ~0U, 0)) + exit(1); + + set_sigstack(seccomp_test_stub_data->sigstack, + sizeof(seccomp_test_stub_data->sigstack)); + + sa.sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO; + sa.sa_sigaction = (void *) sigsys_handler; + sa.sa_restorer = NULL; + if (sigaction(SIGSYS, &sa, NULL) < 0) + exit(2); + + prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + if (syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, + SECCOMP_FILTER_FLAG_TSYNC, &prog) != 0) + exit(3); + + sleep(0); + + /* Never reached. */ + _exit(4); } -void __init os_early_checks(void) +static bool __init init_seccomp(void) { int pid; + int status; + int n; + unsigned long sp; - /* Print out the core dump limits early */ - check_coredump_limit(); + /* + * We check that we can install a seccomp filter and then exit(0) + * from a trapped syscall. + * + * Note that we cannot verify that no seccomp filter already exists + * for a syscall that results in the process/thread to be killed. + */ - check_ptrace(); + os_info("Checking that seccomp filters can be installed..."); - /* Need to check this early because mmapping happens before the - * kernel is running. - */ - check_tmpexec(); + seccomp_test_stub_data = mmap(0, sizeof(*seccomp_test_stub_data), + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANON, 0, 0); - pid = start_ptraced_child(); - if (init_registers(pid)) - fatal("Failed to initialize default registers"); - stop_ptraced_child(pid, 1, 1); -} + /* Use the syscall data area as stack, we just need something */ + sp = (unsigned long)&seccomp_test_stub_data->syscall_data + + sizeof(seccomp_test_stub_data->syscall_data) - + sizeof(void *); + pid = clone(seccomp_helper, (void *)sp, CLONE_VFORK | CLONE_VM, NULL); -static int __init noprocmm_cmd_param(char *str, int* add) -{ - disable_proc_mm = 1; - return 0; -} + if (pid < 0) + fatal_perror("check_seccomp : clone failed"); -__uml_setup("noprocmm", noprocmm_cmd_param, -"noprocmm\n" -" Turns off usage of /proc/mm, even if host supports it.\n" -" To support /proc/mm, the host needs to be patched using\n" -" the current skas3 patch.\n\n"); + CATCH_EINTR(n = waitpid(pid, &status, __WCLONE)); + if (n < 0) + fatal_perror("check_seccomp : waitpid failed"); -static int __init noptracefaultinfo_cmd_param(char *str, int* add) -{ - disable_ptrace_faultinfo = 1; - return 0; -} + if (WIFEXITED(status) && WEXITSTATUS(status) == 0) { + struct uml_pt_regs *regs; + unsigned long fp_size; + int r; -__uml_setup("noptracefaultinfo", noptracefaultinfo_cmd_param, -"noptracefaultinfo\n" -" Turns off usage of PTRACE_FAULTINFO, even if host supports\n" -" it. To support PTRACE_FAULTINFO, the host needs to be patched\n" -" using the current skas3 patch.\n\n"); + /* Fill in the host_fp_size from the mcontext. */ + regs = calloc(1, sizeof(struct uml_pt_regs)); + get_stub_state(regs, seccomp_test_stub_data, &fp_size); + host_fp_size = fp_size; + free(regs); -static int __init noptraceldt_cmd_param(char *str, int* add) -{ - disable_ptrace_ldt = 1; - return 0; -} + /* Repeat with the correct size */ + regs = calloc(1, sizeof(struct uml_pt_regs) + host_fp_size); + r = get_stub_state(regs, seccomp_test_stub_data, NULL); -__uml_setup("noptraceldt", noptraceldt_cmd_param, -"noptraceldt\n" -" Turns off usage of PTRACE_LDT, even if host supports it.\n" -" To support PTRACE_LDT, the host needs to be patched using\n" -" the current skas3 patch.\n\n"); + /* Store as the default startup registers */ + exec_fp_regs = malloc(host_fp_size); + memcpy(exec_regs, regs->gp, sizeof(exec_regs)); + memcpy(exec_fp_regs, regs->fp, host_fp_size); -static inline void check_skas3_ptrace_faultinfo(void) -{ - struct ptrace_faultinfo fi; - int pid, n; + munmap(seccomp_test_stub_data, sizeof(*seccomp_test_stub_data)); - non_fatal(" - PTRACE_FAULTINFO..."); - pid = start_ptraced_child(); + free(regs); + + if (r) { + os_info("failed to fetch registers: %d\n", r); + return false; + } - n = ptrace(PTRACE_FAULTINFO, pid, 0, &fi); - if (n < 0) { - if (errno == EIO) - non_fatal("not found\n"); - else - perror("not found"); - } else if (disable_ptrace_faultinfo) - non_fatal("found but disabled on command line\n"); - else { - ptrace_faultinfo = 1; - non_fatal("found\n"); + os_info("OK\n"); + return true; } - stop_ptraced_child(pid, 1, 1); + if (WIFEXITED(status) && WEXITSTATUS(status) == 2) + os_info("missing\n"); + else + os_info("error\n"); + + munmap(seccomp_test_stub_data, sizeof(*seccomp_test_stub_data)); + return false; } -static inline void check_skas3_ptrace_ldt(void) + +static void __init check_coredump_limit(void) { -#ifdef PTRACE_LDT - int pid, n; - unsigned char ldtbuf[40]; - struct ptrace_ldt ldt_op = (struct ptrace_ldt) { - .func = 2, /* read default ldt */ - .ptr = ldtbuf, - .bytecount = sizeof(ldtbuf)}; - - non_fatal(" - PTRACE_LDT..."); - pid = start_ptraced_child(); + struct rlimit lim; + int err = getrlimit(RLIMIT_CORE, &lim); - n = ptrace(PTRACE_LDT, pid, 0, (unsigned long) &ldt_op); - if (n < 0) { - if (errno == EIO) - non_fatal("not found\n"); - else - perror("not found"); - } else if (disable_ptrace_ldt) - non_fatal("found, but use is disabled\n"); - else { - ptrace_ldt = 1; - non_fatal("found\n"); + if (err) { + perror("Getting core dump limit"); + return; } - stop_ptraced_child(pid, 1, 1); -#endif + os_info("Core dump limits :\n\tsoft - "); + if (lim.rlim_cur == RLIM_INFINITY) + os_info("NONE\n"); + else + os_info("%llu\n", (unsigned long long)lim.rlim_cur); + + os_info("\thard - "); + if (lim.rlim_max == RLIM_INFINITY) + os_info("NONE\n"); + else + os_info("%llu\n", (unsigned long long)lim.rlim_max); } -static inline void check_skas3_proc_mm(void) +void __init get_host_cpu_features( + void (*flags_helper_func)(char *line), + void (*cache_helper_func)(char *line)) { - non_fatal(" - /proc/mm..."); - if (access("/proc/mm", W_OK) < 0) - perror("not found"); - else if (disable_proc_mm) - non_fatal("found but disabled on command line\n"); - else { - proc_mm = 1; - non_fatal("found\n"); + FILE *cpuinfo; + char *line = NULL; + size_t len = 0; + int done_parsing = 0; + + cpuinfo = fopen("/proc/cpuinfo", "r"); + if (cpuinfo == NULL) { + os_info("Failed to get host CPU features\n"); + } else { + while ((getline(&line, &len, cpuinfo)) != -1) { + if (strstr(line, "flags")) { + flags_helper_func(line); + done_parsing++; + } + if (strstr(line, "cache_alignment")) { + cache_helper_func(line); + done_parsing++; + } + free(line); + line = NULL; + if (done_parsing > 1) + break; + } + fclose(cpuinfo); } } -void can_do_skas(void) -{ - non_fatal("Checking for the skas3 patch in the host:\n"); +static int seccomp_config __initdata; - check_skas3_proc_mm(); - check_skas3_ptrace_faultinfo(); - check_skas3_ptrace_ldt(); +static int __init uml_seccomp_config(char *line, int *add) +{ + *add = 0; + + if (strcmp(line, "off") == 0) + seccomp_config = 0; + else if (strcmp(line, "auto") == 0) + seccomp_config = 1; + else if (strcmp(line, "on") == 0) + seccomp_config = 2; + else + fatal("Invalid seccomp option '%s', expected on/auto/off\n", + line); - if (!proc_mm || !ptrace_faultinfo || !ptrace_ldt) - skas_needs_stub = 1; + return 0; } -int __init parse_iomem(char *str, int *add) +__uml_setup("seccomp=", uml_seccomp_config, +"seccomp=<on/auto/off>\n" +" Configure whether or not SECCOMP is used. With SECCOMP, userspace\n" +" processes work collaboratively with the kernel instead of being\n" +" traced using ptrace. All syscalls from the application are caught and\n" +" redirected using a signal. This signal handler in turn is permitted to\n" +" do the selected set of syscalls to communicate with the UML kernel and\n" +" do the required memory management.\n" +"\n" +" This method is overall faster than the ptrace based userspace, primarily\n" +" because it reduces the number of context switches for (minor) page faults.\n" +"\n" +" However, the SECCOMP filter is not (yet) restrictive enough to prevent\n" +" userspace from reading and writing all physical memory. Userspace\n" +" processes could also trick the stub into disabling SIGALRM which\n" +" prevents it from being interrupted for scheduling purposes.\n" +"\n" +" This is insecure and should only be used with a trusted userspace\n\n" +); + +void __init os_early_checks(void) { - struct iomem_region *new; - struct stat64 buf; - char *file, *driver; - int fd, size; - - driver = str; - file = strchr(str,','); - if (file == NULL) { - fprintf(stderr, "parse_iomem : failed to parse iomem\n"); - goto out; - } - *file = '\0'; - file++; - fd = open(file, O_RDWR, 0); - if (fd < 0) { - perror("parse_iomem - Couldn't open io file"); - goto out; - } + int pid; - if (fstat64(fd, &buf) < 0) { - perror("parse_iomem - cannot stat_fd file"); - goto out_close; - } + /* Print out the core dump limits early */ + check_coredump_limit(); + + /* Need to check this early because mmapping happens before the + * kernel is running. + */ + check_tmpexec(); + + if (seccomp_config) { + if (init_seccomp()) { + using_seccomp = 1; + return; + } - new = malloc(sizeof(*new)); - if (new == NULL) { - perror("Couldn't allocate iomem_region struct"); - goto out_close; + if (seccomp_config == 2) + fatal("SECCOMP userspace requested but not functional!\n"); } - size = (buf.st_size + UM_KERN_PAGE_SIZE) & ~(UM_KERN_PAGE_SIZE - 1); + if (uml_ncpus > 1) + fatal("SMP is not supported with PTRACE userspace.\n"); - *new = ((struct iomem_region) { .next = iomem_regions, - .driver = driver, - .fd = fd, - .size = size, - .phys = 0, - .virt = 0 }); - iomem_regions = new; - iomem_size += new->size + UM_KERN_PAGE_SIZE; + using_seccomp = 0; + check_ptrace(); - return 0; - out_close: - close(fd); - out: - return 1; + pid = start_ptraced_child(); + if (init_pid_registers(pid)) + fatal("Failed to initialize default registers"); + stop_ptraced_child(pid, 1); } diff --git a/arch/um/os-Linux/time.c b/arch/um/os-Linux/time.c index e9824d5dd7d5..13ebc86918d4 100644 --- a/arch/um/os-Linux/time.c +++ b/arch/um/os-Linux/time.c @@ -1,186 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0 /* + * Copyright (C) 2015 Anton Ivanov (aivanov@{brocade.com,kot-begemot.co.uk}) + * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de) + * Copyright (C) 2012-2014 Cisco Systems * Copyright (C) 2000 - 2007 Jeff Dike (jdike{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <stddef.h> +#include <unistd.h> #include <errno.h> #include <signal.h> #include <time.h> +#include <sys/signalfd.h> #include <sys/time.h> #include <kern_util.h> #include <os.h> +#include <smp.h> +#include <string.h> #include "internal.h" -int set_interval(void) -{ - int usec = UM_USEC_PER_SEC / UM_HZ; - struct itimerval interval = ((struct itimerval) { { 0, usec }, - { 0, usec } }); - - if (setitimer(ITIMER_VIRTUAL, &interval, NULL) == -1) - return -errno; +static timer_t event_high_res_timer[CONFIG_NR_CPUS] = { 0 }; - return 0; +static inline long long timespec_to_ns(const struct timespec *ts) +{ + return ((long long) ts->tv_sec * UM_NSEC_PER_SEC) + ts->tv_nsec; } -int timer_one_shot(int ticks) +long long os_persistent_clock_emulation(void) { - unsigned long usec = ticks * UM_USEC_PER_SEC / UM_HZ; - unsigned long sec = usec / UM_USEC_PER_SEC; - struct itimerval interval; - - usec %= UM_USEC_PER_SEC; - interval = ((struct itimerval) { { 0, 0 }, { sec, usec } }); + struct timespec realtime_tp; - if (setitimer(ITIMER_VIRTUAL, &interval, NULL) == -1) - return -errno; - - return 0; + clock_gettime(CLOCK_REALTIME, &realtime_tp); + return timespec_to_ns(&realtime_tp); } +#ifndef sigev_notify_thread_id +#define sigev_notify_thread_id _sigev_un._tid +#endif + /** - * timeval_to_ns - Convert timeval to nanoseconds - * @ts: pointer to the timeval variable to be converted - * - * Returns the scalar nanosecond representation of the timeval - * parameter. - * - * Ripped from linux/time.h because it's a kernel header, and thus - * unusable from here. + * os_timer_create() - create an new posix (interval) timer */ -static inline long long timeval_to_ns(const struct timeval *tv) +int os_timer_create(void) { - return ((long long) tv->tv_sec * UM_NSEC_PER_SEC) + - tv->tv_usec * UM_NSEC_PER_USEC; + int cpu = uml_curr_cpu(); + timer_t *t = &event_high_res_timer[cpu]; + struct sigevent sev = { + .sigev_notify = SIGEV_THREAD_ID, + .sigev_signo = SIGALRM, + .sigev_value.sival_ptr = t, + .sigev_notify_thread_id = gettid(), + }; + + if (timer_create(CLOCK_MONOTONIC, &sev, t) == -1) + return -1; + + return 0; } -long long disable_timer(void) +int os_timer_set_interval(int cpu, unsigned long long nsecs) { - struct itimerval time = ((struct itimerval) { { 0, 0 }, { 0, 0 } }); - long long remain, max = UM_NSEC_PER_SEC / UM_HZ; + struct itimerspec its; + + its.it_value.tv_sec = nsecs / UM_NSEC_PER_SEC; + its.it_value.tv_nsec = nsecs % UM_NSEC_PER_SEC; - if (setitimer(ITIMER_VIRTUAL, &time, &time) < 0) - printk(UM_KERN_ERR "disable_timer - setitimer failed, " - "errno = %d\n", errno); + its.it_interval.tv_sec = nsecs / UM_NSEC_PER_SEC; + its.it_interval.tv_nsec = nsecs % UM_NSEC_PER_SEC; - remain = timeval_to_ns(&time.it_value); - if (remain > max) - remain = max; + if (timer_settime(event_high_res_timer[cpu], 0, &its, NULL) == -1) + return -errno; - return remain; + return 0; } -long long os_nsecs(void) +int os_timer_one_shot(int cpu, unsigned long long nsecs) { - struct timeval tv; + struct itimerspec its = { + .it_value.tv_sec = nsecs / UM_NSEC_PER_SEC, + .it_value.tv_nsec = nsecs % UM_NSEC_PER_SEC, - gettimeofday(&tv, NULL); - return timeval_to_ns(&tv); -} + .it_interval.tv_sec = 0, + .it_interval.tv_nsec = 0, // we cheat here + }; -#ifdef UML_CONFIG_NO_HZ_COMMON -static int after_sleep_interval(struct timespec *ts) -{ + timer_settime(event_high_res_timer[cpu], 0, &its, NULL); return 0; } -static void deliver_alarm(void) +/** + * os_timer_disable() - disable the posix (interval) timer + * @cpu: the CPU for which the timer is to be disabled + */ +void os_timer_disable(int cpu) { - alarm_handler(SIGVTALRM, NULL, NULL); -} + struct itimerspec its; -static unsigned long long sleep_time(unsigned long long nsecs) -{ - return nsecs; + memset(&its, 0, sizeof(struct itimerspec)); + timer_settime(event_high_res_timer[cpu], 0, &its, NULL); } -#else -unsigned long long last_tick; -unsigned long long skew; - -static void deliver_alarm(void) +long long os_nsecs(void) { - unsigned long long this_tick = os_nsecs(); - int one_tick = UM_NSEC_PER_SEC / UM_HZ; - - /* Protection against the host's time going backwards */ - if ((last_tick != 0) && (this_tick < last_tick)) - this_tick = last_tick; - - if (last_tick == 0) - last_tick = this_tick - one_tick; - - skew += this_tick - last_tick; - - while (skew >= one_tick) { - alarm_handler(SIGVTALRM, NULL, NULL); - skew -= one_tick; - } + struct timespec ts; - last_tick = this_tick; + clock_gettime(CLOCK_MONOTONIC,&ts); + return timespec_to_ns(&ts); } -static unsigned long long sleep_time(unsigned long long nsecs) -{ - return nsecs > skew ? nsecs - skew : 0; -} +static __thread int wake_signals; -static inline long long timespec_to_us(const struct timespec *ts) +void os_idle_prepare(void) { - return ((long long) ts->tv_sec * UM_USEC_PER_SEC) + - ts->tv_nsec / UM_NSEC_PER_USEC; -} + sigset_t set; -static int after_sleep_interval(struct timespec *ts) -{ - int usec = UM_USEC_PER_SEC / UM_HZ; - long long start_usecs = timespec_to_us(ts); - struct timeval tv; - struct itimerval interval; + sigemptyset(&set); + sigaddset(&set, SIGALRM); + sigaddset(&set, IPI_SIGNAL); /* - * It seems that rounding can increase the value returned from - * setitimer to larger than the one passed in. Over time, - * this will cause the remaining time to be greater than the - * tick interval. If this happens, then just reduce the first - * tick to the interval value. + * We need to use signalfd rather than sigsuspend in idle sleep + * because the IPI signal is a real-time signal that carries data, + * and unlike handling SIGALRM, we cannot simply flag it in + * signals_pending. */ - if (start_usecs > usec) - start_usecs = usec; - - start_usecs -= skew / UM_NSEC_PER_USEC; - if (start_usecs < 0) - start_usecs = 0; - - tv = ((struct timeval) { .tv_sec = start_usecs / UM_USEC_PER_SEC, - .tv_usec = start_usecs % UM_USEC_PER_SEC }); - interval = ((struct itimerval) { { 0, usec }, tv }); - - if (setitimer(ITIMER_VIRTUAL, &interval, NULL) == -1) - return -errno; - - return 0; + wake_signals = signalfd(-1, &set, SFD_CLOEXEC); + if (wake_signals < 0) + panic("Failed to create signal FD, errno = %d", errno); } -#endif -void idle_sleep(unsigned long long nsecs) +/** + * os_idle_sleep() - sleep until interrupted + */ +void os_idle_sleep(void) { - struct timespec ts; + sigset_t set; /* - * nsecs can come in as zero, in which case, this starts a - * busy loop. To prevent this, reset nsecs to the tick - * interval if it is zero. + * Block SIGALRM while performing the need_resched check. + * Note that, because IRQs are disabled, the IPI signal is + * already blocked. */ - if (nsecs == 0) - nsecs = UM_NSEC_PER_SEC / UM_HZ; + sigemptyset(&set); + sigaddset(&set, SIGALRM); + sigprocmask(SIG_BLOCK, &set, NULL); - nsecs = sleep_time(nsecs); - ts = ((struct timespec) { .tv_sec = nsecs / UM_NSEC_PER_SEC, - .tv_nsec = nsecs % UM_NSEC_PER_SEC }); + /* + * Because disabling IRQs does not block SIGALRM, it is also + * necessary to check for any pending timer alarms. + */ + if (!uml_need_resched() && !timer_alarm_pending()) + os_poll(1, &wake_signals); - if (nanosleep(&ts, &ts) == 0) - deliver_alarm(); - after_sleep_interval(&ts); + /* Restore the signal mask. */ + sigprocmask(SIG_UNBLOCK, &set, NULL); } diff --git a/arch/um/os-Linux/tty.c b/arch/um/os-Linux/tty.c index 721d8afa329b..f784db83e026 100644 --- a/arch/um/os-Linux/tty.c +++ b/arch/um/os-Linux/tty.c @@ -1,6 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <stdlib.h> diff --git a/arch/um/os-Linux/umid.c b/arch/um/os-Linux/umid.c index c1dc89261f67..eb523ab1e218 100644 --- a/arch/um/os-Linux/umid.c +++ b/arch/um/os-Linux/umid.c @@ -1,6 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ #include <stdio.h> @@ -35,11 +35,12 @@ static int __init make_uml_dir(void) err = -ENOENT; if (home == NULL) { - printk(UM_KERN_ERR "make_uml_dir : no value in " - "environment for $HOME\n"); + printk(UM_KERN_ERR + "%s: no value in environment for $HOME\n", + __func__); goto err; } - strlcpy(dir, home, sizeof(dir)); + strscpy(dir, home); uml_dir++; } strlcat(dir, uml_dir, sizeof(dir)); @@ -50,13 +51,15 @@ static int __init make_uml_dir(void) err = -ENOMEM; uml_dir = malloc(strlen(dir) + 1); if (uml_dir == NULL) { - printf("make_uml_dir : malloc failed, errno = %d\n", errno); + printk(UM_KERN_ERR "%s : malloc failed, errno = %d\n", + __func__, errno); goto err; } strcpy(uml_dir, dir); if ((mkdir(uml_dir, 0777) < 0) && (errno != EEXIST)) { - printf("Failed to mkdir '%s': %s\n", uml_dir, strerror(errno)); + printk(UM_KERN_ERR "Failed to mkdir '%s': %s\n", + uml_dir, strerror(errno)); err = -errno; goto err_free; } @@ -94,7 +97,7 @@ static int remove_files_and_dir(char *dir) while ((ent = readdir(directory)) != NULL) { if (!strcmp(ent->d_name, ".") || !strcmp(ent->d_name, "..")) continue; - len = strlen(dir) + sizeof("/") + strlen(ent->d_name) + 1; + len = strlen(dir) + strlen("/") + strlen(ent->d_name) + 1; if (len > sizeof(file)) { ret = -E2BIG; goto out; @@ -132,18 +135,16 @@ out: */ static inline int is_umdir_used(char *dir) { - char file[strlen(uml_dir) + UMID_LEN + sizeof("/pid\0")]; - char pid[sizeof("nnnnn\0")], *end; - int dead, fd, p, n, err; - - n = snprintf(file, sizeof(file), "%s/pid", dir); - if (n >= sizeof(file)) { - printk(UM_KERN_ERR "is_umdir_used - pid filename too long\n"); - err = -E2BIG; - goto out; - } + char pid[sizeof("nnnnnnnnn")], *end, *file; + int fd, p, n, err; + size_t filelen = strlen(dir) + sizeof("/pid") + 1; + + file = malloc(filelen); + if (!file) + return -ENOMEM; + + snprintf(file, filelen, "%s/pid", dir); - dead = 0; fd = open(file, O_RDONLY); if (fd < 0) { fd = -errno; @@ -182,6 +183,7 @@ static inline int is_umdir_used(char *dir) out_close: close(fd); out: + free(file); return 0; } @@ -207,18 +209,22 @@ static int umdir_take_if_dead(char *dir) static void __init create_pid_file(void) { - char file[strlen(uml_dir) + UMID_LEN + sizeof("/pid\0")]; - char pid[sizeof("nnnnn\0")]; + char pid[sizeof("nnnnnnnnn")], *file; int fd, n; - if (umid_file_name("pid", file, sizeof(file))) + n = strlen(uml_dir) + UMID_LEN + sizeof("/pid"); + file = malloc(n); + if (!file) return; + if (umid_file_name("pid", file, n)) + goto out; + fd = open(file, O_RDWR | O_CREAT | O_EXCL, 0644); if (fd < 0) { printk(UM_KERN_ERR "Open of machine pid file \"%s\" failed: " "%s\n", file, strerror(errno)); - return; + goto out; } snprintf(pid, sizeof(pid), "%d\n", getpid()); @@ -228,6 +234,8 @@ static void __init create_pid_file(void) errno); close(fd); +out: + free(file); } int __init set_umid(char *name) @@ -235,7 +243,7 @@ int __init set_umid(char *name) if (strlen(name) > UMID_LEN - 1) return -E2BIG; - strlcpy(umid, name, sizeof(umid)); + strscpy(umid, name); return 0; } @@ -254,7 +262,7 @@ static int __init make_umid(void) make_uml_dir(); if (*umid == '\0') { - strlcpy(tmp, uml_dir, sizeof(tmp)); + strscpy(tmp, uml_dir); strlcat(tmp, "XXXXXX", sizeof(tmp)); fd = mkstemp(tmp); if (fd < 0) { @@ -350,8 +358,10 @@ char *get_umid(void) static int __init set_uml_dir(char *name, int *add) { + *add = 0; + if (*name == '\0') { - printf("uml_dir can't be an empty string\n"); + os_warn("uml_dir can't be an empty string\n"); return 0; } @@ -362,7 +372,7 @@ static int __init set_uml_dir(char *name, int *add) uml_dir = malloc(strlen(name) + 2); if (uml_dir == NULL) { - printf("Failed to malloc uml_dir - error = %d\n", errno); + os_warn("Failed to malloc uml_dir - error = %d\n", errno); /* * Return 0 here because do_initcalls doesn't look at @@ -382,13 +392,19 @@ __uml_setup("uml_dir=", set_uml_dir, static void remove_umid_dir(void) { - char dir[strlen(uml_dir) + UMID_LEN + 1], err; + char *dir, err; + + dir = malloc(strlen(uml_dir) + UMID_LEN + 1); + if (!dir) + return; sprintf(dir, "%s%s", uml_dir, umid); err = remove_files_and_dir(dir); if (err) - printf("remove_umid_dir - remove_files_and_dir failed with " - "err = %d\n", err); + os_warn("%s - remove_files_and_dir failed with err = %d\n", + __func__, err); + + free(dir); } __uml_exitcall(remove_umid_dir); diff --git a/arch/um/os-Linux/user_syms.c b/arch/um/os-Linux/user_syms.c index db4a034aeee1..67f6112318b6 100644 --- a/arch/um/os-Linux/user_syms.c +++ b/arch/um/os-Linux/user_syms.c @@ -1,120 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0 +#define __NO_FORTIFY #include <linux/types.h> #include <linux/module.h> -/* Some of this are builtin function (some are not but could in the future), - * so I *must* declare good prototypes for them and then EXPORT them. - * The kernel code uses the macro defined by include/linux/string.h, - * so I undef macros; the userspace code does not include that and I - * add an EXPORT for the glibc one. +/* + * This file exports some critical string functions and compiler + * built-in functions (where calls are emitted by the compiler + * itself that we cannot avoid even in kernel code) to modules. + * + * "_user.c" code that previously used exports here such as hostfs + * really should be considered part of the 'hypervisor' and define + * its own API boundary like hostfs does now; don't add exports to + * this file for such cases. */ -#undef strlen -#undef strstr -#undef memcpy -#undef memset - -extern size_t strlen(const char *); -extern void *memmove(void *, const void *, size_t); -extern void *memset(void *, int, size_t); -extern int printf(const char *, ...); - /* If it's not defined, the export is included in lib/string.c.*/ #ifdef __HAVE_ARCH_STRSTR +#undef strstr EXPORT_SYMBOL(strstr); #endif #ifndef __x86_64__ +#undef memcpy extern void *memcpy(void *, const void *, size_t); EXPORT_SYMBOL(memcpy); -#endif - +extern void *memmove(void *, const void *, size_t); EXPORT_SYMBOL(memmove); +#undef memset +extern void *memset(void *, int, size_t); EXPORT_SYMBOL(memset); -EXPORT_SYMBOL(printf); - -/* Here, instead, I can provide a fake prototype. Yes, someone cares: genksyms. - * However, the modules will use the CRC defined *here*, no matter if it is - * good; so the versions of these symbols will always match - */ -#define EXPORT_SYMBOL_PROTO(sym) \ - int sym(void); \ - EXPORT_SYMBOL(sym); - -extern void readdir64(void) __attribute__((weak)); -EXPORT_SYMBOL(readdir64); -extern void truncate64(void) __attribute__((weak)); -EXPORT_SYMBOL(truncate64); - -#ifdef CONFIG_ARCH_REUSE_HOST_VSYSCALL_AREA -EXPORT_SYMBOL(vsyscall_ehdr); -EXPORT_SYMBOL(vsyscall_end); #endif -EXPORT_SYMBOL_PROTO(__errno_location); - -EXPORT_SYMBOL_PROTO(access); -EXPORT_SYMBOL_PROTO(open); -EXPORT_SYMBOL_PROTO(open64); -EXPORT_SYMBOL_PROTO(close); -EXPORT_SYMBOL_PROTO(read); -EXPORT_SYMBOL_PROTO(write); -EXPORT_SYMBOL_PROTO(dup2); -EXPORT_SYMBOL_PROTO(__xstat); -EXPORT_SYMBOL_PROTO(__lxstat); -EXPORT_SYMBOL_PROTO(__lxstat64); -EXPORT_SYMBOL_PROTO(__fxstat64); -EXPORT_SYMBOL_PROTO(lseek); -EXPORT_SYMBOL_PROTO(lseek64); -EXPORT_SYMBOL_PROTO(chown); -EXPORT_SYMBOL_PROTO(fchown); -EXPORT_SYMBOL_PROTO(truncate); -EXPORT_SYMBOL_PROTO(ftruncate64); -EXPORT_SYMBOL_PROTO(utime); -EXPORT_SYMBOL_PROTO(utimes); -EXPORT_SYMBOL_PROTO(futimes); -EXPORT_SYMBOL_PROTO(chmod); -EXPORT_SYMBOL_PROTO(fchmod); -EXPORT_SYMBOL_PROTO(rename); -EXPORT_SYMBOL_PROTO(__xmknod); - -EXPORT_SYMBOL_PROTO(symlink); -EXPORT_SYMBOL_PROTO(link); -EXPORT_SYMBOL_PROTO(unlink); -EXPORT_SYMBOL_PROTO(readlink); - -EXPORT_SYMBOL_PROTO(mkdir); -EXPORT_SYMBOL_PROTO(rmdir); -EXPORT_SYMBOL_PROTO(opendir); -EXPORT_SYMBOL_PROTO(readdir); -EXPORT_SYMBOL_PROTO(closedir); -EXPORT_SYMBOL_PROTO(seekdir); -EXPORT_SYMBOL_PROTO(telldir); - -EXPORT_SYMBOL_PROTO(ioctl); - -EXPORT_SYMBOL_PROTO(pread64); -EXPORT_SYMBOL_PROTO(pwrite64); - -EXPORT_SYMBOL_PROTO(statfs); -EXPORT_SYMBOL_PROTO(statfs64); - -EXPORT_SYMBOL_PROTO(getuid); - -EXPORT_SYMBOL_PROTO(fsync); -EXPORT_SYMBOL_PROTO(fdatasync); - -EXPORT_SYMBOL_PROTO(lstat64); -EXPORT_SYMBOL_PROTO(fstat64); -EXPORT_SYMBOL_PROTO(mknod); - -/* Export symbols used by GCC for the stack protector. */ -extern void __stack_smash_handler(void *) __attribute__((weak)); -EXPORT_SYMBOL(__stack_smash_handler); - -extern long __guard __attribute__((weak)); -EXPORT_SYMBOL(__guard); - #ifdef _FORTIFY_SOURCE -extern int __sprintf_chk(char *str, int flag, size_t strlen, const char *format); +extern int __sprintf_chk(char *str, int flag, size_t len, const char *format); EXPORT_SYMBOL(__sprintf_chk); #endif diff --git a/arch/um/os-Linux/util.c b/arch/um/os-Linux/util.c index 492ef5e6e166..e3ad71a0d13c 100644 --- a/arch/um/os-Linux/util.c +++ b/arch/um/os-Linux/util.c @@ -1,8 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL */ +#include <stdarg.h> #include <stdio.h> #include <stdlib.h> #include <unistd.h> @@ -10,15 +11,16 @@ #include <signal.h> #include <string.h> #include <termios.h> -#include <wait.h> +#include <sys/wait.h> #include <sys/mman.h> #include <sys/utsname.h> +#include <sys/random.h> +#include <init.h> #include <os.h> void stack_protections(unsigned long address) { - if (mprotect((void *) address, UM_THREAD_SIZE, - PROT_READ | PROT_WRITE | PROT_EXEC) < 0) + if (mprotect((void *) address, UM_THREAD_SIZE, PROT_READ | PROT_WRITE) < 0) panic("protecting stack failed, errno = %d", errno); } @@ -49,8 +51,8 @@ void setup_machinename(char *machine_out) struct utsname host; uname(&host); -#ifdef UML_CONFIG_UML_X86 -# ifndef UML_CONFIG_64BIT +#if IS_ENABLED(CONFIG_UML_X86) +# if !IS_ENABLED(CONFIG_64BIT) if (!strcmp(host.machine, "x86_64")) { strcpy(machine_out, "i686"); return; @@ -94,6 +96,21 @@ static inline void __attribute__ ((noreturn)) uml_abort(void) exit(127); } +ssize_t os_getrandom(void *buf, size_t len, unsigned int flags) +{ + return getrandom(buf, len, flags); +} + +/* + * UML helper threads must not handle SIGWINCH/INT/TERM + */ +void os_fix_helper_signals(void) +{ + signal(SIGWINCH, SIG_IGN); + signal(SIGINT, SIG_DFL); + signal(SIGTERM, SIG_DFL); +} + void os_dump_core(void) { int pid; @@ -142,3 +159,51 @@ void um_early_printk(const char *s, unsigned int n) { printf("%.*s", n, s); } + +static int quiet_info; + +static int __init quiet_cmd_param(char *str, int *add) +{ + quiet_info = 1; + return 0; +} + +__uml_setup("quiet", quiet_cmd_param, +"quiet\n" +" Turns off information messages during boot.\n\n"); + +/* + * The os_info/os_warn functions will be called by helper threads. These + * have a very limited stack size and using the libc formatting functions + * may overflow the stack. + * So pull in the kernel vscnprintf and use that instead with a fixed + * on-stack buffer. + */ +int vscnprintf(char *buf, size_t size, const char *fmt, va_list args); + +void os_info(const char *fmt, ...) +{ + char buf[256]; + va_list list; + int len; + + if (quiet_info) + return; + + va_start(list, fmt); + len = vscnprintf(buf, sizeof(buf), fmt, list); + fwrite(buf, len, 1, stderr); + va_end(list); +} + +void os_warn(const char *fmt, ...) +{ + char buf[256]; + va_list list; + int len; + + va_start(list, fmt); + len = vscnprintf(buf, sizeof(buf), fmt, list); + fwrite(buf, len, 1, stderr); + va_end(list); +} diff --git a/arch/um/scripts/Makefile.rules b/arch/um/scripts/Makefile.rules index 15889df9b466..a8b7d9dab0a6 100644 --- a/arch/um/scripts/Makefile.rules +++ b/arch/um/scripts/Makefile.rules @@ -1,10 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0 # =========================================================================== # arch/um: Generic definitions # =========================================================================== USER_SINGLE_OBJS := \ - $(foreach f,$(patsubst %.o,%,$(obj-y) $(obj-m)),$($(f)-objs)) -USER_OBJS += $(filter %_user.o,$(obj-y) $(obj-m) $(USER_SINGLE_OBJS)) + $(foreach f,$(patsubst %.o,%,$(obj-y)),$($(f)-objs)) +USER_OBJS += $(filter %_user.o,$(obj-y) $(USER_SINGLE_OBJS)) USER_OBJS := $(foreach file,$(USER_OBJS),$(obj)/$(file)) $(USER_OBJS:.o=.%): \ diff --git a/arch/um/sys-ia64/Makefile b/arch/um/sys-ia64/Makefile deleted file mode 100644 index d02f4c265232..000000000000 --- a/arch/um/sys-ia64/Makefile +++ /dev/null @@ -1,11 +0,0 @@ -OBJ = built-in.o - -OBJS = - -all: $(OBJ) - -$(OBJ): $(OBJS) - rm -f $@ - $(LD) $(LINKFLAGS) --start-group $^ --end-group -o $@ - -clean-files := $(OBJS) link.ld diff --git a/arch/um/sys-ia64/sysdep/ptrace.h b/arch/um/sys-ia64/sysdep/ptrace.h deleted file mode 100644 index 0f0f4e6fd334..000000000000 --- a/arch/um/sys-ia64/sysdep/ptrace.h +++ /dev/null @@ -1,16 +0,0 @@ -/* - * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL - */ - -#ifndef __SYSDEP_IA64_PTRACE_H -#define __SYSDEP_IA64_PTRACE_H - -struct sys_pt_regs { - int foo; -}; - -#define EMPTY_REGS { 0 } - -#endif - diff --git a/arch/um/sys-ia64/sysdep/sigcontext.h b/arch/um/sys-ia64/sysdep/sigcontext.h deleted file mode 100644 index 76b43161e779..000000000000 --- a/arch/um/sys-ia64/sysdep/sigcontext.h +++ /dev/null @@ -1,10 +0,0 @@ -/* - * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL - */ - -#ifndef __SYSDEP_IA64_SIGCONTEXT_H -#define __SYSDEP_IA64_SIGCONTEXT_H - -#endif - diff --git a/arch/um/sys-ia64/sysdep/skas_ptrace.h b/arch/um/sys-ia64/sysdep/skas_ptrace.h deleted file mode 100644 index 25a38e715702..000000000000 --- a/arch/um/sys-ia64/sysdep/skas_ptrace.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL - */ - -#ifndef __SYSDEP_IA64_SKAS_PTRACE_H -#define __SYSDEP_IA64_SKAS_PTRACE_H - -struct ptrace_faultinfo { - int is_write; - unsigned long addr; -}; - -struct ptrace_ldt { - int func; - void *ptr; - unsigned long bytecount; -}; - -#define PTRACE_LDT 54 - -#endif diff --git a/arch/um/sys-ia64/sysdep/syscalls.h b/arch/um/sys-ia64/sysdep/syscalls.h deleted file mode 100644 index 5f6700c41558..000000000000 --- a/arch/um/sys-ia64/sysdep/syscalls.h +++ /dev/null @@ -1,10 +0,0 @@ -/* - * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL - */ - -#ifndef __SYSDEP_IA64_SYSCALLS_H -#define __SYSDEP_IA64_SYSCALLS_H - -#endif - diff --git a/arch/um/sys-ppc/Makefile b/arch/um/sys-ppc/Makefile deleted file mode 100644 index 20d363bd7004..000000000000 --- a/arch/um/sys-ppc/Makefile +++ /dev/null @@ -1,65 +0,0 @@ -OBJ = built-in.o - -.S.o: - $(CC) $(KBUILD_AFLAGS) -D__ASSEMBLY__ -D__UM_PPC__ -c $< -o $*.o - -OBJS = ptrace.o sigcontext.o checksum.o miscthings.o misc.o \ - ptrace_user.o sysrq.o - -asflags-y := -DCONFIG_PPC32 -I. -I$(srctree)/arch/ppc/kernel - -all: $(OBJ) - -$(OBJ): $(OBJS) - rm -f $@ - $(LD) $(LINKFLAGS) --start-group $^ --end-group -o $@ - -ptrace_user.o: ptrace_user.c - $(CC) -D__KERNEL__ $(USER_CFLAGS) $(ccflags-y) -c -o $@ $< - -sigcontext.o: sigcontext.c - $(CC) $(USER_CFLAGS) $(ccflags-y) -c -o $@ $< - -checksum.S: - rm -f $@ - ln -s $(srctree)/arch/ppc/lib/$@ $@ - -mk_defs.c: - rm -f $@ - ln -s $(srctree)/arch/ppc/kernel/$@ $@ - -ppc_defs.head: - rm -f $@ - ln -s $(srctree)/arch/ppc/kernel/$@ $@ - -ppc_defs.h: mk_defs.c ppc_defs.head \ - $(srctree)/include/asm-ppc/mmu.h \ - $(srctree)/include/asm-ppc/processor.h \ - $(srctree)/include/asm-ppc/pgtable.h \ - $(srctree)/include/asm-ppc/ptrace.h -# $(CC) $(CFLAGS) -S mk_defs.c - cp ppc_defs.head ppc_defs.h -# for bk, this way we can write to the file even if it's not checked out - echo '#define THREAD 608' >> ppc_defs.h - echo '#define PT_REGS 8' >> ppc_defs.h - echo '#define CLONE_VM 256' >> ppc_defs.h -# chmod u+w ppc_defs.h -# grep '^#define' mk_defs.s >> ppc_defs.h -# rm mk_defs.s - -# the asm link is horrible, and breaks the other targets. This is also -# not going to work with parallel makes. - -checksum.o: checksum.S - rm -f asm - ln -s $(srctree)/include/asm-ppc asm - $(CC) $(asflags-y) $(KBUILD_AFLAGS) -D__ASSEMBLY__ -D__UM_PPC__ -c $< -o $*.o - rm -f asm - -misc.o: misc.S ppc_defs.h - rm -f asm - ln -s $(srctree)/include/asm-ppc asm - $(CC) $(asflags-y) $(KBUILD_AFLAGS) -D__ASSEMBLY__ -D__UM_PPC__ -c $< -o $*.o - rm -f asm - -clean-files := $(OBJS) ppc_defs.h checksum.S mk_defs.c diff --git a/arch/um/sys-ppc/asm/archparam.h b/arch/um/sys-ppc/asm/archparam.h deleted file mode 100644 index 4269d8a37b4f..000000000000 --- a/arch/um/sys-ppc/asm/archparam.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __UM_ARCHPARAM_PPC_H -#define __UM_ARCHPARAM_PPC_H - -/********* Bits for asm-um/string.h **********/ - -#define __HAVE_ARCH_STRRCHR - -#endif diff --git a/arch/um/sys-ppc/asm/elf.h b/arch/um/sys-ppc/asm/elf.h deleted file mode 100644 index 8aacaf56508d..000000000000 --- a/arch/um/sys-ppc/asm/elf.h +++ /dev/null @@ -1,51 +0,0 @@ -#ifndef __UM_ELF_PPC_H -#define __UM_ELF_PPC_H - - -extern long elf_aux_hwcap; -#define ELF_HWCAP (elf_aux_hwcap) - -#define SET_PERSONALITY(ex) do ; while(0) - -#define ELF_EXEC_PAGESIZE 4096 - -#define elf_check_arch(x) (1) - -#ifdef CONFIG_64BIT -#define ELF_CLASS ELFCLASS64 -#else -#define ELF_CLASS ELFCLASS32 -#endif - -#define R_386_NONE 0 -#define R_386_32 1 -#define R_386_PC32 2 -#define R_386_GOT32 3 -#define R_386_PLT32 4 -#define R_386_COPY 5 -#define R_386_GLOB_DAT 6 -#define R_386_JMP_SLOT 7 -#define R_386_RELATIVE 8 -#define R_386_GOTOFF 9 -#define R_386_GOTPC 10 -#define R_386_NUM 11 - -#define ELF_PLATFORM (0) - -#define ELF_ET_DYN_BASE (0x08000000) - -/* the following stolen from asm-ppc/elf.h */ -#define ELF_NGREG 48 /* includes nip, msr, lr, etc. */ -#define ELF_NFPREG 33 /* includes fpscr */ -/* General registers */ -typedef unsigned long elf_greg_t; -typedef elf_greg_t elf_gregset_t[ELF_NGREG]; - -/* Floating point registers */ -typedef double elf_fpreg_t; -typedef elf_fpreg_t elf_fpregset_t[ELF_NFPREG]; - -#define ELF_DATA ELFDATA2MSB -#define ELF_ARCH EM_PPC - -#endif diff --git a/arch/um/sys-ppc/asm/processor.h b/arch/um/sys-ppc/asm/processor.h deleted file mode 100644 index 959323151229..000000000000 --- a/arch/um/sys-ppc/asm/processor.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef __UM_PROCESSOR_PPC_H -#define __UM_PROCESSOR_PPC_H - -#if defined(__ASSEMBLY__) - -#define CONFIG_PPC_MULTIPLATFORM -#include "arch/processor.h" - -#else - -#include "asm/processor-generic.h" - -#endif - -#endif diff --git a/arch/um/sys-ppc/misc.S b/arch/um/sys-ppc/misc.S deleted file mode 100644 index 1364b7da578c..000000000000 --- a/arch/um/sys-ppc/misc.S +++ /dev/null @@ -1,111 +0,0 @@ -/* - * This file contains miscellaneous low-level functions. - * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) - * - * Largely rewritten by Cort Dougan (cort@cs.nmt.edu) - * and Paul Mackerras. - * - * A couple of functions stolen from arch/ppc/kernel/misc.S for UML - * by Chris Emerson. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include <asm/processor.h> -#include "ppc_asm.h" - -#if defined(CONFIG_4xx) || defined(CONFIG_8xx) -#define CACHE_LINE_SIZE 16 -#define LG_CACHE_LINE_SIZE 4 -#define MAX_COPY_PREFETCH 1 -#else -#define CACHE_LINE_SIZE 32 -#define LG_CACHE_LINE_SIZE 5 -#define MAX_COPY_PREFETCH 4 -#endif /* CONFIG_4xx || CONFIG_8xx */ - - .text - -/* - * Clear a page using the dcbz instruction, which doesn't cause any - * memory traffic (except to write out any cache lines which get - * displaced). This only works on cacheable memory. - */ -_GLOBAL(clear_page) - li r0,4096/CACHE_LINE_SIZE - mtctr r0 -#ifdef CONFIG_8xx - li r4, 0 -1: stw r4, 0(r3) - stw r4, 4(r3) - stw r4, 8(r3) - stw r4, 12(r3) -#else -1: dcbz 0,r3 -#endif - addi r3,r3,CACHE_LINE_SIZE - bdnz 1b - blr - -/* - * Copy a whole page. We use the dcbz instruction on the destination - * to reduce memory traffic (it eliminates the unnecessary reads of - * the destination into cache). This requires that the destination - * is cacheable. - */ -#define COPY_16_BYTES \ - lwz r6,4(r4); \ - lwz r7,8(r4); \ - lwz r8,12(r4); \ - lwzu r9,16(r4); \ - stw r6,4(r3); \ - stw r7,8(r3); \ - stw r8,12(r3); \ - stwu r9,16(r3) - -_GLOBAL(copy_page) - addi r3,r3,-4 - addi r4,r4,-4 - li r5,4 - -#ifndef CONFIG_8xx -#if MAX_COPY_PREFETCH > 1 - li r0,MAX_COPY_PREFETCH - li r11,4 - mtctr r0 -11: dcbt r11,r4 - addi r11,r11,CACHE_LINE_SIZE - bdnz 11b -#else /* MAX_COPY_PREFETCH == 1 */ - dcbt r5,r4 - li r11,CACHE_LINE_SIZE+4 -#endif /* MAX_COPY_PREFETCH */ -#endif /* CONFIG_8xx */ - - li r0,4096/CACHE_LINE_SIZE - mtctr r0 -1: -#ifndef CONFIG_8xx - dcbt r11,r4 - dcbz r5,r3 -#endif - COPY_16_BYTES -#if CACHE_LINE_SIZE >= 32 - COPY_16_BYTES -#if CACHE_LINE_SIZE >= 64 - COPY_16_BYTES - COPY_16_BYTES -#if CACHE_LINE_SIZE >= 128 - COPY_16_BYTES - COPY_16_BYTES - COPY_16_BYTES - COPY_16_BYTES -#endif -#endif -#endif - bdnz 1b - blr diff --git a/arch/um/sys-ppc/miscthings.c b/arch/um/sys-ppc/miscthings.c deleted file mode 100644 index 25908d26ce07..000000000000 --- a/arch/um/sys-ppc/miscthings.c +++ /dev/null @@ -1,42 +0,0 @@ -#include <linux/threads.h> -#include <linux/stddef.h> // for NULL -#include <linux/elf.h> // for AT_NULL - -/* The following function nicked from arch/ppc/kernel/process.c and - * adapted slightly */ -/* - * XXX ld.so expects the auxiliary table to start on - * a 16-byte boundary, so we have to find it and - * move it up. :-( - */ -void shove_aux_table(unsigned long sp) -{ - int argc; - char *p; - unsigned long e; - unsigned long aux_start, offset; - - argc = *(int *)sp; - sp += sizeof(int) + (argc + 1) * sizeof(char *); - /* skip over the environment pointers */ - do { - p = *(char **)sp; - sp += sizeof(char *); - } while (p != NULL); - aux_start = sp; - /* skip to the end of the auxiliary table */ - do { - e = *(unsigned long *)sp; - sp += 2 * sizeof(unsigned long); - } while (e != AT_NULL); - offset = ((aux_start + 15) & ~15) - aux_start; - if (offset != 0) { - do { - sp -= sizeof(unsigned long); - e = *(unsigned long *)sp; - *(unsigned long *)(sp + offset) = e; - } while (sp > aux_start); - } -} -/* END stuff taken from arch/ppc/kernel/process.c */ - diff --git a/arch/um/sys-ppc/ptrace.c b/arch/um/sys-ppc/ptrace.c deleted file mode 100644 index 8245df41b201..000000000000 --- a/arch/um/sys-ppc/ptrace.c +++ /dev/null @@ -1,58 +0,0 @@ -#include <linux/sched.h> -#include "asm/ptrace.h" - -int putreg(struct task_struct *child, unsigned long regno, - unsigned long value) -{ - child->thread.process_regs.regs[regno >> 2] = value; - return 0; -} - -int poke_user(struct task_struct *child, long addr, long data) -{ - if ((addr & 3) || addr < 0) - return -EIO; - - if (addr < MAX_REG_OFFSET) - return putreg(child, addr, data); - - else if((addr >= offsetof(struct user, u_debugreg[0])) && - (addr <= offsetof(struct user, u_debugreg[7]))){ - addr -= offsetof(struct user, u_debugreg[0]); - addr = addr >> 2; - if((addr == 4) || (addr == 5)) return -EIO; - child->thread.arch.debugregs[addr] = data; - return 0; - } - return -EIO; -} - -unsigned long getreg(struct task_struct *child, unsigned long regno) -{ - unsigned long retval = ~0UL; - - retval &= child->thread.process_regs.regs[regno >> 2]; - return retval; -} - -int peek_user(struct task_struct *child, long addr, long data) -{ - /* read the word at location addr in the USER area. */ - unsigned long tmp; - - if ((addr & 3) || addr < 0) - return -EIO; - - tmp = 0; /* Default return condition */ - if(addr < MAX_REG_OFFSET){ - tmp = getreg(child, addr); - } - else if((addr >= offsetof(struct user, u_debugreg[0])) && - (addr <= offsetof(struct user, u_debugreg[7]))){ - addr -= offsetof(struct user, u_debugreg[0]); - addr = addr >> 2; - tmp = child->thread.arch.debugregs[addr]; - } - return put_user(tmp, (unsigned long *) data); -} - diff --git a/arch/um/sys-ppc/ptrace_user.c b/arch/um/sys-ppc/ptrace_user.c deleted file mode 100644 index 4601b9296aa7..000000000000 --- a/arch/um/sys-ppc/ptrace_user.c +++ /dev/null @@ -1,29 +0,0 @@ -#include <errno.h> -#include <asm/ptrace.h> -#include <sysdep/ptrace.h> - -int ptrace_getregs(long pid, unsigned long *regs_out) -{ - int i; - for (i=0; i < sizeof(struct sys_pt_regs)/sizeof(PPC_REG); ++i) { - errno = 0; - regs_out->regs[i] = ptrace(PTRACE_PEEKUSR, pid, i*4, 0); - if (errno) { - return -errno; - } - } - return 0; -} - -int ptrace_setregs(long pid, unsigned long *regs_in) -{ - int i; - for (i=0; i < sizeof(struct sys_pt_regs)/sizeof(PPC_REG); ++i) { - if (i != 34 /* FIXME: PT_ORIG_R3 */ && i <= PT_MQ) { - if (ptrace(PTRACE_POKEUSR, pid, i*4, regs_in->regs[i]) < 0) { - return -errno; - } - } - } - return 0; -} diff --git a/arch/um/sys-ppc/shared/sysdep/ptrace.h b/arch/um/sys-ppc/shared/sysdep/ptrace.h deleted file mode 100644 index efe0c1a3ea9c..000000000000 --- a/arch/um/sys-ppc/shared/sysdep/ptrace.h +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Licensed under the GPL - */ - -#ifndef __SYS_PTRACE_PPC_H -#define __SYS_PTRACE_PPC_H - -#include <linux/types.h> - -/* the following taken from <asm-ppc/ptrace.h> */ - -#ifdef CONFIG_PPC64 -#define PPC_REG unsigned long /*long*/ -#else -#define PPC_REG unsigned long -#endif -struct sys_pt_regs_s { - PPC_REG gpr[32]; - PPC_REG nip; - PPC_REG msr; - PPC_REG orig_gpr3; /* Used for restarting system calls */ - PPC_REG ctr; - PPC_REG link; - PPC_REG xer; - PPC_REG ccr; - PPC_REG mq; /* 601 only (not used at present) */ - /* Used on APUS to hold IPL value. */ - PPC_REG trap; /* Reason for being here */ - PPC_REG dar; /* Fault registers */ - PPC_REG dsisr; - PPC_REG result; /* Result of a system call */ -}; - -#define NUM_REGS (sizeof(struct sys_pt_regs_s) / sizeof(PPC_REG)) - -struct sys_pt_regs { - PPC_REG regs[sizeof(struct sys_pt_regs_s) / sizeof(PPC_REG)]; -}; - -#define UM_MAX_REG (PT_FPR0) -#define UM_MAX_REG_OFFSET (UM_MAX_REG * sizeof(PPC_REG)) - -#define EMPTY_REGS { { [ 0 ... NUM_REGS - 1] = 0 } } - -#define UM_REG(r, n) ((r)->regs[n]) - -#define UM_SYSCALL_RET(r) UM_REG(r, PT_R3) -#define UM_SP(r) UM_REG(r, PT_R1) -#define UM_IP(r) UM_REG(r, PT_NIP) -#define UM_ELF_ZERO(r) UM_REG(r, PT_FPSCR) -#define UM_SYSCALL_NR(r) UM_REG(r, PT_R0) -#define UM_SYSCALL_ARG1(r) UM_REG(r, PT_ORIG_R3) -#define UM_SYSCALL_ARG2(r) UM_REG(r, PT_R4) -#define UM_SYSCALL_ARG3(r) UM_REG(r, PT_R5) -#define UM_SYSCALL_ARG4(r) UM_REG(r, PT_R6) -#define UM_SYSCALL_ARG5(r) UM_REG(r, PT_R7) -#define UM_SYSCALL_ARG6(r) UM_REG(r, PT_R8) - -#define UM_SYSCALL_NR_OFFSET (PT_R0 * sizeof(PPC_REG)) -#define UM_SYSCALL_RET_OFFSET (PT_R3 * sizeof(PPC_REG)) -#define UM_SYSCALL_ARG1_OFFSET (PT_R3 * sizeof(PPC_REG)) -#define UM_SYSCALL_ARG2_OFFSET (PT_R4 * sizeof(PPC_REG)) -#define UM_SYSCALL_ARG3_OFFSET (PT_R5 * sizeof(PPC_REG)) -#define UM_SYSCALL_ARG4_OFFSET (PT_R6 * sizeof(PPC_REG)) -#define UM_SYSCALL_ARG5_OFFSET (PT_R7 * sizeof(PPC_REG)) -#define UM_SYSCALL_ARG6_OFFSET (PT_R8 * sizeof(PPC_REG)) -#define UM_SP_OFFSET (PT_R1 * sizeof(PPC_REG)) -#define UM_IP_OFFSET (PT_NIP * sizeof(PPC_REG)) -#define UM_ELF_ZERO_OFFSET (PT_R3 * sizeof(PPC_REG)) - -#define UM_SET_SYSCALL_RETURN(_regs, result) \ -do { \ - if (result < 0) { \ - (_regs)->regs[PT_CCR] |= 0x10000000; \ - UM_SYSCALL_RET((_regs)) = -result; \ - } else { \ - UM_SYSCALL_RET((_regs)) = result; \ - } \ -} while(0) - -extern void shove_aux_table(unsigned long sp); -#define UM_FIX_EXEC_STACK(sp) shove_aux_table(sp); - -/* These aren't actually defined. The undefs are just to make sure - * everyone's clear on the concept. - */ -#undef UML_HAVE_GETREGS -#undef UML_HAVE_GETFPREGS -#undef UML_HAVE_SETREGS -#undef UML_HAVE_SETFPREGS - -#endif - diff --git a/arch/um/sys-ppc/shared/sysdep/sigcontext.h b/arch/um/sys-ppc/shared/sysdep/sigcontext.h deleted file mode 100644 index b7286f0a1e00..000000000000 --- a/arch/um/sys-ppc/shared/sysdep/sigcontext.h +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL - */ - -#ifndef __SYS_SIGCONTEXT_PPC_H -#define __SYS_SIGCONTEXT_PPC_H - -#define DSISR_WRITE 0x02000000 - -#define SC_FAULT_ADDR(sc) ({ \ - struct sigcontext *_sc = (sc); \ - long retval = -1; \ - switch (_sc->regs->trap) { \ - case 0x300: \ - /* data exception */ \ - retval = _sc->regs->dar; \ - break; \ - case 0x400: \ - /* instruction exception */ \ - retval = _sc->regs->nip; \ - break; \ - default: \ - panic("SC_FAULT_ADDR: unhandled trap type\n"); \ - } \ - retval; \ - }) - -#define SC_FAULT_WRITE(sc) ({ \ - struct sigcontext *_sc = (sc); \ - long retval = -1; \ - switch (_sc->regs->trap) { \ - case 0x300: \ - /* data exception */ \ - retval = !!(_sc->regs->dsisr & DSISR_WRITE); \ - break; \ - case 0x400: \ - /* instruction exception: not a write */ \ - retval = 0; \ - break; \ - default: \ - panic("SC_FAULT_ADDR: unhandled trap type\n"); \ - } \ - retval; \ - }) - -#define SC_IP(sc) ((sc)->regs->nip) -#define SC_SP(sc) ((sc)->regs->gpr[1]) -#define SEGV_IS_FIXABLE(sc) (1) - -#endif - diff --git a/arch/um/sys-ppc/shared/sysdep/skas_ptrace.h b/arch/um/sys-ppc/shared/sysdep/skas_ptrace.h deleted file mode 100644 index d9fbbac10de0..000000000000 --- a/arch/um/sys-ppc/shared/sysdep/skas_ptrace.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL - */ - -#ifndef __SYSDEP_PPC_SKAS_PTRACE_H -#define __SYSDEP_PPC_SKAS_PTRACE_H - -struct ptrace_faultinfo { - int is_write; - unsigned long addr; -}; - -struct ptrace_ldt { - int func; - void *ptr; - unsigned long bytecount; -}; - -#define PTRACE_LDT 54 - -#endif diff --git a/arch/um/sys-ppc/shared/sysdep/syscalls.h b/arch/um/sys-ppc/shared/sysdep/syscalls.h deleted file mode 100644 index 1ff81552251c..000000000000 --- a/arch/um/sys-ppc/shared/sysdep/syscalls.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL - */ - -typedef long syscall_handler_t(unsigned long arg1, unsigned long arg2, - unsigned long arg3, unsigned long arg4, - unsigned long arg5, unsigned long arg6); - -#define EXECUTE_SYSCALL(syscall, regs) \ - (*sys_call_table[syscall])(UM_SYSCALL_ARG1(®s), \ - UM_SYSCALL_ARG2(®s), \ - UM_SYSCALL_ARG3(®s), \ - UM_SYSCALL_ARG4(®s), \ - UM_SYSCALL_ARG5(®s), \ - UM_SYSCALL_ARG6(®s)) - -extern syscall_handler_t sys_mincore; -extern syscall_handler_t sys_madvise; - -/* old_mmap needs the correct prototype since syscall_kern.c includes - * this file. - */ -int old_mmap(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long offset); - -#define ARCH_SYSCALLS \ - [ __NR_modify_ldt ] = sys_ni_syscall, \ - [ __NR_pciconfig_read ] = sys_ni_syscall, \ - [ __NR_pciconfig_write ] = sys_ni_syscall, \ - [ __NR_pciconfig_iobase ] = sys_ni_syscall, \ - [ __NR_pivot_root ] = sys_ni_syscall, \ - [ __NR_multiplexer ] = sys_ni_syscall, \ - [ __NR_mmap ] = old_mmap, \ - [ __NR_madvise ] = sys_madvise, \ - [ __NR_mincore ] = sys_mincore, \ - [ __NR_iopl ] = (syscall_handler_t *) sys_ni_syscall, \ - [ __NR_utimes ] = (syscall_handler_t *) sys_utimes, \ - [ __NR_fadvise64 ] = (syscall_handler_t *) sys_fadvise64, - -#define LAST_ARCH_SYSCALL __NR_fadvise64 - diff --git a/arch/um/sys-ppc/sigcontext.c b/arch/um/sys-ppc/sigcontext.c deleted file mode 100644 index aac6c83fe44e..000000000000 --- a/arch/um/sys-ppc/sigcontext.c +++ /dev/null @@ -1,4 +0,0 @@ -#include "asm/ptrace.h" -#include "asm/sigcontext.h" -#include <sysdep/ptrace.h> - diff --git a/arch/um/sys-ppc/sysrq.c b/arch/um/sys-ppc/sysrq.c deleted file mode 100644 index 1ff1ad7f27da..000000000000 --- a/arch/um/sys-ppc/sysrq.c +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (C) 2001 Chris Emerson (cemerson@chiark.greenend.org.uk) - * Licensed under the GPL - */ - -#include <linux/kernel.h> -#include <linux/smp.h> -#include "asm/ptrace.h" -#include "sysrq.h" - -void show_regs(struct pt_regs_subarch *regs) -{ - printk("\n"); - show_regs_print_info(KERN_DEFAULT); - - printk("show_regs(): insert regs here.\n"); -#if 0 - printk("\n"); - printk("EIP: %04x:[<%08lx>] CPU: %d",0xffff & regs->xcs, regs->eip, - smp_processor_id()); - if (regs->xcs & 3) - printk(" ESP: %04x:%08lx",0xffff & regs->xss, regs->esp); - printk(" EFLAGS: %08lx\n", regs->eflags); - printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", - regs->eax, regs->ebx, regs->ecx, regs->edx); - printk("ESI: %08lx EDI: %08lx EBP: %08lx", - regs->esi, regs->edi, regs->ebp); - printk(" DS: %04x ES: %04x\n", - 0xffff & regs->xds, 0xffff & regs->xes); -#endif - - show_trace(current, ®s->gpr[1]); -} |
