diff options
Diffstat (limited to 'arch/s390')
448 files changed, 36368 insertions, 22315 deletions
diff --git a/arch/s390/Kbuild b/arch/s390/Kbuild index e63940bb57cd..a5d3503b353c 100644 --- a/arch/s390/Kbuild +++ b/arch/s390/Kbuild @@ -3,9 +3,11 @@ obj-y += kernel/ obj-y += mm/ obj-$(CONFIG_KVM) += kvm/ obj-y += crypto/ -obj-$(CONFIG_S390_HYPFS_FS) += hypfs/ +obj-$(CONFIG_S390_HYPFS) += hypfs/ obj-$(CONFIG_APPLDATA_BASE) += appldata/ obj-y += net/ obj-$(CONFIG_PCI) += pci/ -obj-$(CONFIG_NUMA) += numa/ -obj-$(CONFIG_ARCH_HAS_KEXEC_PURGATORY) += purgatory/ +obj-$(CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY) += purgatory/ + +# for cleaning +subdir- += boot tools diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index d4051e88e625..fe565f3a3a91 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -2,9 +2,6 @@ config MMU def_bool y -config ZONE_DMA - def_bool y - config CPU_BIG_ENDIAN def_bool y @@ -30,14 +27,11 @@ config GENERIC_BUG_RELATIVE_POINTERS def_bool y config GENERIC_LOCKBREAK - def_bool y if PREEMPT + def_bool y if PREEMPTION config PGSTE def_bool y if KVM -config ARCH_SUPPORTS_DEBUG_PAGEALLOC - def_bool y - config AUDIT_ARCH def_bool y @@ -53,25 +47,43 @@ config ARCH_SUPPORTS_UPROBES config KASAN_SHADOW_OFFSET hex depends on KASAN - default 0x18000000000000 if KASAN_S390_4_LEVEL_PAGING - default 0x30000000000 + default 0x1C000000000000 config S390 def_bool y + # + # Note: keep this list sorted alphabetically + # + imply IMA_SECURE_AND_OR_TRUSTED_BOOT + select ALTERNATE_USER_ADDRESS_SPACE + select ARCH_32BIT_USTAT_F_TINODE select ARCH_BINFMT_ELF_STATE + select ARCH_CORRECT_STACKTRACE_ON_KRETPROBE + select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM + select ARCH_ENABLE_MEMORY_HOTREMOVE + select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2 + select ARCH_HAS_CURRENT_STACK_POINTER + select ARCH_HAS_DEBUG_VM_PGTABLE + select ARCH_HAS_DEBUG_WX select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE + select ARCH_HAS_FORCE_DMA_UNENCRYPTED select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_GIGANTIC_PAGE select ARCH_HAS_KCOV + select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_MEM_ENCRYPT + select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS select ARCH_HAS_PTE_SPECIAL + select ARCH_HAS_SCALED_CPUTIME + select ARCH_HAS_SET_DIRECT_MAP select ARCH_HAS_SET_MEMORY select ARCH_HAS_STRICT_KERNEL_RWX select ARCH_HAS_STRICT_MODULE_RWX select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_UBSAN_SANITIZE_ALL + select ARCH_HAS_VDSO_DATA select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_INLINE_READ_LOCK select ARCH_INLINE_READ_LOCK_BH @@ -101,52 +113,75 @@ config S390 select ARCH_INLINE_WRITE_UNLOCK_BH select ARCH_INLINE_WRITE_UNLOCK_IRQ select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE - select ARCH_KEEP_MEMBLOCK - select ARCH_SAVE_PAGE_KEYS if HIBERNATION select ARCH_STACKWALK select ARCH_SUPPORTS_ATOMIC_RMW + select ARCH_SUPPORTS_DEBUG_PAGEALLOC + select ARCH_SUPPORTS_HUGETLBFS + select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 && CC_IS_CLANG select ARCH_SUPPORTS_NUMA_BALANCING + select ARCH_SUPPORTS_PER_VMA_LOCK select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF - select ARCH_WANTS_DYNAMIC_TASK_STRUCT + select ARCH_USE_SYM_ANNOTATIONS + select ARCH_WANTS_NO_INSTR + select ARCH_WANT_DEFAULT_BPF_JIT select ARCH_WANT_IPC_PARSE_VERSION - select BUILDTIME_EXTABLE_SORT + select ARCH_WANT_KERNEL_PMD_MKWRITE + select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP + select BUILDTIME_TABLE_SORT select CLONE_BACKWARDS2 + select DCACHE_WORD_ACCESS if !KMSAN + select DMA_OPS if PCI select DYNAMIC_FTRACE if FUNCTION_TRACER - select GENERIC_CLOCKEVENTS + select FUNCTION_ALIGNMENT_8B if CC_IS_GCC + select FUNCTION_ALIGNMENT_16B if !CC_IS_GCC + select GENERIC_ALLOCATOR select GENERIC_CPU_AUTOPROBE select GENERIC_CPU_VULNERABILITIES - select GENERIC_FIND_FIRST_BIT + select GENERIC_ENTRY + select GENERIC_GETTIMEOFDAY + select GENERIC_PTDUMP select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL - select HAVE_ALIGNED_STRUCT_PAGE if SLUB + select GENERIC_VDSO_TIME_NS + select GENERIC_IOREMAP if PCI + select HAVE_ALIGNED_STRUCT_PAGE select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_JUMP_LABEL_RELATIVE select HAVE_ARCH_KASAN - select CPU_NO_EFFICIENT_FFS if !HAVE_MARCH_Z9_109_FEATURES + select HAVE_ARCH_KASAN_VMALLOC + select HAVE_ARCH_KCSAN + select HAVE_ARCH_KFENCE + select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_SOFT_DIRTY + select HAVE_ARCH_STACKLEAK select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE select HAVE_ARCH_VMAP_STACK select HAVE_ASM_MODVERSIONS - select HAVE_EBPF_JIT if PACK_STACK && HAVE_MARCH_Z196_FEATURES select HAVE_CMPXCHG_DOUBLE select HAVE_CMPXCHG_LOCAL - select HAVE_COPY_THREAD_TLS select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE + select HAVE_DYNAMIC_FTRACE_WITH_ARGS + select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS select HAVE_DYNAMIC_FTRACE_WITH_REGS - select HAVE_FAST_GUP + select HAVE_EBPF_JIT if HAVE_MARCH_Z196_FEATURES select HAVE_EFFICIENT_UNALIGNED_ACCESS + select HAVE_FAST_GUP select HAVE_FENTRY select HAVE_FTRACE_MCOUNT_RECORD + select HAVE_FUNCTION_ARG_ACCESS_API + select HAVE_FUNCTION_ERROR_INJECTION + select HAVE_FUNCTION_GRAPH_RETVAL select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER - select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_GCC_PLUGINS + select HAVE_GENERIC_VDSO + select HAVE_IOREMAP_PROT if PCI select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_GZIP select HAVE_KERNEL_LZ4 @@ -154,46 +189,55 @@ config S390 select HAVE_KERNEL_LZO select HAVE_KERNEL_UNCOMPRESSED select HAVE_KERNEL_XZ + select HAVE_KERNEL_ZSTD select HAVE_KPROBES + select HAVE_KPROBES_ON_FTRACE select HAVE_KRETPROBES select HAVE_KVM select HAVE_LIVEPATCH - select HAVE_PERF_REGS - select HAVE_PERF_USER_STACK_DUMP - select HAVE_MEMBLOCK_NODE_MAP select HAVE_MEMBLOCK_PHYS_MAP - select HAVE_MMU_GATHER_NO_GATHER select HAVE_MOD_ARCH_SPECIFIC + select HAVE_NMI select HAVE_NOP_MCOUNT - select HAVE_OPROFILE select HAVE_PCI select HAVE_PERF_EVENTS - select HAVE_RCU_TABLE_FREE + select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE + select HAVE_RETHOOK select HAVE_RSEQ + select HAVE_SAMPLE_FTRACE_DIRECT + select HAVE_SAMPLE_FTRACE_DIRECT_MULTI + select HAVE_SETUP_PER_CPU_AREA + select HAVE_SOFTIRQ_ON_OWN_STACK select HAVE_SYSCALL_TRACEPOINTS select HAVE_VIRT_CPU_ACCOUNTING + select HAVE_VIRT_CPU_ACCOUNTING_IDLE select IOMMU_HELPER if PCI select IOMMU_SUPPORT if PCI + select MMU_GATHER_MERGE_VMAS + select MMU_GATHER_NO_GATHER + select MMU_GATHER_RCU_TABLE_FREE select MODULES_USE_ELF_RELA select NEED_DMA_MAP_STATE if PCI + select NEED_PER_CPU_EMBED_FIRST_CHUNK select NEED_SG_DMA_LENGTH if PCI select OLD_SIGACTION select OLD_SIGSUSPEND3 select PCI_DOMAINS if PCI select PCI_MSI if PCI + select PCI_MSI_ARCH_FALLBACKS if PCI_MSI select SPARSE_IRQ + select SWIOTLB select SYSCTL_EXCEPTION_TRACE select THREAD_INFO_IN_TASK + select TRACE_IRQFLAGS_SUPPORT select TTY + select USER_STACKTRACE_SUPPORT select VIRT_CPU_ACCOUNTING - select ARCH_HAS_SCALED_CPUTIME - select HAVE_NMI - select ARCH_HAS_FORCE_DMA_UNENCRYPTED - select SWIOTLB - select GENERIC_ALLOCATOR - + select ZONE_DMA + # Note: keep the above list sorted alphabetically config SCHED_OMIT_FRAME_POINTER def_bool y @@ -204,22 +248,29 @@ config PGTABLE_LEVELS source "kernel/livepatch/Kconfig" -menu "Processor type and features" +config ARCH_SUPPORTS_KEXEC + def_bool y -config HAVE_MARCH_Z900_FEATURES - def_bool n +config ARCH_SUPPORTS_KEXEC_FILE + def_bool y -config HAVE_MARCH_Z990_FEATURES - def_bool n - select HAVE_MARCH_Z900_FEATURES +config ARCH_SUPPORTS_KEXEC_SIG + def_bool MODULE_SIG_FORMAT -config HAVE_MARCH_Z9_109_FEATURES - def_bool n - select HAVE_MARCH_Z990_FEATURES +config ARCH_SUPPORTS_KEXEC_PURGATORY + def_bool y + +config ARCH_SUPPORTS_CRASH_DUMP + def_bool y + help + Refer to <file:Documentation/arch/s390/zfcpdump.rst> for more details on this. + This option also enables s390 zfcpdump. + See also <file:Documentation/arch/s390/zfcpdump.rst> + +menu "Processor type and features" config HAVE_MARCH_Z10_FEATURES def_bool n - select HAVE_MARCH_Z9_109_FEATURES config HAVE_MARCH_Z196_FEATURES def_bool n @@ -241,45 +292,21 @@ config HAVE_MARCH_Z15_FEATURES def_bool n select HAVE_MARCH_Z14_FEATURES +config HAVE_MARCH_Z16_FEATURES + def_bool n + select HAVE_MARCH_Z15_FEATURES + choice prompt "Processor type" default MARCH_Z196 -config MARCH_Z900 - bool "IBM zSeries model z800 and z900" - select HAVE_MARCH_Z900_FEATURES - depends on $(cc-option,-march=z900) - help - Select this to enable optimizations for model z800/z900 (2064 and - 2066 series). This will enable some optimizations that are not - available on older ESA/390 (31 Bit) only CPUs. - -config MARCH_Z990 - bool "IBM zSeries model z890 and z990" - select HAVE_MARCH_Z990_FEATURES - depends on $(cc-option,-march=z990) - help - Select this to enable optimizations for model z890/z990 (2084 and - 2086 series). The kernel will be slightly faster but will not work - on older machines. - -config MARCH_Z9_109 - bool "IBM System z9" - select HAVE_MARCH_Z9_109_FEATURES - depends on $(cc-option,-march=z9-109) - help - Select this to enable optimizations for IBM System z9 (2094 and - 2096 series). The kernel will be slightly faster but will not work - on older machines. - config MARCH_Z10 bool "IBM System z10" select HAVE_MARCH_Z10_FEATURES depends on $(cc-option,-march=z10) help - Select this to enable optimizations for IBM System z10 (2097 and - 2098 series). The kernel will be slightly faster but will not work - on older machines. + Select this to enable optimizations for IBM System z10 (2097 and 2098 + series). This is the oldest machine generation currently supported. config MARCH_Z196 bool "IBM zEnterprise 114 and 196" @@ -326,16 +353,15 @@ config MARCH_Z15 and 8561 series). The kernel will be slightly faster but will not work on older machines. -endchoice - -config MARCH_Z900_TUNE - def_bool TUNE_Z900 || MARCH_Z900 && TUNE_DEFAULT - -config MARCH_Z990_TUNE - def_bool TUNE_Z990 || MARCH_Z990 && TUNE_DEFAULT +config MARCH_Z16 + bool "IBM z16" + select HAVE_MARCH_Z16_FEATURES + depends on $(cc-option,-march=z16) + help + Select this to enable optimizations for IBM z16 (3931 and + 3932 series). -config MARCH_Z9_109_TUNE - def_bool TUNE_Z9_109 || MARCH_Z9_109 && TUNE_DEFAULT +endchoice config MARCH_Z10_TUNE def_bool TUNE_Z10 || MARCH_Z10 && TUNE_DEFAULT @@ -355,6 +381,9 @@ config MARCH_Z14_TUNE config MARCH_Z15_TUNE def_bool TUNE_Z15 || MARCH_Z15 && TUNE_DEFAULT +config MARCH_Z16_TUNE + def_bool TUNE_Z16 || MARCH_Z16 && TUNE_DEFAULT + choice prompt "Tune code generation" default TUNE_DEFAULT @@ -372,21 +401,8 @@ config TUNE_DEFAULT Tune the generated code for the target processor for which the kernel will be compiled. -config TUNE_Z900 - bool "IBM zSeries model z800 and z900" - depends on $(cc-option,-mtune=z900) - -config TUNE_Z990 - bool "IBM zSeries model z890 and z990" - depends on $(cc-option,-mtune=z990) - -config TUNE_Z9_109 - bool "IBM System z9" - depends on $(cc-option,-mtune=z9-109) - config TUNE_Z10 bool "IBM System z10" - depends on $(cc-option,-mtune=z10) config TUNE_Z196 bool "IBM zEnterprise 114 and 196" @@ -408,27 +424,38 @@ config TUNE_Z15 bool "IBM z15" depends on $(cc-option,-mtune=z15) +config TUNE_Z16 + bool "IBM z16" + depends on $(cc-option,-mtune=z16) + endchoice config 64BIT def_bool y +config COMMAND_LINE_SIZE + int "Maximum size of kernel command line" + default 4096 + range 896 1048576 + help + This allows you to specify the maximum length of the kernel command + line. + config COMPAT - def_bool y + def_bool n prompt "Kernel support for 31 bit emulation" - select COMPAT_BINFMT_ELF if BINFMT_ELF select ARCH_WANT_OLD_COMPAT_IPC select COMPAT_OLD_SIGACTION select HAVE_UID16 depends on MULTIUSER + depends on !CC_IS_CLANG help Select this option if you want to enable your system kernel to handle system-calls from ELF binaries for 31 bit ESA. This option (and some other stuff like libraries and such) is needed for - executing 31 bit applications. It is safe to say "Y". + executing 31 bit applications. -config SYSVIPC_COMPAT - def_bool y if COMPAT && SYSVIPC + If unsure say N. config SMP def_bool y @@ -448,14 +475,6 @@ config NR_CPUS config HOTPLUG_CPU def_bool y -# Some NUMA nodes have memory ranges that span -# other nodes. Even though a pfn is valid and -# between a node's start and end pfns, it may not -# reside on that node. See memmap_init_zone() -# for details. <- They meant memory holes! -config NODES_SPAN_OTHER_NODES - def_bool NUMA - config NUMA bool "NUMA support" depends on SCHED_TOPOLOGY @@ -465,58 +484,10 @@ config NUMA This option adds NUMA support to the kernel. - An operation mode can be selected by appending - numa=<method> to the kernel command line. - - The default behaviour is identical to appending numa=plain to - the command line. This will create just one node with all - available memory and all CPUs in it. - config NODES_SHIFT - int "Maximum NUMA nodes (as a power of 2)" - range 1 10 - depends on NUMA - default "4" - help - Specify the maximum number of NUMA nodes available on the target - system. Increases memory reserved to accommodate various tables. - -menu "Select NUMA modes" + int depends on NUMA - -config NUMA_EMU - bool "NUMA emulation" - default y - help - Numa emulation mode will split the available system memory into - equal chunks which then are distributed over the configured number - of nodes in a round-robin manner. - - The number of fake nodes is limited by the number of available memory - chunks (i.e. memory size / fake size) and the number of supported - nodes in the kernel. - - The CPUs are assigned to the nodes in a way that partially respects - the original machine topology (if supported by the machine). - Fair distribution of the CPUs is not guaranteed. - -config EMU_SIZE - hex "NUMA emulation memory chunk size" - default 0x10000000 - range 0x400000 0x100000000 - depends on NUMA_EMU - help - Select the default size by which the memory is chopped and then - assigned to emulated NUMA nodes. - - This can be overridden by specifying - - emu_size=<n> - - on the kernel command line where also suffixes K, M, G, and T are - supported. - -endmenu + default "1" config SCHED_SMT def_bool n @@ -524,19 +495,11 @@ config SCHED_SMT config SCHED_MC def_bool n -config SCHED_BOOK - def_bool n - -config SCHED_DRAWER - def_bool n - config SCHED_TOPOLOGY def_bool y prompt "Topology scheduler support" select SCHED_SMT select SCHED_MC - select SCHED_BOOK - select SCHED_DRAWER help Topology scheduler support improves the CPU scheduler's decision making when dealing with machines that have multi-threading, @@ -544,51 +507,16 @@ config SCHED_TOPOLOGY source "kernel/Kconfig.hz" -config KEXEC - def_bool y - select KEXEC_CORE - -config KEXEC_FILE - bool "kexec file based system call" - select KEXEC_CORE - select BUILD_BIN2C - depends on CRYPTO - depends on CRYPTO_SHA256 - depends on CRYPTO_SHA256_S390 - help - Enable the kexec file based system call. In contrast to the normal - kexec system call this system call takes file descriptors for the - kernel and initramfs as arguments. - -config ARCH_HAS_KEXEC_PURGATORY - def_bool y - depends on KEXEC_FILE - -config KEXEC_SIG - bool "Verify kernel signature during kexec_file_load() syscall" - depends on KEXEC_FILE && MODULE_SIG_FORMAT +config CERT_STORE + bool "Get user certificates via DIAG320" + depends on KEYS + select CRYPTO_LIB_SHA256 help - This option makes kernel signature verification mandatory for - the kexec_file_load() syscall. - - In addition to that option, you need to enable signature - verification for the corresponding kernel image type being - loaded in order for this to work. + Enable this option if you want to access user-provided secure boot + certificates via DIAG 0x320. -config ARCH_RANDOM - def_bool y - prompt "s390 architectural random number generation API" - help - Enable the s390 architectural random number generation API - to provide random data for all consumers within the Linux - kernel. - - When enabled the arch_random_* functions declared in linux/random.h - are implemented. The implementation is based on the s390 CPACF - instruction subfunction TRNG which provides a real true random - number generator. - - If unsure, say Y. + These certificates will be made available via the keyring named + 'cert_store'. config KERNEL_NOBP def_bool n @@ -609,6 +537,7 @@ config KERNEL_NOBP config EXPOLINE def_bool n + depends on $(cc-option,-mindirect-branch=thunk) prompt "Avoid speculative indirect branches in the kernel" help Compile the kernel with the expoline compiler options to guard @@ -619,6 +548,19 @@ config EXPOLINE If unsure, say N. +config EXPOLINE_EXTERN + def_bool n + depends on EXPOLINE + depends on CC_IS_GCC && GCC_VERSION >= 110200 + depends on $(success,$(srctree)/arch/s390/tools/gcc-thunk-extern.sh $(CC)) + prompt "Generate expolines as extern functions." + help + This option is required for some tooling like kpatch. The kernel is + compiled with -mindirect-branch=thunk-extern and requires a newer + compiler. + + If unsure, say N. + choice prompt "Expoline default" depends on EXPOLINE @@ -636,9 +578,7 @@ config EXPOLINE_FULL endchoice config RELOCATABLE - bool "Build a relocatable kernel" - select MODULE_REL_CRCS if MODVERSIONS - default y + def_bool y help This builds a kernel image that retains relocation information so it can be loaded at an arbitrary address. @@ -647,10 +587,11 @@ config RELOCATABLE bootup process. The relocations make the kernel image about 15% larger (compressed 10%), but are discarded at runtime. + Note: this option exists only for documentation purposes, please do + not remove it. config RANDOMIZE_BASE bool "Randomize the address of the kernel image (KASLR)" - depends on RELOCATABLE default y help In support of Kernel Address Space Layout Randomization (KASLR), @@ -670,19 +611,6 @@ config ARCH_SPARSEMEM_ENABLE config ARCH_SPARSEMEM_DEFAULT def_bool y -config ARCH_ENABLE_MEMORY_HOTPLUG - def_bool y if SPARSEMEM - -config ARCH_ENABLE_MEMORY_HOTREMOVE - def_bool y - -config ARCH_ENABLE_SPLIT_PMD_PTLOCK - def_bool y - -config FORCE_MAX_ZONEORDER - int - default "9" - config MAX_PHYSMEM_BITS int "Maximum size of supported physical memory in bits (42-53)" range 42 53 @@ -693,20 +621,6 @@ config MAX_PHYSMEM_BITS Increasing the number of bits also increases the kernel image size. By default 46 bits (64TB) are supported. -config PACK_STACK - def_bool y - prompt "Pack kernel stack" - help - This option enables the compiler option -mkernel-backchain if it - is available. If the option is available the compiler supports - the new stack layout which dramatically reduces the minimum stack - frame size. With an old compiler a non-leaf function needs a - minimum of 96 bytes on 31 bit and 160 bytes on 64 bit. With - -mkernel-backchain the minimum size drops to 16 byte on 31 bit - and 24 byte on 64 bit. - - Say Y if you are unsure. - config CHECK_STACK def_bool y depends on !VMAP_STACK @@ -733,16 +647,6 @@ config STACK_GUARD The minimum size for the stack guard should be 256 for 31 bit and 512 for 64 bit. -config WARN_DYNAMIC_STACK - def_bool n - prompt "Emit compiler warnings for function with dynamic stack usage" - help - This option enables the compiler option -mwarn-dynamicstack. If the - compiler supports this options generates warnings for functions - that dynamically allocate stack space using alloca. - - Say N if you are unsure. - endmenu menu "I/O subsystem" @@ -750,7 +654,7 @@ menu "I/O subsystem" config QDIO def_tristate y prompt "QDIO support" - ---help--- + help This driver provides the Queued Direct I/O base support for IBM System z. @@ -764,7 +668,7 @@ if PCI config PCI_NR_FUNCTIONS int "Maximum number of PCI functions (1-4096)" range 1 4096 - default "128" + default "512" help This allows you to specify the maximum number of PCI functions which this kernel will support. @@ -811,7 +715,8 @@ config EADM_SCH config VFIO_CCW def_tristate n prompt "Support for VFIO-CCW subchannels" - depends on S390_CCW_IOMMU && VFIO_MDEV + depends on VFIO + select VFIO_MDEV help This driver allows usage of I/O subchannels via VFIO-CCW. @@ -821,55 +726,16 @@ config VFIO_CCW config VFIO_AP def_tristate n prompt "VFIO support for AP devices" - depends on S390_AP_IOMMU && VFIO_MDEV_DEVICE && KVM - help - This driver grants access to Adjunct Processor (AP) devices - via the VFIO mediated device interface. - - To compile this driver as a module, choose M here: the module - will be called vfio_ap. - -endmenu - -menu "Dump support" - -config CRASH_DUMP - bool "kernel crash dumps" - select KEXEC - help - Generate crash dump after being started by kexec. - Crash dump kernels are loaded in the main kernel with kexec-tools - into a specially reserved region and then later executed after - a crash by kdump/kexec. - Refer to <file:Documentation/s390/zfcpdump.rst> for more details on this. - This option also enables s390 zfcpdump. - See also <file:Documentation/s390/zfcpdump.rst> - -endmenu - -config SECCOMP - def_bool y - prompt "Enable seccomp to safely compute untrusted bytecode" - depends on PROC_FS + depends on KVM + depends on VFIO + depends on ZCRYPT + select VFIO_MDEV help - This kernel feature is useful for number crunching applications - that may need to compute untrusted bytecode during their - execution. By using pipes or other transports made available to - the process as file descriptors supporting the read/write - syscalls, it's possible to isolate those applications in - their own address space using seccomp. Once seccomp is - enabled via /proc/<pid>/seccomp, it cannot be disabled - and the task is only allowed to execute a few safe syscalls - defined by each seccomp mode. - - If unsure, say Y. - -menu "Power Management" - -config ARCH_HIBERNATION_POSSIBLE - def_bool y + This driver grants access to Adjunct Processor (AP) devices + via the VFIO mediated device interface. -source "kernel/power/Kconfig" + To compile this driver as a module, choose M here: the module + will be called vfio_ap. endmenu @@ -930,7 +796,7 @@ config CMM_IUCV config APPLDATA_BASE def_bool n prompt "Linux - VM Monitor Stream, base infrastructure" - depends on PROC_FS + depends on PROC_SYSCTL help This provides a kernel interface for creating and updating z/VM APPLDATA monitor records. The monitor records are updated at certain time @@ -991,13 +857,24 @@ config APPLDATA_NET_SUM This can also be compiled as a module, which will be called appldata_net_sum.o. -config S390_HYPFS_FS +config S390_HYPFS def_bool y + prompt "s390 hypervisor information" + help + This provides several binary files at (debugfs)/s390_hypfs/ to + provide accounting information in an s390 hypervisor environment. + +config S390_HYPFS_FS + def_bool n prompt "s390 hypervisor file system support" select SYS_HYPERVISOR + depends on S390_HYPFS help This is a virtual file system intended to provide accounting - information in an s390 hypervisor environment. + information in an s390 hypervisor environment. This file system + is deprecated and should not be used. + + Say N if you are unsure. source "arch/s390/kvm/Kconfig" @@ -1007,7 +884,6 @@ config S390_GUEST select TTY select VIRTUALIZATION select VIRTIO - select VIRTIO_CONSOLE help Enabling this option adds support for virtio based paravirtual device drivers on s390. @@ -1017,10 +893,15 @@ config S390_GUEST endmenu +config S390_MODULES_SANITY_TEST_HELPERS + def_bool n + menu "Selftests" config S390_UNWIND_SELFTEST def_tristate n + depends on KUNIT + default KUNIT_ALL_TESTS prompt "Test unwind functions" help This option enables s390 specific stack unwinder testing kernel @@ -1029,4 +910,28 @@ config S390_UNWIND_SELFTEST Say N if you are unsure. +config S390_KPROBES_SANITY_TEST + def_tristate n + prompt "Enable s390 specific kprobes tests" + depends on KPROBES + depends on KUNIT + help + This option enables an s390 specific kprobes test module. This option + is not useful for distributions or general kernels, but only for kernel + developers working on architecture code. + + Say N if you are unsure. + +config S390_MODULES_SANITY_TEST + def_tristate n + depends on KUNIT + default KUNIT_ALL_TESTS + prompt "Enable s390 specific modules tests" + select S390_MODULES_SANITY_TEST_HELPERS + help + This option enables an s390 specific modules test. This option is + not useful for distributions or general kernels, but only for + kernel developers working on architecture code. + + Say N if you are unsure. endmenu diff --git a/arch/s390/Kconfig.debug b/arch/s390/Kconfig.debug index 190527560b2c..c4300ea4abf8 100644 --- a/arch/s390/Kconfig.debug +++ b/arch/s390/Kconfig.debug @@ -1,19 +1,22 @@ # SPDX-License-Identifier: GPL-2.0 -config TRACE_IRQFLAGS_SUPPORT +config EARLY_PRINTK def_bool y -config S390_PTDUMP - bool "Export kernel pagetable layout to userspace via debugfs" +config DEBUG_ENTRY + bool "Debug low-level entry code" depends on DEBUG_KERNEL - select DEBUG_FS - ---help--- - Say Y here if you want to show the kernel pagetable layout in a - debugfs file. This information is only useful for kernel developers - who are working in architecture specific areas of the kernel. - It is probably not a good idea to enable this feature in a production - kernel. - If in doubt, say "N" + help + This option enables sanity checks in s390 low-level entry code. + Some of these sanity checks may slow down kernel entries and + exits or otherwise impact performance. -config EARLY_PRINTK - def_bool y + If unsure, say N. + +config CIO_INJECT + bool "CIO Inject interfaces" + depends on DEBUG_KERNEL && DEBUG_FS + help + This option provides a debugging facility to inject certain artificial events + and instruction responses to the CIO layer of Linux kernel. The newly created + debugfs user-interfaces will be at /sys/kernel/debug/s390/cio/* diff --git a/arch/s390/Makefile b/arch/s390/Makefile index ba8556bb0fb1..73873e451686 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -3,9 +3,7 @@ # s390/Makefile # # This file is included by the global makefile so that you can add your own -# architecture-specific flags and dependencies. Remember to do have actions -# for "archclean" and "archdep" for cleaning up and making dependencies for -# this architecture +# architecture-specific flags and dependencies. # # Copyright (C) 1994 by Linus Torvalds # @@ -16,51 +14,51 @@ KBUILD_AFLAGS_MODULE += -fPIC KBUILD_CFLAGS_MODULE += -fPIC KBUILD_AFLAGS += -m64 KBUILD_CFLAGS += -m64 -ifeq ($(CONFIG_RELOCATABLE),y) KBUILD_CFLAGS += -fPIE LDFLAGS_vmlinux := -pie -endif aflags_dwarf := -Wa,-gdwarf-2 KBUILD_AFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -D__ASSEMBLY__ +ifndef CONFIG_AS_IS_LLVM KBUILD_AFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO),$(aflags_dwarf)) -KBUILD_CFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -O2 +endif +KBUILD_CFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -O2 -mpacked-stack KBUILD_CFLAGS_DECOMPRESSOR += -DDISABLE_BRANCH_PROFILING -D__NO_FORTIFY -KBUILD_CFLAGS_DECOMPRESSOR += -fno-delete-null-pointer-checks -msoft-float +KBUILD_CFLAGS_DECOMPRESSOR += -fno-delete-null-pointer-checks -msoft-float -mbackchain KBUILD_CFLAGS_DECOMPRESSOR += -fno-asynchronous-unwind-tables -KBUILD_CFLAGS_DECOMPRESSOR += $(call cc-option,-ffreestanding) +KBUILD_CFLAGS_DECOMPRESSOR += -ffreestanding +KBUILD_CFLAGS_DECOMPRESSOR += -fno-stack-protector +KBUILD_CFLAGS_DECOMPRESSOR += -fPIE KBUILD_CFLAGS_DECOMPRESSOR += $(call cc-disable-warning, address-of-packed-member) KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO),-g) KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO_DWARF4), $(call cc-option, -gdwarf-4,)) +KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_CC_NO_ARRAY_BOUNDS),-Wno-array-bounds) + UTS_MACHINE := s390x STACK_SIZE := $(if $(CONFIG_KASAN),65536,16384) CHECKFLAGS += -D__s390__ -D__s390x__ export LD_BFD -mflags-$(CONFIG_MARCH_Z900) := -march=z900 -mflags-$(CONFIG_MARCH_Z990) := -march=z990 -mflags-$(CONFIG_MARCH_Z9_109) := -march=z9-109 mflags-$(CONFIG_MARCH_Z10) := -march=z10 mflags-$(CONFIG_MARCH_Z196) := -march=z196 mflags-$(CONFIG_MARCH_ZEC12) := -march=zEC12 mflags-$(CONFIG_MARCH_Z13) := -march=z13 mflags-$(CONFIG_MARCH_Z14) := -march=z14 mflags-$(CONFIG_MARCH_Z15) := -march=z15 +mflags-$(CONFIG_MARCH_Z16) := -march=z16 export CC_FLAGS_MARCH := $(mflags-y) aflags-y += $(mflags-y) cflags-y += $(mflags-y) -cflags-$(CONFIG_MARCH_Z900_TUNE) += -mtune=z900 -cflags-$(CONFIG_MARCH_Z990_TUNE) += -mtune=z990 -cflags-$(CONFIG_MARCH_Z9_109_TUNE) += -mtune=z9-109 cflags-$(CONFIG_MARCH_Z10_TUNE) += -mtune=z10 cflags-$(CONFIG_MARCH_Z196_TUNE) += -mtune=z196 cflags-$(CONFIG_MARCH_ZEC12_TUNE) += -mtune=zEC12 cflags-$(CONFIG_MARCH_Z13_TUNE) += -mtune=z13 cflags-$(CONFIG_MARCH_Z14_TUNE) += -mtune=z14 cflags-$(CONFIG_MARCH_Z15_TUNE) += -mtune=z15 +cflags-$(CONFIG_MARCH_Z16_TUNE) += -mtune=z16 cflags-y += -Wa,-I$(srctree)/arch/$(ARCH)/include @@ -69,44 +67,38 @@ cflags-y += -Wa,-I$(srctree)/arch/$(ARCH)/include # cflags-$(CONFIG_FRAME_POINTER) += -fno-optimize-sibling-calls -ifeq ($(call cc-option-yn,-mpacked-stack),y) -cflags-$(CONFIG_PACK_STACK) += -mpacked-stack -D__PACK_STACK -aflags-$(CONFIG_PACK_STACK) += -D__PACK_STACK -endif - KBUILD_AFLAGS_DECOMPRESSOR += $(aflags-y) KBUILD_CFLAGS_DECOMPRESSOR += $(cflags-y) -ifeq ($(call cc-option-yn,-mstack-size=8192 -mstack-guard=128),y) -cflags-$(CONFIG_CHECK_STACK) += -mstack-size=$(STACK_SIZE) -ifneq ($(call cc-option-yn,-mstack-size=8192),y) -cflags-$(CONFIG_CHECK_STACK) += -mstack-guard=$(CONFIG_STACK_GUARD) -endif -endif - -ifdef CONFIG_WARN_DYNAMIC_STACK - ifeq ($(call cc-option-yn,-mwarn-dynamicstack),y) - KBUILD_CFLAGS += -mwarn-dynamicstack - KBUILD_CFLAGS_DECOMPRESSOR += -mwarn-dynamicstack +ifneq ($(call cc-option,-mstack-size=8192 -mstack-guard=128),) + CC_FLAGS_CHECK_STACK := -mstack-size=$(STACK_SIZE) + ifeq ($(call cc-option,-mstack-size=8192),) + CC_FLAGS_CHECK_STACK += -mstack-guard=$(CONFIG_STACK_GUARD) endif + export CC_FLAGS_CHECK_STACK + cflags-$(CONFIG_CHECK_STACK) += $(CC_FLAGS_CHECK_STACK) endif ifdef CONFIG_EXPOLINE - ifeq ($(call cc-option-yn,$(CC_FLAGS_MARCH) -mindirect-branch=thunk),y) + ifdef CONFIG_EXPOLINE_EXTERN + KBUILD_LDFLAGS_MODULE += arch/s390/lib/expoline/expoline.o + CC_FLAGS_EXPOLINE := -mindirect-branch=thunk-extern + CC_FLAGS_EXPOLINE += -mfunction-return=thunk-extern + else CC_FLAGS_EXPOLINE := -mindirect-branch=thunk CC_FLAGS_EXPOLINE += -mfunction-return=thunk - CC_FLAGS_EXPOLINE += -mindirect-branch-table - export CC_FLAGS_EXPOLINE - cflags-y += $(CC_FLAGS_EXPOLINE) -DCC_USING_EXPOLINE - aflags-y += -DCC_USING_EXPOLINE endif + CC_FLAGS_EXPOLINE += -mindirect-branch-table + export CC_FLAGS_EXPOLINE + cflags-y += $(CC_FLAGS_EXPOLINE) -DCC_USING_EXPOLINE + aflags-y += -DCC_USING_EXPOLINE endif ifdef CONFIG_FUNCTION_TRACER - ifeq ($(call cc-option-yn,-mfentry -mnop-mcount),n) + ifeq ($(call cc-option,-mfentry -mnop-mcount),) # make use of hotpatch feature if the compiler supports it cc_hotpatch := -mhotpatch=0,3 - ifeq ($(call cc-option-yn,$(cc_hotpatch)),y) + ifneq ($(call cc-option,$(cc_hotpatch)),) CC_FLAGS_FTRACE := $(cc_hotpatch) KBUILD_AFLAGS += -DCC_USING_HOTPATCH KBUILD_CFLAGS += -DCC_USING_HOTPATCH @@ -117,7 +109,7 @@ endif # Test CFI features of binutils cfi := $(call as-instr,.cfi_startproc\n.cfi_val_offset 15$(comma)-160\n.cfi_endproc,-DCONFIG_AS_CFI_VAL_OFFSET=1) -KBUILD_CFLAGS += -mbackchain -msoft-float $(cflags-y) +KBUILD_CFLAGS += -mpacked-stack -mbackchain -msoft-float $(cflags-y) KBUILD_CFLAGS += -pipe -Wno-sign-compare KBUILD_CFLAGS += -fno-asynchronous-unwind-tables $(cfi) KBUILD_AFLAGS += $(aflags-y) $(cfi) @@ -126,16 +118,7 @@ export KBUILD_CFLAGS_DECOMPRESSOR OBJCOPYFLAGS := -O binary -head-y := arch/s390/kernel/head64.o - -# See arch/s390/Kbuild for content of core part of the kernel -core-y += arch/s390/ - libs-y += arch/s390/lib/ -drivers-y += drivers/s390/ - -# must be linked after kernel -drivers-$(CONFIG_OPROFILE) += arch/s390/oprofile/ boot := arch/s390/boot syscalls := arch/s390/kernel/syscalls @@ -146,8 +129,8 @@ all: bzImage #KBUILD_IMAGE is necessary for packaging targets like rpm-pkg, deb-pkg... KBUILD_IMAGE := $(boot)/bzImage -install: vmlinux - $(Q)$(MAKE) $(build)=$(boot) $@ +install: + $(call cmd,install) bzImage: vmlinux $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@ @@ -155,19 +138,34 @@ bzImage: vmlinux zfcpdump: $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@ -vdso_install: - $(Q)$(MAKE) $(build)=arch/$(ARCH)/kernel/vdso64 $@ - -archclean: - $(Q)$(MAKE) $(clean)=$(boot) - $(Q)$(MAKE) $(clean)=$(tools) - archheaders: $(Q)$(MAKE) $(build)=$(syscalls) uapi archprepare: $(Q)$(MAKE) $(build)=$(syscalls) kapi $(Q)$(MAKE) $(build)=$(tools) kapi +ifeq ($(KBUILD_EXTMOD),) +# We need to generate vdso-offsets.h before compiling certain files in kernel/. +# In order to do that, we should use the archprepare target, but we can't since +# asm-offsets.h is included in some files used to generate vdso-offsets.h, and +# asm-offsets.h is built in prepare0, for which archprepare is a dependency. +# Therefore we need to generate the header after prepare0 has been made, hence +# this hack. +prepare: vdso_prepare +vdso_prepare: prepare0 + $(Q)$(MAKE) $(build)=arch/s390/kernel/vdso64 include/generated/vdso64-offsets.h + $(if $(CONFIG_COMPAT),$(Q)$(MAKE) \ + $(build)=arch/s390/kernel/vdso32 include/generated/vdso32-offsets.h) + +vdso-install-y += arch/s390/kernel/vdso64/vdso64.so.dbg +vdso-install-$(CONFIG_COMPAT) += arch/s390/kernel/vdso32/vdso32.so.dbg + +ifdef CONFIG_EXPOLINE_EXTERN +modules_prepare: expoline_prepare +expoline_prepare: scripts + $(Q)$(MAKE) $(build)=arch/s390/lib/expoline arch/s390/lib/expoline/expoline.o +endif +endif # Don't use tabs in echo arguments define archhelp diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c index aa738cad1338..c2978cb03b36 100644 --- a/arch/s390/appldata/appldata_base.c +++ b/arch/s390/appldata/appldata_base.c @@ -26,12 +26,10 @@ #include <linux/notifier.h> #include <linux/cpu.h> #include <linux/workqueue.h> -#include <linux/suspend.h> -#include <linux/platform_device.h> +#include <linux/uaccess.h> +#include <linux/io.h> #include <asm/appldata.h> #include <asm/vtimer.h> -#include <linux/uaccess.h> -#include <asm/io.h> #include <asm/smp.h> #include "appldata.h" @@ -44,17 +42,14 @@ #define TOD_MICRO 0x01000 /* nr. of TOD clock units for 1 microsecond */ -static struct platform_device *appldata_pdev; - /* * /proc entries (sysctl) */ static const char appldata_proc_name[APPLDATA_PROC_NAME_LENGTH] = "appldata"; static int appldata_timer_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); static int appldata_interval_handler(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); static struct ctl_table_header *appldata_sysctl_header; static struct ctl_table appldata_table[] = { @@ -68,17 +63,6 @@ static struct ctl_table appldata_table[] = { .mode = S_IRUGO | S_IWUSR, .proc_handler = appldata_interval_handler, }, - { }, -}; - -static struct ctl_table appldata_dir_table[] = { - { - .procname = appldata_proc_name, - .maxlen = 0, - .mode = S_IRUGO | S_IXUGO, - .child = appldata_table, - }, - { }, }; /* @@ -89,7 +73,6 @@ static struct vtimer_list appldata_timer; static DEFINE_SPINLOCK(appldata_timer_lock); static int appldata_interval = APPLDATA_CPU_INTERVAL; static int appldata_timer_active; -static int appldata_timer_suspended = 0; /* * Work queue @@ -217,7 +200,7 @@ static void __appldata_vtimer_setup(int cmd) */ static int appldata_timer_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int timer_active = appldata_timer_active; int rc; @@ -250,7 +233,7 @@ appldata_timer_handler(struct ctl_table *ctl, int write, */ static int appldata_interval_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int interval = appldata_interval; int rc; @@ -280,7 +263,7 @@ appldata_interval_handler(struct ctl_table *ctl, int write, */ static int appldata_generic_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct appldata_ops *ops = NULL, *tmp_ops; struct list_head *lh; @@ -297,7 +280,7 @@ appldata_generic_handler(struct ctl_table *ctl, int write, mutex_lock(&appldata_ops_mutex); list_for_each(lh, &appldata_ops_list) { tmp_ops = list_entry(lh, struct appldata_ops, list); - if (&tmp_ops->ctl_table[2] == ctl) { + if (&tmp_ops->ctl_table[0] == ctl) { found = 1; } } @@ -367,7 +350,7 @@ int appldata_register_ops(struct appldata_ops *ops) if (ops->size > APPLDATA_MAX_REC_SIZE) return -EINVAL; - ops->ctl_table = kcalloc(4, sizeof(struct ctl_table), GFP_KERNEL); + ops->ctl_table = kcalloc(1, sizeof(struct ctl_table), GFP_KERNEL); if (!ops->ctl_table) return -ENOMEM; @@ -375,17 +358,12 @@ int appldata_register_ops(struct appldata_ops *ops) list_add(&ops->list, &appldata_ops_list); mutex_unlock(&appldata_ops_mutex); - ops->ctl_table[0].procname = appldata_proc_name; - ops->ctl_table[0].maxlen = 0; - ops->ctl_table[0].mode = S_IRUGO | S_IXUGO; - ops->ctl_table[0].child = &ops->ctl_table[2]; - - ops->ctl_table[2].procname = ops->name; - ops->ctl_table[2].mode = S_IRUGO | S_IWUSR; - ops->ctl_table[2].proc_handler = appldata_generic_handler; - ops->ctl_table[2].data = ops; + ops->ctl_table[0].procname = ops->name; + ops->ctl_table[0].mode = S_IRUGO | S_IWUSR; + ops->ctl_table[0].proc_handler = appldata_generic_handler; + ops->ctl_table[0].data = ops; - ops->sysctl_header = register_sysctl_table(ops->ctl_table); + ops->sysctl_header = register_sysctl_sz(appldata_proc_name, ops->ctl_table, 1); if (!ops->sysctl_header) goto out; return 0; @@ -413,88 +391,6 @@ void appldata_unregister_ops(struct appldata_ops *ops) /********************** module-ops management <END> **************************/ -/**************************** suspend / resume *******************************/ -static int appldata_freeze(struct device *dev) -{ - struct appldata_ops *ops; - int rc; - struct list_head *lh; - - spin_lock(&appldata_timer_lock); - if (appldata_timer_active) { - __appldata_vtimer_setup(APPLDATA_DEL_TIMER); - appldata_timer_suspended = 1; - } - spin_unlock(&appldata_timer_lock); - - mutex_lock(&appldata_ops_mutex); - list_for_each(lh, &appldata_ops_list) { - ops = list_entry(lh, struct appldata_ops, list); - if (ops->active == 1) { - rc = appldata_diag(ops->record_nr, APPLDATA_STOP_REC, - (unsigned long) ops->data, ops->size, - ops->mod_lvl); - if (rc != 0) - pr_err("Stopping the data collection for %s " - "failed with rc=%d\n", ops->name, rc); - } - } - mutex_unlock(&appldata_ops_mutex); - return 0; -} - -static int appldata_restore(struct device *dev) -{ - struct appldata_ops *ops; - int rc; - struct list_head *lh; - - spin_lock(&appldata_timer_lock); - if (appldata_timer_suspended) { - __appldata_vtimer_setup(APPLDATA_ADD_TIMER); - appldata_timer_suspended = 0; - } - spin_unlock(&appldata_timer_lock); - - mutex_lock(&appldata_ops_mutex); - list_for_each(lh, &appldata_ops_list) { - ops = list_entry(lh, struct appldata_ops, list); - if (ops->active == 1) { - ops->callback(ops->data); // init record - rc = appldata_diag(ops->record_nr, - APPLDATA_START_INTERVAL_REC, - (unsigned long) ops->data, ops->size, - ops->mod_lvl); - if (rc != 0) { - pr_err("Starting the data collection for %s " - "failed with rc=%d\n", ops->name, rc); - } - } - } - mutex_unlock(&appldata_ops_mutex); - return 0; -} - -static int appldata_thaw(struct device *dev) -{ - return appldata_restore(dev); -} - -static const struct dev_pm_ops appldata_pm_ops = { - .freeze = appldata_freeze, - .thaw = appldata_thaw, - .restore = appldata_restore, -}; - -static struct platform_driver appldata_pdrv = { - .driver = { - .name = "appldata", - .pm = &appldata_pm_ops, - }, -}; -/************************* suspend / resume <END> ****************************/ - - /******************************* init / exit *********************************/ /* @@ -504,36 +400,14 @@ static struct platform_driver appldata_pdrv = { */ static int __init appldata_init(void) { - int rc; - init_virt_timer(&appldata_timer); appldata_timer.function = appldata_timer_function; appldata_timer.data = (unsigned long) &appldata_work; - - rc = platform_driver_register(&appldata_pdrv); - if (rc) - return rc; - - appldata_pdev = platform_device_register_simple("appldata", -1, NULL, - 0); - if (IS_ERR(appldata_pdev)) { - rc = PTR_ERR(appldata_pdev); - goto out_driver; - } appldata_wq = alloc_ordered_workqueue("appldata", 0); - if (!appldata_wq) { - rc = -ENOMEM; - goto out_device; - } - - appldata_sysctl_header = register_sysctl_table(appldata_dir_table); + if (!appldata_wq) + return -ENOMEM; + appldata_sysctl_header = register_sysctl(appldata_proc_name, appldata_table); return 0; - -out_device: - platform_device_unregister(appldata_pdev); -out_driver: - platform_driver_unregister(&appldata_pdrv); - return rc; } __initcall(appldata_init); diff --git a/arch/s390/appldata/appldata_mem.c b/arch/s390/appldata/appldata_mem.c index e68136c3c23a..fc608f9b79ab 100644 --- a/arch/s390/appldata/appldata_mem.c +++ b/arch/s390/appldata/appldata_mem.c @@ -15,7 +15,7 @@ #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/slab.h> -#include <asm/io.h> +#include <linux/io.h> #include "appldata.h" @@ -29,10 +29,6 @@ * the structure version (product ID, see appldata_base.c) needs to be changed * as well and all documentation and z/VM applications using it must be * updated. - * - * The record layout is documented in the Linux for zSeries Device Drivers - * book: - * http://oss.software.ibm.com/developerworks/opensource/linux390/index.shtml */ struct appldata_mem_data { u64 timestamp; diff --git a/arch/s390/appldata/appldata_net_sum.c b/arch/s390/appldata/appldata_net_sum.c index 8bc14b0d1def..59c282ca002f 100644 --- a/arch/s390/appldata/appldata_net_sum.c +++ b/arch/s390/appldata/appldata_net_sum.c @@ -25,10 +25,6 @@ * This is accessed as binary data by z/VM. If changes to it can't be avoided, * the structure version (product ID, see appldata_base.c) needs to be changed * as well and all documentation and z/VM applications using it must be updated. - * - * The record layout is documented in the Linux for zSeries Device Drivers - * book: - * http://oss.software.ibm.com/developerworks/opensource/linux390/index.shtml */ struct appldata_net_sum_data { u64 timestamp; diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c index 54f375627532..a363d30ce739 100644 --- a/arch/s390/appldata/appldata_os.c +++ b/arch/s390/appldata/appldata_os.c @@ -32,10 +32,6 @@ * the structure version (product ID, see appldata_base.c) needs to be changed * as well and all documentation and z/VM applications using it must be * updated. - * - * The record layout is documented in the Linux for zSeries Device Drivers - * book: - * http://oss.software.ibm.com/developerworks/opensource/linux390/index.shtml */ struct appldata_os_per_cpu { u32 per_cpu_user; /* timer ticks spent in user mode */ @@ -75,7 +71,7 @@ struct appldata_os_data { (waiting for I/O) */ /* per cpu data */ - struct appldata_os_per_cpu os_cpu[0]; + struct appldata_os_per_cpu os_cpu[]; } __attribute__((packed)); static struct appldata_os_data *appldata_os_data; @@ -133,8 +129,7 @@ static void appldata_get_os_data(void *data) os_data->nr_cpus = j; - new_size = sizeof(struct appldata_os_data) + - (os_data->nr_cpus * sizeof(struct appldata_os_per_cpu)); + new_size = struct_size(os_data, os_cpu, os_data->nr_cpus); if (ops.size != new_size) { if (ops.active) { rc = appldata_diag(APPLDATA_RECORD_OS_ID, @@ -169,8 +164,7 @@ static int __init appldata_os_init(void) { int rc, max_size; - max_size = sizeof(struct appldata_os_data) + - (num_possible_cpus() * sizeof(struct appldata_os_per_cpu)); + max_size = struct_size(appldata_os_data, os_cpu, num_possible_cpus()); if (max_size > APPLDATA_MAX_REC_SIZE) { pr_err("Maximum OS record size %i exceeds the maximum " "record size %i\n", max_size, APPLDATA_MAX_REC_SIZE); diff --git a/arch/s390/boot/.gitignore b/arch/s390/boot/.gitignore index 16ff906e4610..f56591bc0897 100644 --- a/arch/s390/boot/.gitignore +++ b/arch/s390/boot/.gitignore @@ -1,3 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-only image bzImage section_cmp.* +vmlinux +vmlinux.lds +vmlinux.syms diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile index e2c47d3a1c89..c7c81e5f9218 100644 --- a/arch/s390/boot/Makefile +++ b/arch/s390/boot/Makefile @@ -7,6 +7,7 @@ KCOV_INSTRUMENT := n GCOV_PROFILE := n UBSAN_SANITIZE := n KASAN_SANITIZE := n +KCSAN_SANITIZE := n KBUILD_AFLAGS := $(KBUILD_AFLAGS_DECOMPRESSOR) KBUILD_CFLAGS := $(KBUILD_CFLAGS_DECOMPRESSOR) @@ -34,16 +35,24 @@ endif CFLAGS_sclp_early_core.o += -I$(srctree)/drivers/s390/char -obj-y := head.o als.o startup.o mem_detect.o ipl_parm.o ipl_report.o +obj-y := head.o als.o startup.o physmem_info.o ipl_parm.o ipl_report.o vmem.o obj-y += string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o -obj-y += version.o pgm_check_info.o ctype.o text_dma.o -obj-$(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) += uv.o -obj-$(CONFIG_RELOCATABLE) += machine_kexec_reloc.o +obj-y += version.o pgm_check_info.o ctype.o ipl_data.o machine_kexec_reloc.o +obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o -targets := bzImage startup.a section_cmp.boot.data section_cmp.boot.preserved.data $(obj-y) -subdir- := compressed +obj-y += $(if $(CONFIG_KERNEL_UNCOMPRESSED),,decompressor.o) info.o +obj-$(CONFIG_KERNEL_ZSTD) += clz_ctz.o +obj-all := $(obj-y) piggy.o syms.o + +targets := bzImage section_cmp.boot.data section_cmp.boot.preserved.data $(obj-y) +targets += vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 +targets += vmlinux.bin.xz vmlinux.bin.lzma vmlinux.bin.lzo vmlinux.bin.lz4 +targets += vmlinux.bin.zst info.bin syms.bin vmlinux.syms $(obj-all) OBJECTS := $(addprefix $(obj)/,$(obj-y)) +OBJECTS_ALL := $(addprefix $(obj)/,$(obj-all)) + +clean-files += vmlinux.map quiet_cmd_section_cmp = SECTCMP $* define cmd_section_cmp @@ -58,22 +67,67 @@ define cmd_section_cmp touch $@ endef -$(obj)/bzImage: $(obj)/compressed/vmlinux $(obj)/section_cmp.boot.data $(obj)/section_cmp.boot.preserved.data FORCE +$(obj)/bzImage: $(obj)/vmlinux $(obj)/section_cmp.boot.data $(obj)/section_cmp.boot.preserved.data FORCE $(call if_changed,objcopy) -$(obj)/section_cmp%: vmlinux $(obj)/compressed/vmlinux FORCE +$(obj)/section_cmp%: vmlinux $(obj)/vmlinux FORCE $(call if_changed,section_cmp) -$(obj)/compressed/vmlinux: $(obj)/startup.a FORCE - $(Q)$(MAKE) $(build)=$(obj)/compressed $@ +LDFLAGS_vmlinux := --oformat $(LD_BFD) -e startup $(if $(CONFIG_VMLINUX_MAP),-Map=$(obj)/vmlinux.map) --build-id=sha1 -T +$(obj)/vmlinux: $(obj)/vmlinux.lds $(OBJECTS_ALL) FORCE + $(call if_changed,ld) + +LDFLAGS_vmlinux.syms := --oformat $(LD_BFD) -e startup -T +$(obj)/vmlinux.syms: $(obj)/vmlinux.lds $(OBJECTS) FORCE + $(call if_changed,ld) + +quiet_cmd_dumpsyms = DUMPSYMS $< +define cmd_dumpsyms + $(NM) -n -S --format=bsd "$<" | sed -nE 's/^0*([0-9a-fA-F]+) 0*([0-9a-fA-F]+) [tT] ([^ ]*)$$/\1 \2 \3/p' | tr '\n' '\0' > "$@" +endef + +$(obj)/syms.bin: $(obj)/vmlinux.syms FORCE + $(call if_changed,dumpsyms) -$(obj)/startup.a: $(OBJECTS) FORCE - $(call if_changed,ar) +OBJCOPYFLAGS_syms.o := -I binary -O elf64-s390 -B s390:64-bit --rename-section .data=.decompressor.syms +$(obj)/syms.o: $(obj)/syms.bin FORCE + $(call if_changed,objcopy) -install: $(CONFIGURE) $(obj)/bzImage - sh -x $(srctree)/$(obj)/install.sh $(KERNELRELEASE) $(obj)/bzImage \ - System.map "$(INSTALL_PATH)" +OBJCOPYFLAGS_info.bin := -O binary --only-section=.vmlinux.info --set-section-flags .vmlinux.info=load +$(obj)/info.bin: vmlinux FORCE + $(call if_changed,objcopy) -chkbss := $(obj-y) -chkbss-target := startup.a -include $(srctree)/arch/s390/scripts/Makefile.chkbss +OBJCOPYFLAGS_info.o := -I binary -O elf64-s390 -B s390:64-bit --rename-section .data=.vmlinux.info +$(obj)/info.o: $(obj)/info.bin FORCE + $(call if_changed,objcopy) + +OBJCOPYFLAGS_vmlinux.bin := -O binary --remove-section=.comment --remove-section=.vmlinux.info -S +$(obj)/vmlinux.bin: vmlinux FORCE + $(call if_changed,objcopy) + +suffix-$(CONFIG_KERNEL_GZIP) := .gz +suffix-$(CONFIG_KERNEL_BZIP2) := .bz2 +suffix-$(CONFIG_KERNEL_LZ4) := .lz4 +suffix-$(CONFIG_KERNEL_LZMA) := .lzma +suffix-$(CONFIG_KERNEL_LZO) := .lzo +suffix-$(CONFIG_KERNEL_XZ) := .xz +suffix-$(CONFIG_KERNEL_ZSTD) := .zst + +$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE + $(call if_changed,gzip) +$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE + $(call if_changed,bzip2_with_size) +$(obj)/vmlinux.bin.lz4: $(obj)/vmlinux.bin FORCE + $(call if_changed,lz4_with_size) +$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE + $(call if_changed,lzma_with_size) +$(obj)/vmlinux.bin.lzo: $(obj)/vmlinux.bin FORCE + $(call if_changed,lzo_with_size) +$(obj)/vmlinux.bin.xz: $(obj)/vmlinux.bin FORCE + $(call if_changed,xzkern_with_size) +$(obj)/vmlinux.bin.zst: $(obj)/vmlinux.bin FORCE + $(call if_changed,zstd22_with_size) + +OBJCOPYFLAGS_piggy.o := -I binary -O elf64-s390 -B s390:64-bit --rename-section .data=.vmlinux.bin.compressed +$(obj)/piggy.o: $(obj)/vmlinux.bin$(suffix-y) FORCE + $(call if_changed,objcopy) diff --git a/arch/s390/boot/als.c b/arch/s390/boot/als.c index ff6801d401c4..47c48fbfb563 100644 --- a/arch/s390/boot/als.c +++ b/arch/s390/boot/als.c @@ -68,7 +68,7 @@ void print_missing_facilities(void) first = 1; for (i = 0; i < ARRAY_SIZE(als); i++) { - val = ~S390_lowcore.stfle_fac_list[i] & als[i]; + val = ~stfle_fac_list[i] & als[i]; for (j = 0; j < BITS_PER_LONG; j++) { if (!(val & (1UL << (BITS_PER_LONG - 1 - j)))) continue; @@ -106,9 +106,9 @@ void verify_facilities(void) { int i; - __stfle(S390_lowcore.stfle_fac_list, ARRAY_SIZE(S390_lowcore.stfle_fac_list)); + __stfle(stfle_fac_list, ARRAY_SIZE(stfle_fac_list)); for (i = 0; i < ARRAY_SIZE(als); i++) { - if ((S390_lowcore.stfle_fac_list[i] & als[i]) != als[i]) + if ((stfle_fac_list[i] & als[i]) != als[i]) facility_mismatch(); } } diff --git a/arch/s390/boot/boot.h b/arch/s390/boot/boot.h index 2ea603f70c3b..222c6886acf6 100644 --- a/arch/s390/boot/boot.h +++ b/arch/s390/boot/boot.h @@ -2,20 +2,101 @@ #ifndef BOOT_BOOT_H #define BOOT_BOOT_H +#include <linux/types.h> + +#define IPL_START 0x200 + +#ifndef __ASSEMBLY__ + +#include <asm/physmem_info.h> + +struct machine_info { + unsigned char has_edat1 : 1; + unsigned char has_edat2 : 1; + unsigned char has_nx : 1; +}; + +struct vmlinux_info { + unsigned long default_lma; + unsigned long entry; + unsigned long image_size; /* does not include .bss */ + unsigned long bss_size; /* uncompressed image .bss size */ + unsigned long bootdata_off; + unsigned long bootdata_size; + unsigned long bootdata_preserved_off; + unsigned long bootdata_preserved_size; + unsigned long dynsym_start; + unsigned long rela_dyn_start; + unsigned long rela_dyn_end; + unsigned long amode31_size; + unsigned long init_mm_off; + unsigned long swapper_pg_dir_off; + unsigned long invalid_pg_dir_off; +#ifdef CONFIG_KASAN + unsigned long kasan_early_shadow_page_off; + unsigned long kasan_early_shadow_pte_off; + unsigned long kasan_early_shadow_pmd_off; + unsigned long kasan_early_shadow_pud_off; + unsigned long kasan_early_shadow_p4d_off; +#endif +}; + void startup_kernel(void); -void detect_memory(void); +unsigned long detect_max_physmem_end(void); +void detect_physmem_online_ranges(unsigned long max_physmem_end); +void physmem_set_usable_limit(unsigned long limit); +void physmem_reserve(enum reserved_range_type type, unsigned long addr, unsigned long size); +void physmem_free(enum reserved_range_type type); +/* for continuous/multiple allocations per type */ +unsigned long physmem_alloc_top_down(enum reserved_range_type type, unsigned long size, + unsigned long align); +/* for single allocations, 1 per type */ +unsigned long physmem_alloc_range(enum reserved_range_type type, unsigned long size, + unsigned long align, unsigned long min, unsigned long max, + bool die_on_oom); +unsigned long get_physmem_alloc_pos(void); +bool ipl_report_certs_intersects(unsigned long addr, unsigned long size, + unsigned long *intersection_start); +bool is_ipl_block_dump(void); void store_ipl_parmblock(void); +int read_ipl_report(void); +void save_ipl_cert_comp_list(void); void setup_boot_command_line(void); void parse_boot_command_line(void); -void setup_memory_end(void); void verify_facilities(void); void print_missing_facilities(void); +void sclp_early_setup_buffer(void); void print_pgm_check_info(void); -unsigned long get_random_base(unsigned long safe_addr); +unsigned long randomize_within_range(unsigned long size, unsigned long align, + unsigned long min, unsigned long max); +void setup_vmem(unsigned long asce_limit); +void __printf(1, 2) decompressor_printk(const char *fmt, ...); +void print_stacktrace(unsigned long sp); +void error(char *m); + +extern struct machine_info machine; -extern int kaslr_enabled; +/* Symbols defined by linker scripts */ extern const char kernel_version[]; +extern unsigned long memory_limit; +extern unsigned long vmalloc_size; +extern int vmalloc_size_set; +extern char __boot_data_start[], __boot_data_end[]; +extern char __boot_data_preserved_start[], __boot_data_preserved_end[]; +extern char _decompressor_syms_start[], _decompressor_syms_end[]; +extern char _stack_start[], _stack_end[]; +extern char _end[], _decompressor_end[]; +extern unsigned char _compressed_start[]; +extern unsigned char _compressed_end[]; +extern struct vmlinux_info _vmlinux_info; +#define vmlinux _vmlinux_info -unsigned long read_ipl_report(unsigned long safe_offset); +#define __abs_lowcore_pa(x) (((unsigned long)(x) - __abs_lowcore) % sizeof(struct lowcore)) +static inline bool intersects(unsigned long addr0, unsigned long size0, + unsigned long addr1, unsigned long size1) +{ + return addr0 + size0 > addr1 && addr1 + size1 > addr0; +} +#endif /* __ASSEMBLY__ */ #endif /* BOOT_BOOT_H */ diff --git a/arch/s390/boot/clz_ctz.c b/arch/s390/boot/clz_ctz.c new file mode 100644 index 000000000000..c3ebf248596b --- /dev/null +++ b/arch/s390/boot/clz_ctz.c @@ -0,0 +1,2 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "../../../../lib/clz_ctz.c" diff --git a/arch/s390/boot/compressed/.gitignore b/arch/s390/boot/compressed/.gitignore deleted file mode 100644 index e72fcd7ecebb..000000000000 --- a/arch/s390/boot/compressed/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -vmlinux -vmlinux.lds diff --git a/arch/s390/boot/compressed/Makefile b/arch/s390/boot/compressed/Makefile deleted file mode 100644 index fa529c5b4486..000000000000 --- a/arch/s390/boot/compressed/Makefile +++ /dev/null @@ -1,68 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# linux/arch/s390/boot/compressed/Makefile -# -# create a compressed vmlinux image from the original vmlinux -# - -KCOV_INSTRUMENT := n -GCOV_PROFILE := n -UBSAN_SANITIZE := n -KASAN_SANITIZE := n - -obj-y := $(if $(CONFIG_KERNEL_UNCOMPRESSED),,decompressor.o) piggy.o info.o -targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 -targets += vmlinux.bin.xz vmlinux.bin.lzma vmlinux.bin.lzo vmlinux.bin.lz4 -targets += info.bin $(obj-y) - -KBUILD_AFLAGS := $(KBUILD_AFLAGS_DECOMPRESSOR) -KBUILD_CFLAGS := $(KBUILD_CFLAGS_DECOMPRESSOR) -OBJCOPYFLAGS := - -OBJECTS := $(addprefix $(obj)/,$(obj-y)) - -LDFLAGS_vmlinux := --oformat $(LD_BFD) -e startup -T -$(obj)/vmlinux: $(obj)/vmlinux.lds $(objtree)/arch/s390/boot/startup.a $(OBJECTS) FORCE - $(call if_changed,ld) - -OBJCOPYFLAGS_info.bin := -O binary --only-section=.vmlinux.info --set-section-flags .vmlinux.info=load -$(obj)/info.bin: vmlinux FORCE - $(call if_changed,objcopy) - -OBJCOPYFLAGS_info.o := -I binary -O elf64-s390 -B s390:64-bit --rename-section .data=.vmlinux.info -$(obj)/info.o: $(obj)/info.bin FORCE - $(call if_changed,objcopy) - -OBJCOPYFLAGS_vmlinux.bin := -O binary --remove-section=.comment --remove-section=.vmlinux.info -S -$(obj)/vmlinux.bin: vmlinux FORCE - $(call if_changed,objcopy) - -vmlinux.bin.all-y := $(obj)/vmlinux.bin - -suffix-$(CONFIG_KERNEL_GZIP) := .gz -suffix-$(CONFIG_KERNEL_BZIP2) := .bz2 -suffix-$(CONFIG_KERNEL_LZ4) := .lz4 -suffix-$(CONFIG_KERNEL_LZMA) := .lzma -suffix-$(CONFIG_KERNEL_LZO) := .lzo -suffix-$(CONFIG_KERNEL_XZ) := .xz - -$(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE - $(call if_changed,gzip) -$(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE - $(call if_changed,bzip2) -$(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) FORCE - $(call if_changed,lz4) -$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE - $(call if_changed,lzma) -$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE - $(call if_changed,lzo) -$(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) FORCE - $(call if_changed,xzkern) - -OBJCOPYFLAGS_piggy.o := -I binary -O elf64-s390 -B s390:64-bit --rename-section .data=.vmlinux.bin.compressed -$(obj)/piggy.o: $(obj)/vmlinux.bin$(suffix-y) FORCE - $(call if_changed,objcopy) - -chkbss := $(filter-out piggy.o info.o, $(obj-y)) -chkbss-target := vmlinux.bin -include $(srctree)/arch/s390/scripts/Makefile.chkbss diff --git a/arch/s390/boot/compressed/decompressor.h b/arch/s390/boot/compressed/decompressor.h deleted file mode 100644 index c15eb7114d83..000000000000 --- a/arch/s390/boot/compressed/decompressor.h +++ /dev/null @@ -1,30 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef BOOT_COMPRESSED_DECOMPRESSOR_H -#define BOOT_COMPRESSED_DECOMPRESSOR_H - -#ifdef CONFIG_KERNEL_UNCOMPRESSED -static inline void *decompress_kernel(void) {} -#else -void *decompress_kernel(void); -#endif -unsigned long mem_safe_offset(void); -void error(char *m); - -struct vmlinux_info { - unsigned long default_lma; - void (*entry)(void); - unsigned long image_size; /* does not include .bss */ - unsigned long bss_size; /* uncompressed image .bss size */ - unsigned long bootdata_off; - unsigned long bootdata_size; - unsigned long bootdata_preserved_off; - unsigned long bootdata_preserved_size; - unsigned long dynsym_start; - unsigned long rela_dyn_start; - unsigned long rela_dyn_end; -}; - -extern char _vmlinux_info[]; -#define vmlinux (*(struct vmlinux_info *)_vmlinux_info) - -#endif /* BOOT_COMPRESSED_DECOMPRESSOR_H */ diff --git a/arch/s390/boot/compressed/decompressor.c b/arch/s390/boot/decompressor.c index 45046630c56a..d762733a0753 100644 --- a/arch/s390/boot/compressed/decompressor.c +++ b/arch/s390/boot/decompressor.c @@ -11,12 +11,12 @@ #include <linux/string.h> #include <asm/page.h> #include "decompressor.h" +#include "boot.h" /* * gzip declarations */ #define STATIC static -#define STATIC_RW_DATA static __section(.data) #undef memset #undef memcpy @@ -24,19 +24,16 @@ #define memmove memmove #define memzero(s, n) memset((s), 0, (n)) -/* Symbols defined by linker scripts */ -extern char _end[]; -extern unsigned char _compressed_start[]; -extern unsigned char _compressed_end[]; - -#ifdef CONFIG_HAVE_KERNEL_BZIP2 -#define HEAP_SIZE 0x400000 +#if defined(CONFIG_KERNEL_BZIP2) +#define BOOT_HEAP_SIZE 0x400000 +#elif defined(CONFIG_KERNEL_ZSTD) +#define BOOT_HEAP_SIZE 0x30000 #else -#define HEAP_SIZE 0x10000 +#define BOOT_HEAP_SIZE 0x10000 #endif static unsigned long free_mem_ptr = (unsigned long) _end; -static unsigned long free_mem_end_ptr = (unsigned long) _end + HEAP_SIZE; +static unsigned long free_mem_end_ptr = (unsigned long) _end + BOOT_HEAP_SIZE; #ifdef CONFIG_KERNEL_GZIP #include "../../../../lib/decompress_inflate.c" @@ -62,7 +59,11 @@ static unsigned long free_mem_end_ptr = (unsigned long) _end + HEAP_SIZE; #include "../../../../lib/decompress_unxz.c" #endif -#define decompress_offset ALIGN((unsigned long)_end + HEAP_SIZE, PAGE_SIZE) +#ifdef CONFIG_KERNEL_ZSTD +#include "../../../../lib/decompress_unzstd.c" +#endif + +#define decompress_offset ALIGN((unsigned long)_end + BOOT_HEAP_SIZE, PAGE_SIZE) unsigned long mem_safe_offset(void) { @@ -80,6 +81,6 @@ void *decompress_kernel(void) void *output = (void *)decompress_offset; __decompress(_compressed_start, _compressed_end - _compressed_start, - NULL, NULL, output, 0, NULL, error); + NULL, NULL, output, vmlinux.image_size, NULL, error); return output; } diff --git a/arch/s390/boot/decompressor.h b/arch/s390/boot/decompressor.h new file mode 100644 index 000000000000..92b81d2ea35d --- /dev/null +++ b/arch/s390/boot/decompressor.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef BOOT_COMPRESSED_DECOMPRESSOR_H +#define BOOT_COMPRESSED_DECOMPRESSOR_H + +#ifdef CONFIG_KERNEL_UNCOMPRESSED +static inline void *decompress_kernel(void) { return NULL; } +#else +void *decompress_kernel(void); +#endif +unsigned long mem_safe_offset(void); + +#endif /* BOOT_COMPRESSED_DECOMPRESSOR_H */ diff --git a/arch/s390/boot/head.S b/arch/s390/boot/head.S index 4b86a8d3c121..637c29c3f6e3 100644 --- a/arch/s390/boot/head.S +++ b/arch/s390/boot/head.S @@ -5,7 +5,6 @@ * Author(s): Hartmut Penner <hp@de.ibm.com> * Martin Schwidefsky <schwidefsky@de.ibm.com> * Rob van der Heij <rvdhei@iae.nl> - * Heiko Carstens <heiko.carstens@de.ibm.com> * * There are 5 different IPL methods * 1) load the image directly into ram at address 0 and do an PSW restart @@ -25,259 +24,201 @@ #include <linux/init.h> #include <linux/linkage.h> #include <asm/asm-offsets.h> -#include <asm/thread_info.h> #include <asm/page.h> #include <asm/ptrace.h> +#include <asm/sclp.h> +#include "boot.h" -#define ARCH_OFFSET 4 +#define EP_OFFSET 0x10008 +#define EP_STRING "S390EP" +#define IPL_BS 0x730 __HEAD - -#define IPL_BS 0x730 - .org 0 - .long 0x00080000,0x80000000+iplstart # The first 24 bytes are loaded - .long 0x02000018,0x60000050 # by ipl to addresses 0-23. - .long 0x02000068,0x60000050 # (a PSW and two CCWs). - .fill 80-24,1,0x40 # bytes 24-79 are discarded !! - .long 0x020000f0,0x60000050 # The next 160 byte are loaded - .long 0x02000140,0x60000050 # to addresses 0x18-0xb7 - .long 0x02000190,0x60000050 # They form the continuation - .long 0x020001e0,0x60000050 # of the CCW program started - .long 0x02000230,0x60000050 # by ipl and load the range - .long 0x02000280,0x60000050 # 0x0f0-0x730 from the image - .long 0x020002d0,0x60000050 # to the range 0x0f0-0x730 - .long 0x02000320,0x60000050 # in memory. At the end of - .long 0x02000370,0x60000050 # the channel program the PSW - .long 0x020003c0,0x60000050 # at location 0 is loaded. - .long 0x02000410,0x60000050 # Initial processing starts - .long 0x02000460,0x60000050 # at 0x200 = iplstart. - .long 0x020004b0,0x60000050 - .long 0x02000500,0x60000050 - .long 0x02000550,0x60000050 - .long 0x020005a0,0x60000050 - .long 0x020005f0,0x60000050 - .long 0x02000640,0x60000050 - .long 0x02000690,0x60000050 - .long 0x020006e0,0x20000050 - - .org __LC_RST_NEW_PSW # 0x1a0 - .quad 0,iplstart - .org __LC_PGM_NEW_PSW # 0x1d0 - .quad 0x0000000180000000,startup_pgm_check_handler - - .org 0x200 - +ipl_start: + mvi __LC_AR_MODE_ID,1 # set esame flag + slr %r0,%r0 # set cpuid to zero + lhi %r1,2 # mode 2 = esame (dump) + sigp %r1,%r0,0x12 # switch to esame mode + sam64 # switch to 64 bit addressing mode + lgh %r1,__LC_SUBCHANNEL_ID # test if subchannel number + brctg %r1,.Lnoload # is valid + llgf %r1,__LC_SUBCHANNEL_ID # load ipl subchannel number + lghi %r2,IPL_BS # load start address + bras %r14,.Lloader # load rest of ipl image + larl %r12,parmarea # pointer to parameter area + stg %r1,IPL_DEVICE-PARMAREA(%r12) # save ipl device number +# +# load parameter file from ipl device +# +.Lagain1: + larl %r2,_end # ramdisk loc. is temp + bras %r14,.Lloader # load parameter file + ltgr %r2,%r2 # got anything ? + jz .Lnopf + lg %r3,MAX_COMMAND_LINE_SIZE-PARMAREA(%r12) + aghi %r3,-1 + clgr %r2,%r3 + jl .Lnotrunc + lgr %r2,%r3 +.Lnotrunc: + larl %r4,_end + larl %r13,.L_hdr + clc 0(3,%r4),0(%r13) # if it is HDRx + jz .Lagain1 # skip dataset header + larl %r13,.L_eof + clc 0(3,%r4),0(%r13) # if it is EOFx + jz .Lagain1 # skip data set trailer + lgr %r5,%r2 + la %r6,COMMAND_LINE-PARMAREA(%r12) + lgr %r7,%r2 + aghi %r7,1 + mvcl %r6,%r4 +.Lnopf: +# +# load ramdisk from ipl device +# +.Lagain2: + larl %r2,_end # addr of ramdisk + stg %r2,INITRD_START-PARMAREA(%r12) + bras %r14,.Lloader # load ramdisk + stg %r2,INITRD_SIZE-PARMAREA(%r12) # store size of rd + ltgr %r2,%r2 + jnz .Lrdcont + stg %r2,INITRD_START-PARMAREA(%r12) # no ramdisk found +.Lrdcont: + larl %r2,_end + larl %r13,.L_hdr # skip HDRx and EOFx + clc 0(3,%r2),0(%r13) + jz .Lagain2 + larl %r13,.L_eof + clc 0(3,%r2),0(%r13) + jz .Lagain2 +# +# reset files in VM reader +# + larl %r13,.Lcpuid + stidp 0(%r13) # store cpuid + tm 0(%r13),0xff # running VM ? + jno .Lnoreset + larl %r2,.Lreset + lghi %r3,26 + diag %r2,%r3,8 + larl %r5,.Lirb + stsch 0(%r5) # check if irq is pending + tm 30(%r5),0x0f # by verifying if any of the + jnz .Lwaitforirq # activity or status control + tm 31(%r5),0xff # bits is set in the schib + jz .Lnoreset +.Lwaitforirq: + bras %r14,.Lirqwait # wait for IO interrupt + c %r1,__LC_SUBCHANNEL_ID # compare subchannel number + jne .Lwaitforirq + larl %r5,.Lirb + tsch 0(%r5) +.Lnoreset: + j .Lnoload +# +# everything loaded, go for it +# +.Lnoload: + jg startup # # subroutine to wait for end I/O # .Lirqwait: - mvc __LC_IO_NEW_PSW(16),.Lnewpsw # set up IO interrupt psw - lpsw .Lwaitpsw + larl %r13,.Lnewpswmask # set up IO interrupt psw + mvc __LC_IO_NEW_PSW(8),0(%r13) + stg %r14,__LC_IO_NEW_PSW+8 + larl %r13,.Lwaitpsw + lpswe 0(%r13) .Lioint: - br %r14 - .align 8 -.Lnewpsw: - .quad 0x0000000080000000,.Lioint -.Lwaitpsw: - .long 0x020a0000,0x80000000+.Lioint - # # subroutine for loading cards from the reader # .Lloader: - la %r4,0(%r14) - la %r3,.Lorb # r2 = address of orb into r2 - la %r5,.Lirb # r4 = address of irb - la %r6,.Lccws - la %r7,20 + lgr %r4,%r14 + larl %r3,.Lorb # r2 = address of orb into r2 + larl %r5,.Lirb # r4 = address of irb + larl %r6,.Lccws + lghi %r7,20 .Linit: st %r2,4(%r6) # initialize CCW data addresses la %r2,0x50(%r2) la %r6,8(%r6) - bct 7,.Linit - - lctl %c6,%c6,.Lcr6 # set IO subclass mask - slr %r2,%r2 + brctg %r7,.Linit + larl %r13,.Lcr6 + lctlg %c6,%c6,0(%r13) + xgr %r2,%r2 .Lldlp: ssch 0(%r3) # load chunk of 1600 bytes - bnz .Llderr + jnz .Llderr .Lwait4irq: - bas %r14,.Lirqwait + bras %r14,.Lirqwait c %r1,__LC_SUBCHANNEL_ID # compare subchannel number - bne .Lwait4irq + jne .Lwait4irq tsch 0(%r5) - - slr %r0,%r0 + xgr %r0,%r0 ic %r0,8(%r5) # get device status - chi %r0,8 # channel end ? - be .Lcont - chi %r0,12 # channel end + device end ? - be .Lcont - - l %r0,4(%r5) - s %r0,8(%r3) # r0/8 = number of ccws executed - mhi %r0,10 # *10 = number of bytes in ccws - lh %r3,10(%r5) # get residual count - sr %r0,%r3 # #ccws*80-residual=#bytes read - ar %r2,%r0 - + cghi %r0,8 # channel end ? + je .Lcont + cghi %r0,12 # channel end + device end ? + je .Lcont + llgf %r0,4(%r5) + sgf %r0,8(%r3) # r0/8 = number of ccws executed + mghi %r0,10 # *10 = number of bytes in ccws + llgh %r3,10(%r5) # get residual count + sgr %r0,%r3 # #ccws*80-residual=#bytes read + agr %r2,%r0 br %r4 # r2 contains the total size - .Lcont: - ahi %r2,0x640 # add 0x640 to total size - la %r6,.Lccws - la %r7,20 + aghi %r2,0x640 # add 0x640 to total size + larl %r6,.Lccws + lghi %r7,20 .Lincr: l %r0,4(%r6) # update CCW data addresses - ahi %r0,0x640 + aghi %r0,0x640 st %r0,4(%r6) - ahi %r6,8 - bct 7,.Lincr - - b .Lldlp + aghi %r6,8 + brctg %r7,.Lincr + j .Lldlp .Llderr: - lpsw .Lcrash + larl %r13,.Lcrash + lpsw 0(%r13) - .align 8 + .balign 8 +.Lwaitpsw: + .quad 0x0202000180000000,.Lioint +.Lnewpswmask: + .quad 0x0000000180000000 + .balign 8 .Lorb: .long 0x00000000,0x0080ff00,.Lccws .Lirb: .long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -.Lcr6: .long 0xff000000 -.Lloadp:.long 0,0 - .align 8 + .balign 8 +.Lcr6: .quad 0x00000000ff000000 + .balign 8 .Lcrash:.long 0x000a0000,0x00000000 - - .align 8 + .balign 8 .Lccws: .rept 19 .long 0x02600050,0x00000000 .endr .long 0x02200050,0x00000000 - -iplstart: - mvi __LC_AR_MODE_ID,1 # set esame flag - slr %r0,%r0 # set cpuid to zero - lhi %r1,2 # mode 2 = esame (dump) - sigp %r1,%r0,0x12 # switch to esame mode - bras %r13,0f - .fill 16,4,0x0 -0: lmh %r0,%r15,0(%r13) # clear high-order half of gprs - sam31 # switch to 31 bit addressing mode - lh %r1,__LC_SUBCHANNEL_ID # test if subchannel number - bct %r1,.Lnoload # is valid - l %r1,__LC_SUBCHANNEL_ID # load ipl subchannel number - la %r2,IPL_BS # load start address - bas %r14,.Lloader # load rest of ipl image - l %r12,.Lparm # pointer to parameter area - st %r1,IPL_DEVICE+ARCH_OFFSET-PARMAREA(%r12) # save ipl device number - -# -# load parameter file from ipl device -# -.Lagain1: - l %r2,.Linitrd # ramdisk loc. is temp - bas %r14,.Lloader # load parameter file - ltr %r2,%r2 # got anything ? - bz .Lnopf - chi %r2,895 - bnh .Lnotrunc - la %r2,895 -.Lnotrunc: - l %r4,.Linitrd - clc 0(3,%r4),.L_hdr # if it is HDRx - bz .Lagain1 # skip dataset header - clc 0(3,%r4),.L_eof # if it is EOFx - bz .Lagain1 # skip dateset trailer - la %r5,0(%r4,%r2) - lr %r3,%r2 - la %r3,COMMAND_LINE-PARMAREA(%r12) # load adr. of command line - mvc 0(256,%r3),0(%r4) - mvc 256(256,%r3),256(%r4) - mvc 512(256,%r3),512(%r4) - mvc 768(122,%r3),768(%r4) - slr %r0,%r0 - b .Lcntlp -.Ldelspc: - ic %r0,0(%r2,%r3) - chi %r0,0x20 # is it a space ? - be .Lcntlp - ahi %r2,1 - b .Leolp -.Lcntlp: - brct %r2,.Ldelspc -.Leolp: - slr %r0,%r0 - stc %r0,0(%r2,%r3) # terminate buffer -.Lnopf: - -# -# load ramdisk from ipl device -# -.Lagain2: - l %r2,.Linitrd # addr of ramdisk - st %r2,INITRD_START+ARCH_OFFSET-PARMAREA(%r12) - bas %r14,.Lloader # load ramdisk - st %r2,INITRD_SIZE+ARCH_OFFSET-PARMAREA(%r12) # store size of rd - ltr %r2,%r2 - bnz .Lrdcont - st %r2,INITRD_START+ARCH_OFFSET-PARMAREA(%r12) # no ramdisk found -.Lrdcont: - l %r2,.Linitrd - - clc 0(3,%r2),.L_hdr # skip HDRx and EOFx - bz .Lagain2 - clc 0(3,%r2),.L_eof - bz .Lagain2 - -# -# reset files in VM reader -# - stidp .Lcpuid # store cpuid - tm .Lcpuid,0xff # running VM ? - bno .Lnoreset - la %r2,.Lreset - lhi %r3,26 - diag %r2,%r3,8 - la %r5,.Lirb - stsch 0(%r5) # check if irq is pending - tm 30(%r5),0x0f # by verifying if any of the - bnz .Lwaitforirq # activity or status control - tm 31(%r5),0xff # bits is set in the schib - bz .Lnoreset -.Lwaitforirq: - bas %r14,.Lirqwait # wait for IO interrupt - c %r1,__LC_SUBCHANNEL_ID # compare subchannel number - bne .Lwaitforirq - la %r5,.Lirb - tsch 0(%r5) -.Lnoreset: - b .Lnoload - -# -# everything loaded, go for it -# -.Lnoload: - l %r1,.Lstartup - br %r1 - -.Linitrd:.long _end # default address of initrd -.Lparm: .long PARMAREA -.Lstartup: .long startup .Lreset:.byte 0xc3,0xc8,0xc1,0xd5,0xc7,0xc5,0x40,0xd9,0xc4,0xd9,0x40 .byte 0xc1,0xd3,0xd3,0x40,0xd2,0xc5,0xc5,0xd7,0x40,0xd5,0xd6 .byte 0xc8,0xd6,0xd3,0xc4 # "change rdr all keep nohold" .L_eof: .long 0xc5d6c600 /* C'EOF' */ .L_hdr: .long 0xc8c4d900 /* C'HDR' */ - .align 8 + .balign 8 .Lcpuid:.fill 8,1,0 # -# startup-code at 0x10000, running in absolute addressing mode +# normal startup-code, running in absolute addressing mode # this is called either by the ipl loader or directly by PSW restart # or linload or SALIPL # - .org 0x10000 -ENTRY(startup) - j .Lep_startup_normal - .org EP_OFFSET + .org STARTUP_NORMAL_OFFSET - IPL_START +SYM_CODE_START(startup) + j startup_normal + .org EP_OFFSET - IPL_START # # This is a list of s390 kernel entry points. At address 0x1000f the number of # valid entry points is stored. @@ -287,12 +228,12 @@ ENTRY(startup) .ascii EP_STRING .byte 0x00,0x01 # -# kdump startup-code at 0x10010, running in 64 bit absolute addressing mode +# kdump startup-code, running in 64 bit absolute addressing mode # - .org 0x10010 -ENTRY(startup_kdump) - j .Lep_startup_kdump -.Lep_startup_normal: + .org STARTUP_KDUMP_OFFSET - IPL_START + j startup_kdump +SYM_CODE_END(startup) +SYM_CODE_START_LOCAL(startup_normal) mvi __LC_AR_MODE_ID,1 # set esame flag slr %r0,%r0 # set cpuid to zero lhi %r1,2 # mode 2 = esame (dump) @@ -301,55 +242,53 @@ ENTRY(startup_kdump) .fill 16,4,0x0 0: lmh %r0,%r15,0(%r13) # clear high-order half of gprs sam64 # switch to 64 bit addressing mode - basr %r13,0 # get base -.LPG0: + larl %r13,.Lext_new_psw + mvc __LC_EXT_NEW_PSW(16),0(%r13) + larl %r13,.Lpgm_new_psw + mvc __LC_PGM_NEW_PSW(16),0(%r13) + larl %r13,.Lio_new_psw + mvc __LC_IO_NEW_PSW(16),0(%r13) xc 0x200(256),0x200 # partially clear lowcore xc 0x300(256),0x300 xc 0xe00(256),0xe00 xc 0xf00(256),0xf00 - lctlg %c0,%c15,.Lctl-.LPG0(%r13) # load control registers + larl %r13,.Lctl + lctlg %c0,%c15,0(%r13) # load control registers stcke __LC_BOOT_CLOCK mvc __LC_LAST_UPDATE_CLOCK(8),__LC_BOOT_CLOCK+1 - spt 6f-.LPG0(%r13) - mvc __LC_LAST_UPDATE_TIMER(8),6f-.LPG0(%r13) - l %r15,.Lstack-.LPG0(%r13) + larl %r13,6f + spt 0(%r13) + mvc __LC_LAST_UPDATE_TIMER(8),0(%r13) + larl %r15,_stack_end-STACK_FRAME_OVERHEAD + brasl %r14,sclp_early_setup_buffer brasl %r14,verify_facilities brasl %r14,startup_kernel +SYM_CODE_END(startup_normal) -.Lstack: - .long 0x8000 + (1<<(PAGE_SHIFT+BOOT_STACK_ORDER)) - STACK_FRAME_OVERHEAD - .align 8 + .balign 8 6: .long 0x7fffffff,0xffffffff - +.Lext_new_psw: + .quad 0x0002000180000000,0x1b0 # disabled wait +.Lpgm_new_psw: + .quad 0x0000000180000000,startup_pgm_check_handler +.Lio_new_psw: + .quad 0x0002000180000000,0x1f0 # disabled wait .Lctl: .quad 0x04040000 # cr0: AFP registers & secondary space .quad 0 # cr1: primary space segment table - .quad .Lduct # cr2: dispatchable unit control table + .quad 0 # cr2: dispatchable unit control table .quad 0 # cr3: instruction authorization .quad 0xffff # cr4: instruction authorization - .quad .Lduct # cr5: primary-aste origin + .quad 0 # cr5: primary-aste origin .quad 0 # cr6: I/O interrupts .quad 0 # cr7: secondary space segment table - .quad 0 # cr8: access registers translation + .quad 0x0000000000008000 # cr8: access registers translation .quad 0 # cr9: tracing off .quad 0 # cr10: tracing off .quad 0 # cr11: tracing off .quad 0 # cr12: tracing off .quad 0 # cr13: home space segment table .quad 0xc0000000 # cr14: machine check handling off - .quad .Llinkage_stack # cr15: linkage stack operations - - .section .dma.data,"aw",@progbits -.Lduct: .long 0,.Laste,.Laste,0,.Lduald,0,0,0 - .long 0,0,0,0,0,0,0,0 -.Llinkage_stack: - .long 0,0,0x89000000,0,0,0,0x8a000000,0 - .align 64 -.Laste: .quad 0,0xffffffffffffffff,0,0,0,0,0,0 - .align 128 -.Lduald:.rept 8 - .long 0x80000000,0,0,0 # invalid access-list entries - .endr - .previous + .quad 0 # cr15: linkage stack operations #include "head_kdump.S" @@ -359,45 +298,23 @@ ENTRY(startup_kdump) # It simply saves general/control registers and psw in # the save area and does disabled wait with a faulty address. # -ENTRY(startup_pgm_check_handler) - stmg %r0,%r15,__LC_SAVE_AREA_SYNC - la %r1,4095 - stctg %c0,%c15,__LC_CREGS_SAVE_AREA-4095(%r1) - mvc __LC_GPREGS_SAVE_AREA-4095(128,%r1),__LC_SAVE_AREA_SYNC - mvc __LC_PSW_SAVE_AREA-4095(16,%r1),__LC_PGM_OLD_PSW +SYM_CODE_START_LOCAL(startup_pgm_check_handler) + stmg %r8,%r15,__LC_SAVE_AREA_SYNC + la %r8,4095 + stctg %c0,%c15,__LC_CREGS_SAVE_AREA-4095(%r8) + stmg %r0,%r7,__LC_GPREGS_SAVE_AREA-4095(%r8) + mvc __LC_GPREGS_SAVE_AREA-4095+64(64,%r8),__LC_SAVE_AREA_SYNC + mvc __LC_PSW_SAVE_AREA-4095(16,%r8),__LC_PGM_OLD_PSW mvc __LC_RETURN_PSW(16),__LC_PGM_OLD_PSW ni __LC_RETURN_PSW,0xfc # remove IO and EX bits ni __LC_RETURN_PSW+1,0xfb # remove MCHK bit oi __LC_RETURN_PSW+1,0x2 # set wait state bit - larl %r2,.Lold_psw_disabled_wait - stg %r2,__LC_PGM_NEW_PSW+8 - l %r15,.Ldump_info_stack-.Lold_psw_disabled_wait(%r2) + larl %r9,.Lold_psw_disabled_wait + stg %r9,__LC_PGM_NEW_PSW+8 + larl %r15,_dump_info_stack_end-STACK_FRAME_OVERHEAD brasl %r14,print_pgm_check_info .Lold_psw_disabled_wait: - la %r1,4095 - lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1) + la %r8,4095 + lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r8) lpswe __LC_RETURN_PSW # disabled wait -.Ldump_info_stack: - .long 0x5000 + PAGE_SIZE - STACK_FRAME_OVERHEAD -ENDPROC(startup_pgm_check_handler) - -# -# params at 10400 (setup.h) -# Must be keept in sync with struct parmarea in setup.h -# - .org PARMAREA - .quad 0 # IPL_DEVICE - .quad 0 # INITRD_START - .quad 0 # INITRD_SIZE - .quad 0 # OLDMEM_BASE - .quad 0 # OLDMEM_SIZE - .quad kernel_version # points to kernel version string - - .org COMMAND_LINE - .byte "root=/dev/ram0 ro" - .byte 0 - - .org EARLY_SCCB_OFFSET - .fill 4096 - - .org HEAD_END +SYM_CODE_END(startup_pgm_check_handler) diff --git a/arch/s390/boot/head_kdump.S b/arch/s390/boot/head_kdump.S index 174d6959bf5b..f7107c76258c 100644 --- a/arch/s390/boot/head_kdump.S +++ b/arch/s390/boot/head_kdump.S @@ -19,8 +19,7 @@ # Note: This code has to be position independent # -.align 2 -.Lep_startup_kdump: +SYM_CODE_START_LOCAL(startup_kdump) lhi %r1,2 # mode 2 = esame (dump) sigp %r1,%r0,SIGP_SET_ARCHITECTURE # Switch to esame mode sam64 # Switch to 64 bit addressing @@ -83,19 +82,20 @@ # # Startup of kdump (relocated new kernel) # -.align 2 + .balign 2 startup_kdump_relocated: basr %r13,0 0: lpswe .Lrestart_psw-0b(%r13) # Start new kernel... -.align 8 +SYM_CODE_END(startup_kdump) + .balign 8 .Lrestart_psw: .quad 0x0000000080000000,0x0000000000000000 + startup #else -.align 2 -.Lep_startup_kdump: +SYM_CODE_START_LOCAL(startup_kdump) larl %r13,startup_kdump_crash lpswe 0(%r13) -.align 8 +SYM_CODE_END(startup_kdump) + .balign 8 startup_kdump_crash: .quad 0x0002000080000000,0x0000000000000000 + startup_kdump_crash #endif /* CONFIG_CRASH_DUMP */ diff --git a/arch/s390/boot/install.sh b/arch/s390/boot/install.sh index bed227f267ae..a13dd2f2aa1c 100644..100755 --- a/arch/s390/boot/install.sh +++ b/arch/s390/boot/install.sh @@ -14,22 +14,11 @@ # $2 - kernel image file # $3 - kernel map file # $4 - default install path (blank if root directory) -# - -# User may have a custom install script - -if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi -if [ -x /sbin/${INSTALLKERNEL} ]; then exec /sbin/${INSTALLKERNEL} "$@"; fi - -# Default install - same as make zlilo - -if [ -f $4/vmlinuz ]; then - mv $4/vmlinuz $4/vmlinuz.old -fi -if [ -f $4/System.map ]; then - mv $4/System.map $4/System.old -fi +echo "Warning: '${INSTALLKERNEL}' command not available - additional " \ + "bootloader config required" >&2 +if [ -f "$4/vmlinuz-$1" ]; then mv -- "$4/vmlinuz-$1" "$4/vmlinuz-$1.old"; fi +if [ -f "$4/System.map-$1" ]; then mv -- "$4/System.map-$1" "$4/System.map-$1.old"; fi -cat $2 > $4/vmlinuz -cp $3 $4/System.map +cat -- "$2" > "$4/vmlinuz-$1" +cp -- "$3" "$4/System.map-$1" diff --git a/arch/s390/boot/ipl_data.c b/arch/s390/boot/ipl_data.c new file mode 100644 index 000000000000..0846e2b249c6 --- /dev/null +++ b/arch/s390/boot/ipl_data.c @@ -0,0 +1,84 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/compat.h> +#include <linux/ptrace.h> +#include <asm/cio.h> +#include <asm/asm-offsets.h> +#include "boot.h" + +#define CCW0(cmd, addr, cnt, flg) \ + { .cmd_code = cmd, .cda = addr, .count = cnt, .flags = flg, } + +#define PSW_MASK_DISABLED (PSW_MASK_WAIT | PSW_MASK_EA | PSW_MASK_BA) + +struct ipl_lowcore { + psw_t32 ipl_psw; /* 0x0000 */ + struct ccw0 ccwpgm[2]; /* 0x0008 */ + u8 fill[56]; /* 0x0018 */ + struct ccw0 ccwpgmcc[20]; /* 0x0050 */ + u8 pad_0xf0[0x01a0-0x00f0]; /* 0x00f0 */ + psw_t restart_psw; /* 0x01a0 */ + psw_t external_new_psw; /* 0x01b0 */ + psw_t svc_new_psw; /* 0x01c0 */ + psw_t program_new_psw; /* 0x01d0 */ + psw_t mcck_new_psw; /* 0x01e0 */ + psw_t io_new_psw; /* 0x01f0 */ +}; + +/* + * Initial lowcore for IPL: the first 24 bytes are loaded by IPL to + * addresses 0-23 (a PSW and two CCWs). Bytes 24-79 are discarded. + * The next 160 bytes are loaded to addresses 0x18-0xb7. They form + * the continuation of the CCW program started by IPL and load the + * range 0x0f0-0x730 from the image to the range 0x0f0-0x730 in + * memory. At the end of the channel program the PSW at location 0 is + * loaded. + * Initial processing starts at 0x200 = iplstart. + * + * The restart psw points to iplstart which allows to load a kernel + * image into memory and starting it by a psw restart on any cpu. All + * other default psw new locations contain a disabled wait psw where + * the address indicates which psw was loaded. + * + * Note that the 'file' utility can detect s390 kernel images. For + * that to succeed the two initial CCWs, and the 0x40 fill bytes must + * be present. + */ +static struct ipl_lowcore ipl_lowcore __used __section(".ipldata") = { + .ipl_psw = { .mask = PSW32_MASK_BASE, .addr = PSW32_ADDR_AMODE | IPL_START }, + .ccwpgm = { + [ 0] = CCW0(CCW_CMD_READ_IPL, 0x018, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [ 1] = CCW0(CCW_CMD_READ_IPL, 0x068, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + }, + .fill = { + [ 0 ... 55] = 0x40, + }, + .ccwpgmcc = { + [ 0] = CCW0(CCW_CMD_READ_IPL, 0x0f0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [ 1] = CCW0(CCW_CMD_READ_IPL, 0x140, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [ 2] = CCW0(CCW_CMD_READ_IPL, 0x190, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [ 3] = CCW0(CCW_CMD_READ_IPL, 0x1e0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [ 4] = CCW0(CCW_CMD_READ_IPL, 0x230, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [ 5] = CCW0(CCW_CMD_READ_IPL, 0x280, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [ 6] = CCW0(CCW_CMD_READ_IPL, 0x2d0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [ 7] = CCW0(CCW_CMD_READ_IPL, 0x320, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [ 8] = CCW0(CCW_CMD_READ_IPL, 0x370, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [ 9] = CCW0(CCW_CMD_READ_IPL, 0x3c0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [10] = CCW0(CCW_CMD_READ_IPL, 0x410, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [11] = CCW0(CCW_CMD_READ_IPL, 0x460, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [12] = CCW0(CCW_CMD_READ_IPL, 0x4b0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [13] = CCW0(CCW_CMD_READ_IPL, 0x500, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [14] = CCW0(CCW_CMD_READ_IPL, 0x550, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [15] = CCW0(CCW_CMD_READ_IPL, 0x5a0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [16] = CCW0(CCW_CMD_READ_IPL, 0x5f0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [17] = CCW0(CCW_CMD_READ_IPL, 0x640, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [18] = CCW0(CCW_CMD_READ_IPL, 0x690, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [19] = CCW0(CCW_CMD_READ_IPL, 0x6e0, 0x50, CCW_FLAG_SLI), + }, + .restart_psw = { .mask = 0, .addr = IPL_START, }, + .external_new_psw = { .mask = PSW_MASK_DISABLED, .addr = __LC_EXT_NEW_PSW, }, + .svc_new_psw = { .mask = PSW_MASK_DISABLED, .addr = __LC_SVC_NEW_PSW, }, + .program_new_psw = { .mask = PSW_MASK_DISABLED, .addr = __LC_PGM_NEW_PSW, }, + .mcck_new_psw = { .mask = PSW_MASK_DISABLED, .addr = __LC_MCK_NEW_PSW, }, + .io_new_psw = { .mask = PSW_MASK_DISABLED, .addr = __LC_IO_NEW_PSW, }, +}; diff --git a/arch/s390/boot/ipl_parm.c b/arch/s390/boot/ipl_parm.c index 24ef67eb1cef..b24de9aabf7d 100644 --- a/arch/s390/boot/ipl_parm.c +++ b/arch/s390/boot/ipl_parm.c @@ -2,48 +2,62 @@ #include <linux/kernel.h> #include <linux/init.h> #include <linux/ctype.h> +#include <linux/pgtable.h> +#include <asm/page-states.h> #include <asm/ebcdic.h> #include <asm/sclp.h> #include <asm/sections.h> #include <asm/boot_data.h> #include <asm/facility.h> -#include <asm/pgtable.h> +#include <asm/setup.h> #include <asm/uv.h> #include "boot.h" +struct parmarea parmarea __section(".parmarea") = { + .kernel_version = (unsigned long)kernel_version, + .max_command_line_size = COMMAND_LINE_SIZE, + .command_line = "root=/dev/ram0 ro", +}; + char __bootdata(early_command_line)[COMMAND_LINE_SIZE]; + +unsigned int __bootdata_preserved(zlib_dfltcc_support) = ZLIB_DFLTCC_FULL; struct ipl_parameter_block __bootdata_preserved(ipl_block); int __bootdata_preserved(ipl_block_valid); +int __bootdata_preserved(__kaslr_enabled); +int __bootdata_preserved(cmma_flag) = 1; -unsigned long __bootdata(vmalloc_size) = VMALLOC_DEFAULT_SIZE; -unsigned long __bootdata(memory_end); -int __bootdata(memory_end_set); -int __bootdata(noexec_disabled); - -int kaslr_enabled __section(.data); +unsigned long vmalloc_size = VMALLOC_DEFAULT_SIZE; +unsigned long memory_limit; +int vmalloc_size_set; static inline int __diag308(unsigned long subcode, void *addr) { - register unsigned long _addr asm("0") = (unsigned long)addr; - register unsigned long _rc asm("1") = 0; unsigned long reg1, reg2; - psw_t old = S390_lowcore.program_new_psw; + union register_pair r1; + psw_t old; + r1.even = (unsigned long) addr; + r1.odd = 0; asm volatile( - " epsw %0,%1\n" - " st %0,%[psw_pgm]\n" - " st %1,%[psw_pgm]+4\n" - " larl %0,1f\n" - " stg %0,%[psw_pgm]+8\n" - " diag %[addr],%[subcode],0x308\n" - "1: nopr %%r7\n" - : "=&d" (reg1), "=&a" (reg2), - [psw_pgm] "=Q" (S390_lowcore.program_new_psw), - [addr] "+d" (_addr), "+d" (_rc) - : [subcode] "d" (subcode) + " mvc 0(16,%[psw_old]),0(%[psw_pgm])\n" + " epsw %[reg1],%[reg2]\n" + " st %[reg1],0(%[psw_pgm])\n" + " st %[reg2],4(%[psw_pgm])\n" + " larl %[reg1],1f\n" + " stg %[reg1],8(%[psw_pgm])\n" + " diag %[r1],%[subcode],0x308\n" + "1: mvc 0(16,%[psw_pgm]),0(%[psw_old])\n" + : [r1] "+&d" (r1.pair), + [reg1] "=&d" (reg1), + [reg2] "=&a" (reg2), + "+Q" (S390_lowcore.program_new_psw), + "=Q" (old) + : [subcode] "d" (subcode), + [psw_old] "a" (&old), + [psw_pgm] "a" (&S390_lowcore.program_new_psw) : "cc", "memory"); - S390_lowcore.program_new_psw = old; - return _rc; + return r1.odd; } void store_ipl_parmblock(void) @@ -56,6 +70,20 @@ void store_ipl_parmblock(void) ipl_block_valid = 1; } +bool is_ipl_block_dump(void) +{ + if (ipl_block.pb0_hdr.pbt == IPL_PBT_FCP && + ipl_block.fcp.opt == IPL_PB0_FCP_OPT_DUMP) + return true; + if (ipl_block.pb0_hdr.pbt == IPL_PBT_NVME && + ipl_block.nvme.opt == IPL_PB0_NVME_OPT_DUMP) + return true; + if (ipl_block.pb0_hdr.pbt == IPL_PBT_ECKD && + ipl_block.eckd.opt == IPL_PB0_ECKD_OPT_DUMP) + return true; + return false; +} + static size_t scpdata_length(const u8 *buf, size_t count) { while (count) { @@ -69,30 +97,49 @@ static size_t scpdata_length(const u8 *buf, size_t count) static size_t ipl_block_get_ascii_scpdata(char *dest, size_t size, const struct ipl_parameter_block *ipb) { - size_t count; - size_t i; + const __u8 *scp_data; + __u32 scp_data_len; int has_lowercase; + size_t count = 0; + size_t i; + + switch (ipb->pb0_hdr.pbt) { + case IPL_PBT_FCP: + scp_data_len = ipb->fcp.scp_data_len; + scp_data = ipb->fcp.scp_data; + break; + case IPL_PBT_NVME: + scp_data_len = ipb->nvme.scp_data_len; + scp_data = ipb->nvme.scp_data; + break; + case IPL_PBT_ECKD: + scp_data_len = ipb->eckd.scp_data_len; + scp_data = ipb->eckd.scp_data; + break; + + default: + goto out; + } - count = min(size - 1, scpdata_length(ipb->fcp.scp_data, - ipb->fcp.scp_data_len)); + count = min(size - 1, scpdata_length(scp_data, scp_data_len)); if (!count) goto out; has_lowercase = 0; for (i = 0; i < count; i++) { - if (!isascii(ipb->fcp.scp_data[i])) { + if (!isascii(scp_data[i])) { count = 0; goto out; } - if (!has_lowercase && islower(ipb->fcp.scp_data[i])) + if (!has_lowercase && islower(scp_data[i])) has_lowercase = 1; } if (has_lowercase) - memcpy(dest, ipb->fcp.scp_data, count); + memcpy(dest, scp_data, count); else for (i = 0; i < count; i++) - dest[i] = tolower(ipb->fcp.scp_data[i]); + dest[i] = tolower(scp_data[i]); out: dest[count] = '\0'; return count; @@ -114,6 +161,8 @@ static void append_ipl_block_parm(void) parm, COMMAND_LINE_SIZE - len - 1, &ipl_block); break; case IPL_PBT_FCP: + case IPL_PBT_NVME: + case IPL_PBT_ECKD: rc = ipl_block_get_ascii_scpdata( parm, COMMAND_LINE_SIZE - len - 1, &ipl_block); break; @@ -138,12 +187,12 @@ static inline int has_ebcdic_char(const char *str) void setup_boot_command_line(void) { - COMMAND_LINE[ARCH_COMMAND_LINE_SIZE - 1] = 0; + parmarea.command_line[COMMAND_LINE_SIZE - 1] = 0; /* convert arch command line to ascii if necessary */ - if (has_ebcdic_char(COMMAND_LINE)) - EBCASC(COMMAND_LINE, ARCH_COMMAND_LINE_SIZE); + if (has_ebcdic_char(parmarea.command_line)) + EBCASC(parmarea.command_line, COMMAND_LINE_SIZE); /* copy arch command line */ - strcpy(early_command_line, strim(COMMAND_LINE)); + strcpy(early_command_line, strim(parmarea.command_line)); /* append IPL PARM data to the boot command line */ if (!is_prot_virt_guest() && ipl_block_valid) @@ -153,9 +202,9 @@ void setup_boot_command_line(void) static void modify_facility(unsigned long nr, bool clear) { if (clear) - __clear_facility(nr, S390_lowcore.stfle_fac_list); + __clear_facility(nr, stfle_fac_list); else - __set_facility(nr, S390_lowcore.stfle_fac_list); + __set_facility(nr, stfle_fac_list); } static void check_cleared_facilities(void) @@ -164,7 +213,7 @@ static void check_cleared_facilities(void) int i; for (i = 0; i < ARRAY_SIZE(als); i++) { - if ((S390_lowcore.stfle_fac_list[i] & als[i]) != als[i]) { + if ((stfle_fac_list[i] & als[i]) != als[i]) { sclp_early_printk("Warning: The Linux kernel requires facilities cleared via command line option\n"); print_missing_facilities(); break; @@ -208,7 +257,7 @@ static void modify_fac_list(char *str) check_cleared_facilities(); } -static char command_line_buf[COMMAND_LINE_SIZE] __section(.data); +static char command_line_buf[COMMAND_LINE_SIZE]; void parse_boot_command_line(void) { char *param, *val; @@ -216,44 +265,50 @@ void parse_boot_command_line(void) char *args; int rc; - kaslr_enabled = IS_ENABLED(CONFIG_RANDOMIZE_BASE); + __kaslr_enabled = IS_ENABLED(CONFIG_RANDOMIZE_BASE); args = strcpy(command_line_buf, early_command_line); while (*args) { args = next_arg(args, ¶m, &val); - if (!strcmp(param, "mem") && val) { - memory_end = round_down(memparse(val, NULL), PAGE_SIZE); - memory_end_set = 1; - } + if (!strcmp(param, "mem") && val) + memory_limit = round_down(memparse(val, NULL), PAGE_SIZE); - if (!strcmp(param, "vmalloc") && val) - vmalloc_size = round_up(memparse(val, NULL), PAGE_SIZE); + if (!strcmp(param, "vmalloc") && val) { + vmalloc_size = round_up(memparse(val, NULL), _SEGMENT_SIZE); + vmalloc_size_set = 1; + } - if (!strcmp(param, "noexec")) { - rc = kstrtobool(val, &enabled); - if (!rc && !enabled) - noexec_disabled = 1; + if (!strcmp(param, "dfltcc") && val) { + if (!strcmp(val, "off")) + zlib_dfltcc_support = ZLIB_DFLTCC_DISABLED; + else if (!strcmp(val, "on")) + zlib_dfltcc_support = ZLIB_DFLTCC_FULL; + else if (!strcmp(val, "def_only")) + zlib_dfltcc_support = ZLIB_DFLTCC_DEFLATE_ONLY; + else if (!strcmp(val, "inf_only")) + zlib_dfltcc_support = ZLIB_DFLTCC_INFLATE_ONLY; + else if (!strcmp(val, "always")) + zlib_dfltcc_support = ZLIB_DFLTCC_FULL_DEBUG; } if (!strcmp(param, "facilities") && val) modify_fac_list(val); if (!strcmp(param, "nokaslr")) - kaslr_enabled = 0; - } -} + __kaslr_enabled = 0; -void setup_memory_end(void) -{ -#ifdef CONFIG_CRASH_DUMP - if (OLDMEM_BASE) { - kaslr_enabled = 0; - } else if (ipl_block_valid && - ipl_block.pb0_hdr.pbt == IPL_PBT_FCP && - ipl_block.fcp.opt == IPL_PB0_FCP_OPT_DUMP) { - kaslr_enabled = 0; - if (!sclp_early_get_hsa_size(&memory_end) && memory_end) - memory_end_set = 1; - } + if (!strcmp(param, "cmma")) { + rc = kstrtobool(val, &enabled); + if (!rc && !enabled) + cmma_flag = 0; + } + +#if IS_ENABLED(CONFIG_KVM) + if (!strcmp(param, "prot_virt")) { + rc = kstrtobool(val, &enabled); + if (!rc && enabled) + prot_virt_host = 1; + } #endif + } } diff --git a/arch/s390/boot/ipl_report.c b/arch/s390/boot/ipl_report.c index 0b4965573656..1803035e68d2 100644 --- a/arch/s390/boot/ipl_report.c +++ b/arch/s390/boot/ipl_report.c @@ -5,6 +5,7 @@ #include <asm/sclp.h> #include <asm/sections.h> #include <asm/boot_data.h> +#include <asm/physmem_info.h> #include <uapi/asm/ipl.h> #include "boot.h" @@ -16,20 +17,16 @@ unsigned long __bootdata_preserved(ipl_cert_list_size); unsigned long __bootdata(early_ipl_comp_list_addr); unsigned long __bootdata(early_ipl_comp_list_size); +static struct ipl_rb_certificates *certs; +static struct ipl_rb_components *comps; +static bool ipl_report_needs_saving; + #define for_each_rb_entry(entry, rb) \ for (entry = rb->entries; \ (void *) entry + sizeof(*entry) <= (void *) rb + rb->len; \ entry++) -static inline bool intersects(unsigned long addr0, unsigned long size0, - unsigned long addr1, unsigned long size1) -{ - return addr0 + size0 > addr1 && addr1 + size1 > addr0; -} - -static unsigned long find_bootdata_space(struct ipl_rb_components *comps, - struct ipl_rb_certificates *certs, - unsigned long safe_addr) +static unsigned long get_cert_comp_list_size(void) { struct ipl_rb_certificate_entry *cert; struct ipl_rb_component_entry *comp; @@ -44,36 +41,27 @@ static unsigned long find_bootdata_space(struct ipl_rb_components *comps, ipl_cert_list_size = 0; for_each_rb_entry(cert, certs) ipl_cert_list_size += sizeof(unsigned int) + cert->len; - size = ipl_cert_list_size + early_ipl_comp_list_size; + return ipl_cert_list_size + early_ipl_comp_list_size; +} - /* - * Start from safe_addr to find a free memory area large - * enough for the IPL report boot data. This area is used - * for ipl_cert_list_addr/ipl_cert_list_size and - * early_ipl_comp_list_addr/early_ipl_comp_list_size. It must - * not overlap with any component or any certificate. - */ -repeat: - if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && INITRD_START && INITRD_SIZE && - intersects(INITRD_START, INITRD_SIZE, safe_addr, size)) - safe_addr = INITRD_START + INITRD_SIZE; - for_each_rb_entry(comp, comps) - if (intersects(safe_addr, size, comp->addr, comp->len)) { - safe_addr = comp->addr + comp->len; - goto repeat; - } - for_each_rb_entry(cert, certs) - if (intersects(safe_addr, size, cert->addr, cert->len)) { - safe_addr = cert->addr + cert->len; - goto repeat; - } - early_ipl_comp_list_addr = safe_addr; - ipl_cert_list_addr = safe_addr + early_ipl_comp_list_size; +bool ipl_report_certs_intersects(unsigned long addr, unsigned long size, + unsigned long *intersection_start) +{ + struct ipl_rb_certificate_entry *cert; - return safe_addr + size; + if (!ipl_report_needs_saving) + return false; + + for_each_rb_entry(cert, certs) { + if (intersects(addr, size, cert->addr, cert->len)) { + *intersection_start = cert->addr; + return true; + } + } + return false; } -static void copy_components_bootdata(struct ipl_rb_components *comps) +static void copy_components_bootdata(void) { struct ipl_rb_component_entry *comp, *ptr; @@ -82,7 +70,7 @@ static void copy_components_bootdata(struct ipl_rb_components *comps) memcpy(ptr++, comp, sizeof(*ptr)); } -static void copy_certificates_bootdata(struct ipl_rb_certificates *certs) +static void copy_certificates_bootdata(void) { struct ipl_rb_certificate_entry *cert; void *ptr; @@ -96,10 +84,8 @@ static void copy_certificates_bootdata(struct ipl_rb_certificates *certs) } } -unsigned long read_ipl_report(unsigned long safe_addr) +int read_ipl_report(void) { - struct ipl_rb_certificates *certs; - struct ipl_rb_components *comps; struct ipl_pl_hdr *pl_hdr; struct ipl_rl_hdr *rl_hdr; struct ipl_rb_hdr *rb_hdr; @@ -112,7 +98,7 @@ unsigned long read_ipl_report(unsigned long safe_addr) */ if (!ipl_block_valid || !(ipl_block.hdr.flags & IPL_PL_FLAG_IPLSR)) - return safe_addr; + return -1; ipl_secure_flag = !!(ipl_block.hdr.flags & IPL_PL_FLAG_SIPL); /* * There is an IPL report, to find it load the pointer to the @@ -150,16 +136,30 @@ unsigned long read_ipl_report(unsigned long safe_addr) * With either the component list or the certificate list * missing the kernel will stay ignorant of secure IPL. */ - if (!comps || !certs) - return safe_addr; + if (!comps || !certs) { + certs = NULL; + return -1; + } - /* - * Copy component and certificate list to a safe area - * where the decompressed kernel can find them. - */ - safe_addr = find_bootdata_space(comps, certs, safe_addr); - copy_components_bootdata(comps); - copy_certificates_bootdata(certs); + ipl_report_needs_saving = true; + physmem_reserve(RR_IPLREPORT, (unsigned long)pl_hdr, + (unsigned long)rl_end - (unsigned long)pl_hdr); + return 0; +} + +void save_ipl_cert_comp_list(void) +{ + unsigned long size; + + if (!ipl_report_needs_saving) + return; + + size = get_cert_comp_list_size(); + early_ipl_comp_list_addr = physmem_alloc_top_down(RR_CERT_COMP_LIST, size, sizeof(int)); + ipl_cert_list_addr = early_ipl_comp_list_addr + early_ipl_comp_list_size; - return safe_addr; + copy_components_bootdata(); + copy_certificates_bootdata(); + physmem_free(RR_IPLREPORT); + ipl_report_needs_saving = false; } diff --git a/arch/s390/boot/kaslr.c b/arch/s390/boot/kaslr.c index 5d12352545c5..90602101e2ae 100644 --- a/arch/s390/boot/kaslr.c +++ b/arch/s390/boot/kaslr.c @@ -2,12 +2,13 @@ /* * Copyright IBM Corp. 2019 */ -#include <asm/mem_detect.h> -#include <asm/pgtable.h> +#include <linux/pgtable.h> +#include <asm/physmem_info.h> #include <asm/cpacf.h> #include <asm/timex.h> #include <asm/sclp.h> -#include "compressed/decompressor.h" +#include <asm/kasan.h> +#include "decompressor.h" #include "boot.h" #define PRNG_MODE_TDES 1 @@ -42,7 +43,7 @@ static int check_prng(void) return PRNG_MODE_TDES; } -static unsigned long get_random(unsigned long limit) +static int get_random(unsigned long limit, unsigned long *value) { struct prng_parm prng = { /* initial parameter block for tdes mode, copied from libica */ @@ -75,7 +76,7 @@ static unsigned long get_random(unsigned long limit) *(unsigned long *) prng.parm_block ^= seed; for (i = 0; i < 16; i++) { cpacf_kmc(CPACF_KMC_PRNG, prng.parm_block, - (char *) entropy, (char *) entropy, + (u8 *) entropy, (u8 *) entropy, sizeof(entropy)); memcpy(prng.parm_block, entropy, sizeof(entropy)); } @@ -84,87 +85,114 @@ static unsigned long get_random(unsigned long limit) (u8 *) &random, sizeof(random)); break; default: - random = 0; + return -1; } - return random % limit; + *value = random % limit; + return 0; } -unsigned long get_random_base(unsigned long safe_addr) +static void sort_reserved_ranges(struct reserved_range *res, unsigned long size) { - unsigned long memory_limit = memory_end_set ? memory_end : 0; - unsigned long base, start, end, kernel_size; - unsigned long block_sum, offset; - unsigned long kasan_needs; - int i; + struct reserved_range tmp; + int i, j; - if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && INITRD_START && INITRD_SIZE) { - if (safe_addr < INITRD_START + INITRD_SIZE) - safe_addr = INITRD_START + INITRD_SIZE; + for (i = 1; i < size; i++) { + tmp = res[i]; + for (j = i - 1; j >= 0 && res[j].start > tmp.start; j--) + res[j + 1] = res[j]; + res[j + 1] = tmp; } - safe_addr = ALIGN(safe_addr, THREAD_SIZE); +} - if ((IS_ENABLED(CONFIG_KASAN))) { - /* - * Estimate kasan memory requirements, which it will reserve - * at the very end of available physical memory. To estimate - * that, we take into account that kasan would require - * 1/8 of available physical memory (for shadow memory) + - * creating page tables for the whole memory + shadow memory - * region (1 + 1/8). To keep page tables estimates simple take - * the double of combined ptes size. - */ - memory_limit = get_mem_detect_end(); - if (memory_end_set && memory_limit > memory_end) - memory_limit = memory_end; +static unsigned long iterate_valid_positions(unsigned long size, unsigned long align, + unsigned long _min, unsigned long _max, + struct reserved_range *res, size_t res_count, + bool pos_count, unsigned long find_pos) +{ + unsigned long start, end, tmp_end, range_pos, pos = 0; + struct reserved_range *res_end = res + res_count; + struct reserved_range *skip_res; + int i; - /* for shadow memory */ - kasan_needs = memory_limit / 8; - /* for paging structures */ - kasan_needs += (memory_limit + kasan_needs) / PAGE_SIZE / - _PAGE_ENTRIES * _PAGE_TABLE_SIZE * 2; - memory_limit -= kasan_needs; - } + align = max(align, 8UL); + _min = round_up(_min, align); + for_each_physmem_usable_range(i, &start, &end) { + if (_min >= end) + continue; + start = round_up(start, align); + if (start >= _max) + break; + start = max(_min, start); + end = min(_max, end); - kernel_size = vmlinux.image_size + vmlinux.bss_size; - block_sum = 0; - for_each_mem_detect_block(i, &start, &end) { - if (memory_limit) { - if (start >= memory_limit) + while (start + size <= end) { + /* skip reserved ranges below the start */ + while (res && res->end <= start) { + res++; + if (res >= res_end) + res = NULL; + } + skip_res = NULL; + tmp_end = end; + /* has intersecting reserved range */ + if (res && res->start < end) { + skip_res = res; + tmp_end = res->start; + } + if (start + size <= tmp_end) { + range_pos = (tmp_end - start - size) / align + 1; + if (pos_count) { + pos += range_pos; + } else { + if (range_pos >= find_pos) + return start + (find_pos - 1) * align; + find_pos -= range_pos; + } + } + if (!skip_res) break; - if (end > memory_limit) - end = memory_limit; + start = round_up(skip_res->end, align); } - if (end - start < kernel_size) - continue; - block_sum += end - start - kernel_size; - } - if (!block_sum) { - sclp_early_printk("KASLR disabled: not enough memory\n"); - return 0; } - base = get_random(block_sum); - if (base == 0) + return pos_count ? pos : 0; +} + +/* + * Two types of decompressor memory allocations/reserves are considered + * differently. + * + * "Static" or "single" allocations are done via physmem_alloc_range() and + * physmem_reserve(), and they are listed in physmem_info.reserved[]. Each + * type of "static" allocation can only have one allocation per type and + * cannot have chains. + * + * On the other hand, "dynamic" or "repetitive" allocations are done via + * physmem_alloc_top_down(). These allocations are tightly packed together + * top down from the end of online memory. physmem_alloc_pos represents + * current position where those allocations start. + * + * Functions randomize_within_range() and iterate_valid_positions() + * only consider "dynamic" allocations by never looking above + * physmem_alloc_pos. "Static" allocations, however, are explicitly + * considered by checking the "res" (reserves) array. The first + * reserved_range of a "dynamic" allocation may also be checked along the + * way, but it will always be above the maximum value anyway. + */ +unsigned long randomize_within_range(unsigned long size, unsigned long align, + unsigned long min, unsigned long max) +{ + struct reserved_range res[RR_MAX]; + unsigned long max_pos, pos; + + memcpy(res, physmem_info.reserved, sizeof(res)); + sort_reserved_ranges(res, ARRAY_SIZE(res)); + max = min(max, get_physmem_alloc_pos()); + + max_pos = iterate_valid_positions(size, align, min, max, res, ARRAY_SIZE(res), true, 0); + if (!max_pos) return 0; - if (base < safe_addr) - base = safe_addr; - block_sum = offset = 0; - for_each_mem_detect_block(i, &start, &end) { - if (memory_limit) { - if (start >= memory_limit) - break; - if (end > memory_limit) - end = memory_limit; - } - if (end - start < kernel_size) - continue; - block_sum += end - start - kernel_size; - if (base <= block_sum) { - base = start + base - offset; - base = ALIGN_DOWN(base, THREAD_SIZE); - break; - } - offset = block_sum; - } - return base; + if (get_random(max_pos, &pos)) + return 0; + return iterate_valid_positions(size, align, min, max, res, ARRAY_SIZE(res), false, pos + 1); } diff --git a/arch/s390/boot/mem_detect.c b/arch/s390/boot/mem_detect.c deleted file mode 100644 index 62e7c13ce85c..000000000000 --- a/arch/s390/boot/mem_detect.c +++ /dev/null @@ -1,175 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/errno.h> -#include <linux/init.h> -#include <asm/sclp.h> -#include <asm/sections.h> -#include <asm/mem_detect.h> -#include <asm/sparsemem.h> -#include "compressed/decompressor.h" -#include "boot.h" - -unsigned long __bootdata(max_physmem_end); -struct mem_detect_info __bootdata(mem_detect); - -/* up to 256 storage elements, 1020 subincrements each */ -#define ENTRIES_EXTENDED_MAX \ - (256 * (1020 / 2) * sizeof(struct mem_detect_block)) - -/* - * To avoid corrupting old kernel memory during dump, find lowest memory - * chunk possible either right after the kernel end (decompressed kernel) or - * after initrd (if it is present and there is no hole between the kernel end - * and initrd) - */ -static void *mem_detect_alloc_extended(void) -{ - unsigned long offset = ALIGN(mem_safe_offset(), sizeof(u64)); - - if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && INITRD_START && INITRD_SIZE && - INITRD_START < offset + ENTRIES_EXTENDED_MAX) - offset = ALIGN(INITRD_START + INITRD_SIZE, sizeof(u64)); - - return (void *)offset; -} - -static struct mem_detect_block *__get_mem_detect_block_ptr(u32 n) -{ - if (n < MEM_INLINED_ENTRIES) - return &mem_detect.entries[n]; - if (unlikely(!mem_detect.entries_extended)) - mem_detect.entries_extended = mem_detect_alloc_extended(); - return &mem_detect.entries_extended[n - MEM_INLINED_ENTRIES]; -} - -/* - * sequential calls to add_mem_detect_block with adjacent memory areas - * are merged together into single memory block. - */ -void add_mem_detect_block(u64 start, u64 end) -{ - struct mem_detect_block *block; - - if (mem_detect.count) { - block = __get_mem_detect_block_ptr(mem_detect.count - 1); - if (block->end == start) { - block->end = end; - return; - } - } - - block = __get_mem_detect_block_ptr(mem_detect.count); - block->start = start; - block->end = end; - mem_detect.count++; -} - -static int __diag260(unsigned long rx1, unsigned long rx2) -{ - register unsigned long _rx1 asm("2") = rx1; - register unsigned long _rx2 asm("3") = rx2; - register unsigned long _ry asm("4") = 0x10; /* storage configuration */ - int rc = -1; /* fail */ - unsigned long reg1, reg2; - psw_t old = S390_lowcore.program_new_psw; - - asm volatile( - " epsw %0,%1\n" - " st %0,%[psw_pgm]\n" - " st %1,%[psw_pgm]+4\n" - " larl %0,1f\n" - " stg %0,%[psw_pgm]+8\n" - " diag %[rx],%[ry],0x260\n" - " ipm %[rc]\n" - " srl %[rc],28\n" - "1:\n" - : "=&d" (reg1), "=&a" (reg2), - [psw_pgm] "=Q" (S390_lowcore.program_new_psw), - [rc] "+&d" (rc), [ry] "+d" (_ry) - : [rx] "d" (_rx1), "d" (_rx2) - : "cc", "memory"); - S390_lowcore.program_new_psw = old; - return rc == 0 ? _ry : -1; -} - -static int diag260(void) -{ - int rc, i; - - struct { - unsigned long start; - unsigned long end; - } storage_extents[8] __aligned(16); /* VM supports up to 8 extends */ - - memset(storage_extents, 0, sizeof(storage_extents)); - rc = __diag260((unsigned long)storage_extents, sizeof(storage_extents)); - if (rc == -1) - return -1; - - for (i = 0; i < min_t(int, rc, ARRAY_SIZE(storage_extents)); i++) - add_mem_detect_block(storage_extents[i].start, storage_extents[i].end + 1); - return 0; -} - -static int tprot(unsigned long addr) -{ - unsigned long pgm_addr; - int rc = -EFAULT; - psw_t old = S390_lowcore.program_new_psw; - - S390_lowcore.program_new_psw.mask = __extract_psw(); - asm volatile( - " larl %[pgm_addr],1f\n" - " stg %[pgm_addr],%[psw_pgm_addr]\n" - " tprot 0(%[addr]),0\n" - " ipm %[rc]\n" - " srl %[rc],28\n" - "1:\n" - : [pgm_addr] "=&d"(pgm_addr), - [psw_pgm_addr] "=Q"(S390_lowcore.program_new_psw.addr), - [rc] "+&d"(rc) - : [addr] "a"(addr) - : "cc", "memory"); - S390_lowcore.program_new_psw = old; - return rc; -} - -static void search_mem_end(void) -{ - unsigned long range = 1 << (MAX_PHYSMEM_BITS - 20); /* in 1MB blocks */ - unsigned long offset = 0; - unsigned long pivot; - - while (range > 1) { - range >>= 1; - pivot = offset + range; - if (!tprot(pivot << 20)) - offset = pivot; - } - - add_mem_detect_block(0, (offset + 1) << 20); -} - -void detect_memory(void) -{ - sclp_early_get_memsize(&max_physmem_end); - - if (!sclp_early_read_storage_info()) { - mem_detect.info_source = MEM_DETECT_SCLP_STOR_INFO; - return; - } - - if (!diag260()) { - mem_detect.info_source = MEM_DETECT_DIAG260; - return; - } - - if (max_physmem_end) { - add_mem_detect_block(0, max_physmem_end); - mem_detect.info_source = MEM_DETECT_SCLP_READ_INFO; - return; - } - - search_mem_end(); - mem_detect.info_source = MEM_DETECT_BIN_SEARCH; - max_physmem_end = get_mem_detect_end(); -} diff --git a/arch/s390/boot/pgm_check_info.c b/arch/s390/boot/pgm_check_info.c index 83b5b7915c32..97244cd7a206 100644 --- a/arch/s390/boot/pgm_check_info.c +++ b/arch/s390/boot/pgm_check_info.c @@ -1,90 +1,179 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> +#include <linux/stdarg.h> #include <linux/string.h> +#include <linux/ctype.h> +#include <asm/stacktrace.h> +#include <asm/boot_data.h> #include <asm/lowcore.h> +#include <asm/setup.h> #include <asm/sclp.h> +#include <asm/uv.h> #include "boot.h" const char hex_asc[] = "0123456789abcdef"; -#define add_val_as_hex(dst, val) \ - __add_val_as_hex(dst, (const unsigned char *)&val, sizeof(val)) +static char *as_hex(char *dst, unsigned long val, int pad) +{ + char *p, *end = p = dst + max(pad, (int)__fls(val | 1) / 4 + 1); + + for (*p-- = 0; p >= dst; val >>= 4) + *p-- = hex_asc[val & 0x0f]; + return end; +} -static char *__add_val_as_hex(char *dst, const unsigned char *src, size_t count) +static char *symstart(char *p) { - while (count--) - dst = hex_byte_pack(dst, *src++); - return dst; + while (*p) + p--; + return p + 1; } -static char *add_str(char *dst, char *src) +static noinline char *findsym(unsigned long ip, unsigned short *off, unsigned short *len) { - strcpy(dst, src); - return dst + strlen(dst); + /* symbol entries are in a form "10000 c4 startup\0" */ + char *a = _decompressor_syms_start; + char *b = _decompressor_syms_end; + unsigned long start; + unsigned long size; + char *pivot; + char *endp; + + while (a < b) { + pivot = symstart(a + (b - a) / 2); + start = simple_strtoull(pivot, &endp, 16); + size = simple_strtoull(endp + 1, &endp, 16); + if (ip < start) { + b = pivot; + continue; + } + if (ip > start + size) { + a = pivot + strlen(pivot) + 1; + continue; + } + *off = ip - start; + *len = size; + return endp + 1; + } + return NULL; } -void print_pgm_check_info(void) +static noinline char *strsym(void *ip) { - struct psw_bits *psw = &psw_bits(S390_lowcore.psw_save_area); - unsigned short ilc = S390_lowcore.pgm_ilc >> 1; - char buf[256]; - int row, col; + static char buf[64]; + unsigned short off; + unsigned short len; char *p; - add_str(buf, "Linux version "); - strlcat(buf, kernel_version, sizeof(buf)); - sclp_early_printk(buf); + p = findsym((unsigned long)ip, &off, &len); + if (p) { + strncpy(buf, p, sizeof(buf)); + /* reserve 15 bytes for offset/len in symbol+0x1234/0x1234 */ + p = buf + strnlen(buf, sizeof(buf) - 15); + strcpy(p, "+0x"); + p = as_hex(p + 3, off, 0); + strcpy(p, "/0x"); + as_hex(p + 3, len, 0); + } else { + as_hex(buf, (unsigned long)ip, 16); + } + return buf; +} - p = add_str(buf, "Kernel fault: interruption code "); - p = add_val_as_hex(buf + strlen(buf), S390_lowcore.pgm_code); - p = add_str(p, " ilc:"); - *p++ = hex_asc_lo(ilc); - add_str(p, "\n"); - sclp_early_printk(buf); +void decompressor_printk(const char *fmt, ...) +{ + char buf[1024] = { 0 }; + char *end = buf + sizeof(buf) - 1; /* make sure buf is 0 terminated */ + unsigned long pad; + char *p = buf; + va_list args; - p = add_str(buf, "PSW : "); - p = add_val_as_hex(p, S390_lowcore.psw_save_area.mask); - p = add_str(p, " "); - p = add_val_as_hex(p, S390_lowcore.psw_save_area.addr); - add_str(p, "\n"); + va_start(args, fmt); + for (; p < end && *fmt; fmt++) { + if (*fmt != '%') { + *p++ = *fmt; + continue; + } + pad = isdigit(*++fmt) ? simple_strtol(fmt, (char **)&fmt, 10) : 0; + switch (*fmt) { + case 's': + p = buf + strlcat(buf, va_arg(args, char *), sizeof(buf)); + break; + case 'p': + if (*++fmt != 'S') + goto out; + p = buf + strlcat(buf, strsym(va_arg(args, void *)), sizeof(buf)); + break; + case 'l': + if (*++fmt != 'x' || end - p <= max(sizeof(long) * 2, pad)) + goto out; + p = as_hex(p, va_arg(args, unsigned long), pad); + break; + case 'x': + if (end - p <= max(sizeof(int) * 2, pad)) + goto out; + p = as_hex(p, va_arg(args, unsigned int), pad); + break; + default: + goto out; + } + } +out: + va_end(args); sclp_early_printk(buf); +} - p = add_str(buf, " R:"); - *p++ = hex_asc_lo(psw->per); - p = add_str(p, " T:"); - *p++ = hex_asc_lo(psw->dat); - p = add_str(p, " IO:"); - *p++ = hex_asc_lo(psw->io); - p = add_str(p, " EX:"); - *p++ = hex_asc_lo(psw->ext); - p = add_str(p, " Key:"); - *p++ = hex_asc_lo(psw->key); - p = add_str(p, " M:"); - *p++ = hex_asc_lo(psw->mcheck); - p = add_str(p, " W:"); - *p++ = hex_asc_lo(psw->wait); - p = add_str(p, " P:"); - *p++ = hex_asc_lo(psw->pstate); - p = add_str(p, " AS:"); - *p++ = hex_asc_lo(psw->as); - p = add_str(p, " CC:"); - *p++ = hex_asc_lo(psw->cc); - p = add_str(p, " PM:"); - *p++ = hex_asc_lo(psw->pm); - p = add_str(p, " RI:"); - *p++ = hex_asc_lo(psw->ri); - p = add_str(p, " EA:"); - *p++ = hex_asc_lo(psw->eaba); - add_str(p, "\n"); - sclp_early_printk(buf); +void print_stacktrace(unsigned long sp) +{ + struct stack_info boot_stack = { STACK_TYPE_TASK, (unsigned long)_stack_start, + (unsigned long)_stack_end }; + bool first = true; - for (row = 0; row < 4; row++) { - p = add_str(buf, row == 0 ? "GPRS:" : " "); - for (col = 0; col < 4; col++) { - p = add_str(p, " "); - p = add_val_as_hex(p, S390_lowcore.gpregs_save_area[row * 4 + col]); - } - add_str(p, "\n"); - sclp_early_printk(buf); + decompressor_printk("Call Trace:\n"); + while (!(sp & 0x7) && on_stack(&boot_stack, sp, sizeof(struct stack_frame))) { + struct stack_frame *sf = (struct stack_frame *)sp; + + decompressor_printk(first ? "(sp:%016lx [<%016lx>] %pS)\n" : + " sp:%016lx [<%016lx>] %pS\n", + sp, sf->gprs[8], (void *)sf->gprs[8]); + if (sf->back_chain <= sp) + break; + sp = sf->back_chain; + first = false; } } + +void print_pgm_check_info(void) +{ + unsigned long *gpregs = (unsigned long *)S390_lowcore.gpregs_save_area; + struct psw_bits *psw = &psw_bits(S390_lowcore.psw_save_area); + + decompressor_printk("Linux version %s\n", kernel_version); + if (!is_prot_virt_guest() && early_command_line[0]) + decompressor_printk("Kernel command line: %s\n", early_command_line); + decompressor_printk("Kernel fault: interruption code %04x ilc:%x\n", + S390_lowcore.pgm_code, S390_lowcore.pgm_ilc >> 1); + if (kaslr_enabled()) + decompressor_printk("Kernel random base: %lx\n", __kaslr_offset); + decompressor_printk("PSW : %016lx %016lx (%pS)\n", + S390_lowcore.psw_save_area.mask, + S390_lowcore.psw_save_area.addr, + (void *)S390_lowcore.psw_save_area.addr); + decompressor_printk( + " R:%x T:%x IO:%x EX:%x Key:%x M:%x W:%x P:%x AS:%x CC:%x PM:%x RI:%x EA:%x\n", + psw->per, psw->dat, psw->io, psw->ext, psw->key, psw->mcheck, + psw->wait, psw->pstate, psw->as, psw->cc, psw->pm, psw->ri, + psw->eaba); + decompressor_printk("GPRS: %016lx %016lx %016lx %016lx\n", + gpregs[0], gpregs[1], gpregs[2], gpregs[3]); + decompressor_printk(" %016lx %016lx %016lx %016lx\n", + gpregs[4], gpregs[5], gpregs[6], gpregs[7]); + decompressor_printk(" %016lx %016lx %016lx %016lx\n", + gpregs[8], gpregs[9], gpregs[10], gpregs[11]); + decompressor_printk(" %016lx %016lx %016lx %016lx\n", + gpregs[12], gpregs[13], gpregs[14], gpregs[15]); + print_stacktrace(S390_lowcore.gpregs_save_area[15]); + decompressor_printk("Last Breaking-Event-Address:\n"); + decompressor_printk(" [<%016lx>] %pS\n", (unsigned long)S390_lowcore.pgm_last_break, + (void *)S390_lowcore.pgm_last_break); +} diff --git a/arch/s390/boot/physmem_info.c b/arch/s390/boot/physmem_info.c new file mode 100644 index 000000000000..0cf79826eef9 --- /dev/null +++ b/arch/s390/boot/physmem_info.c @@ -0,0 +1,328 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/processor.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <asm/physmem_info.h> +#include <asm/stacktrace.h> +#include <asm/boot_data.h> +#include <asm/sparsemem.h> +#include <asm/sections.h> +#include <asm/setup.h> +#include <asm/sclp.h> +#include <asm/uv.h> +#include "decompressor.h" +#include "boot.h" + +struct physmem_info __bootdata(physmem_info); +static unsigned int physmem_alloc_ranges; +static unsigned long physmem_alloc_pos; + +/* up to 256 storage elements, 1020 subincrements each */ +#define ENTRIES_EXTENDED_MAX \ + (256 * (1020 / 2) * sizeof(struct physmem_range)) + +static struct physmem_range *__get_physmem_range_ptr(u32 n) +{ + if (n < MEM_INLINED_ENTRIES) + return &physmem_info.online[n]; + if (unlikely(!physmem_info.online_extended)) { + physmem_info.online_extended = (struct physmem_range *)physmem_alloc_range( + RR_MEM_DETECT_EXTENDED, ENTRIES_EXTENDED_MAX, sizeof(long), 0, + physmem_alloc_pos, true); + } + return &physmem_info.online_extended[n - MEM_INLINED_ENTRIES]; +} + +/* + * sequential calls to add_physmem_online_range with adjacent memory ranges + * are merged together into single memory range. + */ +void add_physmem_online_range(u64 start, u64 end) +{ + struct physmem_range *range; + + if (physmem_info.range_count) { + range = __get_physmem_range_ptr(physmem_info.range_count - 1); + if (range->end == start) { + range->end = end; + return; + } + } + + range = __get_physmem_range_ptr(physmem_info.range_count); + range->start = start; + range->end = end; + physmem_info.range_count++; +} + +static int __diag260(unsigned long rx1, unsigned long rx2) +{ + unsigned long reg1, reg2, ry; + union register_pair rx; + psw_t old; + int rc; + + rx.even = rx1; + rx.odd = rx2; + ry = 0x10; /* storage configuration */ + rc = -1; /* fail */ + asm volatile( + " mvc 0(16,%[psw_old]),0(%[psw_pgm])\n" + " epsw %[reg1],%[reg2]\n" + " st %[reg1],0(%[psw_pgm])\n" + " st %[reg2],4(%[psw_pgm])\n" + " larl %[reg1],1f\n" + " stg %[reg1],8(%[psw_pgm])\n" + " diag %[rx],%[ry],0x260\n" + " ipm %[rc]\n" + " srl %[rc],28\n" + "1: mvc 0(16,%[psw_pgm]),0(%[psw_old])\n" + : [reg1] "=&d" (reg1), + [reg2] "=&a" (reg2), + [rc] "+&d" (rc), + [ry] "+&d" (ry), + "+Q" (S390_lowcore.program_new_psw), + "=Q" (old) + : [rx] "d" (rx.pair), + [psw_old] "a" (&old), + [psw_pgm] "a" (&S390_lowcore.program_new_psw) + : "cc", "memory"); + return rc == 0 ? ry : -1; +} + +static int diag260(void) +{ + int rc, i; + + struct { + unsigned long start; + unsigned long end; + } storage_extents[8] __aligned(16); /* VM supports up to 8 extends */ + + memset(storage_extents, 0, sizeof(storage_extents)); + rc = __diag260((unsigned long)storage_extents, sizeof(storage_extents)); + if (rc == -1) + return -1; + + for (i = 0; i < min_t(int, rc, ARRAY_SIZE(storage_extents)); i++) + add_physmem_online_range(storage_extents[i].start, storage_extents[i].end + 1); + return 0; +} + +static int tprot(unsigned long addr) +{ + unsigned long reg1, reg2; + int rc = -EFAULT; + psw_t old; + + asm volatile( + " mvc 0(16,%[psw_old]),0(%[psw_pgm])\n" + " epsw %[reg1],%[reg2]\n" + " st %[reg1],0(%[psw_pgm])\n" + " st %[reg2],4(%[psw_pgm])\n" + " larl %[reg1],1f\n" + " stg %[reg1],8(%[psw_pgm])\n" + " tprot 0(%[addr]),0\n" + " ipm %[rc]\n" + " srl %[rc],28\n" + "1: mvc 0(16,%[psw_pgm]),0(%[psw_old])\n" + : [reg1] "=&d" (reg1), + [reg2] "=&a" (reg2), + [rc] "+&d" (rc), + "=Q" (S390_lowcore.program_new_psw.addr), + "=Q" (old) + : [psw_old] "a" (&old), + [psw_pgm] "a" (&S390_lowcore.program_new_psw), + [addr] "a" (addr) + : "cc", "memory"); + return rc; +} + +static unsigned long search_mem_end(void) +{ + unsigned long range = 1 << (MAX_PHYSMEM_BITS - 20); /* in 1MB blocks */ + unsigned long offset = 0; + unsigned long pivot; + + while (range > 1) { + range >>= 1; + pivot = offset + range; + if (!tprot(pivot << 20)) + offset = pivot; + } + return (offset + 1) << 20; +} + +unsigned long detect_max_physmem_end(void) +{ + unsigned long max_physmem_end = 0; + + if (!sclp_early_get_memsize(&max_physmem_end)) { + physmem_info.info_source = MEM_DETECT_SCLP_READ_INFO; + } else { + max_physmem_end = search_mem_end(); + physmem_info.info_source = MEM_DETECT_BIN_SEARCH; + } + return max_physmem_end; +} + +void detect_physmem_online_ranges(unsigned long max_physmem_end) +{ + if (!sclp_early_read_storage_info()) { + physmem_info.info_source = MEM_DETECT_SCLP_STOR_INFO; + } else if (!diag260()) { + physmem_info.info_source = MEM_DETECT_DIAG260; + } else if (max_physmem_end) { + add_physmem_online_range(0, max_physmem_end); + } +} + +void physmem_set_usable_limit(unsigned long limit) +{ + physmem_info.usable = limit; + physmem_alloc_pos = limit; +} + +static void die_oom(unsigned long size, unsigned long align, unsigned long min, unsigned long max) +{ + unsigned long start, end, total_mem = 0, total_reserved_mem = 0; + struct reserved_range *range; + enum reserved_range_type t; + int i; + + decompressor_printk("Linux version %s\n", kernel_version); + if (!is_prot_virt_guest() && early_command_line[0]) + decompressor_printk("Kernel command line: %s\n", early_command_line); + decompressor_printk("Out of memory allocating %lx bytes %lx aligned in range %lx:%lx\n", + size, align, min, max); + decompressor_printk("Reserved memory ranges:\n"); + for_each_physmem_reserved_range(t, range, &start, &end) { + decompressor_printk("%016lx %016lx %s\n", start, end, get_rr_type_name(t)); + total_reserved_mem += end - start; + } + decompressor_printk("Usable online memory ranges (info source: %s [%x]):\n", + get_physmem_info_source(), physmem_info.info_source); + for_each_physmem_usable_range(i, &start, &end) { + decompressor_printk("%016lx %016lx\n", start, end); + total_mem += end - start; + } + decompressor_printk("Usable online memory total: %lx Reserved: %lx Free: %lx\n", + total_mem, total_reserved_mem, + total_mem > total_reserved_mem ? total_mem - total_reserved_mem : 0); + print_stacktrace(current_frame_address()); + sclp_early_printk("\n\n -- System halted\n"); + disabled_wait(); +} + +void physmem_reserve(enum reserved_range_type type, unsigned long addr, unsigned long size) +{ + physmem_info.reserved[type].start = addr; + physmem_info.reserved[type].end = addr + size; +} + +void physmem_free(enum reserved_range_type type) +{ + physmem_info.reserved[type].start = 0; + physmem_info.reserved[type].end = 0; +} + +static bool __physmem_alloc_intersects(unsigned long addr, unsigned long size, + unsigned long *intersection_start) +{ + unsigned long res_addr, res_size; + int t; + + for (t = 0; t < RR_MAX; t++) { + if (!get_physmem_reserved(t, &res_addr, &res_size)) + continue; + if (intersects(addr, size, res_addr, res_size)) { + *intersection_start = res_addr; + return true; + } + } + return ipl_report_certs_intersects(addr, size, intersection_start); +} + +static unsigned long __physmem_alloc_range(unsigned long size, unsigned long align, + unsigned long min, unsigned long max, + unsigned int from_ranges, unsigned int *ranges_left, + bool die_on_oom) +{ + unsigned int nranges = from_ranges ?: physmem_info.range_count; + unsigned long range_start, range_end; + unsigned long intersection_start; + unsigned long addr, pos = max; + + align = max(align, 8UL); + while (nranges) { + __get_physmem_range(nranges - 1, &range_start, &range_end, false); + pos = min(range_end, pos); + + if (round_up(min, align) + size > pos) + break; + addr = round_down(pos - size, align); + if (range_start > addr) { + nranges--; + continue; + } + if (__physmem_alloc_intersects(addr, size, &intersection_start)) { + pos = intersection_start; + continue; + } + + if (ranges_left) + *ranges_left = nranges; + return addr; + } + if (die_on_oom) + die_oom(size, align, min, max); + return 0; +} + +unsigned long physmem_alloc_range(enum reserved_range_type type, unsigned long size, + unsigned long align, unsigned long min, unsigned long max, + bool die_on_oom) +{ + unsigned long addr; + + max = min(max, physmem_alloc_pos); + addr = __physmem_alloc_range(size, align, min, max, 0, NULL, die_on_oom); + if (addr) + physmem_reserve(type, addr, size); + return addr; +} + +unsigned long physmem_alloc_top_down(enum reserved_range_type type, unsigned long size, + unsigned long align) +{ + struct reserved_range *range = &physmem_info.reserved[type]; + struct reserved_range *new_range; + unsigned int ranges_left; + unsigned long addr; + + addr = __physmem_alloc_range(size, align, 0, physmem_alloc_pos, physmem_alloc_ranges, + &ranges_left, true); + /* if not a consecutive allocation of the same type or first allocation */ + if (range->start != addr + size) { + if (range->end) { + physmem_alloc_pos = __physmem_alloc_range( + sizeof(struct reserved_range), 0, 0, physmem_alloc_pos, + physmem_alloc_ranges, &ranges_left, true); + new_range = (struct reserved_range *)physmem_alloc_pos; + *new_range = *range; + range->chain = new_range; + addr = __physmem_alloc_range(size, align, 0, physmem_alloc_pos, + ranges_left, &ranges_left, true); + } + range->end = addr + size; + } + range->start = addr; + physmem_alloc_pos = addr; + physmem_alloc_ranges = ranges_left; + return addr; +} + +unsigned long get_physmem_alloc_pos(void) +{ + return physmem_alloc_pos; +} diff --git a/arch/s390/boot/sclp_early_core.c b/arch/s390/boot/sclp_early_core.c index 5a19fd7020b5..6f30646afbd0 100644 --- a/arch/s390/boot/sclp_early_core.c +++ b/arch/s390/boot/sclp_early_core.c @@ -1,2 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 +#include "boot.h" #include "../../../drivers/s390/char/sclp_early_core.c" + +/* SCLP early buffer must stay page-aligned and below 2GB */ +static char __sclp_early_sccb[EXT_SCCB_READ_SCP] __aligned(PAGE_SIZE); + +void sclp_early_setup_buffer(void) +{ + sclp_early_set_buffer(&__sclp_early_sccb); +} diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index 3b3a11f95269..9cc76e631759 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -1,55 +1,41 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/string.h> #include <linux/elf.h> +#include <asm/page-states.h> +#include <asm/boot_data.h> #include <asm/sections.h> +#include <asm/maccess.h> +#include <asm/cpu_mf.h> #include <asm/setup.h> +#include <asm/kasan.h> #include <asm/kexec.h> #include <asm/sclp.h> #include <asm/diag.h> #include <asm/uv.h> -#include "compressed/decompressor.h" +#include <asm/abs_lowcore.h> +#include <asm/physmem_info.h> +#include "decompressor.h" #include "boot.h" +#include "uv.h" -extern char __boot_data_start[], __boot_data_end[]; -extern char __boot_data_preserved_start[], __boot_data_preserved_end[]; unsigned long __bootdata_preserved(__kaslr_offset); +unsigned long __bootdata_preserved(__abs_lowcore); +unsigned long __bootdata_preserved(__memcpy_real_area); +pte_t *__bootdata_preserved(memcpy_real_ptep); +unsigned long __bootdata_preserved(VMALLOC_START); +unsigned long __bootdata_preserved(VMALLOC_END); +struct page *__bootdata_preserved(vmemmap); +unsigned long __bootdata_preserved(vmemmap_size); +unsigned long __bootdata_preserved(MODULES_VADDR); +unsigned long __bootdata_preserved(MODULES_END); +unsigned long __bootdata_preserved(max_mappable); +unsigned long __bootdata(ident_map_size); -/* - * Some code and data needs to stay below 2 GB, even when the kernel would be - * relocated above 2 GB, because it has to use 31 bit addresses. - * Such code and data is part of the .dma section, and its location is passed - * over to the decompressed / relocated kernel via the .boot.preserved.data - * section. - */ -extern char _sdma[], _edma[]; -extern char _stext_dma[], _etext_dma[]; -extern struct exception_table_entry _start_dma_ex_table[]; -extern struct exception_table_entry _stop_dma_ex_table[]; -unsigned long __bootdata_preserved(__sdma) = __pa(&_sdma); -unsigned long __bootdata_preserved(__edma) = __pa(&_edma); -unsigned long __bootdata_preserved(__stext_dma) = __pa(&_stext_dma); -unsigned long __bootdata_preserved(__etext_dma) = __pa(&_etext_dma); -struct exception_table_entry * - __bootdata_preserved(__start_dma_ex_table) = _start_dma_ex_table; -struct exception_table_entry * - __bootdata_preserved(__stop_dma_ex_table) = _stop_dma_ex_table; - -int _diag210_dma(struct diag210 *addr); -int _diag26c_dma(void *req, void *resp, enum diag26c_sc subcode); -int _diag14_dma(unsigned long rx, unsigned long ry1, unsigned long subcode); -void _diag0c_dma(struct hypfs_diag0c_entry *entry); -void _diag308_reset_dma(void); -struct diag_ops __bootdata_preserved(diag_dma_ops) = { - .diag210 = _diag210_dma, - .diag26c = _diag26c_dma, - .diag14 = _diag14_dma, - .diag0c = _diag0c_dma, - .diag308_reset = _diag308_reset_dma -}; -static struct diag210 _diag210_tmp_dma __section(.dma.data); -struct diag210 *__bootdata_preserved(__diag210_tmp_dma) = &_diag210_tmp_dma; -void _swsusp_reset_dma(void); -unsigned long __bootdata_preserved(__swsusp_reset_dma) = __pa(_swsusp_reset_dma); +u64 __bootdata_preserved(stfle_fac_list[16]); +u64 __bootdata_preserved(alt_stfle_fac_list[16]); +struct oldmem_data __bootdata_preserved(oldmem_data); + +struct machine_info machine; void error(char *x) { @@ -60,6 +46,68 @@ void error(char *x) disabled_wait(); } +static void detect_facilities(void) +{ + if (test_facility(8)) { + machine.has_edat1 = 1; + local_ctl_set_bit(0, CR0_EDAT_BIT); + } + if (test_facility(78)) + machine.has_edat2 = 1; + if (test_facility(130)) + machine.has_nx = 1; +} + +static int cmma_test_essa(void) +{ + unsigned long reg1, reg2, tmp = 0; + int rc = 1; + psw_t old; + + /* Test ESSA_GET_STATE */ + asm volatile( + " mvc 0(16,%[psw_old]),0(%[psw_pgm])\n" + " epsw %[reg1],%[reg2]\n" + " st %[reg1],0(%[psw_pgm])\n" + " st %[reg2],4(%[psw_pgm])\n" + " larl %[reg1],1f\n" + " stg %[reg1],8(%[psw_pgm])\n" + " .insn rrf,0xb9ab0000,%[tmp],%[tmp],%[cmd],0\n" + " la %[rc],0\n" + "1: mvc 0(16,%[psw_pgm]),0(%[psw_old])\n" + : [reg1] "=&d" (reg1), + [reg2] "=&a" (reg2), + [rc] "+&d" (rc), + [tmp] "=&d" (tmp), + "+Q" (S390_lowcore.program_new_psw), + "=Q" (old) + : [psw_old] "a" (&old), + [psw_pgm] "a" (&S390_lowcore.program_new_psw), + [cmd] "i" (ESSA_GET_STATE) + : "cc", "memory"); + return rc; +} + +static void cmma_init(void) +{ + if (!cmma_flag) + return; + if (cmma_test_essa()) { + cmma_flag = 0; + return; + } + if (test_facility(147)) + cmma_flag = 2; +} + +static void setup_lpp(void) +{ + S390_lowcore.current_pid = 0; + S390_lowcore.lpp = LPP_MAGIC; + if (test_facility(40)) + lpp(&S390_lowcore.lpp); +} + #ifdef CONFIG_KERNEL_UNCOMPRESSED unsigned long mem_safe_offset(void) { @@ -67,16 +115,20 @@ unsigned long mem_safe_offset(void) } #endif -static void rescue_initrd(unsigned long addr) +static void rescue_initrd(unsigned long min, unsigned long max) { + unsigned long old_addr, addr, size; + if (!IS_ENABLED(CONFIG_BLK_DEV_INITRD)) return; - if (!INITRD_START || !INITRD_SIZE) + if (!get_physmem_reserved(RR_INITRD, &addr, &size)) return; - if (addr <= INITRD_START) + if (addr >= min && addr + size <= max) return; - memmove((void *)addr, (void *)INITRD_START, INITRD_SIZE); - INITRD_START = addr; + old_addr = addr; + physmem_free(RR_INITRD); + addr = physmem_alloc_top_down(RR_INITRD, size, 0); + memmove((void *)addr, (void *)old_addr, size); } static void copy_bootdata(void) @@ -120,64 +172,252 @@ static void handle_relocs(unsigned long offset) } } -static void clear_bss_section(void) +/* + * Merge information from several sources into a single ident_map_size value. + * "ident_map_size" represents the upper limit of physical memory we may ever + * reach. It might not be all online memory, but also include standby (offline) + * memory. "ident_map_size" could be lower then actual standby or even online + * memory present, due to limiting factors. We should never go above this limit. + * It is the size of our identity mapping. + * + * Consider the following factors: + * 1. max_physmem_end - end of physical memory online or standby. + * Always >= end of the last online memory range (get_physmem_online_end()). + * 2. CONFIG_MAX_PHYSMEM_BITS - the maximum size of physical memory the + * kernel is able to support. + * 3. "mem=" kernel command line option which limits physical memory usage. + * 4. OLDMEM_BASE which is a kdump memory limit when the kernel is executed as + * crash kernel. + * 5. "hsa" size which is a memory limit when the kernel is executed during + * zfcp/nvme dump. + */ +static void setup_ident_map_size(unsigned long max_physmem_end) +{ + unsigned long hsa_size; + + ident_map_size = max_physmem_end; + if (memory_limit) + ident_map_size = min(ident_map_size, memory_limit); + ident_map_size = min(ident_map_size, 1UL << MAX_PHYSMEM_BITS); + +#ifdef CONFIG_CRASH_DUMP + if (oldmem_data.start) { + __kaslr_enabled = 0; + ident_map_size = min(ident_map_size, oldmem_data.size); + } else if (ipl_block_valid && is_ipl_block_dump()) { + __kaslr_enabled = 0; + if (!sclp_early_get_hsa_size(&hsa_size) && hsa_size) + ident_map_size = min(ident_map_size, hsa_size); + } +#endif +} + +static unsigned long setup_kernel_memory_layout(void) +{ + unsigned long vmemmap_start; + unsigned long asce_limit; + unsigned long rte_size; + unsigned long pages; + unsigned long vsize; + unsigned long vmax; + + pages = ident_map_size / PAGE_SIZE; + /* vmemmap contains a multiple of PAGES_PER_SECTION struct pages */ + vmemmap_size = SECTION_ALIGN_UP(pages) * sizeof(struct page); + + /* choose kernel address space layout: 4 or 3 levels. */ + vsize = round_up(ident_map_size, _REGION3_SIZE) + vmemmap_size + + MODULES_LEN + MEMCPY_REAL_SIZE + ABS_LOWCORE_MAP_SIZE; + vsize = size_add(vsize, vmalloc_size); + if (IS_ENABLED(CONFIG_KASAN) || (vsize > _REGION2_SIZE)) { + asce_limit = _REGION1_SIZE; + rte_size = _REGION2_SIZE; + } else { + asce_limit = _REGION2_SIZE; + rte_size = _REGION3_SIZE; + } + + /* + * Forcing modules and vmalloc area under the ultravisor + * secure storage limit, so that any vmalloc allocation + * we do could be used to back secure guest storage. + */ + vmax = adjust_to_uv_max(asce_limit); +#ifdef CONFIG_KASAN + /* force vmalloc and modules below kasan shadow */ + vmax = min(vmax, KASAN_SHADOW_START); +#endif + __memcpy_real_area = round_down(vmax - MEMCPY_REAL_SIZE, PAGE_SIZE); + __abs_lowcore = round_down(__memcpy_real_area - ABS_LOWCORE_MAP_SIZE, + sizeof(struct lowcore)); + MODULES_END = round_down(__abs_lowcore, _SEGMENT_SIZE); + MODULES_VADDR = MODULES_END - MODULES_LEN; + VMALLOC_END = MODULES_VADDR; + + /* allow vmalloc area to occupy up to about 1/2 of the rest virtual space left */ + vsize = round_down(VMALLOC_END / 2, _SEGMENT_SIZE); + vmalloc_size = min(vmalloc_size, vsize); + VMALLOC_START = VMALLOC_END - vmalloc_size; + + /* split remaining virtual space between 1:1 mapping & vmemmap array */ + pages = VMALLOC_START / (PAGE_SIZE + sizeof(struct page)); + pages = SECTION_ALIGN_UP(pages); + /* keep vmemmap_start aligned to a top level region table entry */ + vmemmap_start = round_down(VMALLOC_START - pages * sizeof(struct page), rte_size); + vmemmap_start = min(vmemmap_start, 1UL << MAX_PHYSMEM_BITS); + /* maximum mappable address as seen by arch_get_mappable_range() */ + max_mappable = vmemmap_start; + /* make sure identity map doesn't overlay with vmemmap */ + ident_map_size = min(ident_map_size, vmemmap_start); + vmemmap_size = SECTION_ALIGN_UP(ident_map_size / PAGE_SIZE) * sizeof(struct page); + /* make sure vmemmap doesn't overlay with vmalloc area */ + VMALLOC_START = max(vmemmap_start + vmemmap_size, VMALLOC_START); + vmemmap = (struct page *)vmemmap_start; + + return asce_limit; +} + +/* + * This function clears the BSS section of the decompressed Linux kernel and NOT the decompressor's. + */ +static void clear_bss_section(unsigned long vmlinux_lma) +{ + memset((void *)vmlinux_lma + vmlinux.image_size, 0, vmlinux.bss_size); +} + +/* + * Set vmalloc area size to an 8th of (potential) physical memory + * size, unless size has been set by kernel command line parameter. + */ +static void setup_vmalloc_size(void) +{ + unsigned long size; + + if (vmalloc_size_set) + return; + size = round_up(ident_map_size / 8, _SEGMENT_SIZE); + vmalloc_size = max(size, vmalloc_size); +} + +static void offset_vmlinux_info(unsigned long offset) { - memset((void *)vmlinux.default_lma + vmlinux.image_size, 0, vmlinux.bss_size); + *(unsigned long *)(&vmlinux.entry) += offset; + vmlinux.bootdata_off += offset; + vmlinux.bootdata_preserved_off += offset; + vmlinux.rela_dyn_start += offset; + vmlinux.rela_dyn_end += offset; + vmlinux.dynsym_start += offset; + vmlinux.init_mm_off += offset; + vmlinux.swapper_pg_dir_off += offset; + vmlinux.invalid_pg_dir_off += offset; +#ifdef CONFIG_KASAN + vmlinux.kasan_early_shadow_page_off += offset; + vmlinux.kasan_early_shadow_pte_off += offset; + vmlinux.kasan_early_shadow_pmd_off += offset; + vmlinux.kasan_early_shadow_pud_off += offset; + vmlinux.kasan_early_shadow_p4d_off += offset; +#endif } void startup_kernel(void) { - unsigned long random_lma; + unsigned long max_physmem_end; + unsigned long vmlinux_lma = 0; + unsigned long amode31_lma = 0; + unsigned long asce_limit; unsigned long safe_addr; void *img; + psw_t psw; - store_ipl_parmblock(); + setup_lpp(); safe_addr = mem_safe_offset(); - safe_addr = read_ipl_report(safe_addr); + + /* + * Reserve decompressor memory together with decompression heap, buffer and + * memory which might be occupied by uncompressed kernel at default 1Mb + * position (if KASLR is off or failed). + */ + physmem_reserve(RR_DECOMPRESSOR, 0, safe_addr); + if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && parmarea.initrd_size) + physmem_reserve(RR_INITRD, parmarea.initrd_start, parmarea.initrd_size); + oldmem_data.start = parmarea.oldmem_base; + oldmem_data.size = parmarea.oldmem_size; + + store_ipl_parmblock(); + read_ipl_report(); uv_query_info(); - rescue_initrd(safe_addr); sclp_early_read_info(); setup_boot_command_line(); parse_boot_command_line(); - setup_memory_end(); - detect_memory(); - - random_lma = __kaslr_offset = 0; - if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && kaslr_enabled) { - random_lma = get_random_base(safe_addr); - if (random_lma) { - __kaslr_offset = random_lma - vmlinux.default_lma; - img = (void *)vmlinux.default_lma; - vmlinux.default_lma += __kaslr_offset; - vmlinux.entry += __kaslr_offset; - vmlinux.bootdata_off += __kaslr_offset; - vmlinux.bootdata_preserved_off += __kaslr_offset; - vmlinux.rela_dyn_start += __kaslr_offset; - vmlinux.rela_dyn_end += __kaslr_offset; - vmlinux.dynsym_start += __kaslr_offset; + detect_facilities(); + cmma_init(); + sanitize_prot_virt_host(); + max_physmem_end = detect_max_physmem_end(); + setup_ident_map_size(max_physmem_end); + setup_vmalloc_size(); + asce_limit = setup_kernel_memory_layout(); + /* got final ident_map_size, physmem allocations could be performed now */ + physmem_set_usable_limit(ident_map_size); + detect_physmem_online_ranges(max_physmem_end); + save_ipl_cert_comp_list(); + rescue_initrd(safe_addr, ident_map_size); + + if (kaslr_enabled()) { + vmlinux_lma = randomize_within_range(vmlinux.image_size + vmlinux.bss_size, + THREAD_SIZE, vmlinux.default_lma, + ident_map_size); + if (vmlinux_lma) { + __kaslr_offset = vmlinux_lma - vmlinux.default_lma; + offset_vmlinux_info(__kaslr_offset); } } + vmlinux_lma = vmlinux_lma ?: vmlinux.default_lma; + physmem_reserve(RR_VMLINUX, vmlinux_lma, vmlinux.image_size + vmlinux.bss_size); if (!IS_ENABLED(CONFIG_KERNEL_UNCOMPRESSED)) { img = decompress_kernel(); - memmove((void *)vmlinux.default_lma, img, vmlinux.image_size); - } else if (__kaslr_offset) - memcpy((void *)vmlinux.default_lma, img, vmlinux.image_size); + memmove((void *)vmlinux_lma, img, vmlinux.image_size); + } else if (__kaslr_offset) { + img = (void *)vmlinux.default_lma; + memmove((void *)vmlinux_lma, img, vmlinux.image_size); + memset(img, 0, vmlinux.image_size); + } - clear_bss_section(); + /* vmlinux decompression is done, shrink reserved low memory */ + physmem_reserve(RR_DECOMPRESSOR, 0, (unsigned long)_decompressor_end); + if (kaslr_enabled()) + amode31_lma = randomize_within_range(vmlinux.amode31_size, PAGE_SIZE, 0, SZ_2G); + amode31_lma = amode31_lma ?: vmlinux.default_lma - vmlinux.amode31_size; + physmem_reserve(RR_AMODE31, amode31_lma, vmlinux.amode31_size); + + /* + * The order of the following operations is important: + * + * - handle_relocs() must follow clear_bss_section() to establish static + * memory references to data in .bss to be used by setup_vmem() + * (i.e init_mm.pgd) + * + * - setup_vmem() must follow handle_relocs() to be able using + * static memory references to data in .bss (i.e init_mm.pgd) + * + * - copy_bootdata() must follow setup_vmem() to propagate changes to + * bootdata made by setup_vmem() + */ + clear_bss_section(vmlinux_lma); + handle_relocs(__kaslr_offset); + setup_vmem(asce_limit); copy_bootdata(); - if (IS_ENABLED(CONFIG_RELOCATABLE)) - handle_relocs(__kaslr_offset); - - if (__kaslr_offset) { - /* - * Save KASLR offset for early dumps, before vmcore_info is set. - * Mark as uneven to distinguish from real vmcore_info pointer. - */ - S390_lowcore.vmcore_info = __kaslr_offset | 0x1UL; - /* Clear non-relocated kernel */ - if (IS_ENABLED(CONFIG_KERNEL_UNCOMPRESSED)) - memset(img, 0, vmlinux.image_size); - } - vmlinux.entry(); + + /* + * Save KASLR offset for early dumps, before vmcore_info is set. + * Mark as uneven to distinguish from real vmcore_info pointer. + */ + S390_lowcore.vmcore_info = __kaslr_offset ? __kaslr_offset | 0x1UL : 0; + + /* + * Jump to the decompressed kernel entry point and switch DAT mode on. + */ + psw.addr = vmlinux.entry; + psw.mask = PSW_KERNEL_BITS; + __load_psw(psw); } diff --git a/arch/s390/boot/string.c b/arch/s390/boot/string.c index b11e8108773a..faccb33b462c 100644 --- a/arch/s390/boot/string.c +++ b/arch/s390/boot/string.c @@ -3,6 +3,7 @@ #include <linux/kernel.h> #include <linux/errno.h> #undef CONFIG_KASAN +#undef CONFIG_KASAN_GENERIC #include "../lib/string.c" int strncmp(const char *cs, const char *ct, size_t count) diff --git a/arch/s390/boot/text_dma.S b/arch/s390/boot/text_dma.S deleted file mode 100644 index 9715715c4c28..000000000000 --- a/arch/s390/boot/text_dma.S +++ /dev/null @@ -1,184 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Code that needs to run below 2 GB. - * - * Copyright IBM Corp. 2019 - */ - -#include <linux/linkage.h> -#include <asm/errno.h> -#include <asm/sigp.h> - -#ifdef CC_USING_EXPOLINE - .pushsection .dma.text.__s390_indirect_jump_r14,"axG" -__dma__s390_indirect_jump_r14: - larl %r1,0f - ex 0,0(%r1) - j . -0: br %r14 - .popsection -#endif - - .section .dma.text,"ax" -/* - * Simplified version of expoline thunk. The normal thunks can not be used here, - * because they might be more than 2 GB away, and not reachable by the relative - * branch. No comdat, exrl, etc. optimizations used here, because it only - * affects a few functions that are not performance-relevant. - */ - .macro BR_EX_DMA_r14 -#ifdef CC_USING_EXPOLINE - jg __dma__s390_indirect_jump_r14 -#else - br %r14 -#endif - .endm - -/* - * int _diag14_dma(unsigned long rx, unsigned long ry1, unsigned long subcode) - */ -ENTRY(_diag14_dma) - lgr %r1,%r2 - lgr %r2,%r3 - lgr %r3,%r4 - lhi %r5,-EIO - sam31 - diag %r1,%r2,0x14 -.Ldiag14_ex: - ipm %r5 - srl %r5,28 -.Ldiag14_fault: - sam64 - lgfr %r2,%r5 - BR_EX_DMA_r14 - EX_TABLE_DMA(.Ldiag14_ex, .Ldiag14_fault) -ENDPROC(_diag14_dma) - -/* - * int _diag210_dma(struct diag210 *addr) - */ -ENTRY(_diag210_dma) - lgr %r1,%r2 - lhi %r2,-1 - sam31 - diag %r1,%r0,0x210 -.Ldiag210_ex: - ipm %r2 - srl %r2,28 -.Ldiag210_fault: - sam64 - lgfr %r2,%r2 - BR_EX_DMA_r14 - EX_TABLE_DMA(.Ldiag210_ex, .Ldiag210_fault) -ENDPROC(_diag210_dma) - -/* - * int _diag26c_dma(void *req, void *resp, enum diag26c_sc subcode) - */ -ENTRY(_diag26c_dma) - lghi %r5,-EOPNOTSUPP - sam31 - diag %r2,%r4,0x26c -.Ldiag26c_ex: - sam64 - lgfr %r2,%r5 - BR_EX_DMA_r14 - EX_TABLE_DMA(.Ldiag26c_ex, .Ldiag26c_ex) -ENDPROC(_diag26c_dma) - -/* - * void _diag0c_dma(struct hypfs_diag0c_entry *entry) - */ -ENTRY(_diag0c_dma) - sam31 - diag %r2,%r2,0x0c - sam64 - BR_EX_DMA_r14 -ENDPROC(_diag0c_dma) - -/* - * void _swsusp_reset_dma(void) - */ -ENTRY(_swsusp_reset_dma) - larl %r1,restart_entry - larl %r2,.Lrestart_diag308_psw - og %r1,0(%r2) - stg %r1,0(%r0) - lghi %r0,0 - diag %r0,%r0,0x308 -restart_entry: - lhi %r1,1 - sigp %r1,%r0,SIGP_SET_ARCHITECTURE - sam64 - BR_EX_DMA_r14 -ENDPROC(_swsusp_reset_dma) - -/* - * void _diag308_reset_dma(void) - * - * Calls diag 308 subcode 1 and continues execution - */ -ENTRY(_diag308_reset_dma) - larl %r4,.Lctlregs # Save control registers - stctg %c0,%c15,0(%r4) - lg %r2,0(%r4) # Disable lowcore protection - nilh %r2,0xefff - larl %r4,.Lctlreg0 - stg %r2,0(%r4) - lctlg %c0,%c0,0(%r4) - larl %r4,.Lfpctl # Floating point control register - stfpc 0(%r4) - larl %r4,.Lprefix # Save prefix register - stpx 0(%r4) - larl %r4,.Lprefix_zero # Set prefix register to 0 - spx 0(%r4) - larl %r4,.Lcontinue_psw # Save PSW flags - epsw %r2,%r3 - stm %r2,%r3,0(%r4) - larl %r4,restart_part2 # Setup restart PSW at absolute 0 - larl %r3,.Lrestart_diag308_psw - og %r4,0(%r3) # Save PSW - lghi %r3,0 - sturg %r4,%r3 # Use sturg, because of large pages - lghi %r1,1 - lghi %r0,0 - diag %r0,%r1,0x308 -restart_part2: - lhi %r0,0 # Load r0 with zero - lhi %r1,2 # Use mode 2 = ESAME (dump) - sigp %r1,%r0,SIGP_SET_ARCHITECTURE # Switch to ESAME mode - sam64 # Switch to 64 bit addressing mode - larl %r4,.Lctlregs # Restore control registers - lctlg %c0,%c15,0(%r4) - larl %r4,.Lfpctl # Restore floating point ctl register - lfpc 0(%r4) - larl %r4,.Lprefix # Restore prefix register - spx 0(%r4) - larl %r4,.Lcontinue_psw # Restore PSW flags - lpswe 0(%r4) -.Lcontinue: - BR_EX_DMA_r14 -ENDPROC(_diag308_reset_dma) - - .section .dma.data,"aw",@progbits -.align 8 -.Lrestart_diag308_psw: - .long 0x00080000,0x80000000 - -.align 8 -.Lcontinue_psw: - .quad 0,.Lcontinue - -.align 8 -.Lctlreg0: - .quad 0 -.Lctlregs: - .rept 16 - .quad 0 - .endr -.Lfpctl: - .long 0 -.Lprefix: - .long 0 -.Lprefix_zero: - .long 0 diff --git a/arch/s390/boot/uv.c b/arch/s390/boot/uv.c index ed007f4a6444..1e66d2cbb096 100644 --- a/arch/s390/boot/uv.c +++ b/arch/s390/boot/uv.c @@ -1,9 +1,20 @@ // SPDX-License-Identifier: GPL-2.0 #include <asm/uv.h> +#include <asm/boot_data.h> #include <asm/facility.h> #include <asm/sections.h> +#include "boot.h" +#include "uv.h" + +/* will be used in arch/s390/kernel/uv.c */ +#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST int __bootdata_preserved(prot_virt_guest); +#endif +#if IS_ENABLED(CONFIG_KVM) +int __bootdata_preserved(prot_virt_host); +#endif +struct uv_info __bootdata_preserved(uv_info); void uv_query_info(void) { @@ -15,10 +26,70 @@ void uv_query_info(void) if (!test_facility(158)) return; - if (uv_call(0, (uint64_t)&uvcb)) + /* rc==0x100 means that there is additional data we do not process */ + if (uv_call(0, (uint64_t)&uvcb) && uvcb.header.rc != 0x100) return; + if (IS_ENABLED(CONFIG_KVM)) { + memcpy(uv_info.inst_calls_list, uvcb.inst_calls_list, sizeof(uv_info.inst_calls_list)); + uv_info.uv_base_stor_len = uvcb.uv_base_stor_len; + uv_info.guest_base_stor_len = uvcb.conf_base_phys_stor_len; + uv_info.guest_virt_base_stor_len = uvcb.conf_base_virt_stor_len; + uv_info.guest_virt_var_stor_len = uvcb.conf_virt_var_stor_len; + uv_info.guest_cpu_stor_len = uvcb.cpu_stor_len; + uv_info.max_sec_stor_addr = ALIGN(uvcb.max_guest_stor_addr, PAGE_SIZE); + uv_info.max_num_sec_conf = uvcb.max_num_sec_conf; + uv_info.max_guest_cpu_id = uvcb.max_guest_cpu_id; + uv_info.uv_feature_indications = uvcb.uv_feature_indications; + uv_info.supp_se_hdr_ver = uvcb.supp_se_hdr_versions; + uv_info.supp_se_hdr_pcf = uvcb.supp_se_hdr_pcf; + uv_info.conf_dump_storage_state_len = uvcb.conf_dump_storage_state_len; + uv_info.conf_dump_finalize_len = uvcb.conf_dump_finalize_len; + uv_info.supp_att_req_hdr_ver = uvcb.supp_att_req_hdr_ver; + uv_info.supp_att_pflags = uvcb.supp_att_pflags; + uv_info.supp_add_secret_req_ver = uvcb.supp_add_secret_req_ver; + uv_info.supp_add_secret_pcf = uvcb.supp_add_secret_pcf; + uv_info.supp_secret_types = uvcb.supp_secret_types; + uv_info.max_secrets = uvcb.max_secrets; + } + +#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST if (test_bit_inv(BIT_UVC_CMD_SET_SHARED_ACCESS, (unsigned long *)uvcb.inst_calls_list) && test_bit_inv(BIT_UVC_CMD_REMOVE_SHARED_ACCESS, (unsigned long *)uvcb.inst_calls_list)) prot_virt_guest = 1; +#endif +} + +#if IS_ENABLED(CONFIG_KVM) +unsigned long adjust_to_uv_max(unsigned long limit) +{ + if (is_prot_virt_host() && uv_info.max_sec_stor_addr) + limit = min_t(unsigned long, limit, uv_info.max_sec_stor_addr); + return limit; +} + +static int is_prot_virt_host_capable(void) +{ + /* disable if no prot_virt=1 given on command-line */ + if (!is_prot_virt_host()) + return 0; + /* disable if protected guest virtualization is enabled */ + if (is_prot_virt_guest()) + return 0; + /* disable if no hardware support */ + if (!test_facility(158)) + return 0; + /* disable if kdump */ + if (oldmem_data.start) + return 0; + /* disable if stand-alone dump */ + if (ipl_block_valid && is_ipl_block_dump()) + return 0; + return 1; +} + +void sanitize_prot_virt_host(void) +{ + prot_virt_host = is_prot_virt_host_capable(); } +#endif diff --git a/arch/s390/boot/uv.h b/arch/s390/boot/uv.h new file mode 100644 index 000000000000..0f3070856f8d --- /dev/null +++ b/arch/s390/boot/uv.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef BOOT_UV_H +#define BOOT_UV_H + +#if IS_ENABLED(CONFIG_KVM) +unsigned long adjust_to_uv_max(unsigned long limit); +void sanitize_prot_virt_host(void); +#else +static inline unsigned long adjust_to_uv_max(unsigned long limit) +{ + return limit; +} +static inline void sanitize_prot_virt_host(void) {} +#endif + +#if defined(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) || IS_ENABLED(CONFIG_KVM) +void uv_query_info(void); +#else +static inline void uv_query_info(void) {} +#endif + +#endif /* BOOT_UV_H */ diff --git a/arch/s390/boot/version.c b/arch/s390/boot/version.c index d32e58bdda6a..fd32f038777f 100644 --- a/arch/s390/boot/version.c +++ b/arch/s390/boot/version.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#include <generated/utsversion.h> #include <generated/utsrelease.h> #include <generated/compile.h> #include "boot.h" diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c new file mode 100644 index 000000000000..e3a4500a5a75 --- /dev/null +++ b/arch/s390/boot/vmem.c @@ -0,0 +1,474 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/sched/task.h> +#include <linux/pgtable.h> +#include <linux/kasan.h> +#include <asm/page-states.h> +#include <asm/pgalloc.h> +#include <asm/facility.h> +#include <asm/sections.h> +#include <asm/ctlreg.h> +#include <asm/physmem_info.h> +#include <asm/maccess.h> +#include <asm/abs_lowcore.h> +#include "decompressor.h" +#include "boot.h" + +struct ctlreg __bootdata_preserved(s390_invalid_asce); + +#ifdef CONFIG_PROC_FS +atomic_long_t __bootdata_preserved(direct_pages_count[PG_DIRECT_MAP_MAX]); +#endif + +#define init_mm (*(struct mm_struct *)vmlinux.init_mm_off) +#define swapper_pg_dir vmlinux.swapper_pg_dir_off +#define invalid_pg_dir vmlinux.invalid_pg_dir_off + +enum populate_mode { + POPULATE_NONE, + POPULATE_DIRECT, + POPULATE_ABS_LOWCORE, +#ifdef CONFIG_KASAN + POPULATE_KASAN_MAP_SHADOW, + POPULATE_KASAN_ZERO_SHADOW, + POPULATE_KASAN_SHALLOW +#endif +}; + +static void pgtable_populate(unsigned long addr, unsigned long end, enum populate_mode mode); + +#ifdef CONFIG_KASAN + +#define kasan_early_shadow_page vmlinux.kasan_early_shadow_page_off +#define kasan_early_shadow_pte ((pte_t *)vmlinux.kasan_early_shadow_pte_off) +#define kasan_early_shadow_pmd ((pmd_t *)vmlinux.kasan_early_shadow_pmd_off) +#define kasan_early_shadow_pud ((pud_t *)vmlinux.kasan_early_shadow_pud_off) +#define kasan_early_shadow_p4d ((p4d_t *)vmlinux.kasan_early_shadow_p4d_off) +#define __sha(x) ((unsigned long)kasan_mem_to_shadow((void *)x)) + +static pte_t pte_z; + +static inline void kasan_populate(unsigned long start, unsigned long end, enum populate_mode mode) +{ + start = PAGE_ALIGN_DOWN(__sha(start)); + end = PAGE_ALIGN(__sha(end)); + pgtable_populate(start, end, mode); +} + +static void kasan_populate_shadow(void) +{ + pmd_t pmd_z = __pmd(__pa(kasan_early_shadow_pte) | _SEGMENT_ENTRY); + pud_t pud_z = __pud(__pa(kasan_early_shadow_pmd) | _REGION3_ENTRY); + p4d_t p4d_z = __p4d(__pa(kasan_early_shadow_pud) | _REGION2_ENTRY); + unsigned long memgap_start = 0; + unsigned long untracked_end; + unsigned long start, end; + int i; + + pte_z = __pte(__pa(kasan_early_shadow_page) | pgprot_val(PAGE_KERNEL_RO)); + if (!machine.has_nx) + pte_z = clear_pte_bit(pte_z, __pgprot(_PAGE_NOEXEC)); + crst_table_init((unsigned long *)kasan_early_shadow_p4d, p4d_val(p4d_z)); + crst_table_init((unsigned long *)kasan_early_shadow_pud, pud_val(pud_z)); + crst_table_init((unsigned long *)kasan_early_shadow_pmd, pmd_val(pmd_z)); + memset64((u64 *)kasan_early_shadow_pte, pte_val(pte_z), PTRS_PER_PTE); + __arch_set_page_dat(kasan_early_shadow_p4d, 1UL << CRST_ALLOC_ORDER); + __arch_set_page_dat(kasan_early_shadow_pud, 1UL << CRST_ALLOC_ORDER); + __arch_set_page_dat(kasan_early_shadow_pmd, 1UL << CRST_ALLOC_ORDER); + __arch_set_page_dat(kasan_early_shadow_pte, 1); + + /* + * Current memory layout: + * +- 0 -------------+ +- shadow start -+ + * |1:1 ident mapping| /|1/8 of ident map| + * | | / | | + * +-end of ident map+ / +----------------+ + * | ... gap ... | / | kasan | + * | | / | zero page | + * +- vmalloc area -+ / | mapping | + * | vmalloc_size | / | (untracked) | + * +- modules vaddr -+ / +----------------+ + * | 2Gb |/ | unmapped | allocated per module + * +- shadow start -+ +----------------+ + * | 1/8 addr space | | zero pg mapping| (untracked) + * +- shadow end ----+---------+- shadow end ---+ + * + * Current memory layout (KASAN_VMALLOC): + * +- 0 -------------+ +- shadow start -+ + * |1:1 ident mapping| /|1/8 of ident map| + * | | / | | + * +-end of ident map+ / +----------------+ + * | ... gap ... | / | kasan zero page| (untracked) + * | | / | mapping | + * +- vmalloc area -+ / +----------------+ + * | vmalloc_size | / |shallow populate| + * +- modules vaddr -+ / +----------------+ + * | 2Gb |/ |shallow populate| + * +- shadow start -+ +----------------+ + * | 1/8 addr space | | zero pg mapping| (untracked) + * +- shadow end ----+---------+- shadow end ---+ + */ + + for_each_physmem_usable_range(i, &start, &end) { + kasan_populate(start, end, POPULATE_KASAN_MAP_SHADOW); + if (memgap_start && physmem_info.info_source == MEM_DETECT_DIAG260) + kasan_populate(memgap_start, start, POPULATE_KASAN_ZERO_SHADOW); + memgap_start = end; + } + if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) { + untracked_end = VMALLOC_START; + /* shallowly populate kasan shadow for vmalloc and modules */ + kasan_populate(VMALLOC_START, MODULES_END, POPULATE_KASAN_SHALLOW); + } else { + untracked_end = MODULES_VADDR; + } + /* populate kasan shadow for untracked memory */ + kasan_populate(ident_map_size, untracked_end, POPULATE_KASAN_ZERO_SHADOW); + kasan_populate(MODULES_END, _REGION1_SIZE, POPULATE_KASAN_ZERO_SHADOW); +} + +static bool kasan_pgd_populate_zero_shadow(pgd_t *pgd, unsigned long addr, + unsigned long end, enum populate_mode mode) +{ + if (mode == POPULATE_KASAN_ZERO_SHADOW && + IS_ALIGNED(addr, PGDIR_SIZE) && end - addr >= PGDIR_SIZE) { + pgd_populate(&init_mm, pgd, kasan_early_shadow_p4d); + return true; + } + return false; +} + +static bool kasan_p4d_populate_zero_shadow(p4d_t *p4d, unsigned long addr, + unsigned long end, enum populate_mode mode) +{ + if (mode == POPULATE_KASAN_ZERO_SHADOW && + IS_ALIGNED(addr, P4D_SIZE) && end - addr >= P4D_SIZE) { + p4d_populate(&init_mm, p4d, kasan_early_shadow_pud); + return true; + } + return false; +} + +static bool kasan_pud_populate_zero_shadow(pud_t *pud, unsigned long addr, + unsigned long end, enum populate_mode mode) +{ + if (mode == POPULATE_KASAN_ZERO_SHADOW && + IS_ALIGNED(addr, PUD_SIZE) && end - addr >= PUD_SIZE) { + pud_populate(&init_mm, pud, kasan_early_shadow_pmd); + return true; + } + return false; +} + +static bool kasan_pmd_populate_zero_shadow(pmd_t *pmd, unsigned long addr, + unsigned long end, enum populate_mode mode) +{ + if (mode == POPULATE_KASAN_ZERO_SHADOW && + IS_ALIGNED(addr, PMD_SIZE) && end - addr >= PMD_SIZE) { + pmd_populate(&init_mm, pmd, kasan_early_shadow_pte); + return true; + } + return false; +} + +static bool kasan_pte_populate_zero_shadow(pte_t *pte, enum populate_mode mode) +{ + if (mode == POPULATE_KASAN_ZERO_SHADOW) { + set_pte(pte, pte_z); + return true; + } + return false; +} +#else + +static inline void kasan_populate_shadow(void) {} + +static inline bool kasan_pgd_populate_zero_shadow(pgd_t *pgd, unsigned long addr, + unsigned long end, enum populate_mode mode) +{ + return false; +} + +static inline bool kasan_p4d_populate_zero_shadow(p4d_t *p4d, unsigned long addr, + unsigned long end, enum populate_mode mode) +{ + return false; +} + +static inline bool kasan_pud_populate_zero_shadow(pud_t *pud, unsigned long addr, + unsigned long end, enum populate_mode mode) +{ + return false; +} + +static inline bool kasan_pmd_populate_zero_shadow(pmd_t *pmd, unsigned long addr, + unsigned long end, enum populate_mode mode) +{ + return false; +} + +static bool kasan_pte_populate_zero_shadow(pte_t *pte, enum populate_mode mode) +{ + return false; +} + +#endif + +/* + * Mimic virt_to_kpte() in lack of init_mm symbol. Skip pmd NULL check though. + */ +static inline pte_t *__virt_to_kpte(unsigned long va) +{ + return pte_offset_kernel(pmd_offset(pud_offset(p4d_offset(pgd_offset_k(va), va), va), va), va); +} + +static void *boot_crst_alloc(unsigned long val) +{ + unsigned long size = PAGE_SIZE << CRST_ALLOC_ORDER; + unsigned long *table; + + table = (unsigned long *)physmem_alloc_top_down(RR_VMEM, size, size); + crst_table_init(table, val); + __arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER); + return table; +} + +static pte_t *boot_pte_alloc(void) +{ + static void *pte_leftover; + pte_t *pte; + + /* + * handling pte_leftovers this way helps to avoid memory fragmentation + * during POPULATE_KASAN_MAP_SHADOW when EDAT is off + */ + if (!pte_leftover) { + pte_leftover = (void *)physmem_alloc_top_down(RR_VMEM, PAGE_SIZE, PAGE_SIZE); + pte = pte_leftover + _PAGE_TABLE_SIZE; + __arch_set_page_dat(pte, 1); + } else { + pte = pte_leftover; + pte_leftover = NULL; + } + + memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE); + return pte; +} + +static unsigned long _pa(unsigned long addr, unsigned long size, enum populate_mode mode) +{ + switch (mode) { + case POPULATE_NONE: + return -1; + case POPULATE_DIRECT: + return addr; + case POPULATE_ABS_LOWCORE: + return __abs_lowcore_pa(addr); +#ifdef CONFIG_KASAN + case POPULATE_KASAN_MAP_SHADOW: + addr = physmem_alloc_top_down(RR_VMEM, size, size); + memset((void *)addr, 0, size); + return addr; +#endif + default: + return -1; + } +} + +static bool can_large_pud(pud_t *pu_dir, unsigned long addr, unsigned long end) +{ + return machine.has_edat2 && + IS_ALIGNED(addr, PUD_SIZE) && (end - addr) >= PUD_SIZE; +} + +static bool can_large_pmd(pmd_t *pm_dir, unsigned long addr, unsigned long end) +{ + return machine.has_edat1 && + IS_ALIGNED(addr, PMD_SIZE) && (end - addr) >= PMD_SIZE; +} + +static void pgtable_pte_populate(pmd_t *pmd, unsigned long addr, unsigned long end, + enum populate_mode mode) +{ + unsigned long pages = 0; + pte_t *pte, entry; + + pte = pte_offset_kernel(pmd, addr); + for (; addr < end; addr += PAGE_SIZE, pte++) { + if (pte_none(*pte)) { + if (kasan_pte_populate_zero_shadow(pte, mode)) + continue; + entry = __pte(_pa(addr, PAGE_SIZE, mode)); + entry = set_pte_bit(entry, PAGE_KERNEL); + if (!machine.has_nx) + entry = clear_pte_bit(entry, __pgprot(_PAGE_NOEXEC)); + set_pte(pte, entry); + pages++; + } + } + if (mode == POPULATE_DIRECT) + update_page_count(PG_DIRECT_MAP_4K, pages); +} + +static void pgtable_pmd_populate(pud_t *pud, unsigned long addr, unsigned long end, + enum populate_mode mode) +{ + unsigned long next, pages = 0; + pmd_t *pmd, entry; + pte_t *pte; + + pmd = pmd_offset(pud, addr); + for (; addr < end; addr = next, pmd++) { + next = pmd_addr_end(addr, end); + if (pmd_none(*pmd)) { + if (kasan_pmd_populate_zero_shadow(pmd, addr, next, mode)) + continue; + if (can_large_pmd(pmd, addr, next)) { + entry = __pmd(_pa(addr, _SEGMENT_SIZE, mode)); + entry = set_pmd_bit(entry, SEGMENT_KERNEL); + if (!machine.has_nx) + entry = clear_pmd_bit(entry, __pgprot(_SEGMENT_ENTRY_NOEXEC)); + set_pmd(pmd, entry); + pages++; + continue; + } + pte = boot_pte_alloc(); + pmd_populate(&init_mm, pmd, pte); + } else if (pmd_large(*pmd)) { + continue; + } + pgtable_pte_populate(pmd, addr, next, mode); + } + if (mode == POPULATE_DIRECT) + update_page_count(PG_DIRECT_MAP_1M, pages); +} + +static void pgtable_pud_populate(p4d_t *p4d, unsigned long addr, unsigned long end, + enum populate_mode mode) +{ + unsigned long next, pages = 0; + pud_t *pud, entry; + pmd_t *pmd; + + pud = pud_offset(p4d, addr); + for (; addr < end; addr = next, pud++) { + next = pud_addr_end(addr, end); + if (pud_none(*pud)) { + if (kasan_pud_populate_zero_shadow(pud, addr, next, mode)) + continue; + if (can_large_pud(pud, addr, next)) { + entry = __pud(_pa(addr, _REGION3_SIZE, mode)); + entry = set_pud_bit(entry, REGION3_KERNEL); + if (!machine.has_nx) + entry = clear_pud_bit(entry, __pgprot(_REGION_ENTRY_NOEXEC)); + set_pud(pud, entry); + pages++; + continue; + } + pmd = boot_crst_alloc(_SEGMENT_ENTRY_EMPTY); + pud_populate(&init_mm, pud, pmd); + } else if (pud_large(*pud)) { + continue; + } + pgtable_pmd_populate(pud, addr, next, mode); + } + if (mode == POPULATE_DIRECT) + update_page_count(PG_DIRECT_MAP_2G, pages); +} + +static void pgtable_p4d_populate(pgd_t *pgd, unsigned long addr, unsigned long end, + enum populate_mode mode) +{ + unsigned long next; + p4d_t *p4d; + pud_t *pud; + + p4d = p4d_offset(pgd, addr); + for (; addr < end; addr = next, p4d++) { + next = p4d_addr_end(addr, end); + if (p4d_none(*p4d)) { + if (kasan_p4d_populate_zero_shadow(p4d, addr, next, mode)) + continue; + pud = boot_crst_alloc(_REGION3_ENTRY_EMPTY); + p4d_populate(&init_mm, p4d, pud); + } + pgtable_pud_populate(p4d, addr, next, mode); + } +} + +static void pgtable_populate(unsigned long addr, unsigned long end, enum populate_mode mode) +{ + unsigned long next; + pgd_t *pgd; + p4d_t *p4d; + + pgd = pgd_offset(&init_mm, addr); + for (; addr < end; addr = next, pgd++) { + next = pgd_addr_end(addr, end); + if (pgd_none(*pgd)) { + if (kasan_pgd_populate_zero_shadow(pgd, addr, next, mode)) + continue; + p4d = boot_crst_alloc(_REGION2_ENTRY_EMPTY); + pgd_populate(&init_mm, pgd, p4d); + } +#ifdef CONFIG_KASAN + if (mode == POPULATE_KASAN_SHALLOW) + continue; +#endif + pgtable_p4d_populate(pgd, addr, next, mode); + } +} + +void setup_vmem(unsigned long asce_limit) +{ + unsigned long start, end; + unsigned long asce_type; + unsigned long asce_bits; + int i; + + /* + * Mark whole memory as no-dat. This must be done before any + * page tables are allocated, or kernel image builtin pages + * are marked as dat tables. + */ + for_each_physmem_online_range(i, &start, &end) + __arch_set_page_nodat((void *)start, (end - start) >> PAGE_SHIFT); + + if (asce_limit == _REGION1_SIZE) { + asce_type = _REGION2_ENTRY_EMPTY; + asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; + } else { + asce_type = _REGION3_ENTRY_EMPTY; + asce_bits = _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; + } + s390_invalid_asce.val = invalid_pg_dir | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; + + crst_table_init((unsigned long *)swapper_pg_dir, asce_type); + crst_table_init((unsigned long *)invalid_pg_dir, _REGION3_ENTRY_EMPTY); + __arch_set_page_dat((void *)swapper_pg_dir, 1UL << CRST_ALLOC_ORDER); + __arch_set_page_dat((void *)invalid_pg_dir, 1UL << CRST_ALLOC_ORDER); + + /* + * To allow prefixing the lowcore must be mapped with 4KB pages. + * To prevent creation of a large page at address 0 first map + * the lowcore and create the identity mapping only afterwards. + */ + pgtable_populate(0, sizeof(struct lowcore), POPULATE_DIRECT); + for_each_physmem_usable_range(i, &start, &end) + pgtable_populate(start, end, POPULATE_DIRECT); + pgtable_populate(__abs_lowcore, __abs_lowcore + sizeof(struct lowcore), + POPULATE_ABS_LOWCORE); + pgtable_populate(__memcpy_real_area, __memcpy_real_area + PAGE_SIZE, + POPULATE_NONE); + memcpy_real_ptep = __virt_to_kpte(__memcpy_real_area); + + kasan_populate_shadow(); + + S390_lowcore.kernel_asce.val = swapper_pg_dir | asce_bits; + S390_lowcore.user_asce = s390_invalid_asce; + + local_ctl_load(1, &S390_lowcore.kernel_asce); + local_ctl_load(7, &S390_lowcore.user_asce); + local_ctl_load(13, &S390_lowcore.kernel_asce); + + init_mm.context.asce = S390_lowcore.kernel_asce.val; +} diff --git a/arch/s390/boot/compressed/vmlinux.lds.S b/arch/s390/boot/vmlinux.lds.S index 44561b2c3712..389df0e0d9e5 100644 --- a/arch/s390/boot/compressed/vmlinux.lds.S +++ b/arch/s390/boot/vmlinux.lds.S @@ -1,6 +1,10 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include <asm-generic/vmlinux.lds.h> #include <asm/vmlinux.lds.h> +#include <asm/thread_info.h> +#include <asm/page.h> +#include <asm/sclp.h> +#include "boot.h" OUTPUT_FORMAT("elf64-s390", "elf64-s390", "elf64-s390") OUTPUT_ARCH(s390:64-bit) @@ -10,11 +14,19 @@ ENTRY(startup) SECTIONS { . = 0; + .ipldata : { + *(.ipldata) + } + . = IPL_START; .head.text : { _head = . ; HEAD_TEXT _ehead = . ; } + . = PARMAREA; + .parmarea : { + *(.parmarea) + } .text : { _text = .; /* Text */ *(.text) @@ -27,38 +39,42 @@ SECTIONS *(.rodata.*) _erodata = . ; } + NOTES .data : { _data = . ; *(.data) *(.data.*) _edata = . ; } - /* - * .dma section for code, data, ex_table that need to stay below 2 GB, - * even when the kernel is relocate: above 2 GB. - */ - . = ALIGN(PAGE_SIZE); - _sdma = .; - .dma.text : { - _stext_dma = .; - *(.dma.text) - . = ALIGN(PAGE_SIZE); - _etext_dma = .; - } - . = ALIGN(16); - .dma.ex_table : { - _start_dma_ex_table = .; - KEEP(*(.dma.ex_table)) - _stop_dma_ex_table = .; - } - .dma.data : { *(.dma.data) } - . = ALIGN(PAGE_SIZE); - _edma = .; BOOT_DATA BOOT_DATA_PRESERVED /* + * This is the BSS section of the decompressor and not of the decompressed Linux kernel. + * It will consume place in the decompressor's image. + */ + . = ALIGN(8); + .bss : { + _bss = . ; + *(.bss) + *(.bss.*) + *(COMMON) + /* + * Stacks for the decompressor + */ + . = ALIGN(PAGE_SIZE); + _dump_info_stack_start = .; + . += PAGE_SIZE; + _dump_info_stack_end = .; + . = ALIGN(PAGE_SIZE); + _stack_start = .; + . += BOOT_STACK_SIZE; + _stack_end = .; + _ebss = .; + } + + /* * uncompressed image info used by the decompressor it should match * struct vmlinux_info. It comes from .vmlinux.info section of * uncompressed vmlinux in a form of info.o @@ -69,6 +85,16 @@ SECTIONS *(.vmlinux.info) } + .decompressor.syms : { + . += 1; /* make sure we have \0 before the first entry */ + . = ALIGN(2); + _decompressor_syms_start = .; + *(.decompressor.syms) + _decompressor_syms_end = .; + } + + _decompressor_end = .; + #ifdef CONFIG_KERNEL_UNCOMPRESSED . = 0x100000; #else @@ -78,17 +104,17 @@ SECTIONS _compressed_start = .; *(.vmlinux.bin.compressed) _compressed_end = .; - FILL(0xff); - . = ALIGN(4096); } - . = ALIGN(256); - .bss : { - _bss = . ; - *(.bss) - *(.bss.*) - *(COMMON) - . = ALIGN(8); /* For convenience during zeroing */ - _ebss = .; + +#define SB_TRAILER_SIZE 32 + /* Trailer needed for Secure Boot */ + . += SB_TRAILER_SIZE; /* make sure .sb.trailer does not overwrite the previous section */ + . = ALIGN(4096) - SB_TRAILER_SIZE; + .sb.trailer : { + QUAD(0) + QUAD(0) + QUAD(0) + QUAD(0x000000207a49504c) } _end = .; diff --git a/arch/s390/configs/btf.config b/arch/s390/configs/btf.config new file mode 100644 index 000000000000..eb7f84f5925c --- /dev/null +++ b/arch/s390/configs/btf.config @@ -0,0 +1,2 @@ +# Help: Enable BTF debug info +CONFIG_DEBUG_INFO_BTF=y diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 2e60c80395ab..cae2dd34fbb4 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -1,9 +1,16 @@ +CONFIG_UAPI_HEADER_TEST=y CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y +CONFIG_WATCH_QUEUE=y CONFIG_AUDIT=y CONFIG_NO_HZ_IDLE=y CONFIG_HIGH_RES_TIMERS=y +CONFIG_BPF_SYSCALL=y +CONFIG_BPF_JIT=y +CONFIG_BPF_JIT_ALWAYS_ON=y +CONFIG_BPF_LSM=y CONFIG_PREEMPT=y +CONFIG_SCHED_CORE=y CONFIG_BSD_PROCESS_ACCT=y CONFIG_BSD_PROCESS_ACCT_V3=y CONFIG_TASKSTATS=y @@ -14,10 +21,8 @@ CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_NUMA_BALANCING=y CONFIG_MEMCG=y -CONFIG_MEMCG_SWAP=y CONFIG_BLK_CGROUP=y CONFIG_CFS_BANDWIDTH=y -CONFIG_RT_GROUP_SCHED=y CONFIG_CGROUP_PIDS=y CONFIG_CGROUP_RDMA=y CONFIG_CGROUP_FREEZER=y @@ -27,55 +32,57 @@ CONFIG_CGROUP_DEVICE=y CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_PERF=y CONFIG_CGROUP_BPF=y +CONFIG_CGROUP_MISC=y CONFIG_NAMESPACES=y CONFIG_USER_NS=y CONFIG_CHECKPOINT_RESTORE=y CONFIG_SCHED_AUTOGROUP=y -CONFIG_BLK_DEV_INITRD=y CONFIG_EXPERT=y # CONFIG_SYSFS_SYSCALL is not set -CONFIG_BPF_SYSCALL=y -CONFIG_USERFAULTFD=y -# CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y +CONFIG_KEXEC=y +CONFIG_KEXEC_FILE=y +CONFIG_KEXEC_SIG=y +CONFIG_CRASH_DUMP=y CONFIG_LIVEPATCH=y -CONFIG_TUNE_ZEC12=y +CONFIG_MARCH_Z13=y CONFIG_NR_CPUS=512 CONFIG_NUMA=y CONFIG_HZ_100=y -CONFIG_KEXEC_FILE=y -CONFIG_KEXEC_SIG=y +CONFIG_CERT_STORE=y CONFIG_EXPOLINE=y CONFIG_EXPOLINE_AUTO=y CONFIG_CHSC_SCH=y CONFIG_VFIO_CCW=m CONFIG_VFIO_AP=m -CONFIG_CRASH_DUMP=y -CONFIG_HIBERNATION=y -CONFIG_PM_DEBUG=y +CONFIG_PROTECTED_VIRTUALIZATION_GUEST=y CONFIG_CMM=m CONFIG_APPLDATA_BASE=y +CONFIG_S390_HYPFS_FS=y CONFIG_KVM=m -CONFIG_VHOST_NET=m -CONFIG_VHOST_VSOCK=m -CONFIG_OPROFILE=m +CONFIG_S390_UNWIND_SELFTEST=m +CONFIG_S390_KPROBES_SANITY_TEST=m +CONFIG_S390_MODULES_SANITY_TEST=m CONFIG_KPROBES=y CONFIG_JUMP_LABEL=y CONFIG_STATIC_KEYS_SELFTEST=y +CONFIG_SECCOMP_CACHE_DEBUG=y CONFIG_LOCK_EVENT_COUNTS=y +# CONFIG_GCC_PLUGINS is not set CONFIG_MODULES=y CONFIG_MODULE_FORCE_LOAD=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y +CONFIG_MODULE_UNLOAD_TAINT_TRACKING=y CONFIG_MODVERSIONS=y CONFIG_MODULE_SRCVERSION_ALL=y -CONFIG_MODULE_SIG_SHA256=y -CONFIG_UNUSED_SYMBOLS=y -CONFIG_BLK_DEV_INTEGRITY=y CONFIG_BLK_DEV_THROTTLING=y CONFIG_BLK_WBT=y CONFIG_BLK_CGROUP_IOLATENCY=y CONFIG_BLK_CGROUP_IOCOST=y +CONFIG_BLK_CGROUP_IOPRIO=y +CONFIG_BLK_INLINE_ENCRYPTION=y +CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK=y CONFIG_PARTITION_ADVANCED=y CONFIG_IBM_PARTITION=y CONFIG_BSD_DISKLABEL=y @@ -83,25 +90,27 @@ CONFIG_MINIX_SUBPARTITION=y CONFIG_SOLARIS_X86_PARTITION=y CONFIG_UNIXWARE_DISKLABEL=y CONFIG_IOSCHED_BFQ=y -CONFIG_BFQ_GROUP_IOSCHED=y CONFIG_BINFMT_MISC=m +CONFIG_ZSWAP=y +CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD=y +CONFIG_ZSMALLOC_STAT=y +CONFIG_SLUB_STATS=y +# CONFIG_COMPAT_BRK is not set CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTREMOVE=y CONFIG_KSM=y CONFIG_TRANSPARENT_HUGEPAGE=y -CONFIG_CLEANCACHE=y -CONFIG_FRONTSWAP=y CONFIG_CMA_DEBUG=y CONFIG_CMA_DEBUGFS=y +CONFIG_CMA_SYSFS=y +CONFIG_CMA_AREAS=7 CONFIG_MEM_SOFT_DIRTY=y -CONFIG_ZSWAP=y -CONFIG_ZBUD=m -CONFIG_ZSMALLOC=m -CONFIG_ZSMALLOC_STAT=y CONFIG_DEFERRED_STRUCT_PAGE_INIT=y CONFIG_IDLE_PAGE_TRACKING=y CONFIG_PERCPU_STATS=y -CONFIG_GUP_BENCHMARK=y +CONFIG_GUP_TEST=y +CONFIG_ANON_VMA_NAME=y +CONFIG_USERFAULTFD=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_PACKET_DIAG=m @@ -129,6 +138,7 @@ CONFIG_SYN_COOKIES=y CONFIG_NET_IPVTI=m CONFIG_INET_AH=m CONFIG_INET_ESP=m +CONFIG_INET_ESPINTCP=y CONFIG_INET_IPCOMP=m CONFIG_INET_DIAG=m CONFIG_INET_UDP_DIAG=m @@ -143,6 +153,7 @@ CONFIG_TCP_CONG_ILLINOIS=m CONFIG_IPV6_ROUTER_PREF=y CONFIG_INET6_AH=m CONFIG_INET6_ESP=m +CONFIG_INET6_ESPINTCP=y CONFIG_INET6_IPCOMP=m CONFIG_IPV6_MIP6=m CONFIG_IPV6_VTI=m @@ -150,9 +161,14 @@ CONFIG_IPV6_SIT=m CONFIG_IPV6_GRE=m CONFIG_IPV6_MULTIPLE_TABLES=y CONFIG_IPV6_SUBTREES=y +CONFIG_IPV6_RPL_LWTUNNEL=y +CONFIG_MPTCP=y CONFIG_NETFILTER=y +CONFIG_BRIDGE_NETFILTER=m +CONFIG_NETFILTER_NETLINK_HOOK=m CONFIG_NF_CONNTRACK=m CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_PROCFS=y CONFIG_NF_CONNTRACK_EVENTS=y CONFIG_NF_CONNTRACK_TIMEOUT=y CONFIG_NF_CONNTRACK_TIMESTAMP=y @@ -169,13 +185,16 @@ CONFIG_NF_CONNTRACK_TFTP=m CONFIG_NF_CT_NETLINK=m CONFIG_NF_CT_NETLINK_TIMEOUT=m CONFIG_NF_TABLES=m +CONFIG_NF_TABLES_INET=y CONFIG_NFT_CT=m -CONFIG_NFT_COUNTER=m CONFIG_NFT_LOG=m CONFIG_NFT_LIMIT=m CONFIG_NFT_NAT=m +CONFIG_NFT_REJECT=m CONFIG_NFT_COMPAT=m CONFIG_NFT_HASH=m +CONFIG_NFT_FIB_INET=m +CONFIG_NETFILTER_XTABLES_COMPAT=y CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_AUDIT=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m @@ -264,10 +283,12 @@ CONFIG_IP_VS_DH=m CONFIG_IP_VS_SH=m CONFIG_IP_VS_SED=m CONFIG_IP_VS_NQ=m +CONFIG_IP_VS_TWOS=m CONFIG_IP_VS_FTP=m CONFIG_IP_VS_PE_SIP=m -CONFIG_NF_TABLES_IPV4=y +CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y +CONFIG_NF_LOG_IPV4=m CONFIG_IP_NF_IPTABLES=m CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m @@ -278,7 +299,6 @@ CONFIG_IP_NF_TARGET_REJECT=m CONFIG_IP_NF_NAT=m CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_MANGLE=m -CONFIG_IP_NF_TARGET_CLUSTERIP=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -286,7 +306,7 @@ CONFIG_IP_NF_SECURITY=m CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m -CONFIG_NF_TABLES_IPV6=y +CONFIG_NFT_FIB_IPV6=m CONFIG_IP6_NF_IPTABLES=m CONFIG_IP6_NF_MATCH_AH=m CONFIG_IP6_NF_MATCH_EUI64=m @@ -315,11 +335,11 @@ CONFIG_L2TP_DEBUGFS=m CONFIG_L2TP_V3=y CONFIG_L2TP_IP=m CONFIG_L2TP_ETH=m -CONFIG_BRIDGE=m +CONFIG_BRIDGE=y +CONFIG_BRIDGE_MRP=y CONFIG_VLAN_8021Q=m CONFIG_VLAN_8021Q_GVRP=y CONFIG_NET_SCHED=y -CONFIG_NET_SCH_CBQ=m CONFIG_NET_SCH_HTB=m CONFIG_NET_SCH_HFSC=m CONFIG_NET_SCH_PRIO=m @@ -330,7 +350,6 @@ CONFIG_NET_SCH_SFQ=m CONFIG_NET_SCH_TEQL=m CONFIG_NET_SCH_TBF=m CONFIG_NET_SCH_GRED=m -CONFIG_NET_SCH_DSMARK=m CONFIG_NET_SCH_NETEM=m CONFIG_NET_SCH_DRR=m CONFIG_NET_SCH_MQPRIO=m @@ -340,15 +359,13 @@ CONFIG_NET_SCH_CODEL=m CONFIG_NET_SCH_FQ_CODEL=m CONFIG_NET_SCH_INGRESS=m CONFIG_NET_SCH_PLUG=m +CONFIG_NET_SCH_ETS=m CONFIG_NET_CLS_BASIC=m -CONFIG_NET_CLS_TCINDEX=m CONFIG_NET_CLS_ROUTE4=m CONFIG_NET_CLS_FW=m CONFIG_NET_CLS_U32=m CONFIG_CLS_U32_PERF=y CONFIG_CLS_U32_MARK=y -CONFIG_NET_CLS_RSVP=m -CONFIG_NET_CLS_RSVP6=m CONFIG_NET_CLS_FLOW=m CONFIG_NET_CLS_CGROUP=y CONFIG_NET_CLS_BPF=m @@ -357,30 +374,33 @@ CONFIG_NET_ACT_POLICE=m CONFIG_NET_ACT_GACT=m CONFIG_GACT_PROB=y CONFIG_NET_ACT_MIRRED=m -CONFIG_NET_ACT_IPT=m CONFIG_NET_ACT_NAT=m CONFIG_NET_ACT_PEDIT=m CONFIG_NET_ACT_SIMP=m CONFIG_NET_ACT_SKBEDIT=m CONFIG_NET_ACT_CSUM=m +CONFIG_NET_ACT_GATE=m +CONFIG_NET_TC_SKB_EXT=y CONFIG_DNS_RESOLVER=y CONFIG_OPENVSWITCH=m CONFIG_VSOCKETS=m CONFIG_VIRTIO_VSOCKETS=m CONFIG_NETLINK_DIAG=m +CONFIG_NET_SWITCHDEV=y CONFIG_CGROUP_NET_PRIO=y -CONFIG_BPF_JIT=y CONFIG_NET_PKTGEN=m -# CONFIG_NET_DROP_MONITOR is not set CONFIG_PCI=y +# CONFIG_PCIEASPM is not set CONFIG_PCI_DEBUG=y +CONFIG_PCI_IOV=y CONFIG_HOTPLUG_PCI=y CONFIG_HOTPLUG_PCI_S390=y CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_SAFE=y +# CONFIG_FW_LOADER is not set CONFIG_CONNECTOR=y -CONFIG_ZRAM=m +CONFIG_ZRAM=y CONFIG_BLK_DEV_LOOP=m -CONFIG_BLK_DEV_CRYPTOLOOP=m CONFIG_BLK_DEV_DRBD=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=y @@ -401,12 +421,12 @@ CONFIG_SCSI_ENCLOSURE=m CONFIG_SCSI_CONSTANTS=y CONFIG_SCSI_LOGGING=y CONFIG_SCSI_SPI_ATTRS=m -CONFIG_SCSI_FC_ATTRS=y +CONFIG_SCSI_FC_ATTRS=m CONFIG_SCSI_SAS_LIBSAS=m CONFIG_SCSI_SRP_ATTRS=m CONFIG_ISCSI_TCP=m CONFIG_SCSI_DEBUG=m -CONFIG_ZFCP=y +CONFIG_ZFCP=m CONFIG_SCSI_VIRTIO=m CONFIG_SCSI_DH=y CONFIG_SCSI_DH_RDAC=m @@ -415,12 +435,13 @@ CONFIG_SCSI_DH_EMC=m CONFIG_SCSI_DH_ALUA=m CONFIG_MD=y CONFIG_BLK_DEV_MD=y +# CONFIG_MD_BITMAP_FILE is not set CONFIG_MD_LINEAR=m CONFIG_MD_MULTIPATH=m CONFIG_MD_FAULTY=m CONFIG_MD_CLUSTER=m CONFIG_BCACHE=m -CONFIG_BLK_DEV_DM=m +CONFIG_BLK_DEV_DM=y CONFIG_DM_UNSTRIPED=m CONFIG_DM_CRYPT=m CONFIG_DM_SNAPSHOT=m @@ -434,12 +455,16 @@ CONFIG_DM_ZERO=m CONFIG_DM_MULTIPATH=m CONFIG_DM_MULTIPATH_QL=m CONFIG_DM_MULTIPATH_ST=m +CONFIG_DM_MULTIPATH_HST=m +CONFIG_DM_MULTIPATH_IOA=m CONFIG_DM_DELAY=m +CONFIG_DM_INIT=y CONFIG_DM_UEVENT=y CONFIG_DM_FLAKEY=m CONFIG_DM_VERITY=m CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG=y CONFIG_DM_SWITCH=m +CONFIG_DM_INTEGRITY=m CONFIG_NETDEVICES=y CONFIG_BONDING=m CONFIG_DUMMY=m @@ -447,6 +472,9 @@ CONFIG_EQUALIZER=m CONFIG_IFB=m CONFIG_MACVLAN=m CONFIG_MACVTAP=m +CONFIG_VXLAN=m +CONFIG_BAREUDP=m +CONFIG_AMT=m CONFIG_TUN=m CONFIG_VETH=m CONFIG_VIRTIO_NET=m @@ -460,41 +488,44 @@ CONFIG_NLMON=m # CONFIG_NET_VENDOR_AMD is not set # CONFIG_NET_VENDOR_AQUANTIA is not set # CONFIG_NET_VENDOR_ARC is not set +# CONFIG_NET_VENDOR_ASIX is not set # CONFIG_NET_VENDOR_ATHEROS is not set -# CONFIG_NET_VENDOR_AURORA is not set # CONFIG_NET_VENDOR_BROADCOM is not set -# CONFIG_NET_VENDOR_BROCADE is not set # CONFIG_NET_VENDOR_CADENCE is not set # CONFIG_NET_VENDOR_CAVIUM is not set # CONFIG_NET_VENDOR_CHELSIO is not set # CONFIG_NET_VENDOR_CISCO is not set # CONFIG_NET_VENDOR_CORTINA is not set +# CONFIG_NET_VENDOR_DAVICOM is not set # CONFIG_NET_VENDOR_DEC is not set # CONFIG_NET_VENDOR_DLINK is not set # CONFIG_NET_VENDOR_EMULEX is not set +# CONFIG_NET_VENDOR_ENGLEDER is not set # CONFIG_NET_VENDOR_EZCHIP is not set +# CONFIG_NET_VENDOR_FUNGIBLE is not set # CONFIG_NET_VENDOR_GOOGLE is not set -# CONFIG_NET_VENDOR_HP is not set # CONFIG_NET_VENDOR_HUAWEI is not set # CONFIG_NET_VENDOR_INTEL is not set +# CONFIG_NET_VENDOR_LITEX is not set # CONFIG_NET_VENDOR_MARVELL is not set CONFIG_MLX4_EN=m CONFIG_MLX5_CORE=m CONFIG_MLX5_CORE_EN=y -# CONFIG_MLXFW is not set # CONFIG_NET_VENDOR_MICREL is not set # CONFIG_NET_VENDOR_MICROCHIP is not set # CONFIG_NET_VENDOR_MICROSEMI is not set +# CONFIG_NET_VENDOR_MICROSOFT is not set # CONFIG_NET_VENDOR_MYRI is not set +# CONFIG_NET_VENDOR_NI is not set # CONFIG_NET_VENDOR_NATSEMI is not set # CONFIG_NET_VENDOR_NETERION is not set # CONFIG_NET_VENDOR_NETRONOME is not set -# CONFIG_NET_VENDOR_NI is not set # CONFIG_NET_VENDOR_NVIDIA is not set # CONFIG_NET_VENDOR_OKI is not set # CONFIG_NET_VENDOR_PACKET_ENGINES is not set # CONFIG_NET_VENDOR_PENSANDO is not set # CONFIG_NET_VENDOR_QLOGIC is not set +# CONFIG_NET_VENDOR_BROCADE is not set # CONFIG_NET_VENDOR_QUALCOMM is not set # CONFIG_NET_VENDOR_RDC is not set # CONFIG_NET_VENDOR_REALTEK is not set @@ -502,9 +533,9 @@ CONFIG_MLX5_CORE_EN=y # CONFIG_NET_VENDOR_ROCKER is not set # CONFIG_NET_VENDOR_SAMSUNG is not set # CONFIG_NET_VENDOR_SEEQ is not set -# CONFIG_NET_VENDOR_SOLARFLARE is not set # CONFIG_NET_VENDOR_SILAN is not set # CONFIG_NET_VENDOR_SIS is not set +# CONFIG_NET_VENDOR_SOLARFLARE is not set # CONFIG_NET_VENDOR_SMSC is not set # CONFIG_NET_VENDOR_SOCIONEXT is not set # CONFIG_NET_VENDOR_STMICRO is not set @@ -512,8 +543,11 @@ CONFIG_MLX5_CORE_EN=y # CONFIG_NET_VENDOR_SYNOPSYS is not set # CONFIG_NET_VENDOR_TEHUTI is not set # CONFIG_NET_VENDOR_TI is not set +# CONFIG_NET_VENDOR_VERTEXCOM is not set # CONFIG_NET_VENDOR_VIA is not set +# CONFIG_NET_VENDOR_WANGXUN is not set # CONFIG_NET_VENDOR_WIZNET is not set +# CONFIG_NET_VENDOR_XILINX is not set CONFIG_PPP=m CONFIG_PPP_BSDCOMP=m CONFIG_PPP_DEFLATE=m @@ -531,9 +565,9 @@ CONFIG_INPUT_EVDEV=y # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set CONFIG_LEGACY_PTY_COUNT=0 -CONFIG_NULL_TTY=m +# CONFIG_LEGACY_TIOCSTI is not set +CONFIG_VIRTIO_CONSOLE=m CONFIG_HW_RANDOM_VIRTIO=m -CONFIG_RAW_DRIVER=m CONFIG_HANGCHECK_TIMER=m CONFIG_TN3270_FS=y CONFIG_PPS=m @@ -543,10 +577,12 @@ CONFIG_WATCHDOG=y CONFIG_WATCHDOG_NOWAYOUT=y CONFIG_SOFT_WATCHDOG=m CONFIG_DIAG288_WATCHDOG=m +# CONFIG_DRM_DEBUG_MODESET_LOCK is not set CONFIG_FB=y +# CONFIG_FB_DEVICE is not set CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y -# CONFIG_HID is not set +# CONFIG_HID_SUPPORT is not set # CONFIG_USB_SUPPORT is not set CONFIG_INFINIBAND=m CONFIG_INFINIBAND_USER_ACCESS=m @@ -555,13 +591,12 @@ CONFIG_MLX5_INFINIBAND=m CONFIG_SYNC_FILE=y CONFIG_VFIO=m CONFIG_VFIO_PCI=m -CONFIG_VFIO_MDEV=m -CONFIG_VFIO_MDEV_DEVICE=m +CONFIG_MLX5_VFIO_PCI=m CONFIG_VIRTIO_PCI=m CONFIG_VIRTIO_BALLOON=m CONFIG_VIRTIO_INPUT=y -CONFIG_S390_CCW_IOMMU=y -CONFIG_S390_AP_IOMMU=y +CONFIG_VHOST_NET=m +CONFIG_VHOST_VSOCK=m CONFIG_EXT4_FS=y CONFIG_EXT4_FS_POSIX_ACL=y CONFIG_EXT4_FS_SECURITY=y @@ -583,6 +618,9 @@ CONFIG_BTRFS_FS_POSIX_ACL=y CONFIG_BTRFS_DEBUG=y CONFIG_BTRFS_ASSERT=y CONFIG_NILFS2_FS=m +CONFIG_BCACHEFS_FS=y +CONFIG_BCACHEFS_QUOTA=y +CONFIG_BCACHEFS_POSIX_ACL=y CONFIG_FS_DAX=y CONFIG_EXPORTFS_BLOCK_OPS=y CONFIG_FS_ENCRYPTION=y @@ -594,12 +632,14 @@ CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_QUOTA_DEBUG=y CONFIG_QFMT_V1=m CONFIG_QFMT_V2=m -CONFIG_AUTOFS4_FS=m +CONFIG_AUTOFS_FS=m CONFIG_FUSE_FS=y CONFIG_CUSE=m CONFIG_VIRTIO_FS=m CONFIG_OVERLAY_FS=m -CONFIG_FSCACHE=m +CONFIG_NETFS_SUPPORT=m +CONFIG_NETFS_STATS=y +CONFIG_FSCACHE=y CONFIG_CACHEFILES=m CONFIG_ISO9660_FS=y CONFIG_JOLIET=y @@ -607,16 +647,19 @@ CONFIG_ZISOFS=y CONFIG_UDF_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m +CONFIG_EXFAT_FS=m CONFIG_NTFS_FS=m CONFIG_NTFS_RW=y CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y +CONFIG_TMPFS_INODE64=y +CONFIG_TMPFS_QUOTA=y CONFIG_HUGETLBFS=y -CONFIG_CONFIGFS_FS=m CONFIG_ECRYPT_FS=m CONFIG_CRAMFS=m CONFIG_SQUASHFS=m +CONFIG_SQUASHFS_CHOICE_DECOMP_BY_MOUNT=y CONFIG_SQUASHFS_XATTR=y CONFIG_SQUASHFS_LZ4=y CONFIG_SQUASHFS_LZO=y @@ -632,13 +675,12 @@ CONFIG_NFSD_V3_ACL=y CONFIG_NFSD_V4=y CONFIG_NFSD_V4_SECURITY_LABEL=y CONFIG_CIFS=m -CONFIG_CIFS_STATS2=y -CONFIG_CIFS_WEAK_PW_HASH=y CONFIG_CIFS_UPCALL=y CONFIG_CIFS_XATTR=y CONFIG_CIFS_POSIX=y # CONFIG_CIFS_DEBUG is not set CONFIG_CIFS_DFS_UPCALL=y +CONFIG_CIFS_SWN_UPCALL=y CONFIG_NLS_DEFAULT="utf8" CONFIG_NLS_CODEPAGE_437=m CONFIG_NLS_CODEPAGE_850=m @@ -649,23 +691,26 @@ CONFIG_NLS_UTF8=m CONFIG_DLM=m CONFIG_UNICODE=y CONFIG_PERSISTENT_KEYRINGS=y -CONFIG_BIG_KEYS=y CONFIG_ENCRYPTED_KEYS=m +CONFIG_KEY_NOTIFICATIONS=y CONFIG_SECURITY=y -CONFIG_SECURITY_NETWORK=y +CONFIG_HARDENED_USERCOPY=y CONFIG_FORTIFY_SOURCE=y CONFIG_SECURITY_SELINUX=y CONFIG_SECURITY_SELINUX_BOOTPARAM=y -CONFIG_SECURITY_SELINUX_DISABLE=y CONFIG_SECURITY_LOCKDOWN_LSM=y CONFIG_SECURITY_LOCKDOWN_LSM_EARLY=y +CONFIG_SECURITY_LANDLOCK=y CONFIG_INTEGRITY_SIGNATURE=y CONFIG_INTEGRITY_ASYMMETRIC_KEYS=y +CONFIG_INTEGRITY_PLATFORM_KEYRING=y CONFIG_IMA=y CONFIG_IMA_DEFAULT_HASH_SHA256=y CONFIG_IMA_WRITE_POLICY=y CONFIG_IMA_APPRAISE=y CONFIG_LSM="yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor" +CONFIG_INIT_STACK_NONE=y +CONFIG_BUG_ON_DATA_CORRUPTION=y CONFIG_CRYPTO_USER=m # CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set CONFIG_CRYPTO_PCRYPT=m @@ -673,42 +718,45 @@ CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_TEST=m CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m +CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_CHACHA20POLY1305=m -CONFIG_CRYPTO_AEGIS128=m -CONFIG_CRYPTO_CFB=m -CONFIG_CRYPTO_LRW=m -CONFIG_CRYPTO_PCBC=m -CONFIG_CRYPTO_KEYWRAP=m -CONFIG_CRYPTO_ADIANTUM=m -CONFIG_CRYPTO_XCBC=m -CONFIG_CRYPTO_VMAC=m -CONFIG_CRYPTO_CRC32=m -CONFIG_CRYPTO_XXHASH=m -CONFIG_CRYPTO_MICHAEL_MIC=m -CONFIG_CRYPTO_RMD128=m -CONFIG_CRYPTO_RMD160=m -CONFIG_CRYPTO_RMD256=m -CONFIG_CRYPTO_RMD320=m -CONFIG_CRYPTO_SHA3=m -CONFIG_CRYPTO_SM3=m -CONFIG_CRYPTO_TGR192=m -CONFIG_CRYPTO_WP512=m +CONFIG_CRYPTO_SM2=m +CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m -CONFIG_CRYPTO_ARC4=m +CONFIG_CRYPTO_ARIA=m CONFIG_CRYPTO_BLOWFISH=m CONFIG_CRYPTO_CAMELLIA=m CONFIG_CRYPTO_CAST5=m CONFIG_CRYPTO_CAST6=m +CONFIG_CRYPTO_DES=m CONFIG_CRYPTO_FCRYPT=m CONFIG_CRYPTO_KHAZAD=m -CONFIG_CRYPTO_SALSA20=m CONFIG_CRYPTO_SEED=m CONFIG_CRYPTO_SERPENT=m -CONFIG_CRYPTO_SM4=m +CONFIG_CRYPTO_SM4_GENERIC=m CONFIG_CRYPTO_TEA=m CONFIG_CRYPTO_TWOFISH=m +CONFIG_CRYPTO_ADIANTUM=m +CONFIG_CRYPTO_ARC4=m +CONFIG_CRYPTO_CFB=m +CONFIG_CRYPTO_HCTR2=m +CONFIG_CRYPTO_KEYWRAP=m +CONFIG_CRYPTO_LRW=m +CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_AEGIS128=m +CONFIG_CRYPTO_CHACHA20POLY1305=m +CONFIG_CRYPTO_GCM=y +CONFIG_CRYPTO_SEQIV=y +CONFIG_CRYPTO_MD4=m +CONFIG_CRYPTO_MD5=y +CONFIG_CRYPTO_MICHAEL_MIC=m +CONFIG_CRYPTO_RMD160=m +CONFIG_CRYPTO_SM3_GENERIC=m +CONFIG_CRYPTO_VMAC=m +CONFIG_CRYPTO_WP512=m +CONFIG_CRYPTO_XCBC=m +CONFIG_CRYPTO_CRC32=m CONFIG_CRYPTO_842=m CONFIG_CRYPTO_LZ4=m CONFIG_CRYPTO_LZ4HC=m @@ -719,41 +767,45 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m CONFIG_CRYPTO_STATS=y -CONFIG_ZCRYPT=m -CONFIG_PKEY=m -CONFIG_CRYPTO_PAES_S390=m +CONFIG_CRYPTO_CRC32_S390=y +CONFIG_CRYPTO_SHA512_S390=m CONFIG_CRYPTO_SHA1_S390=m CONFIG_CRYPTO_SHA256_S390=m -CONFIG_CRYPTO_SHA512_S390=m CONFIG_CRYPTO_SHA3_256_S390=m CONFIG_CRYPTO_SHA3_512_S390=m -CONFIG_CRYPTO_DES_S390=m -CONFIG_CRYPTO_AES_S390=m CONFIG_CRYPTO_GHASH_S390=m -CONFIG_CRYPTO_CRC32_S390=y +CONFIG_CRYPTO_AES_S390=m +CONFIG_CRYPTO_DES_S390=m +CONFIG_CRYPTO_CHACHA_S390=m +CONFIG_ZCRYPT=m +CONFIG_PKEY=m +CONFIG_CRYPTO_PAES_S390=m +CONFIG_CRYPTO_DEV_VIRTIO=m +CONFIG_SYSTEM_BLACKLIST_KEYRING=y CONFIG_CORDIC=m +CONFIG_CRYPTO_LIB_CURVE25519=m +CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m CONFIG_CRC32_SELFTEST=y CONFIG_CRC4=m CONFIG_CRC7=m CONFIG_CRC8=m CONFIG_RANDOM32_SELFTEST=y +CONFIG_XZ_DEC_MICROLZMA=y CONFIG_DMA_CMA=y CONFIG_CMA_SIZE_MBYTES=0 -CONFIG_DMA_API_DEBUG=y -CONFIG_STRING_SELFTEST=y CONFIG_PRINTK_TIME=y CONFIG_DYNAMIC_DEBUG=y -CONFIG_DEBUG_INFO=y CONFIG_DEBUG_INFO_DWARF4=y CONFIG_GDB_SCRIPTS=y -CONFIG_FRAME_WARN=1024 CONFIG_HEADERS_INSTALL=y -CONFIG_HEADERS_CHECK=y CONFIG_DEBUG_SECTION_MISMATCH=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_PAGEALLOC=y +CONFIG_SLUB_DEBUG_ON=y CONFIG_PAGE_OWNER=y CONFIG_DEBUG_RODATA_TEST=y +CONFIG_DEBUG_WX=y +CONFIG_PTDUMP_DEBUGFS=y CONFIG_DEBUG_OBJECTS=y CONFIG_DEBUG_OBJECTS_SELFTEST=y CONFIG_DEBUG_OBJECTS_FREE=y @@ -761,58 +813,77 @@ CONFIG_DEBUG_OBJECTS_TIMERS=y CONFIG_DEBUG_OBJECTS_WORK=y CONFIG_DEBUG_OBJECTS_RCU_HEAD=y CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER=y -CONFIG_SLUB_DEBUG_ON=y -CONFIG_SLUB_STATS=y CONFIG_DEBUG_STACK_USAGE=y CONFIG_DEBUG_VM=y -CONFIG_DEBUG_VM_VMACACHE=y -CONFIG_DEBUG_VM_RB=y CONFIG_DEBUG_VM_PGFLAGS=y CONFIG_DEBUG_MEMORY_INIT=y CONFIG_MEMORY_NOTIFIER_ERROR_INJECT=m CONFIG_DEBUG_PER_CPU_MAPS=y +CONFIG_KFENCE=y +CONFIG_KFENCE_DEFERRABLE=y +CONFIG_KFENCE_STATIC_KEYS=y CONFIG_DEBUG_SHIRQ=y +CONFIG_PANIC_ON_OOPS=y CONFIG_DETECT_HUNG_TASK=y CONFIG_WQ_WATCHDOG=y -CONFIG_PANIC_ON_OOPS=y -CONFIG_DEBUG_TIMEKEEPING=y +CONFIG_TEST_LOCKUP=m +CONFIG_DEBUG_PREEMPT=y CONFIG_PROVE_LOCKING=y CONFIG_LOCK_STAT=y -CONFIG_DEBUG_LOCKDEP=y CONFIG_DEBUG_ATOMIC_SLEEP=y CONFIG_DEBUG_LOCKING_API_SELFTESTS=y +CONFIG_DEBUG_IRQFLAGS=y +CONFIG_DEBUG_LIST=y CONFIG_DEBUG_SG=y CONFIG_DEBUG_NOTIFIERS=y -CONFIG_DEBUG_CREDENTIALS=y CONFIG_RCU_TORTURE_TEST=m +CONFIG_RCU_REF_SCALE_TEST=m CONFIG_RCU_CPU_STALL_TIMEOUT=300 +# CONFIG_RCU_TRACE is not set +CONFIG_LATENCYTOP=y +CONFIG_BOOTTIME_TRACING=y +CONFIG_FUNCTION_GRAPH_RETVAL=y +CONFIG_FPROBE=y +CONFIG_FUNCTION_PROFILER=y +CONFIG_STACK_TRACER=y +CONFIG_IRQSOFF_TRACER=y +CONFIG_PREEMPT_TRACER=y +CONFIG_SCHED_TRACER=y +CONFIG_FTRACE_SYSCALLS=y +CONFIG_BLK_DEV_IO_TRACE=y +CONFIG_USER_EVENTS=y +CONFIG_HIST_TRIGGERS=y +CONFIG_FTRACE_STARTUP_TEST=y +# CONFIG_EVENT_TRACE_STARTUP_TEST is not set +CONFIG_SAMPLES=y +CONFIG_SAMPLE_TRACE_PRINTK=m +CONFIG_SAMPLE_FTRACE_DIRECT=m +CONFIG_SAMPLE_FTRACE_DIRECT_MULTI=m +CONFIG_SAMPLE_FTRACE_OPS=m +CONFIG_DEBUG_ENTRY=y +CONFIG_CIO_INJECT=y +CONFIG_KUNIT=m +CONFIG_KUNIT_DEBUGFS=y CONFIG_NOTIFIER_ERROR_INJECTION=m CONFIG_NETDEV_NOTIFIER_ERROR_INJECT=m CONFIG_FAULT_INJECTION=y CONFIG_FAILSLAB=y CONFIG_FAIL_PAGE_ALLOC=y +CONFIG_FAULT_INJECTION_USERCOPY=y CONFIG_FAIL_MAKE_REQUEST=y CONFIG_FAIL_IO_TIMEOUT=y CONFIG_FAIL_FUTEX=y CONFIG_FAULT_INJECTION_DEBUG_FS=y +CONFIG_FAULT_INJECTION_CONFIGFS=y CONFIG_FAULT_INJECTION_STACKTRACE_FILTER=y -CONFIG_LATENCYTOP=y -CONFIG_IRQSOFF_TRACER=y -CONFIG_PREEMPT_TRACER=y -CONFIG_SCHED_TRACER=y -CONFIG_FTRACE_SYSCALLS=y -CONFIG_STACK_TRACER=y -CONFIG_BLK_DEV_IO_TRACE=y -CONFIG_FUNCTION_PROFILER=y -CONFIG_HIST_TRIGGERS=y CONFIG_LKDTM=m -CONFIG_TEST_LIST_SORT=y -CONFIG_TEST_SORT=y -CONFIG_KPROBES_SANITY_TEST=y +CONFIG_TEST_MIN_HEAP=y +CONFIG_KPROBES_SANITY_TEST=m CONFIG_RBTREE_TEST=y CONFIG_INTERVAL_TREE_TEST=m CONFIG_PERCPU_TEST=m CONFIG_ATOMIC64_SELFTEST=y +CONFIG_STRING_SELFTEST=y +CONFIG_TEST_BITOPS=m CONFIG_TEST_BPF=m -CONFIG_BUG_ON_DATA_CORRUPTION=y -CONFIG_S390_PTDUMP=y +CONFIG_TEST_LIVEPATCH=m diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 25f799849582..42b988873e54 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -1,8 +1,14 @@ CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y +CONFIG_WATCH_QUEUE=y CONFIG_AUDIT=y CONFIG_NO_HZ_IDLE=y CONFIG_HIGH_RES_TIMERS=y +CONFIG_BPF_SYSCALL=y +CONFIG_BPF_JIT=y +CONFIG_BPF_JIT_ALWAYS_ON=y +CONFIG_BPF_LSM=y +CONFIG_SCHED_CORE=y CONFIG_BSD_PROCESS_ACCT=y CONFIG_BSD_PROCESS_ACCT_V3=y CONFIG_TASKSTATS=y @@ -13,10 +19,8 @@ CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_NUMA_BALANCING=y CONFIG_MEMCG=y -CONFIG_MEMCG_SWAP=y CONFIG_BLK_CGROUP=y CONFIG_CFS_BANDWIDTH=y -CONFIG_RT_GROUP_SCHED=y CONFIG_CGROUP_PIDS=y CONFIG_CGROUP_RDMA=y CONFIG_CGROUP_FREEZER=y @@ -26,53 +30,54 @@ CONFIG_CGROUP_DEVICE=y CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_PERF=y CONFIG_CGROUP_BPF=y +CONFIG_CGROUP_MISC=y CONFIG_NAMESPACES=y CONFIG_USER_NS=y CONFIG_CHECKPOINT_RESTORE=y CONFIG_SCHED_AUTOGROUP=y -CONFIG_BLK_DEV_INITRD=y CONFIG_EXPERT=y # CONFIG_SYSFS_SYSCALL is not set -CONFIG_BPF_SYSCALL=y -CONFIG_USERFAULTFD=y -# CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y +CONFIG_KEXEC=y +CONFIG_KEXEC_FILE=y +CONFIG_KEXEC_SIG=y +CONFIG_CRASH_DUMP=y CONFIG_LIVEPATCH=y -CONFIG_TUNE_ZEC12=y +CONFIG_MARCH_Z13=y CONFIG_NR_CPUS=512 CONFIG_NUMA=y -# CONFIG_NUMA_EMU is not set CONFIG_HZ_100=y -CONFIG_KEXEC_FILE=y -CONFIG_KEXEC_SIG=y +CONFIG_CERT_STORE=y CONFIG_EXPOLINE=y CONFIG_EXPOLINE_AUTO=y CONFIG_CHSC_SCH=y CONFIG_VFIO_CCW=m CONFIG_VFIO_AP=m -CONFIG_CRASH_DUMP=y -CONFIG_HIBERNATION=y -CONFIG_PM_DEBUG=y +CONFIG_PROTECTED_VIRTUALIZATION_GUEST=y CONFIG_CMM=m CONFIG_APPLDATA_BASE=y +CONFIG_S390_HYPFS_FS=y CONFIG_KVM=m -CONFIG_VHOST_NET=m -CONFIG_VHOST_VSOCK=m -CONFIG_OPROFILE=m +CONFIG_S390_UNWIND_SELFTEST=m +CONFIG_S390_KPROBES_SANITY_TEST=m +CONFIG_S390_MODULES_SANITY_TEST=m CONFIG_KPROBES=y CONFIG_JUMP_LABEL=y +# CONFIG_GCC_PLUGINS is not set CONFIG_MODULES=y CONFIG_MODULE_FORCE_LOAD=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y +CONFIG_MODULE_UNLOAD_TAINT_TRACKING=y CONFIG_MODVERSIONS=y CONFIG_MODULE_SRCVERSION_ALL=y -CONFIG_MODULE_SIG_SHA256=y -CONFIG_UNUSED_SYMBOLS=y CONFIG_BLK_DEV_THROTTLING=y CONFIG_BLK_WBT=y CONFIG_BLK_CGROUP_IOLATENCY=y CONFIG_BLK_CGROUP_IOCOST=y +CONFIG_BLK_CGROUP_IOPRIO=y +CONFIG_BLK_INLINE_ENCRYPTION=y +CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK=y CONFIG_PARTITION_ADVANCED=y CONFIG_IBM_PARTITION=y CONFIG_BSD_DISKLABEL=y @@ -80,23 +85,23 @@ CONFIG_MINIX_SUBPARTITION=y CONFIG_SOLARIS_X86_PARTITION=y CONFIG_UNIXWARE_DISKLABEL=y CONFIG_IOSCHED_BFQ=y -CONFIG_BFQ_GROUP_IOSCHED=y CONFIG_BINFMT_MISC=m +CONFIG_ZSWAP=y +CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD=y +CONFIG_ZSMALLOC_STAT=y +# CONFIG_COMPAT_BRK is not set CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTREMOVE=y CONFIG_KSM=y CONFIG_TRANSPARENT_HUGEPAGE=y -CONFIG_CLEANCACHE=y -CONFIG_FRONTSWAP=y +CONFIG_CMA_SYSFS=y +CONFIG_CMA_AREAS=7 CONFIG_MEM_SOFT_DIRTY=y -CONFIG_ZSWAP=y -CONFIG_ZBUD=m -CONFIG_ZSMALLOC=m -CONFIG_ZSMALLOC_STAT=y CONFIG_DEFERRED_STRUCT_PAGE_INIT=y CONFIG_IDLE_PAGE_TRACKING=y CONFIG_PERCPU_STATS=y -CONFIG_GUP_BENCHMARK=y +CONFIG_ANON_VMA_NAME=y +CONFIG_USERFAULTFD=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_PACKET_DIAG=m @@ -124,6 +129,7 @@ CONFIG_SYN_COOKIES=y CONFIG_NET_IPVTI=m CONFIG_INET_AH=m CONFIG_INET_ESP=m +CONFIG_INET_ESPINTCP=y CONFIG_INET_IPCOMP=m CONFIG_INET_DIAG=m CONFIG_INET_UDP_DIAG=m @@ -138,6 +144,7 @@ CONFIG_TCP_CONG_ILLINOIS=m CONFIG_IPV6_ROUTER_PREF=y CONFIG_INET6_AH=m CONFIG_INET6_ESP=m +CONFIG_INET6_ESPINTCP=y CONFIG_INET6_IPCOMP=m CONFIG_IPV6_MIP6=m CONFIG_IPV6_VTI=m @@ -145,9 +152,14 @@ CONFIG_IPV6_SIT=m CONFIG_IPV6_GRE=m CONFIG_IPV6_MULTIPLE_TABLES=y CONFIG_IPV6_SUBTREES=y +CONFIG_IPV6_RPL_LWTUNNEL=y +CONFIG_MPTCP=y CONFIG_NETFILTER=y +CONFIG_BRIDGE_NETFILTER=m +CONFIG_NETFILTER_NETLINK_HOOK=m CONFIG_NF_CONNTRACK=m CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_PROCFS=y CONFIG_NF_CONNTRACK_EVENTS=y CONFIG_NF_CONNTRACK_TIMEOUT=y CONFIG_NF_CONNTRACK_TIMESTAMP=y @@ -164,13 +176,16 @@ CONFIG_NF_CONNTRACK_TFTP=m CONFIG_NF_CT_NETLINK=m CONFIG_NF_CT_NETLINK_TIMEOUT=m CONFIG_NF_TABLES=m +CONFIG_NF_TABLES_INET=y CONFIG_NFT_CT=m -CONFIG_NFT_COUNTER=m CONFIG_NFT_LOG=m CONFIG_NFT_LIMIT=m CONFIG_NFT_NAT=m +CONFIG_NFT_REJECT=m CONFIG_NFT_COMPAT=m CONFIG_NFT_HASH=m +CONFIG_NFT_FIB_INET=m +CONFIG_NETFILTER_XTABLES_COMPAT=y CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_AUDIT=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m @@ -259,10 +274,12 @@ CONFIG_IP_VS_DH=m CONFIG_IP_VS_SH=m CONFIG_IP_VS_SED=m CONFIG_IP_VS_NQ=m +CONFIG_IP_VS_TWOS=m CONFIG_IP_VS_FTP=m CONFIG_IP_VS_PE_SIP=m -CONFIG_NF_TABLES_IPV4=y +CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y +CONFIG_NF_LOG_IPV4=m CONFIG_IP_NF_IPTABLES=m CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m @@ -273,7 +290,6 @@ CONFIG_IP_NF_TARGET_REJECT=m CONFIG_IP_NF_NAT=m CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_MANGLE=m -CONFIG_IP_NF_TARGET_CLUSTERIP=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -281,7 +297,7 @@ CONFIG_IP_NF_SECURITY=m CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m -CONFIG_NF_TABLES_IPV6=y +CONFIG_NFT_FIB_IPV6=m CONFIG_IP6_NF_IPTABLES=m CONFIG_IP6_NF_MATCH_AH=m CONFIG_IP6_NF_MATCH_EUI64=m @@ -309,11 +325,11 @@ CONFIG_L2TP_DEBUGFS=m CONFIG_L2TP_V3=y CONFIG_L2TP_IP=m CONFIG_L2TP_ETH=m -CONFIG_BRIDGE=m +CONFIG_BRIDGE=y +CONFIG_BRIDGE_MRP=y CONFIG_VLAN_8021Q=m CONFIG_VLAN_8021Q_GVRP=y CONFIG_NET_SCHED=y -CONFIG_NET_SCH_CBQ=m CONFIG_NET_SCH_HTB=m CONFIG_NET_SCH_HFSC=m CONFIG_NET_SCH_PRIO=m @@ -324,7 +340,6 @@ CONFIG_NET_SCH_SFQ=m CONFIG_NET_SCH_TEQL=m CONFIG_NET_SCH_TBF=m CONFIG_NET_SCH_GRED=m -CONFIG_NET_SCH_DSMARK=m CONFIG_NET_SCH_NETEM=m CONFIG_NET_SCH_DRR=m CONFIG_NET_SCH_MQPRIO=m @@ -334,15 +349,13 @@ CONFIG_NET_SCH_CODEL=m CONFIG_NET_SCH_FQ_CODEL=m CONFIG_NET_SCH_INGRESS=m CONFIG_NET_SCH_PLUG=m +CONFIG_NET_SCH_ETS=m CONFIG_NET_CLS_BASIC=m -CONFIG_NET_CLS_TCINDEX=m CONFIG_NET_CLS_ROUTE4=m CONFIG_NET_CLS_FW=m CONFIG_NET_CLS_U32=m CONFIG_CLS_U32_PERF=y CONFIG_CLS_U32_MARK=y -CONFIG_NET_CLS_RSVP=m -CONFIG_NET_CLS_RSVP6=m CONFIG_NET_CLS_FLOW=m CONFIG_NET_CLS_CGROUP=y CONFIG_NET_CLS_BPF=m @@ -351,35 +364,37 @@ CONFIG_NET_ACT_POLICE=m CONFIG_NET_ACT_GACT=m CONFIG_GACT_PROB=y CONFIG_NET_ACT_MIRRED=m -CONFIG_NET_ACT_IPT=m CONFIG_NET_ACT_NAT=m CONFIG_NET_ACT_PEDIT=m CONFIG_NET_ACT_SIMP=m CONFIG_NET_ACT_SKBEDIT=m CONFIG_NET_ACT_CSUM=m +CONFIG_NET_ACT_GATE=m +CONFIG_NET_TC_SKB_EXT=y CONFIG_DNS_RESOLVER=y CONFIG_OPENVSWITCH=m CONFIG_VSOCKETS=m CONFIG_VIRTIO_VSOCKETS=m CONFIG_NETLINK_DIAG=m +CONFIG_NET_SWITCHDEV=y CONFIG_CGROUP_NET_PRIO=y -CONFIG_BPF_JIT=y CONFIG_NET_PKTGEN=m -# CONFIG_NET_DROP_MONITOR is not set CONFIG_PCI=y +# CONFIG_PCIEASPM is not set +CONFIG_PCI_IOV=y CONFIG_HOTPLUG_PCI=y CONFIG_HOTPLUG_PCI_S390=y CONFIG_UEVENT_HELPER=y CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_SAFE=y +# CONFIG_FW_LOADER is not set CONFIG_CONNECTOR=y -CONFIG_ZRAM=m +CONFIG_ZRAM=y CONFIG_BLK_DEV_LOOP=m -CONFIG_BLK_DEV_CRYPTOLOOP=m CONFIG_BLK_DEV_DRBD=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=32768 -# CONFIG_BLK_DEV_XPRAM is not set CONFIG_VIRTIO_BLK=y CONFIG_BLK_DEV_RBD=m CONFIG_BLK_DEV_NVME=m @@ -396,12 +411,12 @@ CONFIG_SCSI_ENCLOSURE=m CONFIG_SCSI_CONSTANTS=y CONFIG_SCSI_LOGGING=y CONFIG_SCSI_SPI_ATTRS=m -CONFIG_SCSI_FC_ATTRS=y +CONFIG_SCSI_FC_ATTRS=m CONFIG_SCSI_SAS_LIBSAS=m CONFIG_SCSI_SRP_ATTRS=m CONFIG_ISCSI_TCP=m CONFIG_SCSI_DEBUG=m -CONFIG_ZFCP=y +CONFIG_ZFCP=m CONFIG_SCSI_VIRTIO=m CONFIG_SCSI_DH=y CONFIG_SCSI_DH_RDAC=m @@ -410,12 +425,13 @@ CONFIG_SCSI_DH_EMC=m CONFIG_SCSI_DH_ALUA=m CONFIG_MD=y CONFIG_BLK_DEV_MD=y +# CONFIG_MD_BITMAP_FILE is not set CONFIG_MD_LINEAR=m CONFIG_MD_MULTIPATH=m CONFIG_MD_FAULTY=m CONFIG_MD_CLUSTER=m CONFIG_BCACHE=m -CONFIG_BLK_DEV_DM=m +CONFIG_BLK_DEV_DM=y CONFIG_DM_UNSTRIPED=m CONFIG_DM_CRYPT=m CONFIG_DM_SNAPSHOT=m @@ -429,7 +445,10 @@ CONFIG_DM_ZERO=m CONFIG_DM_MULTIPATH=m CONFIG_DM_MULTIPATH_QL=m CONFIG_DM_MULTIPATH_ST=m +CONFIG_DM_MULTIPATH_HST=m +CONFIG_DM_MULTIPATH_IOA=m CONFIG_DM_DELAY=m +CONFIG_DM_INIT=y CONFIG_DM_UEVENT=y CONFIG_DM_FLAKEY=m CONFIG_DM_VERITY=m @@ -443,6 +462,9 @@ CONFIG_EQUALIZER=m CONFIG_IFB=m CONFIG_MACVLAN=m CONFIG_MACVTAP=m +CONFIG_VXLAN=m +CONFIG_BAREUDP=m +CONFIG_AMT=m CONFIG_TUN=m CONFIG_VETH=m CONFIG_VIRTIO_NET=m @@ -456,41 +478,44 @@ CONFIG_NLMON=m # CONFIG_NET_VENDOR_AMD is not set # CONFIG_NET_VENDOR_AQUANTIA is not set # CONFIG_NET_VENDOR_ARC is not set +# CONFIG_NET_VENDOR_ASIX is not set # CONFIG_NET_VENDOR_ATHEROS is not set -# CONFIG_NET_VENDOR_AURORA is not set # CONFIG_NET_VENDOR_BROADCOM is not set -# CONFIG_NET_VENDOR_BROCADE is not set # CONFIG_NET_VENDOR_CADENCE is not set # CONFIG_NET_VENDOR_CAVIUM is not set # CONFIG_NET_VENDOR_CHELSIO is not set # CONFIG_NET_VENDOR_CISCO is not set # CONFIG_NET_VENDOR_CORTINA is not set +# CONFIG_NET_VENDOR_DAVICOM is not set # CONFIG_NET_VENDOR_DEC is not set # CONFIG_NET_VENDOR_DLINK is not set # CONFIG_NET_VENDOR_EMULEX is not set +# CONFIG_NET_VENDOR_ENGLEDER is not set # CONFIG_NET_VENDOR_EZCHIP is not set +# CONFIG_NET_VENDOR_FUNGIBLE is not set # CONFIG_NET_VENDOR_GOOGLE is not set -# CONFIG_NET_VENDOR_HP is not set # CONFIG_NET_VENDOR_HUAWEI is not set # CONFIG_NET_VENDOR_INTEL is not set +# CONFIG_NET_VENDOR_LITEX is not set # CONFIG_NET_VENDOR_MARVELL is not set CONFIG_MLX4_EN=m CONFIG_MLX5_CORE=m CONFIG_MLX5_CORE_EN=y -# CONFIG_MLXFW is not set # CONFIG_NET_VENDOR_MICREL is not set # CONFIG_NET_VENDOR_MICROCHIP is not set # CONFIG_NET_VENDOR_MICROSEMI is not set +# CONFIG_NET_VENDOR_MICROSOFT is not set # CONFIG_NET_VENDOR_MYRI is not set +# CONFIG_NET_VENDOR_NI is not set # CONFIG_NET_VENDOR_NATSEMI is not set # CONFIG_NET_VENDOR_NETERION is not set # CONFIG_NET_VENDOR_NETRONOME is not set -# CONFIG_NET_VENDOR_NI is not set # CONFIG_NET_VENDOR_NVIDIA is not set # CONFIG_NET_VENDOR_OKI is not set # CONFIG_NET_VENDOR_PACKET_ENGINES is not set # CONFIG_NET_VENDOR_PENSANDO is not set # CONFIG_NET_VENDOR_QLOGIC is not set +# CONFIG_NET_VENDOR_BROCADE is not set # CONFIG_NET_VENDOR_QUALCOMM is not set # CONFIG_NET_VENDOR_RDC is not set # CONFIG_NET_VENDOR_REALTEK is not set @@ -498,9 +523,9 @@ CONFIG_MLX5_CORE_EN=y # CONFIG_NET_VENDOR_ROCKER is not set # CONFIG_NET_VENDOR_SAMSUNG is not set # CONFIG_NET_VENDOR_SEEQ is not set -# CONFIG_NET_VENDOR_SOLARFLARE is not set # CONFIG_NET_VENDOR_SILAN is not set # CONFIG_NET_VENDOR_SIS is not set +# CONFIG_NET_VENDOR_SOLARFLARE is not set # CONFIG_NET_VENDOR_SMSC is not set # CONFIG_NET_VENDOR_SOCIONEXT is not set # CONFIG_NET_VENDOR_STMICRO is not set @@ -508,8 +533,11 @@ CONFIG_MLX5_CORE_EN=y # CONFIG_NET_VENDOR_SYNOPSYS is not set # CONFIG_NET_VENDOR_TEHUTI is not set # CONFIG_NET_VENDOR_TI is not set +# CONFIG_NET_VENDOR_VERTEXCOM is not set # CONFIG_NET_VENDOR_VIA is not set +# CONFIG_NET_VENDOR_WANGXUN is not set # CONFIG_NET_VENDOR_WIZNET is not set +# CONFIG_NET_VENDOR_XILINX is not set CONFIG_PPP=m CONFIG_PPP_BSDCOMP=m CONFIG_PPP_DEFLATE=m @@ -527,9 +555,9 @@ CONFIG_INPUT_EVDEV=y # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set CONFIG_LEGACY_PTY_COUNT=0 -CONFIG_NULL_TTY=m +# CONFIG_LEGACY_TIOCSTI is not set +CONFIG_VIRTIO_CONSOLE=m CONFIG_HW_RANDOM_VIRTIO=m -CONFIG_RAW_DRIVER=m CONFIG_HANGCHECK_TIMER=m CONFIG_TN3270_FS=y # CONFIG_PTP_1588_CLOCK is not set @@ -540,9 +568,10 @@ CONFIG_WATCHDOG_NOWAYOUT=y CONFIG_SOFT_WATCHDOG=m CONFIG_DIAG288_WATCHDOG=m CONFIG_FB=y +# CONFIG_FB_DEVICE is not set CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y -# CONFIG_HID is not set +# CONFIG_HID_SUPPORT is not set # CONFIG_USB_SUPPORT is not set CONFIG_INFINIBAND=m CONFIG_INFINIBAND_USER_ACCESS=m @@ -551,13 +580,12 @@ CONFIG_MLX5_INFINIBAND=m CONFIG_SYNC_FILE=y CONFIG_VFIO=m CONFIG_VFIO_PCI=m -CONFIG_VFIO_MDEV=m -CONFIG_VFIO_MDEV_DEVICE=m +CONFIG_MLX5_VFIO_PCI=m CONFIG_VIRTIO_PCI=m CONFIG_VIRTIO_BALLOON=m CONFIG_VIRTIO_INPUT=y -CONFIG_S390_CCW_IOMMU=y -CONFIG_S390_AP_IOMMU=y +CONFIG_VHOST_NET=m +CONFIG_VHOST_VSOCK=m CONFIG_EXT4_FS=y CONFIG_EXT4_FS_POSIX_ACL=y CONFIG_EXT4_FS_SECURITY=y @@ -576,6 +604,9 @@ CONFIG_OCFS2_FS=m CONFIG_BTRFS_FS=y CONFIG_BTRFS_FS_POSIX_ACL=y CONFIG_NILFS2_FS=m +CONFIG_BCACHEFS_FS=m +CONFIG_BCACHEFS_QUOTA=y +CONFIG_BCACHEFS_POSIX_ACL=y CONFIG_FS_DAX=y CONFIG_EXPORTFS_BLOCK_OPS=y CONFIG_FS_ENCRYPTION=y @@ -586,12 +617,14 @@ CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_QFMT_V1=m CONFIG_QFMT_V2=m -CONFIG_AUTOFS4_FS=m +CONFIG_AUTOFS_FS=m CONFIG_FUSE_FS=y CONFIG_CUSE=m CONFIG_VIRTIO_FS=m CONFIG_OVERLAY_FS=m -CONFIG_FSCACHE=m +CONFIG_NETFS_SUPPORT=m +CONFIG_NETFS_STATS=y +CONFIG_FSCACHE=y CONFIG_CACHEFILES=m CONFIG_ISO9660_FS=y CONFIG_JOLIET=y @@ -599,16 +632,20 @@ CONFIG_ZISOFS=y CONFIG_UDF_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m +CONFIG_EXFAT_FS=m CONFIG_NTFS_FS=m CONFIG_NTFS_RW=y CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y +CONFIG_TMPFS_INODE64=y +CONFIG_TMPFS_QUOTA=y CONFIG_HUGETLBFS=y CONFIG_CONFIGFS_FS=m CONFIG_ECRYPT_FS=m CONFIG_CRAMFS=m CONFIG_SQUASHFS=m +CONFIG_SQUASHFS_CHOICE_DECOMP_BY_MOUNT=y CONFIG_SQUASHFS_XATTR=y CONFIG_SQUASHFS_LZ4=y CONFIG_SQUASHFS_LZO=y @@ -624,13 +661,12 @@ CONFIG_NFSD_V3_ACL=y CONFIG_NFSD_V4=y CONFIG_NFSD_V4_SECURITY_LABEL=y CONFIG_CIFS=m -CONFIG_CIFS_STATS2=y -CONFIG_CIFS_WEAK_PW_HASH=y CONFIG_CIFS_UPCALL=y CONFIG_CIFS_XATTR=y CONFIG_CIFS_POSIX=y # CONFIG_CIFS_DEBUG is not set CONFIG_CIFS_DFS_UPCALL=y +CONFIG_CIFS_SWN_UPCALL=y CONFIG_NLS_DEFAULT="utf8" CONFIG_NLS_CODEPAGE_437=m CONFIG_NLS_CODEPAGE_850=m @@ -641,22 +677,24 @@ CONFIG_NLS_UTF8=m CONFIG_DLM=m CONFIG_UNICODE=y CONFIG_PERSISTENT_KEYRINGS=y -CONFIG_BIG_KEYS=y CONFIG_ENCRYPTED_KEYS=m +CONFIG_KEY_NOTIFICATIONS=y CONFIG_SECURITY=y -CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_SELINUX=y CONFIG_SECURITY_SELINUX_BOOTPARAM=y -CONFIG_SECURITY_SELINUX_DISABLE=y CONFIG_SECURITY_LOCKDOWN_LSM=y CONFIG_SECURITY_LOCKDOWN_LSM_EARLY=y +CONFIG_SECURITY_LANDLOCK=y CONFIG_INTEGRITY_SIGNATURE=y CONFIG_INTEGRITY_ASYMMETRIC_KEYS=y +CONFIG_INTEGRITY_PLATFORM_KEYRING=y CONFIG_IMA=y CONFIG_IMA_DEFAULT_HASH_SHA256=y CONFIG_IMA_WRITE_POLICY=y CONFIG_IMA_APPRAISE=y CONFIG_LSM="yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor" +CONFIG_INIT_STACK_NONE=y +CONFIG_BUG_ON_DATA_CORRUPTION=y CONFIG_CRYPTO_FIPS=y CONFIG_CRYPTO_USER=m # CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set @@ -665,43 +703,46 @@ CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_TEST=m CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m +CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_CHACHA20POLY1305=m -CONFIG_CRYPTO_AEGIS128=m -CONFIG_CRYPTO_CFB=m -CONFIG_CRYPTO_LRW=m -CONFIG_CRYPTO_OFB=m -CONFIG_CRYPTO_PCBC=m -CONFIG_CRYPTO_KEYWRAP=m -CONFIG_CRYPTO_ADIANTUM=m -CONFIG_CRYPTO_XCBC=m -CONFIG_CRYPTO_VMAC=m -CONFIG_CRYPTO_CRC32=m -CONFIG_CRYPTO_XXHASH=m -CONFIG_CRYPTO_MICHAEL_MIC=m -CONFIG_CRYPTO_RMD128=m -CONFIG_CRYPTO_RMD160=m -CONFIG_CRYPTO_RMD256=m -CONFIG_CRYPTO_RMD320=m -CONFIG_CRYPTO_SHA3=m -CONFIG_CRYPTO_SM3=m -CONFIG_CRYPTO_TGR192=m -CONFIG_CRYPTO_WP512=m +CONFIG_CRYPTO_SM2=m +CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m -CONFIG_CRYPTO_ARC4=m +CONFIG_CRYPTO_ARIA=m CONFIG_CRYPTO_BLOWFISH=m CONFIG_CRYPTO_CAMELLIA=m CONFIG_CRYPTO_CAST5=m CONFIG_CRYPTO_CAST6=m +CONFIG_CRYPTO_DES=m CONFIG_CRYPTO_FCRYPT=m CONFIG_CRYPTO_KHAZAD=m -CONFIG_CRYPTO_SALSA20=m CONFIG_CRYPTO_SEED=m CONFIG_CRYPTO_SERPENT=m -CONFIG_CRYPTO_SM4=m +CONFIG_CRYPTO_SM4_GENERIC=m CONFIG_CRYPTO_TEA=m CONFIG_CRYPTO_TWOFISH=m +CONFIG_CRYPTO_ADIANTUM=m +CONFIG_CRYPTO_ARC4=m +CONFIG_CRYPTO_CFB=m +CONFIG_CRYPTO_HCTR2=m +CONFIG_CRYPTO_KEYWRAP=m +CONFIG_CRYPTO_LRW=m +CONFIG_CRYPTO_OFB=m +CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_AEGIS128=m +CONFIG_CRYPTO_CHACHA20POLY1305=m +CONFIG_CRYPTO_GCM=y +CONFIG_CRYPTO_SEQIV=y +CONFIG_CRYPTO_MD4=m +CONFIG_CRYPTO_MD5=y +CONFIG_CRYPTO_MICHAEL_MIC=m +CONFIG_CRYPTO_RMD160=m +CONFIG_CRYPTO_SM3_GENERIC=m +CONFIG_CRYPTO_VMAC=m +CONFIG_CRYPTO_WP512=m +CONFIG_CRYPTO_XCBC=m +CONFIG_CRYPTO_CRC32=m CONFIG_CRYPTO_842=m CONFIG_CRYPTO_LZ4=m CONFIG_CRYPTO_LZ4HC=m @@ -712,45 +753,66 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m CONFIG_CRYPTO_STATS=y -CONFIG_ZCRYPT=m -CONFIG_PKEY=m -CONFIG_CRYPTO_PAES_S390=m +CONFIG_CRYPTO_CRC32_S390=y +CONFIG_CRYPTO_SHA512_S390=m CONFIG_CRYPTO_SHA1_S390=m CONFIG_CRYPTO_SHA256_S390=m -CONFIG_CRYPTO_SHA512_S390=m CONFIG_CRYPTO_SHA3_256_S390=m CONFIG_CRYPTO_SHA3_512_S390=m -CONFIG_CRYPTO_DES_S390=m -CONFIG_CRYPTO_AES_S390=m CONFIG_CRYPTO_GHASH_S390=m -CONFIG_CRYPTO_CRC32_S390=y +CONFIG_CRYPTO_AES_S390=m +CONFIG_CRYPTO_DES_S390=m +CONFIG_CRYPTO_CHACHA_S390=m +CONFIG_ZCRYPT=m +CONFIG_PKEY=m +CONFIG_CRYPTO_PAES_S390=m +CONFIG_CRYPTO_DEV_VIRTIO=m +CONFIG_SYSTEM_BLACKLIST_KEYRING=y CONFIG_CORDIC=m +CONFIG_PRIME_NUMBERS=m +CONFIG_CRYPTO_LIB_CURVE25519=m +CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m CONFIG_CRC4=m CONFIG_CRC7=m CONFIG_CRC8=m +CONFIG_XZ_DEC_MICROLZMA=y CONFIG_DMA_CMA=y CONFIG_CMA_SIZE_MBYTES=0 CONFIG_PRINTK_TIME=y -CONFIG_DEBUG_INFO=y +CONFIG_DYNAMIC_DEBUG=y CONFIG_DEBUG_INFO_DWARF4=y CONFIG_GDB_SCRIPTS=y -CONFIG_FRAME_WARN=1024 CONFIG_DEBUG_SECTION_MISMATCH=y CONFIG_MAGIC_SYSRQ=y +CONFIG_DEBUG_WX=y +CONFIG_PTDUMP_DEBUGFS=y CONFIG_DEBUG_MEMORY_INIT=y CONFIG_PANIC_ON_OOPS=y +CONFIG_TEST_LOCKUP=m CONFIG_RCU_TORTURE_TEST=m +CONFIG_RCU_REF_SCALE_TEST=m CONFIG_RCU_CPU_STALL_TIMEOUT=60 CONFIG_LATENCYTOP=y +CONFIG_BOOTTIME_TRACING=y +CONFIG_FUNCTION_GRAPH_RETVAL=y +CONFIG_FPROBE=y +CONFIG_FUNCTION_PROFILER=y +CONFIG_STACK_TRACER=y CONFIG_SCHED_TRACER=y CONFIG_FTRACE_SYSCALLS=y -CONFIG_STACK_TRACER=y CONFIG_BLK_DEV_IO_TRACE=y -CONFIG_FUNCTION_PROFILER=y +CONFIG_USER_EVENTS=y CONFIG_HIST_TRIGGERS=y +CONFIG_SAMPLES=y +CONFIG_SAMPLE_TRACE_PRINTK=m +CONFIG_SAMPLE_FTRACE_DIRECT=m +CONFIG_SAMPLE_FTRACE_DIRECT_MULTI=m +CONFIG_SAMPLE_FTRACE_OPS=m +CONFIG_KUNIT=m +CONFIG_KUNIT_DEBUGFS=y CONFIG_LKDTM=m +CONFIG_KPROBES_SANITY_TEST=m CONFIG_PERCPU_TEST=m CONFIG_ATOMIC64_SELFTEST=y CONFIG_TEST_BPF=m -CONFIG_BUG_ON_DATA_CORRUPTION=y -CONFIG_S390_PTDUMP=y +CONFIG_TEST_LIVEPATCH=m diff --git a/arch/s390/configs/kasan.config b/arch/s390/configs/kasan.config new file mode 100644 index 000000000000..84c2b551e992 --- /dev/null +++ b/arch/s390/configs/kasan.config @@ -0,0 +1,4 @@ +# Help: Enable KASan for debugging +CONFIG_KASAN=y +CONFIG_KASAN_INLINE=y +CONFIG_KASAN_VMALLOC=y diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig index 20c51e5d9353..30d2a1687665 100644 --- a/arch/s390/configs/zfcpdump_defconfig +++ b/arch/s390/configs/zfcpdump_defconfig @@ -1,38 +1,39 @@ -# CONFIG_SWAP is not set CONFIG_NO_HZ_IDLE=y CONFIG_HIGH_RES_TIMERS=y +CONFIG_BPF_SYSCALL=y # CONFIG_CPU_ISOLATION is not set # CONFIG_UTS_NS is not set +# CONFIG_TIME_NS is not set # CONFIG_PID_NS is not set # CONFIG_NET_NS is not set CONFIG_BLK_DEV_INITRD=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y -# CONFIG_COMPAT_BRK is not set -CONFIG_TUNE_ZEC12=y -# CONFIG_COMPAT is not set +CONFIG_CRASH_DUMP=y +CONFIG_MARCH_Z13=y CONFIG_NR_CPUS=2 CONFIG_HZ_100=y -# CONFIG_ARCH_RANDOM is not set -# CONFIG_RELOCATABLE is not set # CONFIG_CHSC_SCH is not set # CONFIG_SCM_BUS is not set -CONFIG_CRASH_DUMP=y -# CONFIG_SECCOMP is not set # CONFIG_PFAULT is not set -# CONFIG_S390_HYPFS_FS is not set +# CONFIG_S390_HYPFS is not set # CONFIG_VIRTUALIZATION is not set # CONFIG_S390_GUEST is not set +# CONFIG_SECCOMP is not set +# CONFIG_GCC_PLUGINS is not set +# CONFIG_BLOCK_LEGACY_AUTOLOAD is not set CONFIG_PARTITION_ADVANCED=y -CONFIG_IBM_PARTITION=y # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set +# CONFIG_SWAP is not set +# CONFIG_COMPAT_BRK is not set # CONFIG_COMPACTION is not set # CONFIG_MIGRATION is not set -# CONFIG_BOUNCE is not set CONFIG_NET=y # CONFIG_IUCV is not set +# CONFIG_PCPU_DEV_REFCNT is not set +# CONFIG_ETHTOOL_NETLINK is not set CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_SAFE=y CONFIG_BLK_DEV_RAM=y -# CONFIG_BLK_DEV_XPRAM is not set # CONFIG_DCSSBLK is not set # CONFIG_DASD is not set CONFIG_ENCLOSURE_SERVICES=y @@ -46,28 +47,34 @@ CONFIG_ZFCP=y # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set +# CONFIG_LEGACY_TIOCSTI is not set # CONFIG_HVC_IUCV is not set # CONFIG_HW_RANDOM_S390 is not set -CONFIG_RAW_DRIVER=y # CONFIG_HMC_DRV is not set # CONFIG_S390_TAPE is not set # CONFIG_VMCP is not set # CONFIG_MONWRITER is not set # CONFIG_S390_VMUR is not set -# CONFIG_HID is not set +# CONFIG_HID_SUPPORT is not set +# CONFIG_VIRTIO_MENU is not set +# CONFIG_VHOST_MENU is not set # CONFIG_IOMMU_SUPPORT is not set # CONFIG_DNOTIFY is not set # CONFIG_INOTIFY_USER is not set -CONFIG_CONFIGFS_FS=y # CONFIG_MISC_FILESYSTEMS is not set # CONFIG_NETWORK_FILESYSTEMS is not set CONFIG_LSM="yama,loadpin,safesetid,integrity" +CONFIG_INIT_STACK_NONE=y +# CONFIG_ZLIB_DFLTCC is not set +CONFIG_XZ_DEC_MICROLZMA=y CONFIG_PRINTK_TIME=y -CONFIG_DEBUG_INFO=y -CONFIG_DEBUG_FS=y +# CONFIG_SYMBOLIC_ERRNAME is not set CONFIG_DEBUG_KERNEL=y +CONFIG_DEBUG_INFO_DWARF4=y +CONFIG_DEBUG_FS=y CONFIG_PANIC_ON_OOPS=y # CONFIG_SCHED_DEBUG is not set CONFIG_RCU_CPU_STALL_TIMEOUT=60 +# CONFIG_RCU_TRACE is not set # CONFIG_FTRACE is not set # CONFIG_RUNTIME_TESTING_MENU is not set diff --git a/arch/s390/crypto/Kconfig b/arch/s390/crypto/Kconfig new file mode 100644 index 000000000000..06ee706b0d78 --- /dev/null +++ b/arch/s390/crypto/Kconfig @@ -0,0 +1,135 @@ +# SPDX-License-Identifier: GPL-2.0 + +menu "Accelerated Cryptographic Algorithms for CPU (s390)" + +config CRYPTO_CRC32_S390 + tristate "CRC32c and CRC32" + depends on S390 + select CRYPTO_HASH + select CRC32 + help + CRC32c and CRC32 CRC algorithms + + Architecture: s390 + + It is available with IBM z13 or later. + +config CRYPTO_SHA512_S390 + tristate "Hash functions: SHA-384 and SHA-512" + depends on S390 + select CRYPTO_HASH + help + SHA-384 and SHA-512 secure hash algorithms (FIPS 180) + + Architecture: s390 + + It is available as of z10. + +config CRYPTO_SHA1_S390 + tristate "Hash functions: SHA-1" + depends on S390 + select CRYPTO_HASH + help + SHA-1 secure hash algorithm (FIPS 180) + + Architecture: s390 + + It is available as of z990. + +config CRYPTO_SHA256_S390 + tristate "Hash functions: SHA-224 and SHA-256" + depends on S390 + select CRYPTO_HASH + help + SHA-224 and SHA-256 secure hash algorithms (FIPS 180) + + Architecture: s390 + + It is available as of z9. + +config CRYPTO_SHA3_256_S390 + tristate "Hash functions: SHA3-224 and SHA3-256" + depends on S390 + select CRYPTO_HASH + help + SHA3-224 and SHA3-256 secure hash algorithms (FIPS 202) + + Architecture: s390 + + It is available as of z14. + +config CRYPTO_SHA3_512_S390 + tristate "Hash functions: SHA3-384 and SHA3-512" + depends on S390 + select CRYPTO_HASH + help + SHA3-384 and SHA3-512 secure hash algorithms (FIPS 202) + + Architecture: s390 + + It is available as of z14. + +config CRYPTO_GHASH_S390 + tristate "Hash functions: GHASH" + depends on S390 + select CRYPTO_HASH + help + GCM GHASH hash function (NIST SP800-38D) + + Architecture: s390 + + It is available as of z196. + +config CRYPTO_AES_S390 + tristate "Ciphers: AES, modes: ECB, CBC, CTR, XTS, GCM" + depends on S390 + select CRYPTO_ALGAPI + select CRYPTO_SKCIPHER + help + Block cipher: AES cipher algorithms (FIPS 197) + AEAD cipher: AES with GCM + Length-preserving ciphers: AES with ECB, CBC, XTS, and CTR modes + + Architecture: s390 + + As of z9 the ECB and CBC modes are hardware accelerated + for 128 bit keys. + + As of z10 the ECB and CBC modes are hardware accelerated + for all AES key sizes. + + As of z196 the CTR mode is hardware accelerated for all AES + key sizes and XTS mode is hardware accelerated for 256 and + 512 bit keys. + +config CRYPTO_DES_S390 + tristate "Ciphers: DES and Triple DES EDE, modes: ECB, CBC, CTR" + depends on S390 + select CRYPTO_ALGAPI + select CRYPTO_SKCIPHER + select CRYPTO_LIB_DES + help + Block ciphers: DES (FIPS 46-2) cipher algorithm + Block ciphers: Triple DES EDE (FIPS 46-3) cipher algorithm + Length-preserving ciphers: DES with ECB, CBC, and CTR modes + Length-preserving ciphers: Triple DES EDED with ECB, CBC, and CTR modes + + Architecture: s390 + + As of z990 the ECB and CBC mode are hardware accelerated. + As of z196 the CTR mode is hardware accelerated. + +config CRYPTO_CHACHA_S390 + tristate "Ciphers: ChaCha20" + depends on S390 + select CRYPTO_SKCIPHER + select CRYPTO_LIB_CHACHA_GENERIC + select CRYPTO_ARCH_HAVE_LIB_CHACHA + help + Length-preserving cipher: ChaCha20 stream cipher (RFC 7539) + + Architecture: s390 + + It is available as of z13. + +endmenu diff --git a/arch/s390/crypto/Makefile b/arch/s390/crypto/Makefile index 12889d4652cc..1b1cc478fa94 100644 --- a/arch/s390/crypto/Makefile +++ b/arch/s390/crypto/Makefile @@ -11,9 +11,11 @@ obj-$(CONFIG_CRYPTO_SHA3_512_S390) += sha3_512_s390.o sha_common.o obj-$(CONFIG_CRYPTO_DES_S390) += des_s390.o obj-$(CONFIG_CRYPTO_AES_S390) += aes_s390.o obj-$(CONFIG_CRYPTO_PAES_S390) += paes_s390.o +obj-$(CONFIG_CRYPTO_CHACHA_S390) += chacha_s390.o obj-$(CONFIG_S390_PRNG) += prng.o obj-$(CONFIG_CRYPTO_GHASH_S390) += ghash_s390.o obj-$(CONFIG_CRYPTO_CRC32_S390) += crc32-vx_s390.o -obj-$(CONFIG_ARCH_RANDOM) += arch_random.o +obj-y += arch_random.o crc32-vx_s390-y := crc32-vx.o crc32le-vx.o crc32be-vx.o +chacha_s390-y := chacha-glue.o chacha-s390.o diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c index ead0b2c9881d..c6fe5405de4a 100644 --- a/arch/s390/crypto/aes_s390.c +++ b/arch/s390/crypto/aes_s390.c @@ -21,6 +21,7 @@ #include <crypto/algapi.h> #include <crypto/ghash.h> #include <crypto/internal/aead.h> +#include <crypto/internal/cipher.h> #include <crypto/internal/skcipher.h> #include <crypto/scatterwalk.h> #include <linux/err.h> @@ -72,19 +73,12 @@ static int setkey_fallback_cip(struct crypto_tfm *tfm, const u8 *in_key, unsigned int key_len) { struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm); - int ret; sctx->fallback.cip->base.crt_flags &= ~CRYPTO_TFM_REQ_MASK; sctx->fallback.cip->base.crt_flags |= (tfm->crt_flags & CRYPTO_TFM_REQ_MASK); - ret = crypto_cipher_setkey(sctx->fallback.cip, in_key, key_len); - if (ret) { - tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK; - tfm->crt_flags |= (sctx->fallback.cip->base.crt_flags & - CRYPTO_TFM_RES_MASK); - } - return ret; + return crypto_cipher_setkey(sctx->fallback.cip, in_key, key_len); } static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, @@ -182,18 +176,13 @@ static int setkey_fallback_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int len) { struct s390_aes_ctx *sctx = crypto_skcipher_ctx(tfm); - int ret; crypto_skcipher_clear_flags(sctx->fallback.skcipher, CRYPTO_TFM_REQ_MASK); crypto_skcipher_set_flags(sctx->fallback.skcipher, crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); - ret = crypto_skcipher_setkey(sctx->fallback.skcipher, key, len); - crypto_skcipher_set_flags(tfm, - crypto_skcipher_get_flags(sctx->fallback.skcipher) & - CRYPTO_TFM_RES_MASK); - return ret; + return crypto_skcipher_setkey(sctx->fallback.skcipher, key, len); } static int fallback_skcipher_crypt(struct s390_aes_ctx *sctx, @@ -354,6 +343,7 @@ static int cbc_aes_crypt(struct skcipher_request *req, unsigned long modifier) memcpy(walk.iv, param.iv, AES_BLOCK_SIZE); ret = skcipher_walk_done(&walk, nbytes - n); } + memzero_explicit(¶m, sizeof(param)); return ret; } @@ -389,17 +379,12 @@ static int xts_fallback_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int len) { struct s390_xts_ctx *xts_ctx = crypto_skcipher_ctx(tfm); - int ret; crypto_skcipher_clear_flags(xts_ctx->fallback, CRYPTO_TFM_REQ_MASK); crypto_skcipher_set_flags(xts_ctx->fallback, crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); - ret = crypto_skcipher_setkey(xts_ctx->fallback, key, len); - crypto_skcipher_set_flags(tfm, - crypto_skcipher_get_flags(xts_ctx->fallback) & - CRYPTO_TFM_RES_MASK); - return ret; + return crypto_skcipher_setkey(xts_ctx->fallback, key, len); } static int xts_aes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, @@ -413,12 +398,6 @@ static int xts_aes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, if (err) return err; - /* In fips mode only 128 bit or 256 bit keys are valid */ - if (fips_enabled && key_len != 32 && key_len != 64) { - crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; - } - /* Pick the correct function code based on the key length */ fc = (key_len == 32) ? CPACF_KM_XTS_128 : (key_len == 64) ? CPACF_KM_XTS_256 : 0; @@ -489,6 +468,8 @@ static int xts_aes_crypt(struct skcipher_request *req, unsigned long modifier) walk.dst.virt.addr, walk.src.virt.addr, n); ret = skcipher_walk_done(&walk, nbytes - n); } + memzero_explicit(&pcc_param, sizeof(pcc_param)); + memzero_explicit(&xts_param, sizeof(xts_param)); return ret; } @@ -616,7 +597,9 @@ static int ctr_aes_crypt(struct skcipher_request *req) * final block may be < AES_BLOCK_SIZE, copy only nbytes */ if (nbytes) { - cpacf_kmctr(sctx->fc, sctx->key, buf, walk.src.virt.addr, + memset(buf, 0, AES_BLOCK_SIZE); + memcpy(buf, walk.src.virt.addr, nbytes); + cpacf_kmctr(sctx->fc, sctx->key, buf, buf, AES_BLOCK_SIZE, walk.iv); memcpy(walk.dst.virt.addr, buf, nbytes); crypto_inc(walk.iv, AES_BLOCK_SIZE); @@ -716,7 +699,7 @@ static inline void _gcm_sg_unmap_and_advance(struct gcm_sg_walk *gw, unsigned int nbytes) { gw->walk_bytes_remain -= nbytes; - scatterwalk_unmap(&gw->walk); + scatterwalk_unmap(gw->walk_ptr); scatterwalk_advance(&gw->walk, nbytes); scatterwalk_done(&gw->walk, 0, gw->walk_bytes_remain); gw->walk_ptr = NULL; @@ -791,7 +774,7 @@ static int gcm_out_walk_go(struct gcm_sg_walk *gw, unsigned int minbytesneeded) goto out; } - scatterwalk_unmap(&gw->walk); + scatterwalk_unmap(gw->walk_ptr); gw->walk_ptr = NULL; gw->ptr = gw->buf; @@ -1064,10 +1047,11 @@ out_err: return ret; } -module_cpu_feature_match(MSA, aes_s390_init); +module_cpu_feature_match(S390_CPU_FEATURE_MSA, aes_s390_init); module_exit(aes_s390_fini); MODULE_ALIAS_CRYPTO("aes-all"); MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm"); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(CRYPTO_INTERNAL); diff --git a/arch/s390/crypto/arch_random.c b/arch/s390/crypto/arch_random.c index dd95cdbd22ce..a8a2407381af 100644 --- a/arch/s390/crypto/arch_random.c +++ b/arch/s390/crypto/arch_random.c @@ -2,122 +2,18 @@ /* * s390 arch random implementation. * - * Copyright IBM Corp. 2017, 2018 + * Copyright IBM Corp. 2017, 2020 * Author(s): Harald Freudenberger - * - * The s390_arch_random_generate() function may be called from random.c - * in interrupt context. So this implementation does the best to be very - * fast. There is a buffer of random data which is asynchronously checked - * and filled by a workqueue thread. - * If there are enough bytes in the buffer the s390_arch_random_generate() - * just delivers these bytes. Otherwise false is returned until the - * worker thread refills the buffer. - * The worker fills the rng buffer by pulling fresh entropy from the - * high quality (but slow) true hardware random generator. This entropy - * is then spread over the buffer with an pseudo random generator PRNG. - * As the arch_get_random_seed_long() fetches 8 bytes and the calling - * function add_interrupt_randomness() counts this as 1 bit entropy the - * distribution needs to make sure there is in fact 1 bit entropy contained - * in 8 bytes of the buffer. The current values pull 32 byte entropy - * and scatter this into a 2048 byte buffer. So 8 byte in the buffer - * will contain 1 bit of entropy. - * The worker thread is rescheduled based on the charge level of the - * buffer but at least with 500 ms delay to avoid too much CPU consumption. - * So the max. amount of rng data delivered via arch_get_random_seed is - * limited to 4k bytes per second. */ #include <linux/kernel.h> #include <linux/atomic.h> #include <linux/random.h> -#include <linux/slab.h> #include <linux/static_key.h> -#include <linux/workqueue.h> +#include <asm/archrandom.h> #include <asm/cpacf.h> DEFINE_STATIC_KEY_FALSE(s390_arch_random_available); atomic64_t s390_arch_random_counter = ATOMIC64_INIT(0); EXPORT_SYMBOL(s390_arch_random_counter); - -#define ARCH_REFILL_TICKS (HZ/2) -#define ARCH_PRNG_SEED_SIZE 32 -#define ARCH_RNG_BUF_SIZE 2048 - -static DEFINE_SPINLOCK(arch_rng_lock); -static u8 *arch_rng_buf; -static unsigned int arch_rng_buf_idx; - -static void arch_rng_refill_buffer(struct work_struct *); -static DECLARE_DELAYED_WORK(arch_rng_work, arch_rng_refill_buffer); - -bool s390_arch_random_generate(u8 *buf, unsigned int nbytes) -{ - /* lock rng buffer */ - if (!spin_trylock(&arch_rng_lock)) - return false; - - /* try to resolve the requested amount of bytes from the buffer */ - arch_rng_buf_idx -= nbytes; - if (arch_rng_buf_idx < ARCH_RNG_BUF_SIZE) { - memcpy(buf, arch_rng_buf + arch_rng_buf_idx, nbytes); - atomic64_add(nbytes, &s390_arch_random_counter); - spin_unlock(&arch_rng_lock); - return true; - } - - /* not enough bytes in rng buffer, refill is done asynchronously */ - spin_unlock(&arch_rng_lock); - - return false; -} -EXPORT_SYMBOL(s390_arch_random_generate); - -static void arch_rng_refill_buffer(struct work_struct *unused) -{ - unsigned int delay = ARCH_REFILL_TICKS; - - spin_lock(&arch_rng_lock); - if (arch_rng_buf_idx > ARCH_RNG_BUF_SIZE) { - /* buffer is exhausted and needs refill */ - u8 seed[ARCH_PRNG_SEED_SIZE]; - u8 prng_wa[240]; - /* fetch ARCH_PRNG_SEED_SIZE bytes of entropy */ - cpacf_trng(NULL, 0, seed, sizeof(seed)); - /* blow this entropy up to ARCH_RNG_BUF_SIZE with PRNG */ - memset(prng_wa, 0, sizeof(prng_wa)); - cpacf_prno(CPACF_PRNO_SHA512_DRNG_SEED, - &prng_wa, NULL, 0, seed, sizeof(seed)); - cpacf_prno(CPACF_PRNO_SHA512_DRNG_GEN, - &prng_wa, arch_rng_buf, ARCH_RNG_BUF_SIZE, NULL, 0); - arch_rng_buf_idx = ARCH_RNG_BUF_SIZE; - } - delay += (ARCH_REFILL_TICKS * arch_rng_buf_idx) / ARCH_RNG_BUF_SIZE; - spin_unlock(&arch_rng_lock); - - /* kick next check */ - queue_delayed_work(system_long_wq, &arch_rng_work, delay); -} - -static int __init s390_arch_random_init(void) -{ - /* all the needed PRNO subfunctions available ? */ - if (cpacf_query_func(CPACF_PRNO, CPACF_PRNO_TRNG) && - cpacf_query_func(CPACF_PRNO, CPACF_PRNO_SHA512_DRNG_GEN)) { - - /* alloc arch random working buffer */ - arch_rng_buf = kmalloc(ARCH_RNG_BUF_SIZE, GFP_KERNEL); - if (!arch_rng_buf) - return -ENOMEM; - - /* kick worker queue job to fill the random buffer */ - queue_delayed_work(system_long_wq, - &arch_rng_work, ARCH_REFILL_TICKS); - - /* enable arch random to the outside world */ - static_branch_enable(&s390_arch_random_available); - } - - return 0; -} -arch_initcall(s390_arch_random_init); diff --git a/arch/s390/crypto/chacha-glue.c b/arch/s390/crypto/chacha-glue.c new file mode 100644 index 000000000000..ed9959e6f714 --- /dev/null +++ b/arch/s390/crypto/chacha-glue.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * s390 ChaCha stream cipher. + * + * Copyright IBM Corp. 2021 + */ + +#define KMSG_COMPONENT "chacha_s390" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <crypto/internal/chacha.h> +#include <crypto/internal/skcipher.h> +#include <crypto/algapi.h> +#include <linux/cpufeature.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sizes.h> +#include <asm/fpu/api.h> +#include "chacha-s390.h" + +static void chacha20_crypt_s390(u32 *state, u8 *dst, const u8 *src, + unsigned int nbytes, const u32 *key, + u32 *counter) +{ + struct kernel_fpu vxstate; + + kernel_fpu_begin(&vxstate, KERNEL_VXR); + chacha20_vx(dst, src, nbytes, key, counter); + kernel_fpu_end(&vxstate, KERNEL_VXR); + + *counter += round_up(nbytes, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE; +} + +static int chacha20_s390(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + u32 state[CHACHA_STATE_WORDS] __aligned(16); + struct skcipher_walk walk; + unsigned int nbytes; + int rc; + + rc = skcipher_walk_virt(&walk, req, false); + chacha_init_generic(state, ctx->key, req->iv); + + while (walk.nbytes > 0) { + nbytes = walk.nbytes; + if (nbytes < walk.total) + nbytes = round_down(nbytes, walk.stride); + + if (nbytes <= CHACHA_BLOCK_SIZE) { + chacha_crypt_generic(state, walk.dst.virt.addr, + walk.src.virt.addr, nbytes, + ctx->nrounds); + } else { + chacha20_crypt_s390(state, walk.dst.virt.addr, + walk.src.virt.addr, nbytes, + &state[4], &state[12]); + } + rc = skcipher_walk_done(&walk, walk.nbytes - nbytes); + } + return rc; +} + +void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds) +{ + /* TODO: implement hchacha_block_arch() in assembly */ + hchacha_block_generic(state, stream, nrounds); +} +EXPORT_SYMBOL(hchacha_block_arch); + +void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv) +{ + chacha_init_generic(state, key, iv); +} +EXPORT_SYMBOL(chacha_init_arch); + +void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + /* s390 chacha20 implementation has 20 rounds hard-coded, + * it cannot handle a block of data or less, but otherwise + * it can handle data of arbitrary size + */ + if (bytes <= CHACHA_BLOCK_SIZE || nrounds != 20 || !cpu_has_vx()) + chacha_crypt_generic(state, dst, src, bytes, nrounds); + else + chacha20_crypt_s390(state, dst, src, bytes, + &state[4], &state[12]); +} +EXPORT_SYMBOL(chacha_crypt_arch); + +static struct skcipher_alg chacha_algs[] = { + { + .base.cra_name = "chacha20", + .base.cra_driver_name = "chacha20-s390", + .base.cra_priority = 900, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = CHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .setkey = chacha20_setkey, + .encrypt = chacha20_s390, + .decrypt = chacha20_s390, + } +}; + +static int __init chacha_mod_init(void) +{ + return IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) ? + crypto_register_skciphers(chacha_algs, ARRAY_SIZE(chacha_algs)) : 0; +} + +static void __exit chacha_mod_fini(void) +{ + if (IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER)) + crypto_unregister_skciphers(chacha_algs, ARRAY_SIZE(chacha_algs)); +} + +module_cpu_feature_match(S390_CPU_FEATURE_VXRS, chacha_mod_init); +module_exit(chacha_mod_fini); + +MODULE_DESCRIPTION("ChaCha20 stream cipher"); +MODULE_LICENSE("GPL v2"); + +MODULE_ALIAS_CRYPTO("chacha20"); diff --git a/arch/s390/crypto/chacha-s390.S b/arch/s390/crypto/chacha-s390.S new file mode 100644 index 000000000000..37cb63f25b17 --- /dev/null +++ b/arch/s390/crypto/chacha-s390.S @@ -0,0 +1,908 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Original implementation written by Andy Polyakov, @dot-asm. + * This is an adaptation of the original code for kernel use. + * + * Copyright (C) 2006-2019 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved. + */ + +#include <linux/linkage.h> +#include <asm/nospec-insn.h> +#include <asm/vx-insn.h> + +#define SP %r15 +#define FRAME (16 * 8 + 4 * 8) + + .data + .balign 32 + +SYM_DATA_START_LOCAL(sigma) + .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral + .long 1,0,0,0 + .long 2,0,0,0 + .long 3,0,0,0 + .long 0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c # byte swap + + .long 0,1,2,3 + .long 0x61707865,0x61707865,0x61707865,0x61707865 # smashed sigma + .long 0x3320646e,0x3320646e,0x3320646e,0x3320646e + .long 0x79622d32,0x79622d32,0x79622d32,0x79622d32 + .long 0x6b206574,0x6b206574,0x6b206574,0x6b206574 +SYM_DATA_END(sigma) + + .previous + + GEN_BR_THUNK %r14 + + .text + +############################################################################# +# void chacha20_vx_4x(u8 *out, counst u8 *inp, size_t len, +# counst u32 *key, const u32 *counter) + +#define OUT %r2 +#define INP %r3 +#define LEN %r4 +#define KEY %r5 +#define COUNTER %r6 + +#define BEPERM %v31 +#define CTR %v26 + +#define K0 %v16 +#define K1 %v17 +#define K2 %v18 +#define K3 %v19 + +#define XA0 %v0 +#define XA1 %v1 +#define XA2 %v2 +#define XA3 %v3 + +#define XB0 %v4 +#define XB1 %v5 +#define XB2 %v6 +#define XB3 %v7 + +#define XC0 %v8 +#define XC1 %v9 +#define XC2 %v10 +#define XC3 %v11 + +#define XD0 %v12 +#define XD1 %v13 +#define XD2 %v14 +#define XD3 %v15 + +#define XT0 %v27 +#define XT1 %v28 +#define XT2 %v29 +#define XT3 %v30 + +SYM_FUNC_START(chacha20_vx_4x) + stmg %r6,%r7,6*8(SP) + + larl %r7,sigma + lhi %r0,10 + lhi %r1,0 + + VL K0,0,,%r7 # load sigma + VL K1,0,,KEY # load key + VL K2,16,,KEY + VL K3,0,,COUNTER # load counter + + VL BEPERM,0x40,,%r7 + VL CTR,0x50,,%r7 + + VLM XA0,XA3,0x60,%r7,4 # load [smashed] sigma + + VREPF XB0,K1,0 # smash the key + VREPF XB1,K1,1 + VREPF XB2,K1,2 + VREPF XB3,K1,3 + + VREPF XD0,K3,0 + VREPF XD1,K3,1 + VREPF XD2,K3,2 + VREPF XD3,K3,3 + VAF XD0,XD0,CTR + + VREPF XC0,K2,0 + VREPF XC1,K2,1 + VREPF XC2,K2,2 + VREPF XC3,K2,3 + +.Loop_4x: + VAF XA0,XA0,XB0 + VX XD0,XD0,XA0 + VERLLF XD0,XD0,16 + + VAF XA1,XA1,XB1 + VX XD1,XD1,XA1 + VERLLF XD1,XD1,16 + + VAF XA2,XA2,XB2 + VX XD2,XD2,XA2 + VERLLF XD2,XD2,16 + + VAF XA3,XA3,XB3 + VX XD3,XD3,XA3 + VERLLF XD3,XD3,16 + + VAF XC0,XC0,XD0 + VX XB0,XB0,XC0 + VERLLF XB0,XB0,12 + + VAF XC1,XC1,XD1 + VX XB1,XB1,XC1 + VERLLF XB1,XB1,12 + + VAF XC2,XC2,XD2 + VX XB2,XB2,XC2 + VERLLF XB2,XB2,12 + + VAF XC3,XC3,XD3 + VX XB3,XB3,XC3 + VERLLF XB3,XB3,12 + + VAF XA0,XA0,XB0 + VX XD0,XD0,XA0 + VERLLF XD0,XD0,8 + + VAF XA1,XA1,XB1 + VX XD1,XD1,XA1 + VERLLF XD1,XD1,8 + + VAF XA2,XA2,XB2 + VX XD2,XD2,XA2 + VERLLF XD2,XD2,8 + + VAF XA3,XA3,XB3 + VX XD3,XD3,XA3 + VERLLF XD3,XD3,8 + + VAF XC0,XC0,XD0 + VX XB0,XB0,XC0 + VERLLF XB0,XB0,7 + + VAF XC1,XC1,XD1 + VX XB1,XB1,XC1 + VERLLF XB1,XB1,7 + + VAF XC2,XC2,XD2 + VX XB2,XB2,XC2 + VERLLF XB2,XB2,7 + + VAF XC3,XC3,XD3 + VX XB3,XB3,XC3 + VERLLF XB3,XB3,7 + + VAF XA0,XA0,XB1 + VX XD3,XD3,XA0 + VERLLF XD3,XD3,16 + + VAF XA1,XA1,XB2 + VX XD0,XD0,XA1 + VERLLF XD0,XD0,16 + + VAF XA2,XA2,XB3 + VX XD1,XD1,XA2 + VERLLF XD1,XD1,16 + + VAF XA3,XA3,XB0 + VX XD2,XD2,XA3 + VERLLF XD2,XD2,16 + + VAF XC2,XC2,XD3 + VX XB1,XB1,XC2 + VERLLF XB1,XB1,12 + + VAF XC3,XC3,XD0 + VX XB2,XB2,XC3 + VERLLF XB2,XB2,12 + + VAF XC0,XC0,XD1 + VX XB3,XB3,XC0 + VERLLF XB3,XB3,12 + + VAF XC1,XC1,XD2 + VX XB0,XB0,XC1 + VERLLF XB0,XB0,12 + + VAF XA0,XA0,XB1 + VX XD3,XD3,XA0 + VERLLF XD3,XD3,8 + + VAF XA1,XA1,XB2 + VX XD0,XD0,XA1 + VERLLF XD0,XD0,8 + + VAF XA2,XA2,XB3 + VX XD1,XD1,XA2 + VERLLF XD1,XD1,8 + + VAF XA3,XA3,XB0 + VX XD2,XD2,XA3 + VERLLF XD2,XD2,8 + + VAF XC2,XC2,XD3 + VX XB1,XB1,XC2 + VERLLF XB1,XB1,7 + + VAF XC3,XC3,XD0 + VX XB2,XB2,XC3 + VERLLF XB2,XB2,7 + + VAF XC0,XC0,XD1 + VX XB3,XB3,XC0 + VERLLF XB3,XB3,7 + + VAF XC1,XC1,XD2 + VX XB0,XB0,XC1 + VERLLF XB0,XB0,7 + brct %r0,.Loop_4x + + VAF XD0,XD0,CTR + + VMRHF XT0,XA0,XA1 # transpose data + VMRHF XT1,XA2,XA3 + VMRLF XT2,XA0,XA1 + VMRLF XT3,XA2,XA3 + VPDI XA0,XT0,XT1,0b0000 + VPDI XA1,XT0,XT1,0b0101 + VPDI XA2,XT2,XT3,0b0000 + VPDI XA3,XT2,XT3,0b0101 + + VMRHF XT0,XB0,XB1 + VMRHF XT1,XB2,XB3 + VMRLF XT2,XB0,XB1 + VMRLF XT3,XB2,XB3 + VPDI XB0,XT0,XT1,0b0000 + VPDI XB1,XT0,XT1,0b0101 + VPDI XB2,XT2,XT3,0b0000 + VPDI XB3,XT2,XT3,0b0101 + + VMRHF XT0,XC0,XC1 + VMRHF XT1,XC2,XC3 + VMRLF XT2,XC0,XC1 + VMRLF XT3,XC2,XC3 + VPDI XC0,XT0,XT1,0b0000 + VPDI XC1,XT0,XT1,0b0101 + VPDI XC2,XT2,XT3,0b0000 + VPDI XC3,XT2,XT3,0b0101 + + VMRHF XT0,XD0,XD1 + VMRHF XT1,XD2,XD3 + VMRLF XT2,XD0,XD1 + VMRLF XT3,XD2,XD3 + VPDI XD0,XT0,XT1,0b0000 + VPDI XD1,XT0,XT1,0b0101 + VPDI XD2,XT2,XT3,0b0000 + VPDI XD3,XT2,XT3,0b0101 + + VAF XA0,XA0,K0 + VAF XB0,XB0,K1 + VAF XC0,XC0,K2 + VAF XD0,XD0,K3 + + VPERM XA0,XA0,XA0,BEPERM + VPERM XB0,XB0,XB0,BEPERM + VPERM XC0,XC0,XC0,BEPERM + VPERM XD0,XD0,XD0,BEPERM + + VLM XT0,XT3,0,INP,0 + + VX XT0,XT0,XA0 + VX XT1,XT1,XB0 + VX XT2,XT2,XC0 + VX XT3,XT3,XD0 + + VSTM XT0,XT3,0,OUT,0 + + la INP,0x40(INP) + la OUT,0x40(OUT) + aghi LEN,-0x40 + + VAF XA0,XA1,K0 + VAF XB0,XB1,K1 + VAF XC0,XC1,K2 + VAF XD0,XD1,K3 + + VPERM XA0,XA0,XA0,BEPERM + VPERM XB0,XB0,XB0,BEPERM + VPERM XC0,XC0,XC0,BEPERM + VPERM XD0,XD0,XD0,BEPERM + + clgfi LEN,0x40 + jl .Ltail_4x + + VLM XT0,XT3,0,INP,0 + + VX XT0,XT0,XA0 + VX XT1,XT1,XB0 + VX XT2,XT2,XC0 + VX XT3,XT3,XD0 + + VSTM XT0,XT3,0,OUT,0 + + la INP,0x40(INP) + la OUT,0x40(OUT) + aghi LEN,-0x40 + je .Ldone_4x + + VAF XA0,XA2,K0 + VAF XB0,XB2,K1 + VAF XC0,XC2,K2 + VAF XD0,XD2,K3 + + VPERM XA0,XA0,XA0,BEPERM + VPERM XB0,XB0,XB0,BEPERM + VPERM XC0,XC0,XC0,BEPERM + VPERM XD0,XD0,XD0,BEPERM + + clgfi LEN,0x40 + jl .Ltail_4x + + VLM XT0,XT3,0,INP,0 + + VX XT0,XT0,XA0 + VX XT1,XT1,XB0 + VX XT2,XT2,XC0 + VX XT3,XT3,XD0 + + VSTM XT0,XT3,0,OUT,0 + + la INP,0x40(INP) + la OUT,0x40(OUT) + aghi LEN,-0x40 + je .Ldone_4x + + VAF XA0,XA3,K0 + VAF XB0,XB3,K1 + VAF XC0,XC3,K2 + VAF XD0,XD3,K3 + + VPERM XA0,XA0,XA0,BEPERM + VPERM XB0,XB0,XB0,BEPERM + VPERM XC0,XC0,XC0,BEPERM + VPERM XD0,XD0,XD0,BEPERM + + clgfi LEN,0x40 + jl .Ltail_4x + + VLM XT0,XT3,0,INP,0 + + VX XT0,XT0,XA0 + VX XT1,XT1,XB0 + VX XT2,XT2,XC0 + VX XT3,XT3,XD0 + + VSTM XT0,XT3,0,OUT,0 + +.Ldone_4x: + lmg %r6,%r7,6*8(SP) + BR_EX %r14 + +.Ltail_4x: + VLR XT0,XC0 + VLR XT1,XD0 + + VST XA0,8*8+0x00,,SP + VST XB0,8*8+0x10,,SP + VST XT0,8*8+0x20,,SP + VST XT1,8*8+0x30,,SP + + lghi %r1,0 + +.Loop_tail_4x: + llgc %r5,0(%r1,INP) + llgc %r6,8*8(%r1,SP) + xr %r6,%r5 + stc %r6,0(%r1,OUT) + la %r1,1(%r1) + brct LEN,.Loop_tail_4x + + lmg %r6,%r7,6*8(SP) + BR_EX %r14 +SYM_FUNC_END(chacha20_vx_4x) + +#undef OUT +#undef INP +#undef LEN +#undef KEY +#undef COUNTER + +#undef BEPERM + +#undef K0 +#undef K1 +#undef K2 +#undef K3 + + +############################################################################# +# void chacha20_vx(u8 *out, counst u8 *inp, size_t len, +# counst u32 *key, const u32 *counter) + +#define OUT %r2 +#define INP %r3 +#define LEN %r4 +#define KEY %r5 +#define COUNTER %r6 + +#define BEPERM %v31 + +#define K0 %v27 +#define K1 %v24 +#define K2 %v25 +#define K3 %v26 + +#define A0 %v0 +#define B0 %v1 +#define C0 %v2 +#define D0 %v3 + +#define A1 %v4 +#define B1 %v5 +#define C1 %v6 +#define D1 %v7 + +#define A2 %v8 +#define B2 %v9 +#define C2 %v10 +#define D2 %v11 + +#define A3 %v12 +#define B3 %v13 +#define C3 %v14 +#define D3 %v15 + +#define A4 %v16 +#define B4 %v17 +#define C4 %v18 +#define D4 %v19 + +#define A5 %v20 +#define B5 %v21 +#define C5 %v22 +#define D5 %v23 + +#define T0 %v27 +#define T1 %v28 +#define T2 %v29 +#define T3 %v30 + +SYM_FUNC_START(chacha20_vx) + clgfi LEN,256 + jle chacha20_vx_4x + stmg %r6,%r7,6*8(SP) + + lghi %r1,-FRAME + lgr %r0,SP + la SP,0(%r1,SP) + stg %r0,0(SP) # back-chain + + larl %r7,sigma + lhi %r0,10 + + VLM K1,K2,0,KEY,0 # load key + VL K3,0,,COUNTER # load counter + + VLM K0,BEPERM,0,%r7,4 # load sigma, increments, ... + +.Loop_outer_vx: + VLR A0,K0 + VLR B0,K1 + VLR A1,K0 + VLR B1,K1 + VLR A2,K0 + VLR B2,K1 + VLR A3,K0 + VLR B3,K1 + VLR A4,K0 + VLR B4,K1 + VLR A5,K0 + VLR B5,K1 + + VLR D0,K3 + VAF D1,K3,T1 # K[3]+1 + VAF D2,K3,T2 # K[3]+2 + VAF D3,K3,T3 # K[3]+3 + VAF D4,D2,T2 # K[3]+4 + VAF D5,D2,T3 # K[3]+5 + + VLR C0,K2 + VLR C1,K2 + VLR C2,K2 + VLR C3,K2 + VLR C4,K2 + VLR C5,K2 + + VLR T1,D1 + VLR T2,D2 + VLR T3,D3 + +.Loop_vx: + VAF A0,A0,B0 + VAF A1,A1,B1 + VAF A2,A2,B2 + VAF A3,A3,B3 + VAF A4,A4,B4 + VAF A5,A5,B5 + VX D0,D0,A0 + VX D1,D1,A1 + VX D2,D2,A2 + VX D3,D3,A3 + VX D4,D4,A4 + VX D5,D5,A5 + VERLLF D0,D0,16 + VERLLF D1,D1,16 + VERLLF D2,D2,16 + VERLLF D3,D3,16 + VERLLF D4,D4,16 + VERLLF D5,D5,16 + + VAF C0,C0,D0 + VAF C1,C1,D1 + VAF C2,C2,D2 + VAF C3,C3,D3 + VAF C4,C4,D4 + VAF C5,C5,D5 + VX B0,B0,C0 + VX B1,B1,C1 + VX B2,B2,C2 + VX B3,B3,C3 + VX B4,B4,C4 + VX B5,B5,C5 + VERLLF B0,B0,12 + VERLLF B1,B1,12 + VERLLF B2,B2,12 + VERLLF B3,B3,12 + VERLLF B4,B4,12 + VERLLF B5,B5,12 + + VAF A0,A0,B0 + VAF A1,A1,B1 + VAF A2,A2,B2 + VAF A3,A3,B3 + VAF A4,A4,B4 + VAF A5,A5,B5 + VX D0,D0,A0 + VX D1,D1,A1 + VX D2,D2,A2 + VX D3,D3,A3 + VX D4,D4,A4 + VX D5,D5,A5 + VERLLF D0,D0,8 + VERLLF D1,D1,8 + VERLLF D2,D2,8 + VERLLF D3,D3,8 + VERLLF D4,D4,8 + VERLLF D5,D5,8 + + VAF C0,C0,D0 + VAF C1,C1,D1 + VAF C2,C2,D2 + VAF C3,C3,D3 + VAF C4,C4,D4 + VAF C5,C5,D5 + VX B0,B0,C0 + VX B1,B1,C1 + VX B2,B2,C2 + VX B3,B3,C3 + VX B4,B4,C4 + VX B5,B5,C5 + VERLLF B0,B0,7 + VERLLF B1,B1,7 + VERLLF B2,B2,7 + VERLLF B3,B3,7 + VERLLF B4,B4,7 + VERLLF B5,B5,7 + + VSLDB C0,C0,C0,8 + VSLDB C1,C1,C1,8 + VSLDB C2,C2,C2,8 + VSLDB C3,C3,C3,8 + VSLDB C4,C4,C4,8 + VSLDB C5,C5,C5,8 + VSLDB B0,B0,B0,4 + VSLDB B1,B1,B1,4 + VSLDB B2,B2,B2,4 + VSLDB B3,B3,B3,4 + VSLDB B4,B4,B4,4 + VSLDB B5,B5,B5,4 + VSLDB D0,D0,D0,12 + VSLDB D1,D1,D1,12 + VSLDB D2,D2,D2,12 + VSLDB D3,D3,D3,12 + VSLDB D4,D4,D4,12 + VSLDB D5,D5,D5,12 + + VAF A0,A0,B0 + VAF A1,A1,B1 + VAF A2,A2,B2 + VAF A3,A3,B3 + VAF A4,A4,B4 + VAF A5,A5,B5 + VX D0,D0,A0 + VX D1,D1,A1 + VX D2,D2,A2 + VX D3,D3,A3 + VX D4,D4,A4 + VX D5,D5,A5 + VERLLF D0,D0,16 + VERLLF D1,D1,16 + VERLLF D2,D2,16 + VERLLF D3,D3,16 + VERLLF D4,D4,16 + VERLLF D5,D5,16 + + VAF C0,C0,D0 + VAF C1,C1,D1 + VAF C2,C2,D2 + VAF C3,C3,D3 + VAF C4,C4,D4 + VAF C5,C5,D5 + VX B0,B0,C0 + VX B1,B1,C1 + VX B2,B2,C2 + VX B3,B3,C3 + VX B4,B4,C4 + VX B5,B5,C5 + VERLLF B0,B0,12 + VERLLF B1,B1,12 + VERLLF B2,B2,12 + VERLLF B3,B3,12 + VERLLF B4,B4,12 + VERLLF B5,B5,12 + + VAF A0,A0,B0 + VAF A1,A1,B1 + VAF A2,A2,B2 + VAF A3,A3,B3 + VAF A4,A4,B4 + VAF A5,A5,B5 + VX D0,D0,A0 + VX D1,D1,A1 + VX D2,D2,A2 + VX D3,D3,A3 + VX D4,D4,A4 + VX D5,D5,A5 + VERLLF D0,D0,8 + VERLLF D1,D1,8 + VERLLF D2,D2,8 + VERLLF D3,D3,8 + VERLLF D4,D4,8 + VERLLF D5,D5,8 + + VAF C0,C0,D0 + VAF C1,C1,D1 + VAF C2,C2,D2 + VAF C3,C3,D3 + VAF C4,C4,D4 + VAF C5,C5,D5 + VX B0,B0,C0 + VX B1,B1,C1 + VX B2,B2,C2 + VX B3,B3,C3 + VX B4,B4,C4 + VX B5,B5,C5 + VERLLF B0,B0,7 + VERLLF B1,B1,7 + VERLLF B2,B2,7 + VERLLF B3,B3,7 + VERLLF B4,B4,7 + VERLLF B5,B5,7 + + VSLDB C0,C0,C0,8 + VSLDB C1,C1,C1,8 + VSLDB C2,C2,C2,8 + VSLDB C3,C3,C3,8 + VSLDB C4,C4,C4,8 + VSLDB C5,C5,C5,8 + VSLDB B0,B0,B0,12 + VSLDB B1,B1,B1,12 + VSLDB B2,B2,B2,12 + VSLDB B3,B3,B3,12 + VSLDB B4,B4,B4,12 + VSLDB B5,B5,B5,12 + VSLDB D0,D0,D0,4 + VSLDB D1,D1,D1,4 + VSLDB D2,D2,D2,4 + VSLDB D3,D3,D3,4 + VSLDB D4,D4,D4,4 + VSLDB D5,D5,D5,4 + brct %r0,.Loop_vx + + VAF A0,A0,K0 + VAF B0,B0,K1 + VAF C0,C0,K2 + VAF D0,D0,K3 + VAF A1,A1,K0 + VAF D1,D1,T1 # +K[3]+1 + + VPERM A0,A0,A0,BEPERM + VPERM B0,B0,B0,BEPERM + VPERM C0,C0,C0,BEPERM + VPERM D0,D0,D0,BEPERM + + clgfi LEN,0x40 + jl .Ltail_vx + + VAF D2,D2,T2 # +K[3]+2 + VAF D3,D3,T3 # +K[3]+3 + VLM T0,T3,0,INP,0 + + VX A0,A0,T0 + VX B0,B0,T1 + VX C0,C0,T2 + VX D0,D0,T3 + + VLM K0,T3,0,%r7,4 # re-load sigma and increments + + VSTM A0,D0,0,OUT,0 + + la INP,0x40(INP) + la OUT,0x40(OUT) + aghi LEN,-0x40 + je .Ldone_vx + + VAF B1,B1,K1 + VAF C1,C1,K2 + + VPERM A0,A1,A1,BEPERM + VPERM B0,B1,B1,BEPERM + VPERM C0,C1,C1,BEPERM + VPERM D0,D1,D1,BEPERM + + clgfi LEN,0x40 + jl .Ltail_vx + + VLM A1,D1,0,INP,0 + + VX A0,A0,A1 + VX B0,B0,B1 + VX C0,C0,C1 + VX D0,D0,D1 + + VSTM A0,D0,0,OUT,0 + + la INP,0x40(INP) + la OUT,0x40(OUT) + aghi LEN,-0x40 + je .Ldone_vx + + VAF A2,A2,K0 + VAF B2,B2,K1 + VAF C2,C2,K2 + + VPERM A0,A2,A2,BEPERM + VPERM B0,B2,B2,BEPERM + VPERM C0,C2,C2,BEPERM + VPERM D0,D2,D2,BEPERM + + clgfi LEN,0x40 + jl .Ltail_vx + + VLM A1,D1,0,INP,0 + + VX A0,A0,A1 + VX B0,B0,B1 + VX C0,C0,C1 + VX D0,D0,D1 + + VSTM A0,D0,0,OUT,0 + + la INP,0x40(INP) + la OUT,0x40(OUT) + aghi LEN,-0x40 + je .Ldone_vx + + VAF A3,A3,K0 + VAF B3,B3,K1 + VAF C3,C3,K2 + VAF D2,K3,T3 # K[3]+3 + + VPERM A0,A3,A3,BEPERM + VPERM B0,B3,B3,BEPERM + VPERM C0,C3,C3,BEPERM + VPERM D0,D3,D3,BEPERM + + clgfi LEN,0x40 + jl .Ltail_vx + + VAF D3,D2,T1 # K[3]+4 + VLM A1,D1,0,INP,0 + + VX A0,A0,A1 + VX B0,B0,B1 + VX C0,C0,C1 + VX D0,D0,D1 + + VSTM A0,D0,0,OUT,0 + + la INP,0x40(INP) + la OUT,0x40(OUT) + aghi LEN,-0x40 + je .Ldone_vx + + VAF A4,A4,K0 + VAF B4,B4,K1 + VAF C4,C4,K2 + VAF D4,D4,D3 # +K[3]+4 + VAF D3,D3,T1 # K[3]+5 + VAF K3,D2,T3 # K[3]+=6 + + VPERM A0,A4,A4,BEPERM + VPERM B0,B4,B4,BEPERM + VPERM C0,C4,C4,BEPERM + VPERM D0,D4,D4,BEPERM + + clgfi LEN,0x40 + jl .Ltail_vx + + VLM A1,D1,0,INP,0 + + VX A0,A0,A1 + VX B0,B0,B1 + VX C0,C0,C1 + VX D0,D0,D1 + + VSTM A0,D0,0,OUT,0 + + la INP,0x40(INP) + la OUT,0x40(OUT) + aghi LEN,-0x40 + je .Ldone_vx + + VAF A5,A5,K0 + VAF B5,B5,K1 + VAF C5,C5,K2 + VAF D5,D5,D3 # +K[3]+5 + + VPERM A0,A5,A5,BEPERM + VPERM B0,B5,B5,BEPERM + VPERM C0,C5,C5,BEPERM + VPERM D0,D5,D5,BEPERM + + clgfi LEN,0x40 + jl .Ltail_vx + + VLM A1,D1,0,INP,0 + + VX A0,A0,A1 + VX B0,B0,B1 + VX C0,C0,C1 + VX D0,D0,D1 + + VSTM A0,D0,0,OUT,0 + + la INP,0x40(INP) + la OUT,0x40(OUT) + lhi %r0,10 + aghi LEN,-0x40 + jne .Loop_outer_vx + +.Ldone_vx: + lmg %r6,%r7,FRAME+6*8(SP) + la SP,FRAME(SP) + BR_EX %r14 + +.Ltail_vx: + VSTM A0,D0,8*8,SP,3 + lghi %r1,0 + +.Loop_tail_vx: + llgc %r5,0(%r1,INP) + llgc %r6,8*8(%r1,SP) + xr %r6,%r5 + stc %r6,0(%r1,OUT) + la %r1,1(%r1) + brct LEN,.Loop_tail_vx + + lmg %r6,%r7,FRAME+6*8(SP) + la SP,FRAME(SP) + BR_EX %r14 +SYM_FUNC_END(chacha20_vx) + +.previous diff --git a/arch/s390/crypto/chacha-s390.h b/arch/s390/crypto/chacha-s390.h new file mode 100644 index 000000000000..733744ce30f5 --- /dev/null +++ b/arch/s390/crypto/chacha-s390.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * s390 ChaCha stream cipher. + * + * Copyright IBM Corp. 2021 + */ + +#ifndef _CHACHA_S390_H +#define _CHACHA_S390_H + +void chacha20_vx(u8 *out, const u8 *inp, size_t len, const u32 *key, + const u32 *counter); + +#endif /* _CHACHA_S390_H */ diff --git a/arch/s390/crypto/crc32-vx.c b/arch/s390/crypto/crc32-vx.c index 423ee05887e6..017143e9cef7 100644 --- a/arch/s390/crypto/crc32-vx.c +++ b/arch/s390/crypto/crc32-vx.c @@ -111,10 +111,8 @@ static int crc32_vx_setkey(struct crypto_shash *tfm, const u8 *newkey, { struct crc_ctx *mctx = crypto_shash_ctx(tfm); - if (newkeylen != sizeof(mctx->key)) { - crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (newkeylen != sizeof(mctx->key)) return -EINVAL; - } mctx->key = le32_to_cpu(*(__le32 *)newkey); return 0; } @@ -124,10 +122,8 @@ static int crc32be_vx_setkey(struct crypto_shash *tfm, const u8 *newkey, { struct crc_ctx *mctx = crypto_shash_ctx(tfm); - if (newkeylen != sizeof(mctx->key)) { - crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (newkeylen != sizeof(mctx->key)) return -EINVAL; - } mctx->key = be32_to_cpu(*(__be32 *)newkey); return 0; } @@ -302,7 +298,7 @@ static void __exit crc_vx_mod_exit(void) crypto_unregister_shashes(crc32_vx_algs, ARRAY_SIZE(crc32_vx_algs)); } -module_cpu_feature_match(VXRS, crc_vx_mod_init); +module_cpu_feature_match(S390_CPU_FEATURE_VXRS, crc_vx_mod_init); module_exit(crc_vx_mod_exit); MODULE_AUTHOR("Hendrik Brueckner <brueckner@linux.vnet.ibm.com>"); diff --git a/arch/s390/crypto/crc32be-vx.S b/arch/s390/crypto/crc32be-vx.S index 0099044e2c86..34ee47926891 100644 --- a/arch/s390/crypto/crc32be-vx.S +++ b/arch/s390/crypto/crc32be-vx.S @@ -24,15 +24,15 @@ #define CONST_RU_POLY %v13 #define CONST_CRC_POLY %v14 -.data -.align 8 + .data + .balign 8 /* * The CRC-32 constant block contains reduction constants to fold and * process particular chunks of the input data stream in parallel. * * For the CRC-32 variants, the constants are precomputed according to - * these defintions: + * these definitions: * * R1 = x4*128+64 mod P(x) * R2 = x4*128 mod P(x) @@ -48,7 +48,7 @@ * * Note that the constant definitions below are extended in order to compute * intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction. - * The righmost doubleword can be 0 to prevent contribution to the result or + * The rightmost doubleword can be 0 to prevent contribution to the result or * can be multiplied by 1 to perform an XOR without the need for a separate * VECTOR EXCLUSIVE OR instruction. * @@ -58,19 +58,20 @@ * P'(x) = 0xEDB88320 */ -.Lconstants_CRC_32_BE: +SYM_DATA_START_LOCAL(constants_CRC_32_BE) .quad 0x08833794c, 0x0e6228b11 # R1, R2 .quad 0x0c5b9cd4c, 0x0e8a45605 # R3, R4 .quad 0x0f200aa66, 1 << 32 # R5, x32 .quad 0x0490d678d, 1 # R6, 1 .quad 0x104d101df, 0 # u .quad 0x104C11DB7, 0 # P(x) +SYM_DATA_END(constants_CRC_32_BE) -.previous + .previous GEN_BR_THUNK %r14 -.text + .text /* * The CRC-32 function(s) use these calling conventions: * @@ -90,9 +91,9 @@ * * V9..V14: CRC-32 constants. */ -ENTRY(crc32_be_vgfm_16) +SYM_FUNC_START(crc32_be_vgfm_16) /* Load CRC-32 constants */ - larl %r5,.Lconstants_CRC_32_BE + larl %r5,constants_CRC_32_BE VLM CONST_R1R2,CONST_CRC_POLY,0,%r5 /* Load the initial CRC value into the leftmost word of V0. */ @@ -189,7 +190,7 @@ ENTRY(crc32_be_vgfm_16) * Note: To compensate the division by x^32, use the vector unpack * instruction to move the leftmost word into the leftmost doubleword * of the vector register. The rightmost doubleword is multiplied - * with zero to not contribute to the intermedate results. + * with zero to not contribute to the intermediate results. */ /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */ @@ -207,6 +208,6 @@ ENTRY(crc32_be_vgfm_16) .Ldone: VLGVF %r2,%v2,3 BR_EX %r14 -ENDPROC(crc32_be_vgfm_16) +SYM_FUNC_END(crc32_be_vgfm_16) .previous diff --git a/arch/s390/crypto/crc32le-vx.S b/arch/s390/crypto/crc32le-vx.S index 71caf0f4ec08..5a819ae09a0b 100644 --- a/arch/s390/crypto/crc32le-vx.S +++ b/arch/s390/crypto/crc32le-vx.S @@ -25,8 +25,8 @@ #define CONST_RU_POLY %v13 #define CONST_CRC_POLY %v14 -.data -.align 8 + .data + .balign 8 /* * The CRC-32 constant block contains reduction constants to fold and @@ -59,27 +59,29 @@ * P'(x) = 0x82F63B78 */ -.Lconstants_CRC_32_LE: +SYM_DATA_START_LOCAL(constants_CRC_32_LE) .octa 0x0F0E0D0C0B0A09080706050403020100 # BE->LE mask .quad 0x1c6e41596, 0x154442bd4 # R2, R1 .quad 0x0ccaa009e, 0x1751997d0 # R4, R3 .octa 0x163cd6124 # R5 .octa 0x1F7011641 # u' .octa 0x1DB710641 # P'(x) << 1 +SYM_DATA_END(constants_CRC_32_LE) -.Lconstants_CRC_32C_LE: +SYM_DATA_START_LOCAL(constants_CRC_32C_LE) .octa 0x0F0E0D0C0B0A09080706050403020100 # BE->LE mask .quad 0x09e4addf8, 0x740eef02 # R2, R1 .quad 0x14cd00bd6, 0xf20c0dfe # R4, R3 .octa 0x0dd45aab8 # R5 .octa 0x0dea713f1 # u' .octa 0x105ec76f0 # P'(x) << 1 +SYM_DATA_END(constants_CRC_32C_LE) -.previous + .previous GEN_BR_THUNK %r14 -.text + .text /* * The CRC-32 functions use these calling conventions: @@ -102,17 +104,17 @@ * V10..V14: CRC-32 constants. */ -ENTRY(crc32_le_vgfm_16) - larl %r5,.Lconstants_CRC_32_LE +SYM_FUNC_START(crc32_le_vgfm_16) + larl %r5,constants_CRC_32_LE j crc32_le_vgfm_generic -ENDPROC(crc32_le_vgfm_16) +SYM_FUNC_END(crc32_le_vgfm_16) -ENTRY(crc32c_le_vgfm_16) - larl %r5,.Lconstants_CRC_32C_LE +SYM_FUNC_START(crc32c_le_vgfm_16) + larl %r5,constants_CRC_32C_LE j crc32_le_vgfm_generic -ENDPROC(crc32c_le_vgfm_16) +SYM_FUNC_END(crc32c_le_vgfm_16) -ENTRY(crc32_le_vgfm_generic) +SYM_FUNC_START(crc32_le_vgfm_generic) /* Load CRC-32 constants */ VLM CONST_PERM_LE2BE,CONST_CRC_POLY,0,%r5 @@ -268,6 +270,6 @@ ENTRY(crc32_le_vgfm_generic) .Ldone: VLGVF %r2,%v2,2 BR_EX %r14 -ENDPROC(crc32_le_vgfm_generic) +SYM_FUNC_END(crc32_le_vgfm_generic) .previous diff --git a/arch/s390/crypto/des_s390.c b/arch/s390/crypto/des_s390.c index bfbafd35bcbd..8e75b83a5ddc 100644 --- a/arch/s390/crypto/des_s390.c +++ b/arch/s390/crypto/des_s390.c @@ -194,7 +194,7 @@ static struct skcipher_alg cbc_des_alg = { * same as DES. Implementers MUST reject keys that exhibit this * property. * - * In fips mode additinally check for all 3 keys are unique. + * In fips mode additionally check for all 3 keys are unique. * */ static int des3_setkey(struct crypto_tfm *tfm, const u8 *key, @@ -492,7 +492,7 @@ out_err: return ret; } -module_cpu_feature_match(MSA, des_s390_init); +module_cpu_feature_match(S390_CPU_FEATURE_MSA, des_s390_init); module_exit(des_s390_exit); MODULE_ALIAS_CRYPTO("des"); diff --git a/arch/s390/crypto/ghash_s390.c b/arch/s390/crypto/ghash_s390.c index a3e7400e031c..0800a2a5799f 100644 --- a/arch/s390/crypto/ghash_s390.c +++ b/arch/s390/crypto/ghash_s390.c @@ -43,10 +43,8 @@ static int ghash_setkey(struct crypto_shash *tfm, { struct ghash_ctx *ctx = crypto_shash_ctx(tfm); - if (keylen != GHASH_BLOCK_SIZE) { - crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (keylen != GHASH_BLOCK_SIZE) return -EINVAL; - } memcpy(ctx->key, key, GHASH_BLOCK_SIZE); @@ -147,7 +145,7 @@ static void __exit ghash_mod_exit(void) crypto_unregister_shash(&ghash_alg); } -module_cpu_feature_match(MSA, ghash_mod_init); +module_cpu_feature_match(S390_CPU_FEATURE_MSA, ghash_mod_init); module_exit(ghash_mod_exit); MODULE_ALIAS_CRYPTO("ghash"); diff --git a/arch/s390/crypto/paes_s390.c b/arch/s390/crypto/paes_s390.c index c7119c617b6e..55ee5567a5ea 100644 --- a/arch/s390/crypto/paes_s390.c +++ b/arch/s390/crypto/paes_s390.c @@ -5,7 +5,7 @@ * s390 implementation of the AES Cipher Algorithm with protected keys. * * s390 Version: - * Copyright IBM Corp. 2017,2019 + * Copyright IBM Corp. 2017, 2023 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> * Harald Freudenberger <freude@de.ibm.com> */ @@ -20,7 +20,9 @@ #include <linux/module.h> #include <linux/cpufeature.h> #include <linux/init.h> +#include <linux/mutex.h> #include <linux/spinlock.h> +#include <linux/delay.h> #include <crypto/internal/skcipher.h> #include <crypto/xts.h> #include <asm/cpacf.h> @@ -32,11 +34,11 @@ * is called. As paes can handle different kinds of key blobs * and padding is also possible, the limits need to be generous. */ -#define PAES_MIN_KEYSIZE 64 -#define PAES_MAX_KEYSIZE 256 +#define PAES_MIN_KEYSIZE 16 +#define PAES_MAX_KEYSIZE MAXEP11AESKEYBLOBSIZE static u8 *ctrblk; -static DEFINE_SPINLOCK(ctrblk_lock); +static DEFINE_MUTEX(ctrblk_lock); static cpacf_mask_t km_functions, kmc_functions, kmctr_functions; @@ -53,19 +55,46 @@ struct key_blob { unsigned int keylen; }; -static inline int _copy_key_to_kb(struct key_blob *kb, - const u8 *key, - unsigned int keylen) -{ - if (keylen <= sizeof(kb->keybuf)) +static inline int _key_to_kb(struct key_blob *kb, + const u8 *key, + unsigned int keylen) +{ + struct clearkey_header { + u8 type; + u8 res0[3]; + u8 version; + u8 res1[3]; + u32 keytype; + u32 len; + } __packed * h; + + switch (keylen) { + case 16: + case 24: + case 32: + /* clear key value, prepare pkey clear key token in keybuf */ + memset(kb->keybuf, 0, sizeof(kb->keybuf)); + h = (struct clearkey_header *) kb->keybuf; + h->version = 0x02; /* TOKVER_CLEAR_KEY */ + h->keytype = (keylen - 8) >> 3; + h->len = keylen; + memcpy(kb->keybuf + sizeof(*h), key, keylen); + kb->keylen = sizeof(*h) + keylen; kb->key = kb->keybuf; - else { - kb->key = kmalloc(keylen, GFP_KERNEL); - if (!kb->key) - return -ENOMEM; + break; + default: + /* other key material, let pkey handle this */ + if (keylen <= sizeof(kb->keybuf)) + kb->key = kb->keybuf; + else { + kb->key = kmalloc(keylen, GFP_KERNEL); + if (!kb->key) + return -ENOMEM; + } + memcpy(kb->key, key, keylen); + kb->keylen = keylen; + break; } - memcpy(kb->key, key, keylen); - kb->keylen = keylen; return 0; } @@ -74,7 +103,7 @@ static inline void _free_kb_keybuf(struct key_blob *kb) { if (kb->key && kb->key != kb->keybuf && kb->keylen > sizeof(kb->keybuf)) { - kfree(kb->key); + kfree_sensitive(kb->key); kb->key = NULL; } } @@ -82,23 +111,29 @@ static inline void _free_kb_keybuf(struct key_blob *kb) struct s390_paes_ctx { struct key_blob kb; struct pkey_protkey pk; + spinlock_t pk_lock; unsigned long fc; }; struct s390_pxts_ctx { struct key_blob kb[2]; struct pkey_protkey pk[2]; + spinlock_t pk_lock; unsigned long fc; }; -static inline int __paes_convert_key(struct key_blob *kb, +static inline int __paes_keyblob2pkey(struct key_blob *kb, struct pkey_protkey *pk) { int i, ret; /* try three times in case of failure */ for (i = 0; i < 3; i++) { - ret = pkey_keyblob2pkey(kb->key, kb->keylen, pk); + if (i > 0 && ret == -EAGAIN && in_task()) + if (msleep_interruptible(1000)) + return -EINTR; + ret = pkey_keyblob2pkey(kb->key, kb->keylen, + pk->protkey, &pk->len, &pk->type); if (ret == 0) break; } @@ -106,22 +141,21 @@ static inline int __paes_convert_key(struct key_blob *kb, return ret; } -static int __paes_set_key(struct s390_paes_ctx *ctx) +static inline int __paes_convert_key(struct s390_paes_ctx *ctx) { - unsigned long fc; - - if (__paes_convert_key(&ctx->kb, &ctx->pk)) - return -EINVAL; + int ret; + struct pkey_protkey pkey; - /* Pick the correct function code based on the protected key type */ - fc = (ctx->pk.type == PKEY_KEYTYPE_AES_128) ? CPACF_KM_PAES_128 : - (ctx->pk.type == PKEY_KEYTYPE_AES_192) ? CPACF_KM_PAES_192 : - (ctx->pk.type == PKEY_KEYTYPE_AES_256) ? CPACF_KM_PAES_256 : 0; + pkey.len = sizeof(pkey.protkey); + ret = __paes_keyblob2pkey(&ctx->kb, &pkey); + if (ret) + return ret; - /* Check if the function code is available */ - ctx->fc = (fc && cpacf_test_func(&km_functions, fc)) ? fc : 0; + spin_lock_bh(&ctx->pk_lock); + memcpy(&ctx->pk, &pkey, sizeof(pkey)); + spin_unlock_bh(&ctx->pk_lock); - return ctx->fc ? 0 : -EINVAL; + return 0; } static int ecb_paes_init(struct crypto_skcipher *tfm) @@ -129,6 +163,7 @@ static int ecb_paes_init(struct crypto_skcipher *tfm) struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); ctx->kb.key = NULL; + spin_lock_init(&ctx->pk_lock); return 0; } @@ -140,6 +175,26 @@ static void ecb_paes_exit(struct crypto_skcipher *tfm) _free_kb_keybuf(&ctx->kb); } +static inline int __ecb_paes_set_key(struct s390_paes_ctx *ctx) +{ + int rc; + unsigned long fc; + + rc = __paes_convert_key(ctx); + if (rc) + return rc; + + /* Pick the correct function code based on the protected key type */ + fc = (ctx->pk.type == PKEY_KEYTYPE_AES_128) ? CPACF_KM_PAES_128 : + (ctx->pk.type == PKEY_KEYTYPE_AES_192) ? CPACF_KM_PAES_192 : + (ctx->pk.type == PKEY_KEYTYPE_AES_256) ? CPACF_KM_PAES_256 : 0; + + /* Check if the function code is available */ + ctx->fc = (fc && cpacf_test_func(&km_functions, fc)) ? fc : 0; + + return ctx->fc ? 0 : -EINVAL; +} + static int ecb_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, unsigned int key_len) { @@ -147,15 +202,11 @@ static int ecb_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); _free_kb_keybuf(&ctx->kb); - rc = _copy_key_to_kb(&ctx->kb, in_key, key_len); + rc = _key_to_kb(&ctx->kb, in_key, key_len); if (rc) return rc; - if (__paes_set_key(ctx)) { - crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; - } - return 0; + return __ecb_paes_set_key(ctx); } static int ecb_paes_crypt(struct skcipher_request *req, unsigned long modifier) @@ -165,18 +216,31 @@ static int ecb_paes_crypt(struct skcipher_request *req, unsigned long modifier) struct skcipher_walk walk; unsigned int nbytes, n, k; int ret; + struct { + u8 key[MAXPROTKEYSIZE]; + } param; ret = skcipher_walk_virt(&walk, req, false); + if (ret) + return ret; + + spin_lock_bh(&ctx->pk_lock); + memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE); + spin_unlock_bh(&ctx->pk_lock); + while ((nbytes = walk.nbytes) != 0) { /* only use complete blocks */ n = nbytes & ~(AES_BLOCK_SIZE - 1); - k = cpacf_km(ctx->fc | modifier, ctx->pk.protkey, + k = cpacf_km(ctx->fc | modifier, ¶m, walk.dst.virt.addr, walk.src.virt.addr, n); if (k) ret = skcipher_walk_done(&walk, nbytes - k); if (k < n) { - if (__paes_set_key(ctx) != 0) + if (__paes_convert_key(ctx)) return skcipher_walk_done(&walk, -EIO); + spin_lock_bh(&ctx->pk_lock); + memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE); + spin_unlock_bh(&ctx->pk_lock); } } return ret; @@ -214,6 +278,7 @@ static int cbc_paes_init(struct crypto_skcipher *tfm) struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); ctx->kb.key = NULL; + spin_lock_init(&ctx->pk_lock); return 0; } @@ -225,12 +290,14 @@ static void cbc_paes_exit(struct crypto_skcipher *tfm) _free_kb_keybuf(&ctx->kb); } -static int __cbc_paes_set_key(struct s390_paes_ctx *ctx) +static inline int __cbc_paes_set_key(struct s390_paes_ctx *ctx) { + int rc; unsigned long fc; - if (__paes_convert_key(&ctx->kb, &ctx->pk)) - return -EINVAL; + rc = __paes_convert_key(ctx); + if (rc) + return rc; /* Pick the correct function code based on the protected key type */ fc = (ctx->pk.type == PKEY_KEYTYPE_AES_128) ? CPACF_KMC_PAES_128 : @@ -250,15 +317,11 @@ static int cbc_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); _free_kb_keybuf(&ctx->kb); - rc = _copy_key_to_kb(&ctx->kb, in_key, key_len); + rc = _key_to_kb(&ctx->kb, in_key, key_len); if (rc) return rc; - if (__cbc_paes_set_key(ctx)) { - crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; - } - return 0; + return __cbc_paes_set_key(ctx); } static int cbc_paes_crypt(struct skcipher_request *req, unsigned long modifier) @@ -276,8 +339,12 @@ static int cbc_paes_crypt(struct skcipher_request *req, unsigned long modifier) ret = skcipher_walk_virt(&walk, req, false); if (ret) return ret; + memcpy(param.iv, walk.iv, AES_BLOCK_SIZE); + spin_lock_bh(&ctx->pk_lock); memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE); + spin_unlock_bh(&ctx->pk_lock); + while ((nbytes = walk.nbytes) != 0) { /* only use complete blocks */ n = nbytes & ~(AES_BLOCK_SIZE - 1); @@ -288,9 +355,11 @@ static int cbc_paes_crypt(struct skcipher_request *req, unsigned long modifier) ret = skcipher_walk_done(&walk, nbytes - k); } if (k < n) { - if (__cbc_paes_set_key(ctx) != 0) + if (__paes_convert_key(ctx)) return skcipher_walk_done(&walk, -EIO); + spin_lock_bh(&ctx->pk_lock); memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE); + spin_unlock_bh(&ctx->pk_lock); } } return ret; @@ -330,6 +399,7 @@ static int xts_paes_init(struct crypto_skcipher *tfm) ctx->kb[0].key = NULL; ctx->kb[1].key = NULL; + spin_lock_init(&ctx->pk_lock); return 0; } @@ -342,12 +412,30 @@ static void xts_paes_exit(struct crypto_skcipher *tfm) _free_kb_keybuf(&ctx->kb[1]); } -static int __xts_paes_set_key(struct s390_pxts_ctx *ctx) +static inline int __xts_paes_convert_key(struct s390_pxts_ctx *ctx) +{ + struct pkey_protkey pkey0, pkey1; + + pkey0.len = sizeof(pkey0.protkey); + pkey1.len = sizeof(pkey1.protkey); + + if (__paes_keyblob2pkey(&ctx->kb[0], &pkey0) || + __paes_keyblob2pkey(&ctx->kb[1], &pkey1)) + return -EINVAL; + + spin_lock_bh(&ctx->pk_lock); + memcpy(&ctx->pk[0], &pkey0, sizeof(pkey0)); + memcpy(&ctx->pk[1], &pkey1, sizeof(pkey1)); + spin_unlock_bh(&ctx->pk_lock); + + return 0; +} + +static inline int __xts_paes_set_key(struct s390_pxts_ctx *ctx) { unsigned long fc; - if (__paes_convert_key(&ctx->kb[0], &ctx->pk[0]) || - __paes_convert_key(&ctx->kb[1], &ctx->pk[1])) + if (__xts_paes_convert_key(ctx)) return -EINVAL; if (ctx->pk[0].type != ctx->pk[1].type) @@ -379,20 +467,19 @@ static int xts_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, _free_kb_keybuf(&ctx->kb[0]); _free_kb_keybuf(&ctx->kb[1]); - rc = _copy_key_to_kb(&ctx->kb[0], in_key, key_len); + rc = _key_to_kb(&ctx->kb[0], in_key, key_len); if (rc) return rc; - rc = _copy_key_to_kb(&ctx->kb[1], in_key + key_len, key_len); + rc = _key_to_kb(&ctx->kb[1], in_key + key_len, key_len); if (rc) return rc; - if (__xts_paes_set_key(ctx)) { - crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; - } + rc = __xts_paes_set_key(ctx); + if (rc) + return rc; /* - * xts_check_key verifies the key length is not odd and makes + * xts_verify_key verifies the key length is not odd and makes * sure that the two keys are not the same. This can be done * on the two protected keys as well */ @@ -425,15 +512,17 @@ static int xts_paes_crypt(struct skcipher_request *req, unsigned long modifier) ret = skcipher_walk_virt(&walk, req, false); if (ret) return ret; + keylen = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ? 48 : 64; offset = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ? 16 : 0; -retry: + memset(&pcc_param, 0, sizeof(pcc_param)); memcpy(pcc_param.tweak, walk.iv, sizeof(pcc_param.tweak)); + spin_lock_bh(&ctx->pk_lock); memcpy(pcc_param.key + offset, ctx->pk[1].protkey, keylen); - cpacf_pcc(ctx->fc, pcc_param.key + offset); - memcpy(xts_param.key + offset, ctx->pk[0].protkey, keylen); + spin_unlock_bh(&ctx->pk_lock); + cpacf_pcc(ctx->fc, pcc_param.key + offset); memcpy(xts_param.init, pcc_param.xts, 16); while ((nbytes = walk.nbytes) != 0) { @@ -444,11 +533,15 @@ retry: if (k) ret = skcipher_walk_done(&walk, nbytes - k); if (k < n) { - if (__xts_paes_set_key(ctx) != 0) + if (__xts_paes_convert_key(ctx)) return skcipher_walk_done(&walk, -EIO); - goto retry; + spin_lock_bh(&ctx->pk_lock); + memcpy(xts_param.key + offset, + ctx->pk[0].protkey, keylen); + spin_unlock_bh(&ctx->pk_lock); } } + return ret; } @@ -485,6 +578,7 @@ static int ctr_paes_init(struct crypto_skcipher *tfm) struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); ctx->kb.key = NULL; + spin_lock_init(&ctx->pk_lock); return 0; } @@ -496,12 +590,14 @@ static void ctr_paes_exit(struct crypto_skcipher *tfm) _free_kb_keybuf(&ctx->kb); } -static int __ctr_paes_set_key(struct s390_paes_ctx *ctx) +static inline int __ctr_paes_set_key(struct s390_paes_ctx *ctx) { + int rc; unsigned long fc; - if (__paes_convert_key(&ctx->kb, &ctx->pk)) - return -EINVAL; + rc = __paes_convert_key(ctx); + if (rc) + return rc; /* Pick the correct function code based on the protected key type */ fc = (ctx->pk.type == PKEY_KEYTYPE_AES_128) ? CPACF_KMCTR_PAES_128 : @@ -522,15 +618,11 @@ static int ctr_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); _free_kb_keybuf(&ctx->kb); - rc = _copy_key_to_kb(&ctx->kb, in_key, key_len); + rc = _key_to_kb(&ctx->kb, in_key, key_len); if (rc) return rc; - if (__ctr_paes_set_key(ctx)) { - crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; - } - return 0; + return __ctr_paes_set_key(ctx); } static unsigned int __ctrblk_init(u8 *ctrptr, u8 *iv, unsigned int nbytes) @@ -556,49 +648,67 @@ static int ctr_paes_crypt(struct skcipher_request *req) struct skcipher_walk walk; unsigned int nbytes, n, k; int ret, locked; - - locked = spin_trylock(&ctrblk_lock); + struct { + u8 key[MAXPROTKEYSIZE]; + } param; ret = skcipher_walk_virt(&walk, req, false); + if (ret) + return ret; + + spin_lock_bh(&ctx->pk_lock); + memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE); + spin_unlock_bh(&ctx->pk_lock); + + locked = mutex_trylock(&ctrblk_lock); + while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) { n = AES_BLOCK_SIZE; if (nbytes >= 2*AES_BLOCK_SIZE && locked) n = __ctrblk_init(ctrblk, walk.iv, nbytes); ctrptr = (n > AES_BLOCK_SIZE) ? ctrblk : walk.iv; - k = cpacf_kmctr(ctx->fc, ctx->pk.protkey, walk.dst.virt.addr, + k = cpacf_kmctr(ctx->fc, ¶m, walk.dst.virt.addr, walk.src.virt.addr, n, ctrptr); if (k) { if (ctrptr == ctrblk) memcpy(walk.iv, ctrptr + k - AES_BLOCK_SIZE, AES_BLOCK_SIZE); crypto_inc(walk.iv, AES_BLOCK_SIZE); - ret = skcipher_walk_done(&walk, nbytes - n); + ret = skcipher_walk_done(&walk, nbytes - k); } if (k < n) { - if (__ctr_paes_set_key(ctx) != 0) { + if (__paes_convert_key(ctx)) { if (locked) - spin_unlock(&ctrblk_lock); + mutex_unlock(&ctrblk_lock); return skcipher_walk_done(&walk, -EIO); } + spin_lock_bh(&ctx->pk_lock); + memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE); + spin_unlock_bh(&ctx->pk_lock); } } if (locked) - spin_unlock(&ctrblk_lock); + mutex_unlock(&ctrblk_lock); /* * final block may be < AES_BLOCK_SIZE, copy only nbytes */ if (nbytes) { + memset(buf, 0, AES_BLOCK_SIZE); + memcpy(buf, walk.src.virt.addr, nbytes); while (1) { - if (cpacf_kmctr(ctx->fc, ctx->pk.protkey, buf, - walk.src.virt.addr, AES_BLOCK_SIZE, + if (cpacf_kmctr(ctx->fc, ¶m, buf, + buf, AES_BLOCK_SIZE, walk.iv) == AES_BLOCK_SIZE) break; - if (__ctr_paes_set_key(ctx) != 0) + if (__paes_convert_key(ctx)) return skcipher_walk_done(&walk, -EIO); + spin_lock_bh(&ctx->pk_lock); + memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE); + spin_unlock_bh(&ctx->pk_lock); } memcpy(walk.dst.virt.addr, buf, nbytes); crypto_inc(walk.iv, AES_BLOCK_SIZE); - ret = skcipher_walk_done(&walk, 0); + ret = skcipher_walk_done(&walk, nbytes); } return ret; @@ -631,12 +741,12 @@ static inline void __crypto_unregister_skcipher(struct skcipher_alg *alg) static void paes_s390_fini(void) { - if (ctrblk) - free_page((unsigned long) ctrblk); __crypto_unregister_skcipher(&ctr_paes_alg); __crypto_unregister_skcipher(&xts_paes_alg); __crypto_unregister_skcipher(&cbc_paes_alg); __crypto_unregister_skcipher(&ecb_paes_alg); + if (ctrblk) + free_page((unsigned long) ctrblk); } static int __init paes_s390_init(void) @@ -674,14 +784,14 @@ static int __init paes_s390_init(void) if (cpacf_test_func(&kmctr_functions, CPACF_KMCTR_PAES_128) || cpacf_test_func(&kmctr_functions, CPACF_KMCTR_PAES_192) || cpacf_test_func(&kmctr_functions, CPACF_KMCTR_PAES_256)) { - ret = crypto_register_skcipher(&ctr_paes_alg); - if (ret) - goto out_err; ctrblk = (u8 *) __get_free_page(GFP_KERNEL); if (!ctrblk) { ret = -ENOMEM; goto out_err; } + ret = crypto_register_skcipher(&ctr_paes_alg); + if (ret) + goto out_err; } return 0; diff --git a/arch/s390/crypto/prng.c b/arch/s390/crypto/prng.c index d977643fa627..a077087bc6cc 100644 --- a/arch/s390/crypto/prng.c +++ b/arch/s390/crypto/prng.c @@ -249,7 +249,7 @@ static void prng_tdes_deinstantiate(void) { pr_debug("The prng module stopped " "after running in triple DES mode\n"); - kzfree(prng_data); + kfree_sensitive(prng_data); } @@ -414,7 +414,7 @@ static int __init prng_sha512_instantiate(void) } /* append the seed by 16 bytes of unique nonce */ - get_tod_clock_ext(seed + seedlen); + store_tod_clock_ext((union tod_clock *)(seed + seedlen)); seedlen += 16; /* now initial seed of the prno drng */ @@ -442,7 +442,7 @@ outfree: static void prng_sha512_deinstantiate(void) { pr_debug("The prng module stopped after running in SHA-512 mode\n"); - kzfree(prng_data); + kfree_sensitive(prng_data); } @@ -528,7 +528,7 @@ static ssize_t prng_tdes_read(struct file *file, char __user *ubuf, /* give mutex free before calling schedule() */ mutex_unlock(&prng_data->mutex); schedule(); - /* occopy mutex again */ + /* occupy mutex again */ if (mutex_lock_interruptible(&prng_data->mutex)) { if (ret == 0) ret = -ERESTARTSYS; @@ -674,26 +674,12 @@ static const struct file_operations prng_tdes_fops = { .llseek = noop_llseek, }; -static struct miscdevice prng_sha512_dev = { - .name = "prandom", - .minor = MISC_DYNAMIC_MINOR, - .mode = 0644, - .fops = &prng_sha512_fops, -}; -static struct miscdevice prng_tdes_dev = { - .name = "prandom", - .minor = MISC_DYNAMIC_MINOR, - .mode = 0644, - .fops = &prng_tdes_fops, -}; - - /* chunksize attribute (ro) */ static ssize_t prng_chunksize_show(struct device *dev, struct device_attribute *attr, char *buf) { - return snprintf(buf, PAGE_SIZE, "%u\n", prng_chunk_size); + return scnprintf(buf, PAGE_SIZE, "%u\n", prng_chunk_size); } static DEVICE_ATTR(chunksize, 0444, prng_chunksize_show, NULL); @@ -712,7 +698,7 @@ static ssize_t prng_counter_show(struct device *dev, counter = prng_data->prngws.byte_counter; mutex_unlock(&prng_data->mutex); - return snprintf(buf, PAGE_SIZE, "%llu\n", counter); + return scnprintf(buf, PAGE_SIZE, "%llu\n", counter); } static DEVICE_ATTR(byte_counter, 0444, prng_counter_show, NULL); @@ -721,7 +707,7 @@ static ssize_t prng_errorflag_show(struct device *dev, struct device_attribute *attr, char *buf) { - return snprintf(buf, PAGE_SIZE, "%d\n", prng_errorflag); + return scnprintf(buf, PAGE_SIZE, "%d\n", prng_errorflag); } static DEVICE_ATTR(errorflag, 0444, prng_errorflag_show, NULL); @@ -731,9 +717,9 @@ static ssize_t prng_mode_show(struct device *dev, char *buf) { if (prng_mode == PRNG_MODE_TDES) - return snprintf(buf, PAGE_SIZE, "TDES\n"); + return scnprintf(buf, PAGE_SIZE, "TDES\n"); else - return snprintf(buf, PAGE_SIZE, "SHA512\n"); + return scnprintf(buf, PAGE_SIZE, "SHA512\n"); } static DEVICE_ATTR(mode, 0444, prng_mode_show, NULL); @@ -756,7 +742,7 @@ static ssize_t prng_reseed_limit_show(struct device *dev, struct device_attribute *attr, char *buf) { - return snprintf(buf, PAGE_SIZE, "%u\n", prng_reseed_limit); + return scnprintf(buf, PAGE_SIZE, "%u\n", prng_reseed_limit); } static ssize_t prng_reseed_limit_store(struct device *dev, struct device_attribute *attr, @@ -787,7 +773,7 @@ static ssize_t prng_strength_show(struct device *dev, struct device_attribute *attr, char *buf) { - return snprintf(buf, PAGE_SIZE, "256\n"); + return scnprintf(buf, PAGE_SIZE, "256\n"); } static DEVICE_ATTR(strength, 0444, prng_strength_show, NULL); @@ -801,18 +787,30 @@ static struct attribute *prng_sha512_dev_attrs[] = { &dev_attr_strength.attr, NULL }; +ATTRIBUTE_GROUPS(prng_sha512_dev); + static struct attribute *prng_tdes_dev_attrs[] = { &dev_attr_chunksize.attr, &dev_attr_byte_counter.attr, &dev_attr_mode.attr, NULL }; +ATTRIBUTE_GROUPS(prng_tdes_dev); -static struct attribute_group prng_sha512_dev_attr_group = { - .attrs = prng_sha512_dev_attrs +static struct miscdevice prng_sha512_dev = { + .name = "prandom", + .minor = MISC_DYNAMIC_MINOR, + .mode = 0644, + .fops = &prng_sha512_fops, + .groups = prng_sha512_dev_groups, }; -static struct attribute_group prng_tdes_dev_attr_group = { - .attrs = prng_tdes_dev_attrs + +static struct miscdevice prng_tdes_dev = { + .name = "prandom", + .minor = MISC_DYNAMIC_MINOR, + .mode = 0644, + .fops = &prng_tdes_fops, + .groups = prng_tdes_dev_groups, }; @@ -867,13 +865,6 @@ static int __init prng_init(void) prng_sha512_deinstantiate(); goto out; } - ret = sysfs_create_group(&prng_sha512_dev.this_device->kobj, - &prng_sha512_dev_attr_group); - if (ret) { - misc_deregister(&prng_sha512_dev); - prng_sha512_deinstantiate(); - goto out; - } } else { @@ -898,14 +889,6 @@ static int __init prng_init(void) prng_tdes_deinstantiate(); goto out; } - ret = sysfs_create_group(&prng_tdes_dev.this_device->kobj, - &prng_tdes_dev_attr_group); - if (ret) { - misc_deregister(&prng_tdes_dev); - prng_tdes_deinstantiate(); - goto out; - } - } out: @@ -916,17 +899,13 @@ out: static void __exit prng_exit(void) { if (prng_mode == PRNG_MODE_SHA512) { - sysfs_remove_group(&prng_sha512_dev.this_device->kobj, - &prng_sha512_dev_attr_group); misc_deregister(&prng_sha512_dev); prng_sha512_deinstantiate(); } else { - sysfs_remove_group(&prng_tdes_dev.this_device->kobj, - &prng_tdes_dev_attr_group); misc_deregister(&prng_tdes_dev); prng_tdes_deinstantiate(); } } -module_cpu_feature_match(MSA, prng_init); +module_cpu_feature_match(S390_CPU_FEATURE_MSA, prng_init); module_exit(prng_exit); diff --git a/arch/s390/crypto/sha.h b/arch/s390/crypto/sha.h index ada2f98c27b7..65ea12fc87a1 100644 --- a/arch/s390/crypto/sha.h +++ b/arch/s390/crypto/sha.h @@ -11,7 +11,8 @@ #define _CRYPTO_ARCH_S390_SHA_H #include <linux/crypto.h> -#include <crypto/sha.h> +#include <crypto/sha1.h> +#include <crypto/sha2.h> #include <crypto/sha3.h> /* must be big enough for the largest SHA variant */ diff --git a/arch/s390/crypto/sha1_s390.c b/arch/s390/crypto/sha1_s390.c index 7c15542d3685..bc3a22704e09 100644 --- a/arch/s390/crypto/sha1_s390.c +++ b/arch/s390/crypto/sha1_s390.c @@ -22,12 +22,12 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/cpufeature.h> -#include <crypto/sha.h> +#include <crypto/sha1.h> #include <asm/cpacf.h> #include "sha.h" -static int sha1_init(struct shash_desc *desc) +static int s390_sha1_init(struct shash_desc *desc) { struct s390_sha_ctx *sctx = shash_desc_ctx(desc); @@ -42,7 +42,7 @@ static int sha1_init(struct shash_desc *desc) return 0; } -static int sha1_export(struct shash_desc *desc, void *out) +static int s390_sha1_export(struct shash_desc *desc, void *out) { struct s390_sha_ctx *sctx = shash_desc_ctx(desc); struct sha1_state *octx = out; @@ -53,7 +53,7 @@ static int sha1_export(struct shash_desc *desc, void *out) return 0; } -static int sha1_import(struct shash_desc *desc, const void *in) +static int s390_sha1_import(struct shash_desc *desc, const void *in) { struct s390_sha_ctx *sctx = shash_desc_ctx(desc); const struct sha1_state *ictx = in; @@ -67,11 +67,11 @@ static int sha1_import(struct shash_desc *desc, const void *in) static struct shash_alg alg = { .digestsize = SHA1_DIGEST_SIZE, - .init = sha1_init, + .init = s390_sha1_init, .update = s390_sha_update, .final = s390_sha_final, - .export = sha1_export, - .import = sha1_import, + .export = s390_sha1_export, + .import = s390_sha1_import, .descsize = sizeof(struct s390_sha_ctx), .statesize = sizeof(struct sha1_state), .base = { @@ -95,7 +95,7 @@ static void __exit sha1_s390_fini(void) crypto_unregister_shash(&alg); } -module_cpu_feature_match(MSA, sha1_s390_init); +module_cpu_feature_match(S390_CPU_FEATURE_MSA, sha1_s390_init); module_exit(sha1_s390_fini); MODULE_ALIAS_CRYPTO("sha1"); diff --git a/arch/s390/crypto/sha256_s390.c b/arch/s390/crypto/sha256_s390.c index b52c87e44939..6f1ccdf93d3e 100644 --- a/arch/s390/crypto/sha256_s390.c +++ b/arch/s390/crypto/sha256_s390.c @@ -12,7 +12,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/cpufeature.h> -#include <crypto/sha.h> +#include <crypto/sha2.h> #include <asm/cpacf.h> #include "sha.h" @@ -134,7 +134,7 @@ static void __exit sha256_s390_fini(void) crypto_unregister_shash(&sha256_alg); } -module_cpu_feature_match(MSA, sha256_s390_init); +module_cpu_feature_match(S390_CPU_FEATURE_MSA, sha256_s390_init); module_exit(sha256_s390_fini); MODULE_ALIAS_CRYPTO("sha256"); diff --git a/arch/s390/crypto/sha3_256_s390.c b/arch/s390/crypto/sha3_256_s390.c index 460cbbbaa44a..e1350e033a32 100644 --- a/arch/s390/crypto/sha3_256_s390.c +++ b/arch/s390/crypto/sha3_256_s390.c @@ -12,7 +12,6 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/cpufeature.h> -#include <crypto/sha.h> #include <crypto/sha3.h> #include <asm/cpacf.h> @@ -138,7 +137,7 @@ static void __exit sha3_256_s390_fini(void) crypto_unregister_shash(&sha3_256_alg); } -module_cpu_feature_match(MSA, sha3_256_s390_init); +module_cpu_feature_match(S390_CPU_FEATURE_MSA, sha3_256_s390_init); module_exit(sha3_256_s390_fini); MODULE_ALIAS_CRYPTO("sha3-256"); diff --git a/arch/s390/crypto/sha3_512_s390.c b/arch/s390/crypto/sha3_512_s390.c index 72cf460a53e5..06c142ed9bb1 100644 --- a/arch/s390/crypto/sha3_512_s390.c +++ b/arch/s390/crypto/sha3_512_s390.c @@ -11,7 +11,6 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/cpufeature.h> -#include <crypto/sha.h> #include <crypto/sha3.h> #include <asm/cpacf.h> @@ -148,7 +147,7 @@ static void __exit fini(void) crypto_unregister_shash(&sha3_384_alg); } -module_cpu_feature_match(MSA, init); +module_cpu_feature_match(S390_CPU_FEATURE_MSA, init); module_exit(fini); MODULE_LICENSE("GPL"); diff --git a/arch/s390/crypto/sha512_s390.c b/arch/s390/crypto/sha512_s390.c index ad29db085a18..04f11c407763 100644 --- a/arch/s390/crypto/sha512_s390.c +++ b/arch/s390/crypto/sha512_s390.c @@ -8,7 +8,7 @@ * Author(s): Jan Glauber (jang@de.ibm.com) */ #include <crypto/internal/hash.h> -#include <crypto/sha.h> +#include <crypto/sha2.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/kernel.h> @@ -22,14 +22,14 @@ static int sha512_init(struct shash_desc *desc) { struct s390_sha_ctx *ctx = shash_desc_ctx(desc); - *(__u64 *)&ctx->state[0] = 0x6a09e667f3bcc908ULL; - *(__u64 *)&ctx->state[2] = 0xbb67ae8584caa73bULL; - *(__u64 *)&ctx->state[4] = 0x3c6ef372fe94f82bULL; - *(__u64 *)&ctx->state[6] = 0xa54ff53a5f1d36f1ULL; - *(__u64 *)&ctx->state[8] = 0x510e527fade682d1ULL; - *(__u64 *)&ctx->state[10] = 0x9b05688c2b3e6c1fULL; - *(__u64 *)&ctx->state[12] = 0x1f83d9abfb41bd6bULL; - *(__u64 *)&ctx->state[14] = 0x5be0cd19137e2179ULL; + *(__u64 *)&ctx->state[0] = SHA512_H0; + *(__u64 *)&ctx->state[2] = SHA512_H1; + *(__u64 *)&ctx->state[4] = SHA512_H2; + *(__u64 *)&ctx->state[6] = SHA512_H3; + *(__u64 *)&ctx->state[8] = SHA512_H4; + *(__u64 *)&ctx->state[10] = SHA512_H5; + *(__u64 *)&ctx->state[12] = SHA512_H6; + *(__u64 *)&ctx->state[14] = SHA512_H7; ctx->count = 0; ctx->func = CPACF_KIMD_SHA_512; @@ -87,14 +87,14 @@ static int sha384_init(struct shash_desc *desc) { struct s390_sha_ctx *ctx = shash_desc_ctx(desc); - *(__u64 *)&ctx->state[0] = 0xcbbb9d5dc1059ed8ULL; - *(__u64 *)&ctx->state[2] = 0x629a292a367cd507ULL; - *(__u64 *)&ctx->state[4] = 0x9159015a3070dd17ULL; - *(__u64 *)&ctx->state[6] = 0x152fecd8f70e5939ULL; - *(__u64 *)&ctx->state[8] = 0x67332667ffc00b31ULL; - *(__u64 *)&ctx->state[10] = 0x8eb44a8768581511ULL; - *(__u64 *)&ctx->state[12] = 0xdb0c2e0d64f98fa7ULL; - *(__u64 *)&ctx->state[14] = 0x47b5481dbefa4fa4ULL; + *(__u64 *)&ctx->state[0] = SHA384_H0; + *(__u64 *)&ctx->state[2] = SHA384_H1; + *(__u64 *)&ctx->state[4] = SHA384_H2; + *(__u64 *)&ctx->state[6] = SHA384_H3; + *(__u64 *)&ctx->state[8] = SHA384_H4; + *(__u64 *)&ctx->state[10] = SHA384_H5; + *(__u64 *)&ctx->state[12] = SHA384_H6; + *(__u64 *)&ctx->state[14] = SHA384_H7; ctx->count = 0; ctx->func = CPACF_KIMD_SHA_512; @@ -142,7 +142,7 @@ static void __exit fini(void) crypto_unregister_shash(&sha384_alg); } -module_cpu_feature_match(MSA, init); +module_cpu_feature_match(S390_CPU_FEATURE_MSA, init); module_exit(fini); MODULE_LICENSE("GPL"); diff --git a/arch/s390/hypfs/Makefile b/arch/s390/hypfs/Makefile index 06f601509ce9..c34854d298f8 100644 --- a/arch/s390/hypfs/Makefile +++ b/arch/s390/hypfs/Makefile @@ -3,7 +3,12 @@ # Makefile for the linux hypfs filesystem routines. # -obj-$(CONFIG_S390_HYPFS_FS) += s390_hypfs.o +obj-$(CONFIG_S390_HYPFS) += hypfs_dbfs.o +obj-$(CONFIG_S390_HYPFS) += hypfs_diag.o +obj-$(CONFIG_S390_HYPFS) += hypfs_diag0c.o +obj-$(CONFIG_S390_HYPFS) += hypfs_sprp.o +obj-$(CONFIG_S390_HYPFS) += hypfs_vm.o -s390_hypfs-objs := inode.o hypfs_diag.o hypfs_vm.o hypfs_dbfs.o hypfs_sprp.o -s390_hypfs-objs += hypfs_diag0c.o +obj-$(CONFIG_S390_HYPFS_FS) += hypfs_diag_fs.o +obj-$(CONFIG_S390_HYPFS_FS) += hypfs_vm_fs.o +obj-$(CONFIG_S390_HYPFS_FS) += inode.o diff --git a/arch/s390/hypfs/hypfs.h b/arch/s390/hypfs/hypfs.h index 05f3f9aee5fc..65f4036fd541 100644 --- a/arch/s390/hypfs/hypfs.h +++ b/arch/s390/hypfs/hypfs.h @@ -46,6 +46,15 @@ void hypfs_diag0c_exit(void); void hypfs_sprp_init(void); void hypfs_sprp_exit(void); +int __hypfs_fs_init(void); + +static inline int hypfs_fs_init(void) +{ + if (IS_ENABLED(CONFIG_S390_HYPFS_FS)) + return __hypfs_fs_init(); + return 0; +} + /* debugfs interface */ struct hypfs_dbfs_file; @@ -69,7 +78,6 @@ struct hypfs_dbfs_file { struct dentry *dentry; }; -extern void hypfs_dbfs_init(void); extern void hypfs_dbfs_exit(void); extern void hypfs_dbfs_create_file(struct hypfs_dbfs_file *df); extern void hypfs_dbfs_remove_file(struct hypfs_dbfs_file *df); diff --git a/arch/s390/hypfs/hypfs_dbfs.c b/arch/s390/hypfs/hypfs_dbfs.c index f4c7dbfaf8ee..4024599eb448 100644 --- a/arch/s390/hypfs/hypfs_dbfs.c +++ b/arch/s390/hypfs/hypfs_dbfs.c @@ -90,12 +90,33 @@ void hypfs_dbfs_remove_file(struct hypfs_dbfs_file *df) debugfs_remove(df->dentry); } -void hypfs_dbfs_init(void) +static int __init hypfs_dbfs_init(void) { - dbfs_dir = debugfs_create_dir("s390_hypfs", NULL); -} + int rc = -ENODATA; -void hypfs_dbfs_exit(void) -{ + dbfs_dir = debugfs_create_dir("s390_hypfs", NULL); + if (hypfs_diag_init()) + goto fail_dbfs_exit; + if (hypfs_vm_init()) + goto fail_hypfs_diag_exit; + hypfs_sprp_init(); + if (hypfs_diag0c_init()) + goto fail_hypfs_sprp_exit; + rc = hypfs_fs_init(); + if (rc) + goto fail_hypfs_diag0c_exit; + return 0; + +fail_hypfs_diag0c_exit: + hypfs_diag0c_exit(); +fail_hypfs_sprp_exit: + hypfs_sprp_exit(); + hypfs_vm_exit(); +fail_hypfs_diag_exit: + hypfs_diag_exit(); + pr_err("Initialization of hypfs failed with rc=%i\n", rc); +fail_dbfs_exit: debugfs_remove(dbfs_dir); + return rc; } +device_initcall(hypfs_dbfs_init) diff --git a/arch/s390/hypfs/hypfs_diag.c b/arch/s390/hypfs/hypfs_diag.c index f0bc4dc3e9bf..279b7bba4d43 100644 --- a/arch/s390/hypfs/hypfs_diag.c +++ b/arch/s390/hypfs/hypfs_diag.c @@ -18,196 +18,27 @@ #include <linux/mm.h> #include <asm/diag.h> #include <asm/ebcdic.h> +#include "hypfs_diag.h" #include "hypfs.h" -#define TMP_SIZE 64 /* size of temporary buffers */ - #define DBFS_D204_HDR_VERSION 0 -static char *diag224_cpu_names; /* diag 224 name table */ static enum diag204_sc diag204_store_sc; /* used subcode for store */ static enum diag204_format diag204_info_type; /* used diag 204 data format */ static void *diag204_buf; /* 4K aligned buffer for diag204 data */ -static void *diag204_buf_vmalloc; /* vmalloc pointer for diag204 data */ static int diag204_buf_pages; /* number of pages for diag204 data */ static struct dentry *dbfs_d204_file; -/* - * DIAG 204 member access functions. - * - * Since we have two different diag 204 data formats for old and new s390 - * machines, we do not access the structs directly, but use getter functions for - * each struct member instead. This should make the code more readable. - */ - -/* Time information block */ - -static inline int info_blk_hdr__size(enum diag204_format type) -{ - if (type == DIAG204_INFO_SIMPLE) - return sizeof(struct diag204_info_blk_hdr); - else /* DIAG204_INFO_EXT */ - return sizeof(struct diag204_x_info_blk_hdr); -} - -static inline __u8 info_blk_hdr__npar(enum diag204_format type, void *hdr) -{ - if (type == DIAG204_INFO_SIMPLE) - return ((struct diag204_info_blk_hdr *)hdr)->npar; - else /* DIAG204_INFO_EXT */ - return ((struct diag204_x_info_blk_hdr *)hdr)->npar; -} - -static inline __u8 info_blk_hdr__flags(enum diag204_format type, void *hdr) -{ - if (type == DIAG204_INFO_SIMPLE) - return ((struct diag204_info_blk_hdr *)hdr)->flags; - else /* DIAG204_INFO_EXT */ - return ((struct diag204_x_info_blk_hdr *)hdr)->flags; -} - -static inline __u16 info_blk_hdr__pcpus(enum diag204_format type, void *hdr) -{ - if (type == DIAG204_INFO_SIMPLE) - return ((struct diag204_info_blk_hdr *)hdr)->phys_cpus; - else /* DIAG204_INFO_EXT */ - return ((struct diag204_x_info_blk_hdr *)hdr)->phys_cpus; -} - -/* Partition header */ - -static inline int part_hdr__size(enum diag204_format type) -{ - if (type == DIAG204_INFO_SIMPLE) - return sizeof(struct diag204_part_hdr); - else /* DIAG204_INFO_EXT */ - return sizeof(struct diag204_x_part_hdr); -} - -static inline __u8 part_hdr__rcpus(enum diag204_format type, void *hdr) -{ - if (type == DIAG204_INFO_SIMPLE) - return ((struct diag204_part_hdr *)hdr)->cpus; - else /* DIAG204_INFO_EXT */ - return ((struct diag204_x_part_hdr *)hdr)->rcpus; -} - -static inline void part_hdr__part_name(enum diag204_format type, void *hdr, - char *name) -{ - if (type == DIAG204_INFO_SIMPLE) - memcpy(name, ((struct diag204_part_hdr *)hdr)->part_name, - DIAG204_LPAR_NAME_LEN); - else /* DIAG204_INFO_EXT */ - memcpy(name, ((struct diag204_x_part_hdr *)hdr)->part_name, - DIAG204_LPAR_NAME_LEN); - EBCASC(name, DIAG204_LPAR_NAME_LEN); - name[DIAG204_LPAR_NAME_LEN] = 0; - strim(name); -} - -/* CPU info block */ - -static inline int cpu_info__size(enum diag204_format type) -{ - if (type == DIAG204_INFO_SIMPLE) - return sizeof(struct diag204_cpu_info); - else /* DIAG204_INFO_EXT */ - return sizeof(struct diag204_x_cpu_info); -} - -static inline __u8 cpu_info__ctidx(enum diag204_format type, void *hdr) -{ - if (type == DIAG204_INFO_SIMPLE) - return ((struct diag204_cpu_info *)hdr)->ctidx; - else /* DIAG204_INFO_EXT */ - return ((struct diag204_x_cpu_info *)hdr)->ctidx; -} - -static inline __u16 cpu_info__cpu_addr(enum diag204_format type, void *hdr) -{ - if (type == DIAG204_INFO_SIMPLE) - return ((struct diag204_cpu_info *)hdr)->cpu_addr; - else /* DIAG204_INFO_EXT */ - return ((struct diag204_x_cpu_info *)hdr)->cpu_addr; -} - -static inline __u64 cpu_info__acc_time(enum diag204_format type, void *hdr) -{ - if (type == DIAG204_INFO_SIMPLE) - return ((struct diag204_cpu_info *)hdr)->acc_time; - else /* DIAG204_INFO_EXT */ - return ((struct diag204_x_cpu_info *)hdr)->acc_time; -} - -static inline __u64 cpu_info__lp_time(enum diag204_format type, void *hdr) -{ - if (type == DIAG204_INFO_SIMPLE) - return ((struct diag204_cpu_info *)hdr)->lp_time; - else /* DIAG204_INFO_EXT */ - return ((struct diag204_x_cpu_info *)hdr)->lp_time; -} - -static inline __u64 cpu_info__online_time(enum diag204_format type, void *hdr) -{ - if (type == DIAG204_INFO_SIMPLE) - return 0; /* online_time not available in simple info */ - else /* DIAG204_INFO_EXT */ - return ((struct diag204_x_cpu_info *)hdr)->online_time; -} - -/* Physical header */ - -static inline int phys_hdr__size(enum diag204_format type) -{ - if (type == DIAG204_INFO_SIMPLE) - return sizeof(struct diag204_phys_hdr); - else /* DIAG204_INFO_EXT */ - return sizeof(struct diag204_x_phys_hdr); -} - -static inline __u8 phys_hdr__cpus(enum diag204_format type, void *hdr) -{ - if (type == DIAG204_INFO_SIMPLE) - return ((struct diag204_phys_hdr *)hdr)->cpus; - else /* DIAG204_INFO_EXT */ - return ((struct diag204_x_phys_hdr *)hdr)->cpus; -} - -/* Physical CPU info block */ - -static inline int phys_cpu__size(enum diag204_format type) +enum diag204_format diag204_get_info_type(void) { - if (type == DIAG204_INFO_SIMPLE) - return sizeof(struct diag204_phys_cpu); - else /* DIAG204_INFO_EXT */ - return sizeof(struct diag204_x_phys_cpu); + return diag204_info_type; } -static inline __u16 phys_cpu__cpu_addr(enum diag204_format type, void *hdr) +static void diag204_set_info_type(enum diag204_format type) { - if (type == DIAG204_INFO_SIMPLE) - return ((struct diag204_phys_cpu *)hdr)->cpu_addr; - else /* DIAG204_INFO_EXT */ - return ((struct diag204_x_phys_cpu *)hdr)->cpu_addr; -} - -static inline __u64 phys_cpu__mgm_time(enum diag204_format type, void *hdr) -{ - if (type == DIAG204_INFO_SIMPLE) - return ((struct diag204_phys_cpu *)hdr)->mgm_time; - else /* DIAG204_INFO_EXT */ - return ((struct diag204_x_phys_cpu *)hdr)->mgm_time; -} - -static inline __u64 phys_cpu__ctidx(enum diag204_format type, void *hdr) -{ - if (type == DIAG204_INFO_SIMPLE) - return ((struct diag204_phys_cpu *)hdr)->ctidx; - else /* DIAG204_INFO_EXT */ - return ((struct diag204_x_phys_cpu *)hdr)->ctidx; + diag204_info_type = type; } /* Diagnose 204 functions */ @@ -220,43 +51,11 @@ static inline __u64 phys_cpu__ctidx(enum diag204_format type, void *hdr) static void diag204_free_buffer(void) { - if (!diag204_buf) - return; - if (diag204_buf_vmalloc) { - vfree(diag204_buf_vmalloc); - diag204_buf_vmalloc = NULL; - } else { - free_pages((unsigned long) diag204_buf, 0); - } + vfree(diag204_buf); diag204_buf = NULL; } -static void *page_align_ptr(void *ptr) -{ - return (void *) PAGE_ALIGN((unsigned long) ptr); -} - -static void *diag204_alloc_vbuf(int pages) -{ - /* The buffer has to be page aligned! */ - diag204_buf_vmalloc = vmalloc(array_size(PAGE_SIZE, (pages + 1))); - if (!diag204_buf_vmalloc) - return ERR_PTR(-ENOMEM); - diag204_buf = page_align_ptr(diag204_buf_vmalloc); - diag204_buf_pages = pages; - return diag204_buf; -} - -static void *diag204_alloc_rbuf(void) -{ - diag204_buf = (void*)__get_free_pages(GFP_KERNEL,0); - if (!diag204_buf) - return ERR_PTR(-ENOMEM); - diag204_buf_pages = 1; - return diag204_buf; -} - -static void *diag204_get_buffer(enum diag204_format fmt, int *pages) +void *diag204_get_buffer(enum diag204_format fmt, int *pages) { if (diag204_buf) { *pages = diag204_buf_pages; @@ -264,15 +63,19 @@ static void *diag204_get_buffer(enum diag204_format fmt, int *pages) } if (fmt == DIAG204_INFO_SIMPLE) { *pages = 1; - return diag204_alloc_rbuf(); } else {/* DIAG204_INFO_EXT */ *pages = diag204((unsigned long)DIAG204_SUBC_RSI | (unsigned long)DIAG204_INFO_EXT, 0, NULL); if (*pages <= 0) - return ERR_PTR(-ENOSYS); - else - return diag204_alloc_vbuf(*pages); + return ERR_PTR(-EOPNOTSUPP); } + diag204_buf = __vmalloc_node(array_size(*pages, PAGE_SIZE), + PAGE_SIZE, GFP_KERNEL, NUMA_NO_NODE, + __builtin_return_address(0)); + if (!diag204_buf) + return ERR_PTR(-ENOMEM); + diag204_buf_pages = *pages; + return diag204_buf; } /* @@ -299,13 +102,13 @@ static int diag204_probe(void) if (diag204((unsigned long)DIAG204_SUBC_STIB7 | (unsigned long)DIAG204_INFO_EXT, pages, buf) >= 0) { diag204_store_sc = DIAG204_SUBC_STIB7; - diag204_info_type = DIAG204_INFO_EXT; + diag204_set_info_type(DIAG204_INFO_EXT); goto out; } if (diag204((unsigned long)DIAG204_SUBC_STIB6 | (unsigned long)DIAG204_INFO_EXT, pages, buf) >= 0) { diag204_store_sc = DIAG204_SUBC_STIB6; - diag204_info_type = DIAG204_INFO_EXT; + diag204_set_info_type(DIAG204_INFO_EXT); goto out; } diag204_free_buffer(); @@ -321,10 +124,10 @@ static int diag204_probe(void) if (diag204((unsigned long)DIAG204_SUBC_STIB4 | (unsigned long)DIAG204_INFO_SIMPLE, pages, buf) >= 0) { diag204_store_sc = DIAG204_SUBC_STIB4; - diag204_info_type = DIAG204_INFO_SIMPLE; + diag204_set_info_type(DIAG204_INFO_SIMPLE); goto out; } else { - rc = -ENOSYS; + rc = -EOPNOTSUPP; goto fail_store; } out: @@ -335,58 +138,13 @@ fail_alloc: return rc; } -static int diag204_do_store(void *buf, int pages) +int diag204_store(void *buf, int pages) { int rc; - rc = diag204((unsigned long) diag204_store_sc | - (unsigned long) diag204_info_type, pages, buf); - return rc < 0 ? -ENOSYS : 0; -} - -static void *diag204_store(void) -{ - void *buf; - int pages, rc; - - buf = diag204_get_buffer(diag204_info_type, &pages); - if (IS_ERR(buf)) - goto out; - rc = diag204_do_store(buf, pages); - if (rc) - return ERR_PTR(rc); -out: - return buf; -} - -/* Diagnose 224 functions */ - -static int diag224_get_name_table(void) -{ - /* memory must be below 2GB */ - diag224_cpu_names = (char *) __get_free_page(GFP_KERNEL | GFP_DMA); - if (!diag224_cpu_names) - return -ENOMEM; - if (diag224(diag224_cpu_names)) { - free_page((unsigned long) diag224_cpu_names); - return -EOPNOTSUPP; - } - EBCASC(diag224_cpu_names + 16, (*diag224_cpu_names + 1) * 16); - return 0; -} - -static void diag224_delete_name_table(void) -{ - free_page((unsigned long) diag224_cpu_names); -} - -static int diag224_idx2name(int index, char *name) -{ - memcpy(name, diag224_cpu_names + ((index + 1) * DIAG204_CPU_NAME_LEN), - DIAG204_CPU_NAME_LEN); - name[DIAG204_CPU_NAME_LEN] = 0; - strim(name); - return 0; + rc = diag204((unsigned long)diag204_store_sc | + (unsigned long)diag204_get_info_type(), pages, buf); + return rc < 0 ? -EOPNOTSUPP : 0; } struct dbfs_d204_hdr { @@ -411,8 +169,8 @@ static int dbfs_d204_create(void **data, void **data_free_ptr, size_t *size) base = vzalloc(buf_size); if (!base) return -ENOMEM; - d204 = page_align_ptr(base + sizeof(d204->hdr)) - sizeof(d204->hdr); - rc = diag204_do_store(d204->buf, diag204_buf_pages); + d204 = PTR_ALIGN(base + sizeof(d204->hdr), PAGE_SIZE) - sizeof(d204->hdr); + rc = diag204_store(d204->buf, diag204_buf_pages); if (rc) { vfree(base); return rc; @@ -437,180 +195,25 @@ __init int hypfs_diag_init(void) int rc; if (diag204_probe()) { - pr_err("The hardware system does not support hypfs\n"); + pr_info("The hardware system does not support hypfs\n"); return -ENODATA; } - if (diag204_info_type == DIAG204_INFO_EXT) + if (diag204_get_info_type() == DIAG204_INFO_EXT) hypfs_dbfs_create_file(&dbfs_file_d204); - if (MACHINE_IS_LPAR) { - rc = diag224_get_name_table(); - if (rc) { - pr_err("The hardware system does not provide all " - "functions required by hypfs\n"); - debugfs_remove(dbfs_d204_file); - return rc; - } + rc = hypfs_diag_fs_init(); + if (rc) { + pr_err("The hardware system does not provide all functions required by hypfs\n"); + debugfs_remove(dbfs_d204_file); } - return 0; + return rc; } void hypfs_diag_exit(void) { debugfs_remove(dbfs_d204_file); - diag224_delete_name_table(); + hypfs_diag_fs_exit(); diag204_free_buffer(); hypfs_dbfs_remove_file(&dbfs_file_d204); } - -/* - * Functions to create the directory structure - * ******************************************* - */ - -static int hypfs_create_cpu_files(struct dentry *cpus_dir, void *cpu_info) -{ - struct dentry *cpu_dir; - char buffer[TMP_SIZE]; - void *rc; - - snprintf(buffer, TMP_SIZE, "%d", cpu_info__cpu_addr(diag204_info_type, - cpu_info)); - cpu_dir = hypfs_mkdir(cpus_dir, buffer); - rc = hypfs_create_u64(cpu_dir, "mgmtime", - cpu_info__acc_time(diag204_info_type, cpu_info) - - cpu_info__lp_time(diag204_info_type, cpu_info)); - if (IS_ERR(rc)) - return PTR_ERR(rc); - rc = hypfs_create_u64(cpu_dir, "cputime", - cpu_info__lp_time(diag204_info_type, cpu_info)); - if (IS_ERR(rc)) - return PTR_ERR(rc); - if (diag204_info_type == DIAG204_INFO_EXT) { - rc = hypfs_create_u64(cpu_dir, "onlinetime", - cpu_info__online_time(diag204_info_type, - cpu_info)); - if (IS_ERR(rc)) - return PTR_ERR(rc); - } - diag224_idx2name(cpu_info__ctidx(diag204_info_type, cpu_info), buffer); - rc = hypfs_create_str(cpu_dir, "type", buffer); - return PTR_ERR_OR_ZERO(rc); -} - -static void *hypfs_create_lpar_files(struct dentry *systems_dir, void *part_hdr) -{ - struct dentry *cpus_dir; - struct dentry *lpar_dir; - char lpar_name[DIAG204_LPAR_NAME_LEN + 1]; - void *cpu_info; - int i; - - part_hdr__part_name(diag204_info_type, part_hdr, lpar_name); - lpar_name[DIAG204_LPAR_NAME_LEN] = 0; - lpar_dir = hypfs_mkdir(systems_dir, lpar_name); - if (IS_ERR(lpar_dir)) - return lpar_dir; - cpus_dir = hypfs_mkdir(lpar_dir, "cpus"); - if (IS_ERR(cpus_dir)) - return cpus_dir; - cpu_info = part_hdr + part_hdr__size(diag204_info_type); - for (i = 0; i < part_hdr__rcpus(diag204_info_type, part_hdr); i++) { - int rc; - rc = hypfs_create_cpu_files(cpus_dir, cpu_info); - if (rc) - return ERR_PTR(rc); - cpu_info += cpu_info__size(diag204_info_type); - } - return cpu_info; -} - -static int hypfs_create_phys_cpu_files(struct dentry *cpus_dir, void *cpu_info) -{ - struct dentry *cpu_dir; - char buffer[TMP_SIZE]; - void *rc; - - snprintf(buffer, TMP_SIZE, "%i", phys_cpu__cpu_addr(diag204_info_type, - cpu_info)); - cpu_dir = hypfs_mkdir(cpus_dir, buffer); - if (IS_ERR(cpu_dir)) - return PTR_ERR(cpu_dir); - rc = hypfs_create_u64(cpu_dir, "mgmtime", - phys_cpu__mgm_time(diag204_info_type, cpu_info)); - if (IS_ERR(rc)) - return PTR_ERR(rc); - diag224_idx2name(phys_cpu__ctidx(diag204_info_type, cpu_info), buffer); - rc = hypfs_create_str(cpu_dir, "type", buffer); - return PTR_ERR_OR_ZERO(rc); -} - -static void *hypfs_create_phys_files(struct dentry *parent_dir, void *phys_hdr) -{ - int i; - void *cpu_info; - struct dentry *cpus_dir; - - cpus_dir = hypfs_mkdir(parent_dir, "cpus"); - if (IS_ERR(cpus_dir)) - return cpus_dir; - cpu_info = phys_hdr + phys_hdr__size(diag204_info_type); - for (i = 0; i < phys_hdr__cpus(diag204_info_type, phys_hdr); i++) { - int rc; - rc = hypfs_create_phys_cpu_files(cpus_dir, cpu_info); - if (rc) - return ERR_PTR(rc); - cpu_info += phys_cpu__size(diag204_info_type); - } - return cpu_info; -} - -int hypfs_diag_create_files(struct dentry *root) -{ - struct dentry *systems_dir, *hyp_dir; - void *time_hdr, *part_hdr; - int i, rc; - void *buffer, *ptr; - - buffer = diag204_store(); - if (IS_ERR(buffer)) - return PTR_ERR(buffer); - - systems_dir = hypfs_mkdir(root, "systems"); - if (IS_ERR(systems_dir)) { - rc = PTR_ERR(systems_dir); - goto err_out; - } - time_hdr = (struct x_info_blk_hdr *)buffer; - part_hdr = time_hdr + info_blk_hdr__size(diag204_info_type); - for (i = 0; i < info_blk_hdr__npar(diag204_info_type, time_hdr); i++) { - part_hdr = hypfs_create_lpar_files(systems_dir, part_hdr); - if (IS_ERR(part_hdr)) { - rc = PTR_ERR(part_hdr); - goto err_out; - } - } - if (info_blk_hdr__flags(diag204_info_type, time_hdr) & - DIAG204_LPAR_PHYS_FLG) { - ptr = hypfs_create_phys_files(root, part_hdr); - if (IS_ERR(ptr)) { - rc = PTR_ERR(ptr); - goto err_out; - } - } - hyp_dir = hypfs_mkdir(root, "hyp"); - if (IS_ERR(hyp_dir)) { - rc = PTR_ERR(hyp_dir); - goto err_out; - } - ptr = hypfs_create_str(hyp_dir, "type", "LPAR Hypervisor"); - if (IS_ERR(ptr)) { - rc = PTR_ERR(ptr); - goto err_out; - } - rc = 0; - -err_out: - return rc; -} diff --git a/arch/s390/hypfs/hypfs_diag.h b/arch/s390/hypfs/hypfs_diag.h new file mode 100644 index 000000000000..7090eff27fef --- /dev/null +++ b/arch/s390/hypfs/hypfs_diag.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Hypervisor filesystem for Linux on s390. Diag 204 and 224 + * implementation. + * + * Copyright IBM Corp. 2006, 2008 + * Author(s): Michael Holzheu <holzheu@de.ibm.com> + */ + +#ifndef _S390_HYPFS_DIAG_H_ +#define _S390_HYPFS_DIAG_H_ + +#include <asm/diag.h> + +enum diag204_format diag204_get_info_type(void); +void *diag204_get_buffer(enum diag204_format fmt, int *pages); +int diag204_store(void *buf, int pages); + +int __hypfs_diag_fs_init(void); +void __hypfs_diag_fs_exit(void); + +static inline int hypfs_diag_fs_init(void) +{ + if (IS_ENABLED(CONFIG_S390_HYPFS_FS)) + return __hypfs_diag_fs_init(); + return 0; +} + +static inline void hypfs_diag_fs_exit(void) +{ + if (IS_ENABLED(CONFIG_S390_HYPFS_FS)) + __hypfs_diag_fs_exit(); +} + +#endif /* _S390_HYPFS_DIAG_H_ */ diff --git a/arch/s390/hypfs/hypfs_diag0c.c b/arch/s390/hypfs/hypfs_diag0c.c index 3235e4d82f2d..9a2786079e3a 100644 --- a/arch/s390/hypfs/hypfs_diag0c.c +++ b/arch/s390/hypfs/hypfs_diag0c.c @@ -21,7 +21,7 @@ static void diag0c_fn(void *data) { diag_stat_inc(DIAG_STAT_X00C); - diag_dma_ops.diag0c(((void **) data)[smp_processor_id()]); + diag_amode31_ops.diag0c(((void **)data)[smp_processor_id()]); } /* @@ -33,12 +33,12 @@ static void *diag0c_store(unsigned int *count) unsigned int cpu_count, cpu, i; void **cpu_vec; - get_online_cpus(); + cpus_read_lock(); cpu_count = num_online_cpus(); cpu_vec = kmalloc_array(num_possible_cpus(), sizeof(*cpu_vec), GFP_KERNEL); if (!cpu_vec) - goto fail_put_online_cpus; + goto fail_unlock_cpus; /* Note: Diag 0c needs 8 byte alignment and real storage */ diag0c_data = kzalloc(struct_size(diag0c_data, entry, cpu_count), GFP_KERNEL | GFP_DMA); @@ -54,13 +54,13 @@ static void *diag0c_store(unsigned int *count) on_each_cpu(diag0c_fn, cpu_vec, 1); *count = cpu_count; kfree(cpu_vec); - put_online_cpus(); + cpus_read_unlock(); return diag0c_data; fail_kfree_cpu_vec: kfree(cpu_vec); -fail_put_online_cpus: - put_online_cpus(); +fail_unlock_cpus: + cpus_read_unlock(); return ERR_PTR(-ENOMEM); } @@ -84,7 +84,7 @@ static int dbfs_diag0c_create(void **data, void **data_free_ptr, size_t *size) if (IS_ERR(diag0c_data)) return PTR_ERR(diag0c_data); memset(&diag0c_data->hdr, 0, sizeof(diag0c_data->hdr)); - get_tod_clock_ext(diag0c_data->hdr.tod_ext); + store_tod_clock_ext((union tod_clock *)diag0c_data->hdr.tod_ext); diag0c_data->hdr.len = count * sizeof(struct hypfs_diag0c_entry); diag0c_data->hdr.version = DBFS_D0C_HDR_VERSION; diag0c_data->hdr.count = count; diff --git a/arch/s390/hypfs/hypfs_diag_fs.c b/arch/s390/hypfs/hypfs_diag_fs.c new file mode 100644 index 000000000000..00a6d370a280 --- /dev/null +++ b/arch/s390/hypfs/hypfs_diag_fs.c @@ -0,0 +1,393 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Hypervisor filesystem for Linux on s390. Diag 204 and 224 + * implementation. + * + * Copyright IBM Corp. 2006, 2008 + * Author(s): Michael Holzheu <holzheu@de.ibm.com> + */ + +#define KMSG_COMPONENT "hypfs" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/vmalloc.h> +#include <linux/mm.h> +#include <asm/diag.h> +#include <asm/ebcdic.h> +#include "hypfs_diag.h" +#include "hypfs.h" + +#define TMP_SIZE 64 /* size of temporary buffers */ + +static char *diag224_cpu_names; /* diag 224 name table */ +static int diag224_idx2name(int index, char *name); + +/* + * DIAG 204 member access functions. + * + * Since we have two different diag 204 data formats for old and new s390 + * machines, we do not access the structs directly, but use getter functions for + * each struct member instead. This should make the code more readable. + */ + +/* Time information block */ + +static inline int info_blk_hdr__size(enum diag204_format type) +{ + if (type == DIAG204_INFO_SIMPLE) + return sizeof(struct diag204_info_blk_hdr); + else /* DIAG204_INFO_EXT */ + return sizeof(struct diag204_x_info_blk_hdr); +} + +static inline __u8 info_blk_hdr__npar(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_info_blk_hdr *)hdr)->npar; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_info_blk_hdr *)hdr)->npar; +} + +static inline __u8 info_blk_hdr__flags(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_info_blk_hdr *)hdr)->flags; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_info_blk_hdr *)hdr)->flags; +} + +/* Partition header */ + +static inline int part_hdr__size(enum diag204_format type) +{ + if (type == DIAG204_INFO_SIMPLE) + return sizeof(struct diag204_part_hdr); + else /* DIAG204_INFO_EXT */ + return sizeof(struct diag204_x_part_hdr); +} + +static inline __u8 part_hdr__rcpus(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_part_hdr *)hdr)->cpus; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_part_hdr *)hdr)->rcpus; +} + +static inline void part_hdr__part_name(enum diag204_format type, void *hdr, + char *name) +{ + if (type == DIAG204_INFO_SIMPLE) + memcpy(name, ((struct diag204_part_hdr *)hdr)->part_name, + DIAG204_LPAR_NAME_LEN); + else /* DIAG204_INFO_EXT */ + memcpy(name, ((struct diag204_x_part_hdr *)hdr)->part_name, + DIAG204_LPAR_NAME_LEN); + EBCASC(name, DIAG204_LPAR_NAME_LEN); + name[DIAG204_LPAR_NAME_LEN] = 0; + strim(name); +} + +/* CPU info block */ + +static inline int cpu_info__size(enum diag204_format type) +{ + if (type == DIAG204_INFO_SIMPLE) + return sizeof(struct diag204_cpu_info); + else /* DIAG204_INFO_EXT */ + return sizeof(struct diag204_x_cpu_info); +} + +static inline __u8 cpu_info__ctidx(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_cpu_info *)hdr)->ctidx; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_cpu_info *)hdr)->ctidx; +} + +static inline __u16 cpu_info__cpu_addr(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_cpu_info *)hdr)->cpu_addr; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_cpu_info *)hdr)->cpu_addr; +} + +static inline __u64 cpu_info__acc_time(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_cpu_info *)hdr)->acc_time; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_cpu_info *)hdr)->acc_time; +} + +static inline __u64 cpu_info__lp_time(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_cpu_info *)hdr)->lp_time; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_cpu_info *)hdr)->lp_time; +} + +static inline __u64 cpu_info__online_time(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return 0; /* online_time not available in simple info */ + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_cpu_info *)hdr)->online_time; +} + +/* Physical header */ + +static inline int phys_hdr__size(enum diag204_format type) +{ + if (type == DIAG204_INFO_SIMPLE) + return sizeof(struct diag204_phys_hdr); + else /* DIAG204_INFO_EXT */ + return sizeof(struct diag204_x_phys_hdr); +} + +static inline __u8 phys_hdr__cpus(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_phys_hdr *)hdr)->cpus; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_phys_hdr *)hdr)->cpus; +} + +/* Physical CPU info block */ + +static inline int phys_cpu__size(enum diag204_format type) +{ + if (type == DIAG204_INFO_SIMPLE) + return sizeof(struct diag204_phys_cpu); + else /* DIAG204_INFO_EXT */ + return sizeof(struct diag204_x_phys_cpu); +} + +static inline __u16 phys_cpu__cpu_addr(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_phys_cpu *)hdr)->cpu_addr; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_phys_cpu *)hdr)->cpu_addr; +} + +static inline __u64 phys_cpu__mgm_time(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_phys_cpu *)hdr)->mgm_time; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_phys_cpu *)hdr)->mgm_time; +} + +static inline __u64 phys_cpu__ctidx(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_phys_cpu *)hdr)->ctidx; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_phys_cpu *)hdr)->ctidx; +} + +/* + * Functions to create the directory structure + * ******************************************* + */ + +static int hypfs_create_cpu_files(struct dentry *cpus_dir, void *cpu_info) +{ + struct dentry *cpu_dir; + char buffer[TMP_SIZE]; + void *rc; + + snprintf(buffer, TMP_SIZE, "%d", cpu_info__cpu_addr(diag204_get_info_type(), + cpu_info)); + cpu_dir = hypfs_mkdir(cpus_dir, buffer); + rc = hypfs_create_u64(cpu_dir, "mgmtime", + cpu_info__acc_time(diag204_get_info_type(), cpu_info) - + cpu_info__lp_time(diag204_get_info_type(), cpu_info)); + if (IS_ERR(rc)) + return PTR_ERR(rc); + rc = hypfs_create_u64(cpu_dir, "cputime", + cpu_info__lp_time(diag204_get_info_type(), cpu_info)); + if (IS_ERR(rc)) + return PTR_ERR(rc); + if (diag204_get_info_type() == DIAG204_INFO_EXT) { + rc = hypfs_create_u64(cpu_dir, "onlinetime", + cpu_info__online_time(diag204_get_info_type(), + cpu_info)); + if (IS_ERR(rc)) + return PTR_ERR(rc); + } + diag224_idx2name(cpu_info__ctidx(diag204_get_info_type(), cpu_info), buffer); + rc = hypfs_create_str(cpu_dir, "type", buffer); + return PTR_ERR_OR_ZERO(rc); +} + +static void *hypfs_create_lpar_files(struct dentry *systems_dir, void *part_hdr) +{ + struct dentry *cpus_dir; + struct dentry *lpar_dir; + char lpar_name[DIAG204_LPAR_NAME_LEN + 1]; + void *cpu_info; + int i; + + part_hdr__part_name(diag204_get_info_type(), part_hdr, lpar_name); + lpar_name[DIAG204_LPAR_NAME_LEN] = 0; + lpar_dir = hypfs_mkdir(systems_dir, lpar_name); + if (IS_ERR(lpar_dir)) + return lpar_dir; + cpus_dir = hypfs_mkdir(lpar_dir, "cpus"); + if (IS_ERR(cpus_dir)) + return cpus_dir; + cpu_info = part_hdr + part_hdr__size(diag204_get_info_type()); + for (i = 0; i < part_hdr__rcpus(diag204_get_info_type(), part_hdr); i++) { + int rc; + + rc = hypfs_create_cpu_files(cpus_dir, cpu_info); + if (rc) + return ERR_PTR(rc); + cpu_info += cpu_info__size(diag204_get_info_type()); + } + return cpu_info; +} + +static int hypfs_create_phys_cpu_files(struct dentry *cpus_dir, void *cpu_info) +{ + struct dentry *cpu_dir; + char buffer[TMP_SIZE]; + void *rc; + + snprintf(buffer, TMP_SIZE, "%i", phys_cpu__cpu_addr(diag204_get_info_type(), + cpu_info)); + cpu_dir = hypfs_mkdir(cpus_dir, buffer); + if (IS_ERR(cpu_dir)) + return PTR_ERR(cpu_dir); + rc = hypfs_create_u64(cpu_dir, "mgmtime", + phys_cpu__mgm_time(diag204_get_info_type(), cpu_info)); + if (IS_ERR(rc)) + return PTR_ERR(rc); + diag224_idx2name(phys_cpu__ctidx(diag204_get_info_type(), cpu_info), buffer); + rc = hypfs_create_str(cpu_dir, "type", buffer); + return PTR_ERR_OR_ZERO(rc); +} + +static void *hypfs_create_phys_files(struct dentry *parent_dir, void *phys_hdr) +{ + int i; + void *cpu_info; + struct dentry *cpus_dir; + + cpus_dir = hypfs_mkdir(parent_dir, "cpus"); + if (IS_ERR(cpus_dir)) + return cpus_dir; + cpu_info = phys_hdr + phys_hdr__size(diag204_get_info_type()); + for (i = 0; i < phys_hdr__cpus(diag204_get_info_type(), phys_hdr); i++) { + int rc; + + rc = hypfs_create_phys_cpu_files(cpus_dir, cpu_info); + if (rc) + return ERR_PTR(rc); + cpu_info += phys_cpu__size(diag204_get_info_type()); + } + return cpu_info; +} + +int hypfs_diag_create_files(struct dentry *root) +{ + struct dentry *systems_dir, *hyp_dir; + void *time_hdr, *part_hdr; + void *buffer, *ptr; + int i, rc, pages; + + buffer = diag204_get_buffer(diag204_get_info_type(), &pages); + if (IS_ERR(buffer)) + return PTR_ERR(buffer); + rc = diag204_store(buffer, pages); + if (rc) + return rc; + + systems_dir = hypfs_mkdir(root, "systems"); + if (IS_ERR(systems_dir)) { + rc = PTR_ERR(systems_dir); + goto err_out; + } + time_hdr = (struct x_info_blk_hdr *)buffer; + part_hdr = time_hdr + info_blk_hdr__size(diag204_get_info_type()); + for (i = 0; i < info_blk_hdr__npar(diag204_get_info_type(), time_hdr); i++) { + part_hdr = hypfs_create_lpar_files(systems_dir, part_hdr); + if (IS_ERR(part_hdr)) { + rc = PTR_ERR(part_hdr); + goto err_out; + } + } + if (info_blk_hdr__flags(diag204_get_info_type(), time_hdr) & + DIAG204_LPAR_PHYS_FLG) { + ptr = hypfs_create_phys_files(root, part_hdr); + if (IS_ERR(ptr)) { + rc = PTR_ERR(ptr); + goto err_out; + } + } + hyp_dir = hypfs_mkdir(root, "hyp"); + if (IS_ERR(hyp_dir)) { + rc = PTR_ERR(hyp_dir); + goto err_out; + } + ptr = hypfs_create_str(hyp_dir, "type", "LPAR Hypervisor"); + if (IS_ERR(ptr)) { + rc = PTR_ERR(ptr); + goto err_out; + } + rc = 0; + +err_out: + return rc; +} + +/* Diagnose 224 functions */ + +static int diag224_idx2name(int index, char *name) +{ + memcpy(name, diag224_cpu_names + ((index + 1) * DIAG204_CPU_NAME_LEN), + DIAG204_CPU_NAME_LEN); + name[DIAG204_CPU_NAME_LEN] = 0; + strim(name); + return 0; +} + +static int diag224_get_name_table(void) +{ + /* memory must be below 2GB */ + diag224_cpu_names = (char *)__get_free_page(GFP_KERNEL | GFP_DMA); + if (!diag224_cpu_names) + return -ENOMEM; + if (diag224(diag224_cpu_names)) { + free_page((unsigned long)diag224_cpu_names); + return -EOPNOTSUPP; + } + EBCASC(diag224_cpu_names + 16, (*diag224_cpu_names + 1) * 16); + return 0; +} + +static void diag224_delete_name_table(void) +{ + free_page((unsigned long)diag224_cpu_names); +} + +int __init __hypfs_diag_fs_init(void) +{ + if (MACHINE_IS_LPAR) + return diag224_get_name_table(); + return 0; +} + +void __hypfs_diag_fs_exit(void) +{ + diag224_delete_name_table(); +} diff --git a/arch/s390/hypfs/hypfs_sprp.c b/arch/s390/hypfs/hypfs_sprp.c index 7d9fb496d155..f5f7e78ddc0c 100644 --- a/arch/s390/hypfs/hypfs_sprp.c +++ b/arch/s390/hypfs/hypfs_sprp.c @@ -25,14 +25,13 @@ static inline unsigned long __hypfs_sprp_diag304(void *data, unsigned long cmd) { - register unsigned long _data asm("2") = (unsigned long) data; - register unsigned long _rc asm("3"); - register unsigned long _cmd asm("4") = cmd; + union register_pair r1 = { .even = (unsigned long)data, }; - asm volatile("diag %1,%2,0x304\n" - : "=d" (_rc) : "d" (_data), "d" (_cmd) : "memory"); - - return _rc; + asm volatile("diag %[r1],%[r3],0x304\n" + : [r1] "+&d" (r1.pair) + : [r3] "d" (cmd) + : "memory"); + return r1.odd; } static unsigned long hypfs_sprp_diag304(void *data, unsigned long cmd) diff --git a/arch/s390/hypfs/hypfs_vm.c b/arch/s390/hypfs/hypfs_vm.c index e1fcc03159ef..3db40ad853e0 100644 --- a/arch/s390/hypfs/hypfs_vm.c +++ b/arch/s390/hypfs/hypfs_vm.c @@ -10,49 +10,19 @@ #include <linux/errno.h> #include <linux/string.h> #include <linux/vmalloc.h> +#include <asm/extable.h> #include <asm/diag.h> #include <asm/ebcdic.h> #include <asm/timex.h> +#include "hypfs_vm.h" #include "hypfs.h" -#define NAME_LEN 8 #define DBFS_D2FC_HDR_VERSION 0 static char local_guest[] = " "; static char all_guests[] = "* "; -static char *guest_query; - -struct diag2fc_data { - __u32 version; - __u32 flags; - __u64 used_cpu; - __u64 el_time; - __u64 mem_min_kb; - __u64 mem_max_kb; - __u64 mem_share_kb; - __u64 mem_used_kb; - __u32 pcpus; - __u32 lcpus; - __u32 vcpus; - __u32 ocpus; - __u32 cpu_max; - __u32 cpu_shares; - __u32 cpu_use_samp; - __u32 cpu_delay_samp; - __u32 page_wait_samp; - __u32 idle_samp; - __u32 other_samp; - __u32 total_samp; - char guest_name[NAME_LEN]; -}; - -struct diag2fc_parm_list { - char userid[NAME_LEN]; - char aci_grp[NAME_LEN]; - __u64 addr; - __u32 size; - __u32 fmt; -}; +static char *all_groups = all_guests; +char *diag2fc_guest_query; static int diag2fc(int size, char* query, void *addr) { @@ -60,12 +30,13 @@ static int diag2fc(int size, char* query, void *addr) unsigned long rc; struct diag2fc_parm_list parm_list; - memcpy(parm_list.userid, query, NAME_LEN); - ASCEBC(parm_list.userid, NAME_LEN); - parm_list.addr = (unsigned long) addr ; + memcpy(parm_list.userid, query, DIAG2FC_NAME_LEN); + ASCEBC(parm_list.userid, DIAG2FC_NAME_LEN); + memcpy(parm_list.aci_grp, all_groups, DIAG2FC_NAME_LEN); + ASCEBC(parm_list.aci_grp, DIAG2FC_NAME_LEN); + parm_list.addr = (unsigned long)addr; parm_list.size = size; parm_list.fmt = 0x02; - memset(parm_list.aci_grp, 0x40, NAME_LEN); rc = -1; diag_stat_inc(DIAG_STAT_X2FC); @@ -84,7 +55,7 @@ static int diag2fc(int size, char* query, void *addr) /* * Allocate buffer for "query" and store diag 2fc at "offset" */ -static void *diag2fc_store(char *query, unsigned int *count, int offset) +void *diag2fc_store(char *query, unsigned int *count, int offset) { void *data; int size; @@ -105,136 +76,15 @@ static void *diag2fc_store(char *query, unsigned int *count, int offset) return data; } -static void diag2fc_free(const void *data) +void diag2fc_free(const void *data) { vfree(data); } -#define ATTRIBUTE(dir, name, member) \ -do { \ - void *rc; \ - rc = hypfs_create_u64(dir, name, member); \ - if (IS_ERR(rc)) \ - return PTR_ERR(rc); \ -} while(0) - -static int hypfs_vm_create_guest(struct dentry *systems_dir, - struct diag2fc_data *data) -{ - char guest_name[NAME_LEN + 1] = {}; - struct dentry *guest_dir, *cpus_dir, *samples_dir, *mem_dir; - int dedicated_flag, capped_value; - - capped_value = (data->flags & 0x00000006) >> 1; - dedicated_flag = (data->flags & 0x00000008) >> 3; - - /* guest dir */ - memcpy(guest_name, data->guest_name, NAME_LEN); - EBCASC(guest_name, NAME_LEN); - strim(guest_name); - guest_dir = hypfs_mkdir(systems_dir, guest_name); - if (IS_ERR(guest_dir)) - return PTR_ERR(guest_dir); - ATTRIBUTE(guest_dir, "onlinetime_us", data->el_time); - - /* logical cpu information */ - cpus_dir = hypfs_mkdir(guest_dir, "cpus"); - if (IS_ERR(cpus_dir)) - return PTR_ERR(cpus_dir); - ATTRIBUTE(cpus_dir, "cputime_us", data->used_cpu); - ATTRIBUTE(cpus_dir, "capped", capped_value); - ATTRIBUTE(cpus_dir, "dedicated", dedicated_flag); - ATTRIBUTE(cpus_dir, "count", data->vcpus); - /* - * Note: The "weight_min" attribute got the wrong name. - * The value represents the number of non-stopped (operating) - * CPUS. - */ - ATTRIBUTE(cpus_dir, "weight_min", data->ocpus); - ATTRIBUTE(cpus_dir, "weight_max", data->cpu_max); - ATTRIBUTE(cpus_dir, "weight_cur", data->cpu_shares); - - /* memory information */ - mem_dir = hypfs_mkdir(guest_dir, "mem"); - if (IS_ERR(mem_dir)) - return PTR_ERR(mem_dir); - ATTRIBUTE(mem_dir, "min_KiB", data->mem_min_kb); - ATTRIBUTE(mem_dir, "max_KiB", data->mem_max_kb); - ATTRIBUTE(mem_dir, "used_KiB", data->mem_used_kb); - ATTRIBUTE(mem_dir, "share_KiB", data->mem_share_kb); - - /* samples */ - samples_dir = hypfs_mkdir(guest_dir, "samples"); - if (IS_ERR(samples_dir)) - return PTR_ERR(samples_dir); - ATTRIBUTE(samples_dir, "cpu_using", data->cpu_use_samp); - ATTRIBUTE(samples_dir, "cpu_delay", data->cpu_delay_samp); - ATTRIBUTE(samples_dir, "mem_delay", data->page_wait_samp); - ATTRIBUTE(samples_dir, "idle", data->idle_samp); - ATTRIBUTE(samples_dir, "other", data->other_samp); - ATTRIBUTE(samples_dir, "total", data->total_samp); - return 0; -} - -int hypfs_vm_create_files(struct dentry *root) -{ - struct dentry *dir, *file; - struct diag2fc_data *data; - unsigned int count = 0; - int rc, i; - - data = diag2fc_store(guest_query, &count, 0); - if (IS_ERR(data)) - return PTR_ERR(data); - - /* Hpervisor Info */ - dir = hypfs_mkdir(root, "hyp"); - if (IS_ERR(dir)) { - rc = PTR_ERR(dir); - goto failed; - } - file = hypfs_create_str(dir, "type", "z/VM Hypervisor"); - if (IS_ERR(file)) { - rc = PTR_ERR(file); - goto failed; - } - - /* physical cpus */ - dir = hypfs_mkdir(root, "cpus"); - if (IS_ERR(dir)) { - rc = PTR_ERR(dir); - goto failed; - } - file = hypfs_create_u64(dir, "count", data->lcpus); - if (IS_ERR(file)) { - rc = PTR_ERR(file); - goto failed; - } - - /* guests */ - dir = hypfs_mkdir(root, "systems"); - if (IS_ERR(dir)) { - rc = PTR_ERR(dir); - goto failed; - } - - for (i = 0; i < count; i++) { - rc = hypfs_vm_create_guest(dir, &(data[i])); - if (rc) - goto failed; - } - diag2fc_free(data); - return 0; - -failed: - diag2fc_free(data); - return rc; -} - struct dbfs_d2fc_hdr { u64 len; /* Length of d2fc buffer without header */ u16 version; /* Version of header */ - char tod_ext[STORE_CLOCK_EXT_SIZE]; /* TOD clock for d2fc */ + union tod_clock tod_ext; /* TOD clock for d2fc */ u64 count; /* Number of VM guests in d2fc buffer */ char reserved[30]; } __attribute__ ((packed)); @@ -249,10 +99,10 @@ static int dbfs_diag2fc_create(void **data, void **data_free_ptr, size_t *size) struct dbfs_d2fc *d2fc; unsigned int count; - d2fc = diag2fc_store(guest_query, &count, sizeof(d2fc->hdr)); + d2fc = diag2fc_store(diag2fc_guest_query, &count, sizeof(d2fc->hdr)); if (IS_ERR(d2fc)) return PTR_ERR(d2fc); - get_tod_clock_ext(d2fc->hdr.tod_ext); + store_tod_clock_ext(&d2fc->hdr.tod_ext); d2fc->hdr.len = count * sizeof(struct diag2fc_data); d2fc->hdr.version = DBFS_D2FC_HDR_VERSION; d2fc->hdr.count = count; @@ -274,9 +124,9 @@ int hypfs_vm_init(void) if (!MACHINE_IS_VM) return 0; if (diag2fc(0, all_guests, NULL) > 0) - guest_query = all_guests; + diag2fc_guest_query = all_guests; else if (diag2fc(0, local_guest, NULL) > 0) - guest_query = local_guest; + diag2fc_guest_query = local_guest; else return -EACCES; hypfs_dbfs_create_file(&dbfs_file_2fc); diff --git a/arch/s390/hypfs/hypfs_vm.h b/arch/s390/hypfs/hypfs_vm.h new file mode 100644 index 000000000000..fe2e5851addd --- /dev/null +++ b/arch/s390/hypfs/hypfs_vm.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Hypervisor filesystem for Linux on s390. z/VM implementation. + * + * Copyright IBM Corp. 2006 + * Author(s): Michael Holzheu <holzheu@de.ibm.com> + */ + +#ifndef _S390_HYPFS_VM_H_ +#define _S390_HYPFS_VM_H_ + +#define DIAG2FC_NAME_LEN 8 + +struct diag2fc_data { + __u32 version; + __u32 flags; + __u64 used_cpu; + __u64 el_time; + __u64 mem_min_kb; + __u64 mem_max_kb; + __u64 mem_share_kb; + __u64 mem_used_kb; + __u32 pcpus; + __u32 lcpus; + __u32 vcpus; + __u32 ocpus; + __u32 cpu_max; + __u32 cpu_shares; + __u32 cpu_use_samp; + __u32 cpu_delay_samp; + __u32 page_wait_samp; + __u32 idle_samp; + __u32 other_samp; + __u32 total_samp; + char guest_name[DIAG2FC_NAME_LEN]; +}; + +struct diag2fc_parm_list { + char userid[DIAG2FC_NAME_LEN]; + char aci_grp[DIAG2FC_NAME_LEN]; + __u64 addr; + __u32 size; + __u32 fmt; +}; + +void *diag2fc_store(char *query, unsigned int *count, int offset); +void diag2fc_free(const void *data); +extern char *diag2fc_guest_query; + +#endif /* _S390_HYPFS_VM_H_ */ diff --git a/arch/s390/hypfs/hypfs_vm_fs.c b/arch/s390/hypfs/hypfs_vm_fs.c new file mode 100644 index 000000000000..6011289afa8c --- /dev/null +++ b/arch/s390/hypfs/hypfs_vm_fs.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Hypervisor filesystem for Linux on s390. z/VM implementation. + * + * Copyright IBM Corp. 2006 + * Author(s): Michael Holzheu <holzheu@de.ibm.com> + */ + +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/vmalloc.h> +#include <asm/extable.h> +#include <asm/diag.h> +#include <asm/ebcdic.h> +#include <asm/timex.h> +#include "hypfs_vm.h" +#include "hypfs.h" + +#define ATTRIBUTE(dir, name, member) \ +do { \ + void *rc; \ + rc = hypfs_create_u64(dir, name, member); \ + if (IS_ERR(rc)) \ + return PTR_ERR(rc); \ +} while (0) + +static int hypfs_vm_create_guest(struct dentry *systems_dir, + struct diag2fc_data *data) +{ + char guest_name[DIAG2FC_NAME_LEN + 1] = {}; + struct dentry *guest_dir, *cpus_dir, *samples_dir, *mem_dir; + int dedicated_flag, capped_value; + + capped_value = (data->flags & 0x00000006) >> 1; + dedicated_flag = (data->flags & 0x00000008) >> 3; + + /* guest dir */ + memcpy(guest_name, data->guest_name, DIAG2FC_NAME_LEN); + EBCASC(guest_name, DIAG2FC_NAME_LEN); + strim(guest_name); + guest_dir = hypfs_mkdir(systems_dir, guest_name); + if (IS_ERR(guest_dir)) + return PTR_ERR(guest_dir); + ATTRIBUTE(guest_dir, "onlinetime_us", data->el_time); + + /* logical cpu information */ + cpus_dir = hypfs_mkdir(guest_dir, "cpus"); + if (IS_ERR(cpus_dir)) + return PTR_ERR(cpus_dir); + ATTRIBUTE(cpus_dir, "cputime_us", data->used_cpu); + ATTRIBUTE(cpus_dir, "capped", capped_value); + ATTRIBUTE(cpus_dir, "dedicated", dedicated_flag); + ATTRIBUTE(cpus_dir, "count", data->vcpus); + /* + * Note: The "weight_min" attribute got the wrong name. + * The value represents the number of non-stopped (operating) + * CPUS. + */ + ATTRIBUTE(cpus_dir, "weight_min", data->ocpus); + ATTRIBUTE(cpus_dir, "weight_max", data->cpu_max); + ATTRIBUTE(cpus_dir, "weight_cur", data->cpu_shares); + + /* memory information */ + mem_dir = hypfs_mkdir(guest_dir, "mem"); + if (IS_ERR(mem_dir)) + return PTR_ERR(mem_dir); + ATTRIBUTE(mem_dir, "min_KiB", data->mem_min_kb); + ATTRIBUTE(mem_dir, "max_KiB", data->mem_max_kb); + ATTRIBUTE(mem_dir, "used_KiB", data->mem_used_kb); + ATTRIBUTE(mem_dir, "share_KiB", data->mem_share_kb); + + /* samples */ + samples_dir = hypfs_mkdir(guest_dir, "samples"); + if (IS_ERR(samples_dir)) + return PTR_ERR(samples_dir); + ATTRIBUTE(samples_dir, "cpu_using", data->cpu_use_samp); + ATTRIBUTE(samples_dir, "cpu_delay", data->cpu_delay_samp); + ATTRIBUTE(samples_dir, "mem_delay", data->page_wait_samp); + ATTRIBUTE(samples_dir, "idle", data->idle_samp); + ATTRIBUTE(samples_dir, "other", data->other_samp); + ATTRIBUTE(samples_dir, "total", data->total_samp); + return 0; +} + +int hypfs_vm_create_files(struct dentry *root) +{ + struct dentry *dir, *file; + struct diag2fc_data *data; + unsigned int count = 0; + int rc, i; + + data = diag2fc_store(diag2fc_guest_query, &count, 0); + if (IS_ERR(data)) + return PTR_ERR(data); + + /* Hypervisor Info */ + dir = hypfs_mkdir(root, "hyp"); + if (IS_ERR(dir)) { + rc = PTR_ERR(dir); + goto failed; + } + file = hypfs_create_str(dir, "type", "z/VM Hypervisor"); + if (IS_ERR(file)) { + rc = PTR_ERR(file); + goto failed; + } + + /* physical cpus */ + dir = hypfs_mkdir(root, "cpus"); + if (IS_ERR(dir)) { + rc = PTR_ERR(dir); + goto failed; + } + file = hypfs_create_u64(dir, "count", data->lcpus); + if (IS_ERR(file)) { + rc = PTR_ERR(file); + goto failed; + } + + /* guests */ + dir = hypfs_mkdir(root, "systems"); + if (IS_ERR(dir)) { + rc = PTR_ERR(dir); + goto failed; + } + + for (i = 0; i < count; i++) { + rc = hypfs_vm_create_guest(dir, &data[i]); + if (rc) + goto failed; + } + diag2fc_free(data); + return 0; + +failed: + diag2fc_free(data); + return rc; +} diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c index 70139d0791b6..858beaf4a8cb 100644 --- a/arch/s390/hypfs/inode.c +++ b/arch/s390/hypfs/inode.c @@ -53,7 +53,7 @@ static void hypfs_update_update(struct super_block *sb) struct inode *inode = d_inode(sb_info->update_file); sb_info->last_update = ktime_get_seconds(); - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); } /* directory tree removal functions */ @@ -101,7 +101,7 @@ static struct inode *hypfs_make_inode(struct super_block *sb, umode_t mode) ret->i_mode = mode; ret->i_uid = hypfs_info->uid; ret->i_gid = hypfs_info->gid; - ret->i_atime = ret->i_mtime = ret->i_ctime = current_time(ret); + simple_inode_init_ts(ret); if (S_ISDIR(mode)) set_nlink(ret, 2); } @@ -209,17 +209,12 @@ static int hypfs_release(struct inode *inode, struct file *filp) enum { Opt_uid, Opt_gid, }; -static const struct fs_parameter_spec hypfs_param_specs[] = { +static const struct fs_parameter_spec hypfs_fs_parameters[] = { fsparam_u32("gid", Opt_gid), fsparam_u32("uid", Opt_uid), {} }; -static const struct fs_parameter_description hypfs_fs_parameters = { - .name = "hypfs", - .specs = hypfs_param_specs, -}; - static int hypfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct hypfs_sb_info *hypfs_info = fc->s_fs_info; @@ -228,7 +223,7 @@ static int hypfs_parse_param(struct fs_context *fc, struct fs_parameter *param) kgid_t gid; int opt; - opt = fs_parse(fc, &hypfs_fs_parameters, param, &result); + opt = fs_parse(fc, hypfs_fs_parameters, param, &result); if (opt < 0) return opt; @@ -455,7 +450,7 @@ static struct file_system_type hypfs_type = { .owner = THIS_MODULE, .name = "s390_hypfs", .init_fs_context = hypfs_init_fs_context, - .parameters = &hypfs_fs_parameters, + .parameters = hypfs_fs_parameters, .kill_sb = hypfs_kill_super }; @@ -465,45 +460,18 @@ static const struct super_operations hypfs_s_ops = { .show_options = hypfs_show_options, }; -static int __init hypfs_init(void) +int __init __hypfs_fs_init(void) { int rc; - hypfs_dbfs_init(); - - if (hypfs_diag_init()) { - rc = -ENODATA; - goto fail_dbfs_exit; - } - if (hypfs_vm_init()) { - rc = -ENODATA; - goto fail_hypfs_diag_exit; - } - hypfs_sprp_init(); - if (hypfs_diag0c_init()) { - rc = -ENODATA; - goto fail_hypfs_sprp_exit; - } rc = sysfs_create_mount_point(hypervisor_kobj, "s390"); if (rc) - goto fail_hypfs_diag0c_exit; + return rc; rc = register_filesystem(&hypfs_type); if (rc) - goto fail_filesystem; + goto fail; return 0; - -fail_filesystem: +fail: sysfs_remove_mount_point(hypervisor_kobj, "s390"); -fail_hypfs_diag0c_exit: - hypfs_diag0c_exit(); -fail_hypfs_sprp_exit: - hypfs_sprp_exit(); - hypfs_vm_exit(); -fail_hypfs_diag_exit: - hypfs_diag_exit(); -fail_dbfs_exit: - hypfs_dbfs_exit(); - pr_err("Initialization of hypfs failed with rc=%i\n", rc); return rc; } -device_initcall(hypfs_init) diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index 2531f673f099..4b904110d27c 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -5,22 +5,5 @@ generated-y += syscall_table.h generated-y += unistd_nr.h generic-y += asm-offsets.h -generic-y += cacheflush.h -generic-y += device.h -generic-y += dma-contiguous.h -generic-y += dma-mapping.h -generic-y += div64.h -generic-y += emergency-restart.h -generic-y += export.h -generic-y += fb.h -generic-y += irq_regs.h -generic-y += irq_work.h -generic-y += kmap_types.h -generic-y += local.h -generic-y += local64.h +generic-y += kvm_types.h generic-y += mcs_spinlock.h -generic-y += mm-arch-hooks.h -generic-y += mmiowb.h -generic-y += trace_clock.h -generic-y += unaligned.h -generic-y += word-at-a-time.h diff --git a/arch/s390/include/asm/abs_lowcore.h b/arch/s390/include/asm/abs_lowcore.h new file mode 100644 index 000000000000..6f264b79e377 --- /dev/null +++ b/arch/s390/include/asm/abs_lowcore.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_ABS_LOWCORE_H +#define _ASM_S390_ABS_LOWCORE_H + +#include <asm/lowcore.h> + +#define ABS_LOWCORE_MAP_SIZE (NR_CPUS * sizeof(struct lowcore)) + +extern unsigned long __abs_lowcore; + +int abs_lowcore_map(int cpu, struct lowcore *lc, bool alloc); +void abs_lowcore_unmap(int cpu); + +static inline struct lowcore *get_abs_lowcore(void) +{ + int cpu; + + cpu = get_cpu(); + return ((struct lowcore *)__abs_lowcore) + cpu; +} + +static inline void put_abs_lowcore(struct lowcore *lc) +{ + put_cpu(); +} + +#endif /* _ASM_S390_ABS_LOWCORE_H */ diff --git a/arch/s390/include/asm/airq.h b/arch/s390/include/asm/airq.h index 01936fdfaddb..c4c28c2609a5 100644 --- a/arch/s390/include/asm/airq.h +++ b/arch/s390/include/asm/airq.h @@ -12,12 +12,12 @@ #include <linux/bit_spinlock.h> #include <linux/dma-mapping.h> +#include <asm/tpi.h> struct airq_struct { struct hlist_node list; /* Handler queueing. */ - void (*handler)(struct airq_struct *airq, bool floating); + void (*handler)(struct airq_struct *airq, struct tpi_info *tpi_info); u8 *lsi_ptr; /* Local-Summary-Indicator pointer */ - u8 lsi_mask; /* Local-Summary-Indicator mask */ u8 isc; /* Interrupt-subclass */ u8 flags; }; @@ -46,8 +46,10 @@ struct airq_iv { #define AIRQ_IV_PTR 4 /* Allocate the ptr array */ #define AIRQ_IV_DATA 8 /* Allocate the data array */ #define AIRQ_IV_CACHELINE 16 /* Cacheline alignment for the vector */ +#define AIRQ_IV_GUESTVEC 32 /* Vector is a pinned guest page */ -struct airq_iv *airq_iv_create(unsigned long bits, unsigned long flags); +struct airq_iv *airq_iv_create(unsigned long bits, unsigned long flags, + unsigned long *vec); void airq_iv_release(struct airq_iv *iv); unsigned long airq_iv_alloc(struct airq_iv *iv, unsigned long num); void airq_iv_free(struct airq_iv *iv, unsigned long bit, unsigned long num); diff --git a/arch/s390/include/asm/alternative-asm.h b/arch/s390/include/asm/alternative-asm.h index 955d620db23e..7db046596b93 100644 --- a/arch/s390/include/asm/alternative-asm.h +++ b/arch/s390/include/asm/alternative-asm.h @@ -5,19 +5,6 @@ #ifdef __ASSEMBLY__ /* - * Check the length of an instruction sequence. The length may not be larger - * than 254 bytes and it has to be divisible by 2. - */ -.macro alt_len_check start,end - .if ( \end - \start ) > 254 - .error "cpu alternatives does not support instructions blocks > 254 bytes\n" - .endif - .if ( \end - \start ) % 2 - .error "cpu alternatives instructions length is odd\n" - .endif -.endm - -/* * Issue one struct alt_instr descriptor entry (need to put it into * the section .altinstructions, see below). This entry contains * enough information for the alternatives patching code to patch an @@ -28,60 +15,29 @@ .long \alt_start - . .word \feature .byte \orig_end - \orig_start - .byte \alt_end - \alt_start -.endm - -/* - * Fill up @bytes with nops. The macro emits 6-byte nop instructions - * for the bulk of the area, possibly followed by a 4-byte and/or - * a 2-byte nop if the size of the area is not divisible by 6. - */ -.macro alt_pad_fill bytes - .fill ( \bytes ) / 6, 6, 0xc0040000 - .fill ( \bytes ) % 6 / 4, 4, 0x47000000 - .fill ( \bytes ) % 6 % 4 / 2, 2, 0x0700 -.endm - -/* - * Fill up @bytes with nops. If the number of bytes is larger - * than 6, emit a jg instruction to branch over all nops, then - * fill an area of size (@bytes - 6) with nop instructions. - */ -.macro alt_pad bytes - .if ( \bytes > 0 ) - .if ( \bytes > 6 ) - jg . + \bytes - alt_pad_fill \bytes - 6 - .else - alt_pad_fill \bytes - .endif - .endif + .org . - ( \orig_end - \orig_start ) + ( \alt_end - \alt_start ) + .org . - ( \alt_end - \alt_start ) + ( \orig_end - \orig_start ) .endm /* * Define an alternative between two instructions. If @feature is * present, early code in apply_alternatives() replaces @oldinstr with - * @newinstr. ".skip" directive takes care of proper instruction padding - * in case @newinstr is longer than @oldinstr. + * @newinstr. */ .macro ALTERNATIVE oldinstr, newinstr, feature .pushsection .altinstr_replacement,"ax" 770: \newinstr 771: .popsection 772: \oldinstr -773: alt_len_check 770b, 771b - alt_len_check 772b, 773b - alt_pad ( ( 771b - 770b ) - ( 773b - 772b ) ) -774: .pushsection .altinstructions,"a" - alt_entry 772b, 774b, 770b, 771b, \feature +773: .pushsection .altinstructions,"a" + alt_entry 772b, 773b, 770b, 771b, \feature .popsection .endm /* * Define an alternative between two instructions. If @feature is * present, early code in apply_alternatives() replaces @oldinstr with - * @newinstr. ".skip" directive takes care of proper instruction padding - * in case @newinstr is longer than @oldinstr. + * @newinstr. */ .macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2 .pushsection .altinstr_replacement,"ax" @@ -89,17 +45,9 @@ 771: \newinstr2 772: .popsection 773: \oldinstr -774: alt_len_check 770b, 771b - alt_len_check 771b, 772b - alt_len_check 773b, 774b - .if ( 771b - 770b > 772b - 771b ) - alt_pad ( ( 771b - 770b ) - ( 774b - 773b ) ) - .else - alt_pad ( ( 772b - 771b ) - ( 774b - 773b ) ) - .endif -775: .pushsection .altinstructions,"a" - alt_entry 773b, 775b, 770b, 771b,\feature1 - alt_entry 773b, 775b, 771b, 772b,\feature2 +774: .pushsection .altinstructions,"a" + alt_entry 773b, 774b, 770b, 771b,\feature1 + alt_entry 773b, 774b, 771b, 772b,\feature2 .popsection .endm diff --git a/arch/s390/include/asm/alternative.h b/arch/s390/include/asm/alternative.h index 1c8a38f762a3..904dd049f954 100644 --- a/arch/s390/include/asm/alternative.h +++ b/arch/s390/include/asm/alternative.h @@ -13,32 +13,25 @@ struct alt_instr { s32 repl_offset; /* offset to replacement instruction */ u16 facility; /* facility bit set for replacement */ u8 instrlen; /* length of original instruction */ - u8 replacementlen; /* length of new instruction */ } __packed; void apply_alternative_instructions(void); void apply_alternatives(struct alt_instr *start, struct alt_instr *end); /* - * |661: |662: |6620 |663: - * +-----------+---------------------+ - * | oldinstr | oldinstr_padding | - * | +----------+----------+ - * | | | | - * | | >6 bytes |6/4/2 nops| - * | |6 bytes jg-----------> - * +-----------+---------------------+ - * ^^ static padding ^^ + * +---------------------------------+ + * |661: |662: + * | oldinstr | + * +---------------------------------+ * * .altinstr_replacement section - * +---------------------+-----------+ + * +---------------------------------+ * |6641: |6651: * | alternative instr 1 | - * +-----------+---------+- - - - - -+ - * |6642: |6652: | - * | alternative instr 2 | padding - * +---------------------+- - - - - -+ - * ^ runtime ^ + * +---------------------------------+ + * |6642: |6652: + * | alternative instr 2 | + * +---------------------------------+ * * .altinstructions section * +---------------------------------+ @@ -47,70 +40,31 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end); * +---------------------------------+ */ -#define b_altinstr(num) "664"#num -#define e_altinstr(num) "665"#num - -#define e_oldinstr_pad_end "663" +#define b_altinstr(num) "664"#num +#define e_altinstr(num) "665"#num #define oldinstr_len "662b-661b" -#define oldinstr_total_len e_oldinstr_pad_end"b-661b" #define altinstr_len(num) e_altinstr(num)"b-"b_altinstr(num)"b" -#define oldinstr_pad_len(num) \ - "-(((" altinstr_len(num) ")-(" oldinstr_len ")) > 0) * " \ - "((" altinstr_len(num) ")-(" oldinstr_len "))" - -#define INSTR_LEN_SANITY_CHECK(len) \ - ".if " len " > 254\n" \ - "\t.error \"cpu alternatives does not support instructions " \ - "blocks > 254 bytes\"\n" \ - ".endif\n" \ - ".if (" len ") %% 2\n" \ - "\t.error \"cpu alternatives instructions length is odd\"\n" \ - ".endif\n" - -#define OLDINSTR_PADDING(oldinstr, num) \ - ".if " oldinstr_pad_len(num) " > 6\n" \ - "\tjg " e_oldinstr_pad_end "f\n" \ - "6620:\n" \ - "\t.fill (" oldinstr_pad_len(num) " - (6620b-662b)) / 2, 2, 0x0700\n" \ - ".else\n" \ - "\t.fill " oldinstr_pad_len(num) " / 6, 6, 0xc0040000\n" \ - "\t.fill " oldinstr_pad_len(num) " %% 6 / 4, 4, 0x47000000\n" \ - "\t.fill " oldinstr_pad_len(num) " %% 6 %% 4 / 2, 2, 0x0700\n" \ - ".endif\n" - -#define OLDINSTR(oldinstr, num) \ - "661:\n\t" oldinstr "\n662:\n" \ - OLDINSTR_PADDING(oldinstr, num) \ - e_oldinstr_pad_end ":\n" \ - INSTR_LEN_SANITY_CHECK(oldinstr_len) - -#define OLDINSTR_2(oldinstr, num1, num2) \ - "661:\n\t" oldinstr "\n662:\n" \ - ".if " altinstr_len(num1) " < " altinstr_len(num2) "\n" \ - OLDINSTR_PADDING(oldinstr, num2) \ - ".else\n" \ - OLDINSTR_PADDING(oldinstr, num1) \ - ".endif\n" \ - e_oldinstr_pad_end ":\n" \ - INSTR_LEN_SANITY_CHECK(oldinstr_len) + +#define OLDINSTR(oldinstr) \ + "661:\n\t" oldinstr "\n662:\n" #define ALTINSTR_ENTRY(facility, num) \ "\t.long 661b - .\n" /* old instruction */ \ "\t.long " b_altinstr(num)"b - .\n" /* alt instruction */ \ "\t.word " __stringify(facility) "\n" /* facility bit */ \ - "\t.byte " oldinstr_total_len "\n" /* source len */ \ - "\t.byte " altinstr_len(num) "\n" /* alt instruction len */ + "\t.byte " oldinstr_len "\n" /* instruction len */ \ + "\t.org . - (" oldinstr_len ") + (" altinstr_len(num) ")\n" \ + "\t.org . - (" altinstr_len(num) ") + (" oldinstr_len ")\n" #define ALTINSTR_REPLACEMENT(altinstr, num) /* replacement */ \ - b_altinstr(num)":\n\t" altinstr "\n" e_altinstr(num) ":\n" \ - INSTR_LEN_SANITY_CHECK(altinstr_len(num)) + b_altinstr(num)":\n\t" altinstr "\n" e_altinstr(num) ":\n" /* alternative assembly primitive: */ #define ALTERNATIVE(oldinstr, altinstr, facility) \ ".pushsection .altinstr_replacement, \"ax\"\n" \ ALTINSTR_REPLACEMENT(altinstr, 1) \ ".popsection\n" \ - OLDINSTR(oldinstr, 1) \ + OLDINSTR(oldinstr) \ ".pushsection .altinstructions,\"a\"\n" \ ALTINSTR_ENTRY(facility, 1) \ ".popsection\n" @@ -120,7 +74,7 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end); ALTINSTR_REPLACEMENT(altinstr1, 1) \ ALTINSTR_REPLACEMENT(altinstr2, 2) \ ".popsection\n" \ - OLDINSTR_2(oldinstr, 1, 2) \ + OLDINSTR(oldinstr) \ ".pushsection .altinstructions,\"a\"\n" \ ALTINSTR_ENTRY(facility1, 1) \ ALTINSTR_ENTRY(facility2, 2) \ @@ -145,6 +99,22 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end); asm_inline volatile(ALTERNATIVE_2(oldinstr, altinstr1, facility1, \ altinstr2, facility2) ::: "memory") +/* Alternative inline assembly with input. */ +#define alternative_input(oldinstr, newinstr, feature, input...) \ + asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, feature) \ + : : input) + +/* Like alternative_input, but with a single output argument */ +#define alternative_io(oldinstr, altinstr, facility, output, input...) \ + asm_inline volatile(ALTERNATIVE(oldinstr, altinstr, facility) \ + : output : input) + +/* Use this macro if more than one output parameter is needed. */ +#define ASM_OUTPUT2(a...) a + +/* Use this macro if clobbers are needed without inputs. */ +#define ASM_NO_INPUT_CLOBBER(clobber...) : clobber + #endif /* __ASSEMBLY__ */ #endif /* _ASM_S390_ALTERNATIVE_H */ diff --git a/arch/s390/include/asm/ap.h b/arch/s390/include/asm/ap.h index aea32dda3d14..43ac4a64f49b 100644 --- a/arch/s390/include/asm/ap.h +++ b/arch/s390/include/asm/ap.h @@ -12,6 +12,9 @@ #ifndef _ASM_S390_AP_H_ #define _ASM_S390_AP_H_ +#include <linux/io.h> +#include <asm/asm-extable.h> + /** * The ap_qid_t identifier of an ap queue. * If the AP facilities test (APFT) facility is available, @@ -40,10 +43,24 @@ struct ap_queue_status { unsigned int queue_empty : 1; unsigned int replies_waiting : 1; unsigned int queue_full : 1; - unsigned int _pad1 : 4; + unsigned int : 3; + unsigned int async : 1; unsigned int irq_enabled : 1; unsigned int response_code : 8; - unsigned int _pad2 : 16; + unsigned int : 16; +}; + +/* + * AP queue status reg union to access the reg1 + * register with the lower 32 bits comprising the + * ap queue status. + */ +union ap_queue_status_reg { + unsigned long value; + struct { + u32 _pad; + struct ap_queue_status status; + }; }; /** @@ -53,54 +70,98 @@ struct ap_queue_status { */ static inline bool ap_instructions_available(void) { - register unsigned long reg0 asm ("0") = AP_MKQID(0, 0); - register unsigned long reg1 asm ("1") = 0; - register unsigned long reg2 asm ("2") = 0; + unsigned long reg0 = AP_MKQID(0, 0); + unsigned long reg1 = 0; asm volatile( - " .long 0xb2af0000\n" /* PQAP(TAPQ) */ - "0: la %0,1\n" + " lgr 0,%[reg0]\n" /* qid into gr0 */ + " lghi 1,0\n" /* 0 into gr1 */ + " lghi 2,0\n" /* 0 into gr2 */ + " .insn rre,0xb2af0000,0,0\n" /* PQAP(TAPQ) */ + "0: la %[reg1],1\n" /* 1 into reg1 */ "1:\n" EX_TABLE(0b, 1b) - : "+d" (reg1), "+d" (reg2) - : "d" (reg0) - : "cc"); + : [reg1] "+&d" (reg1) + : [reg0] "d" (reg0) + : "cc", "0", "1", "2"); return reg1 != 0; } +/* TAPQ register GR2 response struct */ +struct ap_tapq_hwinfo { + union { + unsigned long value; + struct { + unsigned int fac : 32; /* facility bits */ + unsigned int apinfo : 32; /* ap type, ... */ + }; + struct { + unsigned int apsc : 1; /* APSC */ + unsigned int mex4k : 1; /* AP4KM */ + unsigned int crt4k : 1; /* AP4KC */ + unsigned int cca : 1; /* D */ + unsigned int accel : 1; /* A */ + unsigned int ep11 : 1; /* X */ + unsigned int apxa : 1; /* APXA */ + unsigned int : 1; + unsigned int class : 8; + unsigned int bs : 2; /* SE bind/assoc */ + unsigned int : 14; + unsigned int at : 8; /* ap type */ + unsigned int nd : 8; /* nr of domains */ + unsigned int : 4; + unsigned int ml : 4; /* apxl ml */ + unsigned int : 4; + unsigned int qd : 4; /* queue depth */ + }; + }; +}; + +/* + * Convenience defines to be used with the bs field from struct ap_tapq_gr2 + */ +#define AP_BS_Q_USABLE 0 +#define AP_BS_Q_USABLE_NO_SECURE_KEY 1 +#define AP_BS_Q_AVAIL_FOR_BINDING 2 +#define AP_BS_Q_UNUSABLE 3 + /** * ap_tapq(): Test adjunct processor queue. * @qid: The AP queue number - * @info: Pointer to queue descriptor + * @info: Pointer to tapq hwinfo struct * * Returns AP queue status structure. */ -static inline struct ap_queue_status ap_tapq(ap_qid_t qid, unsigned long *info) +static inline struct ap_queue_status ap_tapq(ap_qid_t qid, + struct ap_tapq_hwinfo *info) { - register unsigned long reg0 asm ("0") = qid; - register struct ap_queue_status reg1 asm ("1"); - register unsigned long reg2 asm ("2"); - - asm volatile(".long 0xb2af0000" /* PQAP(TAPQ) */ - : "=d" (reg1), "=d" (reg2) - : "d" (reg0) - : "cc"); + union ap_queue_status_reg reg1; + unsigned long reg2; + + asm volatile( + " lgr 0,%[qid]\n" /* qid into gr0 */ + " lghi 2,0\n" /* 0 into gr2 */ + " .insn rre,0xb2af0000,0,0\n" /* PQAP(TAPQ) */ + " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ + " lgr %[reg2],2\n" /* gr2 into reg2 */ + : [reg1] "=&d" (reg1.value), [reg2] "=&d" (reg2) + : [qid] "d" (qid) + : "cc", "0", "1", "2"); if (info) - *info = reg2; - return reg1; + info->value = reg2; + return reg1.status; } /** * ap_test_queue(): Test adjunct processor queue. * @qid: The AP queue number * @tbit: Test facilities bit - * @info: Pointer to queue descriptor + * @info: Ptr to tapq gr2 struct * * Returns AP queue status structure. */ -static inline struct ap_queue_status ap_test_queue(ap_qid_t qid, - int tbit, - unsigned long *info) +static inline struct ap_queue_status ap_test_queue(ap_qid_t qid, int tbit, + struct ap_tapq_hwinfo *info) { if (tbit) qid |= 1UL << 23; /* set T bit*/ @@ -110,39 +171,51 @@ static inline struct ap_queue_status ap_test_queue(ap_qid_t qid, /** * ap_pqap_rapq(): Reset adjunct processor queue. * @qid: The AP queue number + * @fbit: if != 0 set F bit * * Returns AP queue status structure. */ -static inline struct ap_queue_status ap_rapq(ap_qid_t qid) +static inline struct ap_queue_status ap_rapq(ap_qid_t qid, int fbit) { - register unsigned long reg0 asm ("0") = qid | (1UL << 24); - register struct ap_queue_status reg1 asm ("1"); + unsigned long reg0 = qid | (1UL << 24); /* fc 1UL is RAPQ */ + union ap_queue_status_reg reg1; + + if (fbit) + reg0 |= 1UL << 22; asm volatile( - ".long 0xb2af0000" /* PQAP(RAPQ) */ - : "=d" (reg1) - : "d" (reg0) - : "cc"); - return reg1; + " lgr 0,%[reg0]\n" /* qid arg into gr0 */ + " .insn rre,0xb2af0000,0,0\n" /* PQAP(RAPQ) */ + " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ + : [reg1] "=&d" (reg1.value) + : [reg0] "d" (reg0) + : "cc", "0", "1"); + return reg1.status; } /** * ap_pqap_zapq(): Reset and zeroize adjunct processor queue. * @qid: The AP queue number + * @fbit: if != 0 set F bit * * Returns AP queue status structure. */ -static inline struct ap_queue_status ap_zapq(ap_qid_t qid) +static inline struct ap_queue_status ap_zapq(ap_qid_t qid, int fbit) { - register unsigned long reg0 asm ("0") = qid | (2UL << 24); - register struct ap_queue_status reg1 asm ("1"); + unsigned long reg0 = qid | (2UL << 24); /* fc 2UL is ZAPQ */ + union ap_queue_status_reg reg1; + + if (fbit) + reg0 |= 1UL << 22; asm volatile( - ".long 0xb2af0000" /* PQAP(ZAPQ) */ - : "=d" (reg1) - : "d" (reg0) - : "cc"); - return reg1; + " lgr 0,%[reg0]\n" /* qid arg into gr0 */ + " .insn rre,0xb2af0000,0,0\n" /* PQAP(ZAPQ) */ + " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ + : [reg1] "=&d" (reg1.value) + : [reg0] "d" (reg0) + : "cc", "0", "1"); + return reg1.status; } /** @@ -154,15 +227,16 @@ struct ap_config_info { unsigned int apxa : 1; /* N bit */ unsigned int qact : 1; /* C bit */ unsigned int rc8a : 1; /* R bit */ - unsigned char _reserved1 : 4; - unsigned char _reserved2[3]; - unsigned char Na; /* max # of APs - 1 */ - unsigned char Nd; /* max # of Domains - 1 */ - unsigned char _reserved3[10]; + unsigned int : 4; + unsigned int apsb : 1; /* B bit */ + unsigned int : 23; + unsigned char na; /* max # of APs - 1 */ + unsigned char nd; /* max # of Domains - 1 */ + unsigned char _reserved0[10]; unsigned int apm[8]; /* AP ID mask */ unsigned int aqm[8]; /* AP (usage) queue mask */ unsigned int adm[8]; /* AP (control) domain mask */ - unsigned char _reserved4[16]; + unsigned char _reserved1[16]; } __aligned(8); /** @@ -172,18 +246,20 @@ struct ap_config_info { */ static inline int ap_qci(struct ap_config_info *config) { - register unsigned long reg0 asm ("0") = 4UL << 24; - register unsigned long reg1 asm ("1") = -EOPNOTSUPP; - register struct ap_config_info *reg2 asm ("2") = config; + unsigned long reg0 = 4UL << 24; /* fc 4UL is QCI */ + unsigned long reg1 = -EOPNOTSUPP; + struct ap_config_info *reg2 = config; asm volatile( - ".long 0xb2af0000\n" /* PQAP(QCI) */ - "0: la %0,0\n" + " lgr 0,%[reg0]\n" /* QCI fc into gr0 */ + " lgr 2,%[reg2]\n" /* ptr to config into gr2 */ + " .insn rre,0xb2af0000,0,0\n" /* PQAP(QCI) */ + "0: la %[reg1],0\n" /* good case, QCI fc available */ "1:\n" EX_TABLE(0b, 1b) - : "+d" (reg1) - : "d" (reg0), "d" (reg2) - : "cc", "memory"); + : [reg1] "+&d" (reg1) + : [reg0] "d" (reg0), [reg2] "d" (reg2) + : "cc", "memory", "0", "2"); return reg1; } @@ -194,47 +270,50 @@ static inline int ap_qci(struct ap_config_info *config) * parameter to the PQAP(AQIC) instruction. For details please * see the AR documentation. */ -struct ap_qirq_ctrl { - unsigned int _res1 : 8; - unsigned int zone : 8; /* zone info */ - unsigned int ir : 1; /* ir flag: enable (1) or disable (0) irq */ - unsigned int _res2 : 4; - unsigned int gisc : 3; /* guest isc field */ - unsigned int _res3 : 6; - unsigned int gf : 2; /* gisa format */ - unsigned int _res4 : 1; - unsigned int gisa : 27; /* gisa origin */ - unsigned int _res5 : 1; - unsigned int isc : 3; /* irq sub class */ +union ap_qirq_ctrl { + unsigned long value; + struct { + unsigned int : 8; + unsigned int zone : 8; /* zone info */ + unsigned int ir : 1; /* ir flag: enable (1) or disable (0) irq */ + unsigned int : 4; + unsigned int gisc : 3; /* guest isc field */ + unsigned int : 6; + unsigned int gf : 2; /* gisa format */ + unsigned int : 1; + unsigned int gisa : 27; /* gisa origin */ + unsigned int : 1; + unsigned int isc : 3; /* irq sub class */ + }; }; /** * ap_aqic(): Control interruption for a specific AP. * @qid: The AP queue number * @qirqctrl: struct ap_qirq_ctrl (64 bit value) - * @ind: The notification indicator byte + * @pa_ind: Physical address of the notification indicator byte * * Returns AP queue status. */ static inline struct ap_queue_status ap_aqic(ap_qid_t qid, - struct ap_qirq_ctrl qirqctrl, - void *ind) + union ap_qirq_ctrl qirqctrl, + phys_addr_t pa_ind) { - register unsigned long reg0 asm ("0") = qid | (3UL << 24); - register union { - unsigned long value; - struct ap_qirq_ctrl qirqctrl; - struct ap_queue_status status; - } reg1 asm ("1"); - register void *reg2 asm ("2") = ind; + unsigned long reg0 = qid | (3UL << 24); /* fc 3UL is AQIC */ + union ap_queue_status_reg reg1; + unsigned long reg2 = pa_ind; - reg1.qirqctrl = qirqctrl; + reg1.value = qirqctrl.value; asm volatile( - ".long 0xb2af0000" /* PQAP(AQIC) */ - : "+d" (reg1) - : "d" (reg0), "d" (reg2) - : "cc"); + " lgr 0,%[reg0]\n" /* qid param into gr0 */ + " lgr 1,%[reg1]\n" /* irq ctrl into gr1 */ + " lgr 2,%[reg2]\n" /* ni addr into gr2 */ + " .insn rre,0xb2af0000,0,0\n" /* PQAP(AQIC) */ + " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ + : [reg1] "+&d" (reg1.value) + : [reg0] "d" (reg0), [reg2] "d" (reg2) + : "cc", "memory", "0", "1", "2"); return reg1.status; } @@ -257,7 +336,7 @@ union ap_qact_ap_info { }; /** - * ap_qact(): Query AP combatibility type. + * ap_qact(): Query AP compatibility type. * @qid: The AP queue number * @apinfo: On input the info about the AP queue. On output the * alternate AP queue info provided by the qact function @@ -268,25 +347,78 @@ union ap_qact_ap_info { static inline struct ap_queue_status ap_qact(ap_qid_t qid, int ifbit, union ap_qact_ap_info *apinfo) { - register unsigned long reg0 asm ("0") = qid | (5UL << 24) - | ((ifbit & 0x01) << 22); - register union { - unsigned long value; - struct ap_queue_status status; - } reg1 asm ("1"); - register unsigned long reg2 asm ("2"); + unsigned long reg0 = qid | (5UL << 24) | ((ifbit & 0x01) << 22); + union ap_queue_status_reg reg1; + unsigned long reg2; reg1.value = apinfo->val; asm volatile( - ".long 0xb2af0000" /* PQAP(QACT) */ - : "+d" (reg1), "=d" (reg2) - : "d" (reg0) - : "cc"); + " lgr 0,%[reg0]\n" /* qid param into gr0 */ + " lgr 1,%[reg1]\n" /* qact in info into gr1 */ + " .insn rre,0xb2af0000,0,0\n" /* PQAP(QACT) */ + " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ + " lgr %[reg2],2\n" /* qact out info into reg2 */ + : [reg1] "+&d" (reg1.value), [reg2] "=&d" (reg2) + : [reg0] "d" (reg0) + : "cc", "0", "1", "2"); apinfo->val = reg2; return reg1.status; } +/* + * ap_bapq(): SE bind AP queue. + * @qid: The AP queue number + * + * Returns AP queue status structure. + * + * Invoking this function in a non-SE environment + * may case a specification exception. + */ +static inline struct ap_queue_status ap_bapq(ap_qid_t qid) +{ + unsigned long reg0 = qid | (7UL << 24); /* fc 7 is BAPQ */ + union ap_queue_status_reg reg1; + + asm volatile( + " lgr 0,%[reg0]\n" /* qid arg into gr0 */ + " .insn rre,0xb2af0000,0,0\n" /* PQAP(BAPQ) */ + " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ + : [reg1] "=&d" (reg1.value) + : [reg0] "d" (reg0) + : "cc", "0", "1"); + + return reg1.status; +} + +/* + * ap_aapq(): SE associate AP queue. + * @qid: The AP queue number + * @sec_idx: The secret index + * + * Returns AP queue status structure. + * + * Invoking this function in a non-SE environment + * may case a specification exception. + */ +static inline struct ap_queue_status ap_aapq(ap_qid_t qid, unsigned int sec_idx) +{ + unsigned long reg0 = qid | (8UL << 24); /* fc 8 is AAPQ */ + unsigned long reg2 = sec_idx; + union ap_queue_status_reg reg1; + + asm volatile( + " lgr 0,%[reg0]\n" /* qid arg into gr0 */ + " lgr 2,%[reg2]\n" /* secret index into gr2 */ + " .insn rre,0xb2af0000,0,0\n" /* PQAP(AAPQ) */ + " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ + : [reg1] "=&d" (reg1.value) + : [reg0] "d" (reg0), [reg2] "d" (reg2) + : "cc", "0", "1", "2"); + + return reg1.status; +} + /** * ap_nqap(): Send message to adjunct processor queue. * @qid: The AP queue number @@ -303,28 +435,36 @@ static inline struct ap_queue_status ap_nqap(ap_qid_t qid, unsigned long long psmid, void *msg, size_t length) { - register unsigned long reg0 asm ("0") = qid | 0x40000000UL; - register struct ap_queue_status reg1 asm ("1"); - register unsigned long reg2 asm ("2") = (unsigned long) msg; - register unsigned long reg3 asm ("3") = (unsigned long) length; - register unsigned long reg4 asm ("4") = (unsigned int) (psmid >> 32); - register unsigned long reg5 asm ("5") = psmid & 0xffffffff; + unsigned long reg0 = qid | 0x40000000UL; /* 0x4... is last msg part */ + union register_pair nqap_r1, nqap_r2; + union ap_queue_status_reg reg1; + + nqap_r1.even = (unsigned int)(psmid >> 32); + nqap_r1.odd = psmid & 0xffffffff; + nqap_r2.even = (unsigned long)msg; + nqap_r2.odd = (unsigned long)length; asm volatile ( - "0: .long 0xb2ad0042\n" /* NQAP */ - " brc 2,0b" - : "+d" (reg0), "=d" (reg1), "+d" (reg2), "+d" (reg3) - : "d" (reg4), "d" (reg5) - : "cc", "memory"); - return reg1; + " lgr 0,%[reg0]\n" /* qid param in gr0 */ + "0: .insn rre,0xb2ad0000,%[nqap_r1],%[nqap_r2]\n" + " brc 2,0b\n" /* handle partial completion */ + " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ + : [reg0] "+&d" (reg0), [reg1] "=&d" (reg1.value), + [nqap_r2] "+&d" (nqap_r2.pair) + : [nqap_r1] "d" (nqap_r1.pair) + : "cc", "memory", "0", "1"); + return reg1.status; } /** * ap_dqap(): Receive message from adjunct processor queue. * @qid: The AP queue number * @psmid: Pointer to program supplied message identifier - * @msg: The message text - * @length: The message length + * @msg: Pointer to message buffer + * @msglen: Message buffer size + * @length: Pointer to length of actually written bytes + * @reslength: Residual length on return + * @resgr0: input: gr0 value (only used if != 0), output: residual gr0 content * * Returns AP queue status structure. * Condition code 1 on DQAP means the receive has taken place @@ -336,28 +476,72 @@ static inline struct ap_queue_status ap_nqap(ap_qid_t qid, * Note that gpr2 is used by the DQAP instruction to keep track of * any 'residual' length, in case the instruction gets interrupted. * Hence it gets zeroed before the instruction. + * If the message does not fit into the buffer, this function will + * return with a truncated message and the reply in the firmware queue + * is not removed. This is indicated to the caller with an + * ap_queue_status response_code value of all bits on (0xFF) and (if + * the reslength ptr is given) the remaining length is stored in + * *reslength and (if the resgr0 ptr is given) the updated gr0 value + * for further processing of this msg entry is stored in *resgr0. The + * caller needs to detect this situation and should invoke ap_dqap + * with a valid resgr0 ptr and a value in there != 0 to indicate that + * *resgr0 is to be used instead of qid to further process this entry. */ static inline struct ap_queue_status ap_dqap(ap_qid_t qid, - unsigned long long *psmid, - void *msg, size_t length) + unsigned long *psmid, + void *msg, size_t msglen, + size_t *length, + size_t *reslength, + unsigned long *resgr0) { - register unsigned long reg0 asm("0") = qid | 0x80000000UL; - register struct ap_queue_status reg1 asm ("1"); - register unsigned long reg2 asm("2") = 0UL; - register unsigned long reg4 asm("4") = (unsigned long) msg; - register unsigned long reg5 asm("5") = (unsigned long) length; - register unsigned long reg6 asm("6") = 0UL; - register unsigned long reg7 asm("7") = 0UL; + unsigned long reg0 = resgr0 && *resgr0 ? *resgr0 : qid | 0x80000000UL; + union ap_queue_status_reg reg1; + unsigned long reg2; + union register_pair rp1, rp2; + rp1.even = 0UL; + rp1.odd = 0UL; + rp2.even = (unsigned long)msg; + rp2.odd = (unsigned long)msglen; asm volatile( - "0: .long 0xb2ae0064\n" /* DQAP */ - " brc 6,0b\n" - : "+d" (reg0), "=d" (reg1), "+d" (reg2), - "+d" (reg4), "+d" (reg5), "+d" (reg6), "+d" (reg7) - : : "cc", "memory"); - *psmid = (((unsigned long long) reg6) << 32) + reg7; - return reg1; + " lgr 0,%[reg0]\n" /* qid param into gr0 */ + " lghi 2,0\n" /* 0 into gr2 (res length) */ + "0: ltgr %N[rp2],%N[rp2]\n" /* check buf len */ + " jz 2f\n" /* go out if buf len is 0 */ + "1: .insn rre,0xb2ae0000,%[rp1],%[rp2]\n" + " brc 6,0b\n" /* handle partial complete */ + "2: lgr %[reg0],0\n" /* gr0 (qid + info) into reg0 */ + " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ + " lgr %[reg2],2\n" /* gr2 (res length) into reg2 */ + : [reg0] "+&d" (reg0), [reg1] "=&d" (reg1.value), + [reg2] "=&d" (reg2), [rp1] "+&d" (rp1.pair), + [rp2] "+&d" (rp2.pair) + : + : "cc", "memory", "0", "1", "2"); + + if (reslength) + *reslength = reg2; + if (reg2 != 0 && rp2.odd == 0) { + /* + * Partially complete, status in gr1 is not set. + * Signal the caller that this dqap is only partially received + * with a special status response code 0xFF and *resgr0 updated + */ + reg1.status.response_code = 0xFF; + if (resgr0) + *resgr0 = reg0; + } else { + *psmid = (rp1.even << 32) + rp1.odd; + if (resgr0) + *resgr0 = 0; + } + + /* update *length with the nr of bytes stored into the msg buffer */ + if (length) + *length = msglen - rp2.odd; + + return reg1.status; } /* @@ -368,7 +552,7 @@ static inline struct ap_queue_status ap_dqap(ap_qid_t qid, #if IS_ENABLED(CONFIG_ZCRYPT) void ap_bus_cfg_chg(void); #else -static inline void ap_bus_cfg_chg(void){}; +static inline void ap_bus_cfg_chg(void){} #endif #endif /* _ASM_S390_AP_H_ */ diff --git a/arch/s390/include/asm/appldata.h b/arch/s390/include/asm/appldata.h index c5bd9f4437e5..f2240392c708 100644 --- a/arch/s390/include/asm/appldata.h +++ b/arch/s390/include/asm/appldata.h @@ -8,8 +8,8 @@ #ifndef _ASM_S390_APPLDATA_H #define _ASM_S390_APPLDATA_H +#include <linux/io.h> #include <asm/diag.h> -#include <asm/io.h> #define APPLDATA_START_INTERVAL_REC 0x80 #define APPLDATA_STOP_REC 0x81 diff --git a/arch/s390/include/asm/archrandom.h b/arch/s390/include/asm/archrandom.h index c67b82dfa558..1594049893e0 100644 --- a/arch/s390/include/asm/archrandom.h +++ b/arch/s390/include/asm/archrandom.h @@ -2,7 +2,7 @@ /* * Kernel interface for the s390 arch_random_* functions * - * Copyright IBM Corp. 2017 + * Copyright IBM Corp. 2017, 2022 * * Author: Harald Freudenberger <freude@de.ibm.com> * @@ -11,53 +11,28 @@ #ifndef _ASM_S390_ARCHRANDOM_H #define _ASM_S390_ARCHRANDOM_H -#ifdef CONFIG_ARCH_RANDOM - #include <linux/static_key.h> +#include <linux/preempt.h> #include <linux/atomic.h> +#include <asm/cpacf.h> DECLARE_STATIC_KEY_FALSE(s390_arch_random_available); extern atomic64_t s390_arch_random_counter; -bool s390_arch_random_generate(u8 *buf, unsigned int nbytes); - -static inline bool arch_has_random(void) -{ - return false; -} - -static inline bool arch_has_random_seed(void) -{ - if (static_branch_likely(&s390_arch_random_available)) - return true; - return false; -} - -static inline bool arch_get_random_long(unsigned long *v) +static inline size_t __must_check arch_get_random_longs(unsigned long *v, size_t max_longs) { - return false; -} - -static inline bool arch_get_random_int(unsigned int *v) -{ - return false; -} - -static inline bool arch_get_random_seed_long(unsigned long *v) -{ - if (static_branch_likely(&s390_arch_random_available)) { - return s390_arch_random_generate((u8 *)v, sizeof(*v)); - } - return false; + return 0; } -static inline bool arch_get_random_seed_int(unsigned int *v) +static inline size_t __must_check arch_get_random_seed_longs(unsigned long *v, size_t max_longs) { - if (static_branch_likely(&s390_arch_random_available)) { - return s390_arch_random_generate((u8 *)v, sizeof(*v)); + if (static_branch_likely(&s390_arch_random_available) && + in_task()) { + cpacf_trng(NULL, 0, (u8 *)v, max_longs * sizeof(*v)); + atomic64_add(max_longs * sizeof(*v), &s390_arch_random_counter); + return max_longs; } - return false; + return 0; } -#endif /* CONFIG_ARCH_RANDOM */ #endif /* _ASM_S390_ARCHRANDOM_H */ diff --git a/arch/s390/include/asm/asm-const.h b/arch/s390/include/asm/asm-const.h new file mode 100644 index 000000000000..11f615eb0066 --- /dev/null +++ b/arch/s390/include/asm/asm-const.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_ASM_CONST_H +#define _ASM_S390_ASM_CONST_H + +#ifdef __ASSEMBLY__ +# define stringify_in_c(...) __VA_ARGS__ +#else +/* This version of stringify will deal with commas... */ +# define __stringify_in_c(...) #__VA_ARGS__ +# define stringify_in_c(...) __stringify_in_c(__VA_ARGS__) " " +#endif +#endif /* _ASM_S390_ASM_CONST_H */ diff --git a/arch/s390/include/asm/asm-extable.h b/arch/s390/include/asm/asm-extable.h new file mode 100644 index 000000000000..4a6b0a8b6412 --- /dev/null +++ b/arch/s390/include/asm/asm-extable.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_EXTABLE_H +#define __ASM_EXTABLE_H + +#include <linux/stringify.h> +#include <linux/bits.h> +#include <asm/asm-const.h> + +#define EX_TYPE_NONE 0 +#define EX_TYPE_FIXUP 1 +#define EX_TYPE_BPF 2 +#define EX_TYPE_UA_STORE 3 +#define EX_TYPE_UA_LOAD_MEM 4 +#define EX_TYPE_UA_LOAD_REG 5 +#define EX_TYPE_UA_LOAD_REGPAIR 6 +#define EX_TYPE_ZEROPAD 7 + +#define EX_DATA_REG_ERR_SHIFT 0 +#define EX_DATA_REG_ERR GENMASK(3, 0) + +#define EX_DATA_REG_ADDR_SHIFT 4 +#define EX_DATA_REG_ADDR GENMASK(7, 4) + +#define EX_DATA_LEN_SHIFT 8 +#define EX_DATA_LEN GENMASK(11, 8) + +#define __EX_TABLE(_section, _fault, _target, _type, _regerr, _regaddr, _len) \ + stringify_in_c(.section _section,"a";) \ + stringify_in_c(.balign 4;) \ + stringify_in_c(.long (_fault) - .;) \ + stringify_in_c(.long (_target) - .;) \ + stringify_in_c(.short (_type);) \ + stringify_in_c(.macro extable_reg regerr, regaddr;) \ + stringify_in_c(.set .Lfound, 0;) \ + stringify_in_c(.set .Lcurr, 0;) \ + stringify_in_c(.irp rs,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15;) \ + stringify_in_c( .ifc "\regerr", "%%r\rs";) \ + stringify_in_c( .set .Lfound, 1;) \ + stringify_in_c( .set .Lregerr, .Lcurr;) \ + stringify_in_c( .endif;) \ + stringify_in_c( .set .Lcurr, .Lcurr+1;) \ + stringify_in_c(.endr;) \ + stringify_in_c(.ifne (.Lfound != 1);) \ + stringify_in_c( .error "extable_reg: bad register argument1";) \ + stringify_in_c(.endif;) \ + stringify_in_c(.set .Lfound, 0;) \ + stringify_in_c(.set .Lcurr, 0;) \ + stringify_in_c(.irp rs,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15;) \ + stringify_in_c( .ifc "\regaddr", "%%r\rs";) \ + stringify_in_c( .set .Lfound, 1;) \ + stringify_in_c( .set .Lregaddr, .Lcurr;) \ + stringify_in_c( .endif;) \ + stringify_in_c( .set .Lcurr, .Lcurr+1;) \ + stringify_in_c(.endr;) \ + stringify_in_c(.ifne (.Lfound != 1);) \ + stringify_in_c( .error "extable_reg: bad register argument2";) \ + stringify_in_c(.endif;) \ + stringify_in_c(.short .Lregerr << EX_DATA_REG_ERR_SHIFT | \ + .Lregaddr << EX_DATA_REG_ADDR_SHIFT | \ + _len << EX_DATA_LEN_SHIFT;) \ + stringify_in_c(.endm;) \ + stringify_in_c(extable_reg _regerr,_regaddr;) \ + stringify_in_c(.purgem extable_reg;) \ + stringify_in_c(.previous) + +#define EX_TABLE(_fault, _target) \ + __EX_TABLE(__ex_table, _fault, _target, EX_TYPE_FIXUP, __stringify(%%r0), __stringify(%%r0), 0) + +#define EX_TABLE_AMODE31(_fault, _target) \ + __EX_TABLE(.amode31.ex_table, _fault, _target, EX_TYPE_FIXUP, __stringify(%%r0), __stringify(%%r0), 0) + +#define EX_TABLE_UA_STORE(_fault, _target, _regerr) \ + __EX_TABLE(__ex_table, _fault, _target, EX_TYPE_UA_STORE, _regerr, _regerr, 0) + +#define EX_TABLE_UA_LOAD_MEM(_fault, _target, _regerr, _regmem, _len) \ + __EX_TABLE(__ex_table, _fault, _target, EX_TYPE_UA_LOAD_MEM, _regerr, _regmem, _len) + +#define EX_TABLE_UA_LOAD_REG(_fault, _target, _regerr, _regzero) \ + __EX_TABLE(__ex_table, _fault, _target, EX_TYPE_UA_LOAD_REG, _regerr, _regzero, 0) + +#define EX_TABLE_UA_LOAD_REGPAIR(_fault, _target, _regerr, _regzero) \ + __EX_TABLE(__ex_table, _fault, _target, EX_TYPE_UA_LOAD_REGPAIR, _regerr, _regzero, 0) + +#define EX_TABLE_ZEROPAD(_fault, _target, _regdata, _regaddr) \ + __EX_TABLE(__ex_table, _fault, _target, EX_TYPE_ZEROPAD, _regdata, _regaddr, 0) + +#endif /* __ASM_EXTABLE_H */ diff --git a/arch/s390/include/asm/asm-prototypes.h b/arch/s390/include/asm/asm-prototypes.h index c37eb921bfbf..a873e873e1ee 100644 --- a/arch/s390/include/asm/asm-prototypes.h +++ b/arch/s390/include/asm/asm-prototypes.h @@ -6,4 +6,8 @@ #include <asm/fpu/api.h> #include <asm-generic/asm-prototypes.h> +__int128_t __ashlti3(__int128_t a, int b); +__int128_t __ashrti3(__int128_t a, int b); +__int128_t __lshrti3(__int128_t a, int b); + #endif /* _ASM_S390_PROTOTYPES_H */ diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h index 491ad53a0d4e..7138d189cc42 100644 --- a/arch/s390/include/asm/atomic.h +++ b/arch/s390/include/asm/atomic.h @@ -15,56 +15,46 @@ #include <asm/barrier.h> #include <asm/cmpxchg.h> -#define ATOMIC_INIT(i) { (i) } - -static inline int atomic_read(const atomic_t *v) +static inline int arch_atomic_read(const atomic_t *v) { - int c; - - asm volatile( - " l %0,%1\n" - : "=d" (c) : "Q" (v->counter)); - return c; + return __atomic_read(v); } +#define arch_atomic_read arch_atomic_read -static inline void atomic_set(atomic_t *v, int i) +static inline void arch_atomic_set(atomic_t *v, int i) { - asm volatile( - " st %1,%0\n" - : "=Q" (v->counter) : "d" (i)); + __atomic_set(v, i); } +#define arch_atomic_set arch_atomic_set -static inline int atomic_add_return(int i, atomic_t *v) +static inline int arch_atomic_add_return(int i, atomic_t *v) { return __atomic_add_barrier(i, &v->counter) + i; } +#define arch_atomic_add_return arch_atomic_add_return -static inline int atomic_fetch_add(int i, atomic_t *v) +static inline int arch_atomic_fetch_add(int i, atomic_t *v) { return __atomic_add_barrier(i, &v->counter); } +#define arch_atomic_fetch_add arch_atomic_fetch_add -static inline void atomic_add(int i, atomic_t *v) +static inline void arch_atomic_add(int i, atomic_t *v) { -#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES - if (__builtin_constant_p(i) && (i > -129) && (i < 128)) { - __atomic_add_const(i, &v->counter); - return; - } -#endif __atomic_add(i, &v->counter); } +#define arch_atomic_add arch_atomic_add -#define atomic_sub(_i, _v) atomic_add(-(int)(_i), _v) -#define atomic_sub_return(_i, _v) atomic_add_return(-(int)(_i), _v) -#define atomic_fetch_sub(_i, _v) atomic_fetch_add(-(int)(_i), _v) +#define arch_atomic_sub(_i, _v) arch_atomic_add(-(int)(_i), _v) +#define arch_atomic_sub_return(_i, _v) arch_atomic_add_return(-(int)(_i), _v) +#define arch_atomic_fetch_sub(_i, _v) arch_atomic_fetch_add(-(int)(_i), _v) #define ATOMIC_OPS(op) \ -static inline void atomic_##op(int i, atomic_t *v) \ +static inline void arch_atomic_##op(int i, atomic_t *v) \ { \ __atomic_##op(i, &v->counter); \ } \ -static inline int atomic_fetch_##op(int i, atomic_t *v) \ +static inline int arch_atomic_fetch_##op(int i, atomic_t *v) \ { \ return __atomic_##op##_barrier(i, &v->counter); \ } @@ -75,66 +65,67 @@ ATOMIC_OPS(xor) #undef ATOMIC_OPS -#define atomic_xchg(v, new) (xchg(&((v)->counter), new)) +#define arch_atomic_and arch_atomic_and +#define arch_atomic_or arch_atomic_or +#define arch_atomic_xor arch_atomic_xor +#define arch_atomic_fetch_and arch_atomic_fetch_and +#define arch_atomic_fetch_or arch_atomic_fetch_or +#define arch_atomic_fetch_xor arch_atomic_fetch_xor + +#define arch_atomic_xchg(v, new) (arch_xchg(&((v)->counter), new)) -static inline int atomic_cmpxchg(atomic_t *v, int old, int new) +static inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new) { return __atomic_cmpxchg(&v->counter, old, new); } +#define arch_atomic_cmpxchg arch_atomic_cmpxchg #define ATOMIC64_INIT(i) { (i) } -static inline s64 atomic64_read(const atomic64_t *v) +static inline s64 arch_atomic64_read(const atomic64_t *v) { - s64 c; - - asm volatile( - " lg %0,%1\n" - : "=d" (c) : "Q" (v->counter)); - return c; + return __atomic64_read(v); } +#define arch_atomic64_read arch_atomic64_read -static inline void atomic64_set(atomic64_t *v, s64 i) +static inline void arch_atomic64_set(atomic64_t *v, s64 i) { - asm volatile( - " stg %1,%0\n" - : "=Q" (v->counter) : "d" (i)); + __atomic64_set(v, i); } +#define arch_atomic64_set arch_atomic64_set -static inline s64 atomic64_add_return(s64 i, atomic64_t *v) +static inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) { return __atomic64_add_barrier(i, (long *)&v->counter) + i; } +#define arch_atomic64_add_return arch_atomic64_add_return -static inline s64 atomic64_fetch_add(s64 i, atomic64_t *v) +static inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) { return __atomic64_add_barrier(i, (long *)&v->counter); } +#define arch_atomic64_fetch_add arch_atomic64_fetch_add -static inline void atomic64_add(s64 i, atomic64_t *v) +static inline void arch_atomic64_add(s64 i, atomic64_t *v) { -#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES - if (__builtin_constant_p(i) && (i > -129) && (i < 128)) { - __atomic64_add_const(i, (long *)&v->counter); - return; - } -#endif __atomic64_add(i, (long *)&v->counter); } +#define arch_atomic64_add arch_atomic64_add -#define atomic64_xchg(v, new) (xchg(&((v)->counter), new)) +#define arch_atomic64_xchg(v, new) (arch_xchg(&((v)->counter), new)) -static inline s64 atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) +static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) { return __atomic64_cmpxchg((long *)&v->counter, old, new); } +#define arch_atomic64_cmpxchg arch_atomic64_cmpxchg #define ATOMIC64_OPS(op) \ -static inline void atomic64_##op(s64 i, atomic64_t *v) \ +static inline void arch_atomic64_##op(s64 i, atomic64_t *v) \ { \ __atomic64_##op(i, (long *)&v->counter); \ } \ -static inline long atomic64_fetch_##op(s64 i, atomic64_t *v) \ +static inline long arch_atomic64_fetch_##op(s64 i, atomic64_t *v) \ { \ return __atomic64_##op##_barrier(i, (long *)&v->counter); \ } @@ -145,8 +136,15 @@ ATOMIC64_OPS(xor) #undef ATOMIC64_OPS -#define atomic64_sub_return(_i, _v) atomic64_add_return(-(s64)(_i), _v) -#define atomic64_fetch_sub(_i, _v) atomic64_fetch_add(-(s64)(_i), _v) -#define atomic64_sub(_i, _v) atomic64_add(-(s64)(_i), _v) +#define arch_atomic64_and arch_atomic64_and +#define arch_atomic64_or arch_atomic64_or +#define arch_atomic64_xor arch_atomic64_xor +#define arch_atomic64_fetch_and arch_atomic64_fetch_and +#define arch_atomic64_fetch_or arch_atomic64_fetch_or +#define arch_atomic64_fetch_xor arch_atomic64_fetch_xor + +#define arch_atomic64_sub_return(_i, _v) arch_atomic64_add_return(-(s64)(_i), _v) +#define arch_atomic64_fetch_sub(_i, _v) arch_atomic64_fetch_add(-(s64)(_i), _v) +#define arch_atomic64_sub(_i, _v) arch_atomic64_add(-(s64)(_i), _v) #endif /* __ARCH_S390_ATOMIC__ */ diff --git a/arch/s390/include/asm/atomic_ops.h b/arch/s390/include/asm/atomic_ops.h index 61467b9eecc7..50510e08b893 100644 --- a/arch/s390/include/asm/atomic_ops.h +++ b/arch/s390/include/asm/atomic_ops.h @@ -8,6 +8,40 @@ #ifndef __ARCH_S390_ATOMIC_OPS__ #define __ARCH_S390_ATOMIC_OPS__ +static inline int __atomic_read(const atomic_t *v) +{ + int c; + + asm volatile( + " l %0,%1\n" + : "=d" (c) : "R" (v->counter)); + return c; +} + +static inline void __atomic_set(atomic_t *v, int i) +{ + asm volatile( + " st %1,%0\n" + : "=R" (v->counter) : "d" (i)); +} + +static inline s64 __atomic64_read(const atomic64_t *v) +{ + s64 c; + + asm volatile( + " lg %0,%1\n" + : "=d" (c) : "RT" (v->counter)); + return c; +} + +static inline void __atomic64_set(atomic64_t *v, s64 i) +{ + asm volatile( + " stg %1,%0\n" + : "=RT" (v->counter) : "d" (i)); +} + #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES #define __ATOMIC_OP(op_name, op_type, op_string, op_barrier) \ @@ -18,7 +52,7 @@ static inline op_type op_name(op_type val, op_type *ptr) \ asm volatile( \ op_string " %[old],%[val],%[ptr]\n" \ op_barrier \ - : [old] "=d" (old), [ptr] "+Q" (*ptr) \ + : [old] "=d" (old), [ptr] "+QS" (*ptr) \ : [val] "d" (val) : "cc", "memory"); \ return old; \ } \ @@ -46,7 +80,7 @@ static __always_inline void op_name(op_type val, op_type *ptr) \ asm volatile( \ op_string " %[ptr],%[val]\n" \ op_barrier \ - : [ptr] "+Q" (*ptr) : [val] "i" (val) : "cc", "memory");\ + : [ptr] "+QS" (*ptr) : [val] "i" (val) : "cc", "memory");\ } #define __ATOMIC_CONST_OPS(op_name, op_type, op_string) \ @@ -97,7 +131,7 @@ static inline long op_name(long val, long *ptr) \ op_string " %[new],%[val]\n" \ " csg %[old],%[new],%[ptr]\n" \ " jl 0b" \ - : [old] "=d" (old), [new] "=&d" (new), [ptr] "+Q" (*ptr)\ + : [old] "=d" (old), [new] "=&d" (new), [ptr] "+QS" (*ptr)\ : [val] "d" (val), "0" (*ptr) : "cc", "memory"); \ return old; \ } @@ -122,22 +156,46 @@ __ATOMIC64_OPS(__atomic64_xor, "xgr") static inline int __atomic_cmpxchg(int *ptr, int old, int new) { - return __sync_val_compare_and_swap(ptr, old, new); + asm volatile( + " cs %[old],%[new],%[ptr]" + : [old] "+d" (old), [ptr] "+Q" (*ptr) + : [new] "d" (new) + : "cc", "memory"); + return old; } -static inline int __atomic_cmpxchg_bool(int *ptr, int old, int new) +static inline bool __atomic_cmpxchg_bool(int *ptr, int old, int new) { - return __sync_bool_compare_and_swap(ptr, old, new); + int old_expected = old; + + asm volatile( + " cs %[old],%[new],%[ptr]" + : [old] "+d" (old), [ptr] "+Q" (*ptr) + : [new] "d" (new) + : "cc", "memory"); + return old == old_expected; } static inline long __atomic64_cmpxchg(long *ptr, long old, long new) { - return __sync_val_compare_and_swap(ptr, old, new); + asm volatile( + " csg %[old],%[new],%[ptr]" + : [old] "+d" (old), [ptr] "+QS" (*ptr) + : [new] "d" (new) + : "cc", "memory"); + return old; } -static inline long __atomic64_cmpxchg_bool(long *ptr, long old, long new) +static inline bool __atomic64_cmpxchg_bool(long *ptr, long old, long new) { - return __sync_bool_compare_and_swap(ptr, old, new); + long old_expected = old; + + asm volatile( + " csg %[old],%[new],%[ptr]" + : [old] "+d" (old), [ptr] "+QS" (*ptr) + : [new] "d" (new) + : "cc", "memory"); + return old == old_expected; } #endif /* __ARCH_S390_ATOMIC_OPS__ */ diff --git a/arch/s390/include/asm/barrier.h b/arch/s390/include/asm/barrier.h index f9eddbca79d2..82de2a7c4160 100644 --- a/arch/s390/include/asm/barrier.h +++ b/arch/s390/include/asm/barrier.h @@ -16,20 +16,24 @@ #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES /* Fast-BCR without checkpoint synchronization */ -#define __ASM_BARRIER "bcr 14,0\n" +#define __ASM_BCR_SERIALIZE "bcr 14,0\n" #else -#define __ASM_BARRIER "bcr 15,0\n" +#define __ASM_BCR_SERIALIZE "bcr 15,0\n" #endif -#define mb() do { asm volatile(__ASM_BARRIER : : : "memory"); } while (0) +static __always_inline void bcr_serialize(void) +{ + asm volatile(__ASM_BCR_SERIALIZE : : : "memory"); +} -#define rmb() barrier() -#define wmb() barrier() -#define dma_rmb() mb() -#define dma_wmb() mb() -#define __smp_mb() mb() -#define __smp_rmb() rmb() -#define __smp_wmb() wmb() +#define __mb() bcr_serialize() +#define __rmb() barrier() +#define __wmb() barrier() +#define __dma_rmb() __mb() +#define __dma_wmb() __mb() +#define __smp_mb() __mb() +#define __smp_rmb() __rmb() +#define __smp_wmb() __wmb() #define __smp_store_release(p, v) \ do { \ diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index 431e208a5ea4..c467dffa8c12 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -42,7 +42,7 @@ #define __BITOPS_WORDS(bits) (((bits) + BITS_PER_LONG - 1) / BITS_PER_LONG) static inline unsigned long * -__bitops_word(unsigned long nr, volatile unsigned long *ptr) +__bitops_word(unsigned long nr, const volatile unsigned long *ptr) { unsigned long addr; @@ -50,73 +50,33 @@ __bitops_word(unsigned long nr, volatile unsigned long *ptr) return (unsigned long *)addr; } -static inline unsigned char * -__bitops_byte(unsigned long nr, volatile unsigned long *ptr) +static inline unsigned long __bitops_mask(unsigned long nr) { - return ((unsigned char *)ptr) + ((nr ^ (BITS_PER_LONG - 8)) >> 3); + return 1UL << (nr & (BITS_PER_LONG - 1)); } static __always_inline void arch_set_bit(unsigned long nr, volatile unsigned long *ptr) { unsigned long *addr = __bitops_word(nr, ptr); - unsigned long mask; + unsigned long mask = __bitops_mask(nr); -#ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES - if (__builtin_constant_p(nr)) { - unsigned char *caddr = __bitops_byte(nr, ptr); - - asm volatile( - "oi %0,%b1\n" - : "+Q" (*caddr) - : "i" (1 << (nr & 7)) - : "cc", "memory"); - return; - } -#endif - mask = 1UL << (nr & (BITS_PER_LONG - 1)); __atomic64_or(mask, (long *)addr); } static __always_inline void arch_clear_bit(unsigned long nr, volatile unsigned long *ptr) { unsigned long *addr = __bitops_word(nr, ptr); - unsigned long mask; + unsigned long mask = __bitops_mask(nr); -#ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES - if (__builtin_constant_p(nr)) { - unsigned char *caddr = __bitops_byte(nr, ptr); - - asm volatile( - "ni %0,%b1\n" - : "+Q" (*caddr) - : "i" (~(1 << (nr & 7))) - : "cc", "memory"); - return; - } -#endif - mask = ~(1UL << (nr & (BITS_PER_LONG - 1))); - __atomic64_and(mask, (long *)addr); + __atomic64_and(~mask, (long *)addr); } static __always_inline void arch_change_bit(unsigned long nr, volatile unsigned long *ptr) { unsigned long *addr = __bitops_word(nr, ptr); - unsigned long mask; - -#ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES - if (__builtin_constant_p(nr)) { - unsigned char *caddr = __bitops_byte(nr, ptr); + unsigned long mask = __bitops_mask(nr); - asm volatile( - "xi %0,%b1\n" - : "+Q" (*caddr) - : "i" (1 << (nr & 7)) - : "cc", "memory"); - return; - } -#endif - mask = 1UL << (nr & (BITS_PER_LONG - 1)); __atomic64_xor(mask, (long *)addr); } @@ -124,106 +84,106 @@ static inline bool arch_test_and_set_bit(unsigned long nr, volatile unsigned long *ptr) { unsigned long *addr = __bitops_word(nr, ptr); - unsigned long old, mask; + unsigned long mask = __bitops_mask(nr); + unsigned long old; - mask = 1UL << (nr & (BITS_PER_LONG - 1)); old = __atomic64_or_barrier(mask, (long *)addr); - return (old & mask) != 0; + return old & mask; } static inline bool arch_test_and_clear_bit(unsigned long nr, volatile unsigned long *ptr) { unsigned long *addr = __bitops_word(nr, ptr); - unsigned long old, mask; + unsigned long mask = __bitops_mask(nr); + unsigned long old; - mask = ~(1UL << (nr & (BITS_PER_LONG - 1))); - old = __atomic64_and_barrier(mask, (long *)addr); - return (old & ~mask) != 0; + old = __atomic64_and_barrier(~mask, (long *)addr); + return old & mask; } static inline bool arch_test_and_change_bit(unsigned long nr, volatile unsigned long *ptr) { unsigned long *addr = __bitops_word(nr, ptr); - unsigned long old, mask; + unsigned long mask = __bitops_mask(nr); + unsigned long old; - mask = 1UL << (nr & (BITS_PER_LONG - 1)); old = __atomic64_xor_barrier(mask, (long *)addr); - return (old & mask) != 0; + return old & mask; } -static inline void arch___set_bit(unsigned long nr, volatile unsigned long *ptr) +static __always_inline void +arch___set_bit(unsigned long nr, volatile unsigned long *addr) { - unsigned char *addr = __bitops_byte(nr, ptr); + unsigned long *p = __bitops_word(nr, addr); + unsigned long mask = __bitops_mask(nr); - *addr |= 1 << (nr & 7); + *p |= mask; } -static inline void arch___clear_bit(unsigned long nr, - volatile unsigned long *ptr) +static __always_inline void +arch___clear_bit(unsigned long nr, volatile unsigned long *addr) { - unsigned char *addr = __bitops_byte(nr, ptr); + unsigned long *p = __bitops_word(nr, addr); + unsigned long mask = __bitops_mask(nr); - *addr &= ~(1 << (nr & 7)); + *p &= ~mask; } -static inline void arch___change_bit(unsigned long nr, - volatile unsigned long *ptr) +static __always_inline void +arch___change_bit(unsigned long nr, volatile unsigned long *addr) { - unsigned char *addr = __bitops_byte(nr, ptr); + unsigned long *p = __bitops_word(nr, addr); + unsigned long mask = __bitops_mask(nr); - *addr ^= 1 << (nr & 7); + *p ^= mask; } -static inline bool arch___test_and_set_bit(unsigned long nr, - volatile unsigned long *ptr) +static __always_inline bool +arch___test_and_set_bit(unsigned long nr, volatile unsigned long *addr) { - unsigned char *addr = __bitops_byte(nr, ptr); - unsigned char ch; + unsigned long *p = __bitops_word(nr, addr); + unsigned long mask = __bitops_mask(nr); + unsigned long old; - ch = *addr; - *addr |= 1 << (nr & 7); - return (ch >> (nr & 7)) & 1; + old = *p; + *p |= mask; + return old & mask; } -static inline bool arch___test_and_clear_bit(unsigned long nr, - volatile unsigned long *ptr) +static __always_inline bool +arch___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr) { - unsigned char *addr = __bitops_byte(nr, ptr); - unsigned char ch; + unsigned long *p = __bitops_word(nr, addr); + unsigned long mask = __bitops_mask(nr); + unsigned long old; - ch = *addr; - *addr &= ~(1 << (nr & 7)); - return (ch >> (nr & 7)) & 1; + old = *p; + *p &= ~mask; + return old & mask; } -static inline bool arch___test_and_change_bit(unsigned long nr, - volatile unsigned long *ptr) +static __always_inline bool +arch___test_and_change_bit(unsigned long nr, volatile unsigned long *addr) { - unsigned char *addr = __bitops_byte(nr, ptr); - unsigned char ch; + unsigned long *p = __bitops_word(nr, addr); + unsigned long mask = __bitops_mask(nr); + unsigned long old; - ch = *addr; - *addr ^= 1 << (nr & 7); - return (ch >> (nr & 7)) & 1; + old = *p; + *p ^= mask; + return old & mask; } -static inline bool arch_test_bit(unsigned long nr, - const volatile unsigned long *ptr) -{ - const volatile unsigned char *addr; - - addr = ((const volatile unsigned char *)ptr); - addr += (nr ^ (BITS_PER_LONG - 8)) >> 3; - return (*addr >> (nr & 7)) & 1; -} +#define arch_test_bit generic_test_bit +#define arch_test_bit_acquire generic_test_bit_acquire static inline bool arch_test_and_set_bit_lock(unsigned long nr, volatile unsigned long *ptr) { if (arch_test_bit(nr, ptr)) - return 1; + return true; return arch_test_and_set_bit(nr, ptr); } @@ -241,6 +201,16 @@ static inline void arch___clear_bit_unlock(unsigned long nr, arch___clear_bit(nr, ptr); } +static inline bool arch_xor_unlock_is_negative_byte(unsigned long mask, + volatile unsigned long *ptr) +{ + unsigned long old; + + old = __atomic64_xor_barrier(mask, (long *)ptr); + return old & BIT(7); +} +#define arch_xor_unlock_is_negative_byte arch_xor_unlock_is_negative_byte + #include <asm-generic/bitops/instrumented-atomic.h> #include <asm-generic/bitops/instrumented-non-atomic.h> #include <asm-generic/bitops/instrumented-lock.h> @@ -291,8 +261,6 @@ static inline bool test_bit_inv(unsigned long nr, return test_bit(nr ^ (BITS_PER_LONG - 1), ptr); } -#ifdef CONFIG_HAVE_MARCH_Z9_109_FEATURES - /** * __flogr - find leftmost one * @word - The word to search @@ -334,13 +302,13 @@ static inline unsigned char __flogr(unsigned long word) } return bit; } else { - register unsigned long bit asm("4") = word; - register unsigned long out asm("5"); + union register_pair rp; + rp.even = word; asm volatile( - " flogr %[bit],%[bit]\n" - : [bit] "+d" (bit), [out] "=d" (out) : : "cc"); - return bit; + " flogr %[rp],%[rp]\n" + : [rp] "+d" (rp.pair) : : "cc"); + return rp.even; } } @@ -411,18 +379,7 @@ static inline int fls(unsigned int word) return fls64(word); } -#else /* CONFIG_HAVE_MARCH_Z9_109_FEATURES */ - -#include <asm-generic/bitops/__ffs.h> -#include <asm-generic/bitops/ffs.h> -#include <asm-generic/bitops/__fls.h> -#include <asm-generic/bitops/fls.h> -#include <asm-generic/bitops/fls64.h> - -#endif /* CONFIG_HAVE_MARCH_Z9_109_FEATURES */ - #include <asm-generic/bitops/ffz.h> -#include <asm-generic/bitops/find.h> #include <asm-generic/bitops/hweight.h> #include <asm-generic/bitops/sched.h> #include <asm-generic/bitops/le.h> diff --git a/arch/s390/include/asm/bug.h b/arch/s390/include/asm/bug.h index a2b11ac00f60..aebe1e22c7be 100644 --- a/arch/s390/include/asm/bug.h +++ b/arch/s390/include/asm/bug.h @@ -2,7 +2,7 @@ #ifndef _ASM_S390_BUG_H #define _ASM_S390_BUG_H -#include <linux/kernel.h> +#include <linux/compiler.h> #ifdef CONFIG_BUG @@ -10,15 +10,15 @@ #define __EMIT_BUG(x) do { \ asm_inline volatile( \ - "0: j 0b+2\n" \ - "1:\n" \ + "0: mc 0,0\n" \ ".section .rodata.str,\"aMS\",@progbits,1\n" \ - "2: .asciz \""__FILE__"\"\n" \ + "1: .asciz \""__FILE__"\"\n" \ ".previous\n" \ ".section __bug_table,\"awM\",@progbits,%2\n" \ - "3: .long 1b-3b,2b-3b\n" \ + "2: .long 0b-.\n" \ + " .long 1b-.\n" \ " .short %0,%1\n" \ - " .org 3b+%2\n" \ + " .org 2b+%2\n" \ ".previous\n" \ : : "i" (__LINE__), \ "i" (x), \ @@ -29,12 +29,11 @@ #define __EMIT_BUG(x) do { \ asm_inline volatile( \ - "0: j 0b+2\n" \ - "1:\n" \ + "0: mc 0,0\n" \ ".section __bug_table,\"awM\",@progbits,%1\n" \ - "2: .long 1b-2b\n" \ + "1: .long 0b-.\n" \ " .short %0\n" \ - " .org 2b+%1\n" \ + " .org 1b+%1\n" \ ".previous\n" \ : : "i" (x), \ "i" (sizeof(struct bug_entry))); \ diff --git a/arch/s390/include/asm/bugs.h b/arch/s390/include/asm/bugs.h deleted file mode 100644 index aa42a179be33..000000000000 --- a/arch/s390/include/asm/bugs.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * S390 version - * Copyright IBM Corp. 1999 - * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) - * - * Derived from "include/asm-i386/bugs.h" - * Copyright (C) 1994 Linus Torvalds - */ - -/* - * This is included by init/main.c to check for architecture-dependent bugs. - * - * Needs: - * void check_bugs(void); - */ - -static inline void check_bugs(void) -{ - /* s390 has no bugs ... */ -} diff --git a/arch/s390/include/asm/cache.h b/arch/s390/include/asm/cache.h index d5e22e837416..00128174c025 100644 --- a/arch/s390/include/asm/cache.h +++ b/arch/s390/include/asm/cache.h @@ -14,6 +14,6 @@ #define L1_CACHE_SHIFT 8 #define NET_SKB_PAD 32 -#define __read_mostly __section(.data..read_mostly) +#define __read_mostly __section(".data..read_mostly") #endif diff --git a/arch/s390/include/asm/ccwdev.h b/arch/s390/include/asm/ccwdev.h index 865ce1cb86d5..91d261751d25 100644 --- a/arch/s390/include/asm/ccwdev.h +++ b/arch/s390/include/asm/ccwdev.h @@ -11,9 +11,11 @@ #include <linux/device.h> #include <linux/mod_devicetable.h> +#include <asm/chsc.h> #include <asm/fcx.h> #include <asm/irq.h> #include <asm/schid.h> +#include <linux/mutex.h> /* structs from asm/cio.h */ struct irb; @@ -86,6 +88,7 @@ struct ccw_device { spinlock_t *ccwlock; /* private: */ struct ccw_device_private *private; /* cio private information */ + struct mutex reg_mutex; /* public: */ struct ccw_device_id id; struct ccw_driver *drv; @@ -103,6 +106,8 @@ struct ccw_device { was successfully verified. */ #define PE_PATHGROUP_ESTABLISHED 0x4 /* A pathgroup was reset and had to be established again. */ +#define PE_PATH_FCES_EVENT 0x8 /* The FCES Status of a path has + * changed. */ /* * Possible CIO actions triggered by the unit check handler. @@ -114,7 +119,7 @@ enum uc_todo { }; /** - * struct ccw driver - device driver for channel attached devices + * struct ccw_driver - device driver for channel attached devices * @ids: ids supported by this driver * @probe: function called on probe * @remove: function called on remove @@ -123,11 +128,6 @@ enum uc_todo { * @notify: notify driver of device state changes * @path_event: notify driver of channel path events * @shutdown: called at device shutdown - * @prepare: prepare for pm state transition - * @complete: undo work done in @prepare - * @freeze: callback for freezing during hibernation snapshotting - * @thaw: undo work done in @freeze - * @restore: callback for restoring after hibernation * @uc_handler: callback for unit check handler * @driver: embedded device driver structure * @int_class: interruption class to use for accounting interrupts @@ -141,11 +141,6 @@ struct ccw_driver { int (*notify) (struct ccw_device *, int); void (*path_event) (struct ccw_device *, int *); void (*shutdown) (struct ccw_device *); - int (*prepare) (struct ccw_device *); - void (*complete) (struct ccw_device *); - int (*freeze)(struct ccw_device *); - int (*thaw) (struct ccw_device *); - int (*restore)(struct ccw_device *); enum uc_todo (*uc_handler) (struct ccw_device *, struct irb *); struct device_driver driver; enum interruption_class int_class; @@ -159,9 +154,6 @@ extern struct ccw_device *get_ccwdev_by_busid(struct ccw_driver *cdrv, * when new devices for its type pop up */ extern int ccw_driver_register (struct ccw_driver *driver); extern void ccw_driver_unregister (struct ccw_driver *driver); - -struct ccw1; - extern int ccw_device_set_options_mask(struct ccw_device *, unsigned long); extern int ccw_device_set_options(struct ccw_device *, unsigned long); extern void ccw_device_clear_options(struct ccw_device *, unsigned long); @@ -224,7 +216,6 @@ extern struct ccw_device *ccw_device_create_console(struct ccw_driver *); extern void ccw_device_destroy_console(struct ccw_device *); extern int ccw_device_enable_console(struct ccw_device *); extern void ccw_device_wait_idle(struct ccw_device *); -extern int ccw_device_force_console(struct ccw_device *); extern void *ccw_device_dma_zalloc(struct ccw_device *cdev, size_t size); extern void ccw_device_dma_free(struct ccw_device *cdev, @@ -236,4 +227,11 @@ extern void ccw_device_get_schid(struct ccw_device *, struct subchannel_id *); struct channel_path_desc_fmt0 *ccw_device_get_chp_desc(struct ccw_device *, int); u8 *ccw_device_get_util_str(struct ccw_device *cdev, int chp_idx); +int ccw_device_pnso(struct ccw_device *cdev, + struct chsc_pnso_area *pnso_area, u8 oc, + struct chsc_pnso_resume_token resume_token, int cnc); +int ccw_device_get_cssid(struct ccw_device *cdev, u8 *cssid); +int ccw_device_get_iid(struct ccw_device *cdev, u8 *iid); +int ccw_device_get_chpid(struct ccw_device *cdev, int chp_idx, u8 *chpid); +int ccw_device_get_chid(struct ccw_device *cdev, int chp_idx, u16 *chid); #endif /* _S390_CCWDEV_H_ */ diff --git a/arch/s390/include/asm/ccwgroup.h b/arch/s390/include/asm/ccwgroup.h index 7293c139dd79..11d2fb3de4f5 100644 --- a/arch/s390/include/asm/ccwgroup.h +++ b/arch/s390/include/asm/ccwgroup.h @@ -11,8 +11,7 @@ struct ccw_driver; * @count: number of attached slave devices * @dev: embedded device structure * @cdev: variable number of slave devices, allocated as needed - * @ungroup_work: work to be done when a ccwgroup notifier has action - * type %BUS_NOTIFY_UNBIND_DRIVER + * @ungroup_work: used to ungroup the ccwgroup device */ struct ccwgroup_device { enum { @@ -26,7 +25,7 @@ struct ccwgroup_device { unsigned int count; struct device dev; struct work_struct ungroup_work; - struct ccw_device *cdev[0]; + struct ccw_device *cdev[]; }; /** @@ -36,11 +35,6 @@ struct ccwgroup_device { * @set_online: function called when device is set online * @set_offline: function called when device is set offline * @shutdown: function called when device is shut down - * @prepare: prepare for pm state transition - * @complete: undo work done in @prepare - * @freeze: callback for freezing during hibernation snapshotting - * @thaw: undo work done in @freeze - * @restore: callback for restoring after hibernation * @driver: embedded driver structure * @ccw_driver: supported ccw_driver (optional) */ @@ -50,11 +44,6 @@ struct ccwgroup_driver { int (*set_online) (struct ccwgroup_device *); int (*set_offline) (struct ccwgroup_device *); void (*shutdown)(struct ccwgroup_device *); - int (*prepare) (struct ccwgroup_device *); - void (*complete) (struct ccwgroup_device *); - int (*freeze)(struct ccwgroup_device *); - int (*thaw) (struct ccwgroup_device *); - int (*restore)(struct ccwgroup_device *); struct device_driver driver; struct ccw_driver *ccw_driver; @@ -64,11 +53,9 @@ extern int ccwgroup_driver_register (struct ccwgroup_driver *cdriver); extern void ccwgroup_driver_unregister (struct ccwgroup_driver *cdriver); int ccwgroup_create_dev(struct device *root, struct ccwgroup_driver *gdrv, int num_devices, const char *buf); -struct ccwgroup_device *get_ccwgroupdev_by_busid(struct ccwgroup_driver *gdrv, - char *bus_id); extern int ccwgroup_set_online(struct ccwgroup_device *gdev); -extern int ccwgroup_set_offline(struct ccwgroup_device *gdev); +int ccwgroup_set_offline(struct ccwgroup_device *gdev, bool call_gdrv); extern int ccwgroup_probe_ccwdev(struct ccw_device *cdev); extern void ccwgroup_remove_ccwdev(struct ccw_device *cdev); diff --git a/arch/s390/include/asm/checksum.h b/arch/s390/include/asm/checksum.h index 91e376b0d28c..69837eec2ff5 100644 --- a/arch/s390/include/asm/checksum.h +++ b/arch/s390/include/asm/checksum.h @@ -12,128 +12,122 @@ #ifndef _S390_CHECKSUM_H #define _S390_CHECKSUM_H -#include <linux/uaccess.h> +#include <linux/kasan-checks.h> +#include <linux/in6.h> /* - * computes the checksum of a memory block at buff, length len, - * and adds in "sum" (32-bit) + * Computes the checksum of a memory block at buff, length len, + * and adds in "sum" (32-bit). * - * returns a 32-bit number suitable for feeding into itself - * or csum_tcpudp_magic + * Returns a 32-bit number suitable for feeding into itself + * or csum_tcpudp_magic. * - * this function must be called with even lengths, except - * for the last fragment, which may be odd + * This function must be called with even lengths, except + * for the last fragment, which may be odd. * - * it's best to have buff aligned on a 32-bit boundary + * It's best to have buff aligned on a 32-bit boundary. */ -static inline __wsum -csum_partial(const void *buff, int len, __wsum sum) +static inline __wsum csum_partial(const void *buff, int len, __wsum sum) { - register unsigned long reg2 asm("2") = (unsigned long) buff; - register unsigned long reg3 asm("3") = (unsigned long) len; + union register_pair rp = { + .even = (unsigned long) buff, + .odd = (unsigned long) len, + }; + kasan_check_read(buff, len); asm volatile( - "0: cksm %0,%1\n" /* do checksum on longs */ + "0: cksm %[sum],%[rp]\n" " jo 0b\n" - : "+d" (sum), "+d" (reg2), "+d" (reg3) : : "cc", "memory"); + : [sum] "+&d" (sum), [rp] "+&d" (rp.pair) : : "cc", "memory"); return sum; } /* - * the same as csum_partial_copy, but copies from user space. - * - * here even more important to align src and dst on a 32-bit (or even - * better 64-bit) boundary - * - * Copy from userspace and compute checksum. - */ -static inline __wsum -csum_partial_copy_from_user(const void __user *src, void *dst, - int len, __wsum sum, - int *err_ptr) -{ - if (unlikely(copy_from_user(dst, src, len))) - *err_ptr = -EFAULT; - return csum_partial(dst, len, sum); -} - - -static inline __wsum -csum_partial_copy_nocheck (const void *src, void *dst, int len, __wsum sum) -{ - memcpy(dst,src,len); - return csum_partial(dst, len, sum); -} - -/* - * Fold a partial checksum without adding pseudo headers + * Fold a partial checksum without adding pseudo headers. */ static inline __sum16 csum_fold(__wsum sum) { u32 csum = (__force u32) sum; - csum += (csum >> 16) + (csum << 16); + csum += (csum >> 16) | (csum << 16); csum >>= 16; return (__force __sum16) ~csum; } /* - * This is a version of ip_compute_csum() optimized for IP headers, - * which always checksum on 4 octet boundaries. - * + * This is a version of ip_compute_csum() optimized for IP headers, + * which always checksums on 4 octet boundaries. */ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl) { - return csum_fold(csum_partial(iph, ihl*4, 0)); + __u64 csum = 0; + __u32 *ptr = (u32 *)iph; + + csum += *ptr++; + csum += *ptr++; + csum += *ptr++; + csum += *ptr++; + ihl -= 4; + while (ihl--) + csum += *ptr++; + csum += (csum >> 32) | (csum << 32); + return csum_fold((__force __wsum)(csum >> 32)); } /* - * computes the checksum of the TCP/UDP pseudo-header - * returns a 32-bit checksum + * Computes the checksum of the TCP/UDP pseudo-header. + * Returns a 32-bit checksum. */ -static inline __wsum -csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, - __wsum sum) +static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, + __u8 proto, __wsum sum) { - __u32 csum = (__force __u32)sum; + __u64 csum = (__force __u64)sum; csum += (__force __u32)saddr; - if (csum < (__force __u32)saddr) - csum++; - csum += (__force __u32)daddr; - if (csum < (__force __u32)daddr) - csum++; - - csum += len + proto; - if (csum < len + proto) - csum++; - - return (__force __wsum)csum; + csum += len; + csum += proto; + csum += (csum >> 32) | (csum << 32); + return (__force __wsum)(csum >> 32); } /* - * computes the checksum of the TCP/UDP pseudo-header - * returns a 16-bit checksum, already complemented + * Computes the checksum of the TCP/UDP pseudo-header. + * Returns a 16-bit checksum, already complemented. */ - -static inline __sum16 -csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, - __wsum sum) +static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, + __u8 proto, __wsum sum) { - return csum_fold(csum_tcpudp_nofold(saddr,daddr,len,proto,sum)); + return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum)); } /* - * this routine is used for miscellaneous IP-like checksums, mainly - * in icmp.c + * Used for miscellaneous IP-like checksums, mainly icmp. */ - static inline __sum16 ip_compute_csum(const void *buff, int len) { return csum_fold(csum_partial(buff, len, 0)); } -#endif /* _S390_CHECKSUM_H */ - +#define _HAVE_ARCH_IPV6_CSUM +static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr, + const struct in6_addr *daddr, + __u32 len, __u8 proto, __wsum csum) +{ + __u64 sum = (__force __u64)csum; + + sum += (__force __u32)saddr->s6_addr32[0]; + sum += (__force __u32)saddr->s6_addr32[1]; + sum += (__force __u32)saddr->s6_addr32[2]; + sum += (__force __u32)saddr->s6_addr32[3]; + sum += (__force __u32)daddr->s6_addr32[0]; + sum += (__force __u32)daddr->s6_addr32[1]; + sum += (__force __u32)daddr->s6_addr32[2]; + sum += (__force __u32)daddr->s6_addr32[3]; + sum += len; + sum += proto; + sum += (sum >> 32) | (sum << 32); + return csum_fold((__force __wsum)(sum >> 32)); +} +#endif /* _S390_CHECKSUM_H */ diff --git a/arch/s390/include/asm/chsc.h b/arch/s390/include/asm/chsc.h new file mode 100644 index 000000000000..bb48ea380c0d --- /dev/null +++ b/arch/s390/include/asm/chsc.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 2020 + * + * Author(s): Alexandra Winter <wintera@linux.ibm.com> + * + * Interface for Channel Subsystem Call + */ +#ifndef _ASM_S390_CHSC_H +#define _ASM_S390_CHSC_H + +#include <uapi/asm/chsc.h> + +/** + * Operation codes for CHSC PNSO: + * PNSO_OC_NET_BRIDGE_INFO - only addresses that are visible to a bridgeport + * PNSO_OC_NET_ADDR_INFO - all addresses + */ +#define PNSO_OC_NET_BRIDGE_INFO 0 +#define PNSO_OC_NET_ADDR_INFO 3 +/** + * struct chsc_pnso_naid_l2 - network address information descriptor + * @nit: Network interface token + * @addr_lnid: network address and logical network id (VLAN ID) + */ +struct chsc_pnso_naid_l2 { + u64 nit; + struct { u8 mac[6]; u16 lnid; } addr_lnid; +} __packed; + +struct chsc_pnso_resume_token { + u64 t1; + u64 t2; +} __packed; + +struct chsc_pnso_naihdr { + struct chsc_pnso_resume_token resume_token; + u32:32; + u32 instance; + u32:24; + u8 naids; + u32 reserved[3]; +} __packed; + +struct chsc_pnso_area { + struct chsc_header request; + u8:2; + u8 m:1; + u8:5; + u8:2; + u8 ssid:2; + u8 fmt:4; + u16 sch; + u8:8; + u8 cssid; + u16:16; + u8 oc; + u32:24; + struct chsc_pnso_resume_token resume_token; + u32 n:1; + u32:31; + u32 reserved[3]; + struct chsc_header response; + u32:32; + struct chsc_pnso_naihdr naihdr; + struct chsc_pnso_naid_l2 entries[]; +} __packed __aligned(PAGE_SIZE); + +#endif /* _ASM_S390_CHSC_H */ diff --git a/arch/s390/include/asm/cio.h b/arch/s390/include/asm/cio.h index b5bfb3123cb1..1c4f585dd39b 100644 --- a/arch/s390/include/asm/cio.h +++ b/arch/s390/include/asm/cio.h @@ -5,10 +5,10 @@ #ifndef _ASM_S390_CIO_H_ #define _ASM_S390_CIO_H_ -#include <linux/spinlock.h> #include <linux/bitops.h> #include <linux/genalloc.h> #include <asm/types.h> +#include <asm/tpi.h> #define LPM_ANYPATH 0xff #define __MAX_CSSID 0 @@ -329,7 +329,7 @@ struct ccw_dev_id { }; /** - * ccw_device_id_is_equal() - compare two ccw_dev_ids + * ccw_dev_id_is_equal() - compare two ccw_dev_ids * @dev_id1: a ccw_dev_id * @dev_id2: another ccw_dev_id * Returns: @@ -356,7 +356,6 @@ static inline u8 pathmask_to_pos(u8 mask) return 8 - ffs(mask); } -void channel_subsystem_reinit(void); extern void css_schedule_reprobe(void); extern void *cio_dma_zalloc(size_t size); @@ -370,8 +369,10 @@ void cio_gp_dma_destroy(struct gen_pool *gp_dma, struct device *dma_dev); struct gen_pool *cio_gp_dma_create(struct device *dma_dev, int nr_pages); /* Function from drivers/s390/cio/chsc.c */ -int chsc_sstpc(void *page, unsigned int op, u16 ctrl, u64 *clock_delta); +int chsc_sstpc(void *page, unsigned int op, u16 ctrl, long *clock_delta); int chsc_sstpi(void *page, void *result, size_t size); +int chsc_stzi(void *page, void *result, size_t size); int chsc_sgib(u32 origin); +int chsc_scud(u16 cu, u64 *esm, u8 *esm_valid); #endif diff --git a/arch/s390/include/asm/clocksource.h b/arch/s390/include/asm/clocksource.h new file mode 100644 index 000000000000..03434369fce4 --- /dev/null +++ b/arch/s390/include/asm/clocksource.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* s390-specific clocksource additions */ + +#ifndef _ASM_S390_CLOCKSOURCE_H +#define _ASM_S390_CLOCKSOURCE_H + +#endif /* _ASM_S390_CLOCKSOURCE_H */ diff --git a/arch/s390/include/asm/clp.h b/arch/s390/include/asm/clp.h index 3925b0f085b7..10919eeb7533 100644 --- a/arch/s390/include/asm/clp.h +++ b/arch/s390/include/asm/clp.h @@ -5,6 +5,9 @@ /* CLP common request & response block size */ #define CLP_BLK_SIZE PAGE_SIZE +/* Call Logical Processor - Command Code */ +#define CLP_SLPC 0x0001 + #define CLP_LPS_BASE 0 #define CLP_LPS_PCI 2 diff --git a/arch/s390/include/asm/cmpxchg.h b/arch/s390/include/asm/cmpxchg.h index af99c1f66f12..aae0315374de 100644 --- a/arch/s390/include/asm/cmpxchg.h +++ b/arch/s390/include/asm/cmpxchg.h @@ -12,55 +12,196 @@ #include <linux/types.h> #include <linux/bug.h> -#define cmpxchg(ptr, o, n) \ -({ \ - __typeof__(*(ptr)) __o = (o); \ - __typeof__(*(ptr)) __n = (n); \ - (__typeof__(*(ptr))) __sync_val_compare_and_swap((ptr),__o,__n);\ -}) +void __xchg_called_with_bad_pointer(void); -#define cmpxchg64 cmpxchg -#define cmpxchg_local cmpxchg -#define cmpxchg64_local cmpxchg +static __always_inline unsigned long +__arch_xchg(unsigned long x, unsigned long address, int size) +{ + unsigned long old; + int shift; -#define xchg(ptr, x) \ -({ \ - __typeof__(ptr) __ptr = (ptr); \ - __typeof__(*(ptr)) __old; \ - do { \ - __old = *__ptr; \ - } while (!__sync_bool_compare_and_swap(__ptr, __old, x)); \ - __old; \ -}) + switch (size) { + case 1: + shift = (3 ^ (address & 3)) << 3; + address ^= address & 3; + asm volatile( + " l %0,%1\n" + "0: lr 0,%0\n" + " nr 0,%3\n" + " or 0,%2\n" + " cs %0,0,%1\n" + " jl 0b\n" + : "=&d" (old), "+Q" (*(int *) address) + : "d" ((x & 0xff) << shift), "d" (~(0xff << shift)) + : "memory", "cc", "0"); + return old >> shift; + case 2: + shift = (2 ^ (address & 2)) << 3; + address ^= address & 2; + asm volatile( + " l %0,%1\n" + "0: lr 0,%0\n" + " nr 0,%3\n" + " or 0,%2\n" + " cs %0,0,%1\n" + " jl 0b\n" + : "=&d" (old), "+Q" (*(int *) address) + : "d" ((x & 0xffff) << shift), "d" (~(0xffff << shift)) + : "memory", "cc", "0"); + return old >> shift; + case 4: + asm volatile( + " l %0,%1\n" + "0: cs %0,%2,%1\n" + " jl 0b\n" + : "=&d" (old), "+Q" (*(int *) address) + : "d" (x) + : "memory", "cc"); + return old; + case 8: + asm volatile( + " lg %0,%1\n" + "0: csg %0,%2,%1\n" + " jl 0b\n" + : "=&d" (old), "+QS" (*(long *) address) + : "d" (x) + : "memory", "cc"); + return old; + } + __xchg_called_with_bad_pointer(); + return x; +} -#define __cmpxchg_double(p1, p2, o1, o2, n1, n2) \ +#define arch_xchg(ptr, x) \ ({ \ - register __typeof__(*(p1)) __old1 asm("2") = (o1); \ - register __typeof__(*(p2)) __old2 asm("3") = (o2); \ - register __typeof__(*(p1)) __new1 asm("4") = (n1); \ - register __typeof__(*(p2)) __new2 asm("5") = (n2); \ - int cc; \ - asm volatile( \ - " cdsg %[old],%[new],%[ptr]\n" \ - " ipm %[cc]\n" \ - " srl %[cc],28" \ - : [cc] "=d" (cc), [old] "+d" (__old1), "+d" (__old2) \ - : [new] "d" (__new1), "d" (__new2), \ - [ptr] "Q" (*(p1)), "Q" (*(p2)) \ - : "memory", "cc"); \ - !cc; \ + __typeof__(*(ptr)) __ret; \ + \ + __ret = (__typeof__(*(ptr))) \ + __arch_xchg((unsigned long)(x), (unsigned long)(ptr), \ + sizeof(*(ptr))); \ + __ret; \ }) -#define cmpxchg_double(p1, p2, o1, o2, n1, n2) \ +void __cmpxchg_called_with_bad_pointer(void); + +static __always_inline unsigned long __cmpxchg(unsigned long address, + unsigned long old, + unsigned long new, int size) +{ + switch (size) { + case 1: { + unsigned int prev, shift, mask; + + shift = (3 ^ (address & 3)) << 3; + address ^= address & 3; + old = (old & 0xff) << shift; + new = (new & 0xff) << shift; + mask = ~(0xff << shift); + asm volatile( + " l %[prev],%[address]\n" + " nr %[prev],%[mask]\n" + " xilf %[mask],0xffffffff\n" + " or %[new],%[prev]\n" + " or %[prev],%[tmp]\n" + "0: lr %[tmp],%[prev]\n" + " cs %[prev],%[new],%[address]\n" + " jnl 1f\n" + " xr %[tmp],%[prev]\n" + " xr %[new],%[tmp]\n" + " nr %[tmp],%[mask]\n" + " jz 0b\n" + "1:" + : [prev] "=&d" (prev), + [address] "+Q" (*(int *)address), + [tmp] "+&d" (old), + [new] "+&d" (new), + [mask] "+&d" (mask) + :: "memory", "cc"); + return prev >> shift; + } + case 2: { + unsigned int prev, shift, mask; + + shift = (2 ^ (address & 2)) << 3; + address ^= address & 2; + old = (old & 0xffff) << shift; + new = (new & 0xffff) << shift; + mask = ~(0xffff << shift); + asm volatile( + " l %[prev],%[address]\n" + " nr %[prev],%[mask]\n" + " xilf %[mask],0xffffffff\n" + " or %[new],%[prev]\n" + " or %[prev],%[tmp]\n" + "0: lr %[tmp],%[prev]\n" + " cs %[prev],%[new],%[address]\n" + " jnl 1f\n" + " xr %[tmp],%[prev]\n" + " xr %[new],%[tmp]\n" + " nr %[tmp],%[mask]\n" + " jz 0b\n" + "1:" + : [prev] "=&d" (prev), + [address] "+Q" (*(int *)address), + [tmp] "+&d" (old), + [new] "+&d" (new), + [mask] "+&d" (mask) + :: "memory", "cc"); + return prev >> shift; + } + case 4: { + unsigned int prev = old; + + asm volatile( + " cs %[prev],%[new],%[address]\n" + : [prev] "+&d" (prev), + [address] "+Q" (*(int *)address) + : [new] "d" (new) + : "memory", "cc"); + return prev; + } + case 8: { + unsigned long prev = old; + + asm volatile( + " csg %[prev],%[new],%[address]\n" + : [prev] "+&d" (prev), + [address] "+QS" (*(long *)address) + : [new] "d" (new) + : "memory", "cc"); + return prev; + } + } + __cmpxchg_called_with_bad_pointer(); + return old; +} + +#define arch_cmpxchg(ptr, o, n) \ ({ \ - __typeof__(p1) __p1 = (p1); \ - __typeof__(p2) __p2 = (p2); \ - BUILD_BUG_ON(sizeof(*(p1)) != sizeof(long)); \ - BUILD_BUG_ON(sizeof(*(p2)) != sizeof(long)); \ - VM_BUG_ON((unsigned long)((__p1) + 1) != (unsigned long)(__p2));\ - __cmpxchg_double(__p1, __p2, o1, o2, n1, n2); \ + __typeof__(*(ptr)) __ret; \ + \ + __ret = (__typeof__(*(ptr))) \ + __cmpxchg((unsigned long)(ptr), (unsigned long)(o), \ + (unsigned long)(n), sizeof(*(ptr))); \ + __ret; \ }) -#define system_has_cmpxchg_double() 1 +#define arch_cmpxchg64 arch_cmpxchg +#define arch_cmpxchg_local arch_cmpxchg +#define arch_cmpxchg64_local arch_cmpxchg + +#define system_has_cmpxchg128() 1 + +static __always_inline u128 arch_cmpxchg128(volatile u128 *ptr, u128 old, u128 new) +{ + asm volatile( + " cdsg %[old],%[new],%[ptr]\n" + : [old] "+d" (old), [ptr] "+QS" (*ptr) + : [new] "d" (new) + : "memory", "cc"); + return old; +} + +#define arch_cmpxchg128 arch_cmpxchg128 #endif /* __ASM_CMPXCHG_H */ diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h index 63b46e30b2c3..3cb9d813f022 100644 --- a/arch/s390/include/asm/compat.h +++ b/arch/s390/include/asm/compat.h @@ -8,6 +8,22 @@ #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <linux/thread_info.h> +#include <asm/ptrace.h> + +#define compat_mode_t compat_mode_t +typedef u16 compat_mode_t; + +#define __compat_uid_t __compat_uid_t +typedef u16 __compat_uid_t; +typedef u16 __compat_gid_t; + +#define compat_dev_t compat_dev_t +typedef u16 compat_dev_t; + +#define compat_ipc_pid_t compat_ipc_pid_t +typedef u16 compat_ipc_pid_t; + +#define compat_statfs compat_statfs #include <asm-generic/compat.h> @@ -19,52 +35,16 @@ (__force t)(__TYPE_IS_PTR(t) ? ((v) & 0x7fffffff) : (v)); \ }) -#define PSW32_MASK_PER 0x40000000UL -#define PSW32_MASK_DAT 0x04000000UL -#define PSW32_MASK_IO 0x02000000UL -#define PSW32_MASK_EXT 0x01000000UL -#define PSW32_MASK_KEY 0x00F00000UL -#define PSW32_MASK_BASE 0x00080000UL /* Always one */ -#define PSW32_MASK_MCHECK 0x00040000UL -#define PSW32_MASK_WAIT 0x00020000UL -#define PSW32_MASK_PSTATE 0x00010000UL -#define PSW32_MASK_ASC 0x0000C000UL -#define PSW32_MASK_CC 0x00003000UL -#define PSW32_MASK_PM 0x00000f00UL -#define PSW32_MASK_RI 0x00000080UL - #define PSW32_MASK_USER 0x0000FF00UL -#define PSW32_ADDR_AMODE 0x80000000UL -#define PSW32_ADDR_INSN 0x7FFFFFFFUL - -#define PSW32_DEFAULT_KEY (((u32) PAGE_DEFAULT_ACC) << 20) - -#define PSW32_ASC_PRIMARY 0x00000000UL -#define PSW32_ASC_ACCREG 0x00004000UL -#define PSW32_ASC_SECONDARY 0x00008000UL -#define PSW32_ASC_HOME 0x0000C000UL - #define PSW32_USER_BITS (PSW32_MASK_DAT | PSW32_MASK_IO | PSW32_MASK_EXT | \ PSW32_DEFAULT_KEY | PSW32_MASK_BASE | \ PSW32_MASK_MCHECK | PSW32_MASK_PSTATE | \ PSW32_ASC_PRIMARY) -#define COMPAT_USER_HZ 100 #define COMPAT_UTS_MACHINE "s390\0\0\0\0" -typedef u16 __compat_uid_t; -typedef u16 __compat_gid_t; -typedef u32 __compat_uid32_t; -typedef u32 __compat_gid32_t; -typedef u16 compat_mode_t; -typedef u16 compat_dev_t; typedef u16 compat_nlink_t; -typedef u16 compat_ipc_pid_t; -typedef u32 compat_caddr_t; -typedef __kernel_fsid_t compat_fsid_t; -typedef s64 compat_s64; -typedef u64 compat_u64; typedef struct { u32 mask; @@ -105,26 +85,6 @@ struct compat_stat { u32 __unused5; }; -struct compat_flock { - short l_type; - short l_whence; - compat_off_t l_start; - compat_off_t l_len; - compat_pid_t l_pid; -}; - -#define F_GETLK64 12 -#define F_SETLK64 13 -#define F_SETLKW64 14 - -struct compat_flock64 { - short l_type; - short l_whence; - compat_loff_t l_start; - compat_loff_t l_len; - compat_pid_t l_pid; -}; - struct compat_statfs { u32 f_type; u32 f_bsize; @@ -152,20 +112,9 @@ struct compat_statfs64 { u32 f_namelen; u32 f_frsize; u32 f_flags; - u32 f_spare[4]; + u32 f_spare[5]; }; -#define COMPAT_RLIM_INFINITY 0xffffffff - -typedef u32 compat_old_sigset_t; /* at least 32 bits */ - -#define _COMPAT_NSIG 64 -#define _COMPAT_NSIG_BPW 32 - -typedef u32 compat_sigset_word; - -#define COMPAT_OFF_T_MAX 0x7fffffff - /* * A pointer passed in from user mode. This should not * be used for syscall parameters, just declare them @@ -177,11 +126,7 @@ static inline void __user *compat_ptr(compat_uptr_t uptr) { return (void __user *)(unsigned long)(uptr & 0x7fffffffUL); } - -static inline compat_uptr_t ptr_to_compat(void __user *uptr) -{ - return (u32)(unsigned long)uptr; -} +#define compat_ptr(uptr) compat_ptr(uptr) #ifdef CONFIG_COMPAT @@ -190,73 +135,6 @@ static inline int is_compat_task(void) return test_thread_flag(TIF_31BIT); } -static inline void __user *arch_compat_alloc_user_space(long len) -{ - unsigned long stack; - - stack = KSTK_ESP(current); - if (is_compat_task()) - stack &= 0x7fffffffUL; - return (void __user *) (stack - len); -} - #endif -struct compat_ipc64_perm { - compat_key_t key; - __compat_uid32_t uid; - __compat_gid32_t gid; - __compat_uid32_t cuid; - __compat_gid32_t cgid; - compat_mode_t mode; - unsigned short __pad1; - unsigned short seq; - unsigned short __pad2; - unsigned int __unused1; - unsigned int __unused2; -}; - -struct compat_semid64_ds { - struct compat_ipc64_perm sem_perm; - compat_ulong_t sem_otime; - compat_ulong_t sem_otime_high; - compat_ulong_t sem_ctime; - compat_ulong_t sem_ctime_high; - compat_ulong_t sem_nsems; - compat_ulong_t __unused1; - compat_ulong_t __unused2; -}; - -struct compat_msqid64_ds { - struct compat_ipc64_perm msg_perm; - compat_ulong_t msg_stime; - compat_ulong_t msg_stime_high; - compat_ulong_t msg_rtime; - compat_ulong_t msg_rtime_high; - compat_ulong_t msg_ctime; - compat_ulong_t msg_ctime_high; - compat_ulong_t msg_cbytes; - compat_ulong_t msg_qnum; - compat_ulong_t msg_qbytes; - compat_pid_t msg_lspid; - compat_pid_t msg_lrpid; - compat_ulong_t __unused1; - compat_ulong_t __unused2; -}; - -struct compat_shmid64_ds { - struct compat_ipc64_perm shm_perm; - compat_size_t shm_segsz; - compat_ulong_t shm_atime; - compat_ulong_t shm_atime_high; - compat_ulong_t shm_dtime; - compat_ulong_t shm_dtime_high; - compat_ulong_t shm_ctime; - compat_ulong_t shm_ctime_high; - compat_pid_t shm_cpid; - compat_pid_t shm_lpid; - compat_ulong_t shm_nattch; - compat_ulong_t __unused1; - compat_ulong_t __unused2; -}; #endif /* _ASM_S390X_COMPAT_H */ diff --git a/arch/s390/include/asm/cpacf.h b/arch/s390/include/asm/cpacf.h index c0f3bfeddcbe..b378e2b57ad8 100644 --- a/arch/s390/include/asm/cpacf.h +++ b/arch/s390/include/asm/cpacf.h @@ -2,7 +2,7 @@ /* * CP Assist for Cryptographic Functions (CPACF) * - * Copyright IBM Corp. 2003, 2017 + * Copyright IBM Corp. 2003, 2023 * Author(s): Thomas Spatzier * Jan Glauber * Harald Freudenberger (freude@de.ibm.com) @@ -132,6 +132,11 @@ #define CPACF_PCKMO_ENC_AES_128_KEY 0x12 #define CPACF_PCKMO_ENC_AES_192_KEY 0x13 #define CPACF_PCKMO_ENC_AES_256_KEY 0x14 +#define CPACF_PCKMO_ENC_ECC_P256_KEY 0x20 +#define CPACF_PCKMO_ENC_ECC_P384_KEY 0x21 +#define CPACF_PCKMO_ENC_ECC_P521_KEY 0x22 +#define CPACF_PCKMO_ENC_ECC_ED25519_KEY 0x28 +#define CPACF_PCKMO_ENC_ECC_ED448_KEY 0x29 /* * Function codes for the PRNO (PERFORM RANDOM NUMBER OPERATION) @@ -173,17 +178,16 @@ typedef struct { unsigned char bytes[16]; } cpacf_mask_t; */ static __always_inline void __cpacf_query(unsigned int opcode, cpacf_mask_t *mask) { - register unsigned long r0 asm("0") = 0; /* query function */ - register unsigned long r1 asm("1") = (unsigned long) mask; - asm volatile( - " spm 0\n" /* pckmo doesn't change the cc */ + " lghi 0,0\n" /* query function */ + " lgr 1,%[mask]\n" + " spm 0\n" /* pckmo doesn't change the cc */ /* Parameter regs are ignored, but must be nonzero and unique */ "0: .insn rrf,%[opc] << 16,2,4,6,0\n" " brc 1,0b\n" /* handle partial completion */ : "=m" (*mask) - : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (opcode) - : "cc"); + : [mask] "d" ((unsigned long)mask), [opc] "i" (opcode) + : "cc", "0", "1"); } static __always_inline int __cpacf_check_opcode(unsigned int opcode) @@ -249,20 +253,22 @@ static __always_inline int cpacf_query_func(unsigned int opcode, unsigned int fu static inline int cpacf_km(unsigned long func, void *param, u8 *dest, const u8 *src, long src_len) { - register unsigned long r0 asm("0") = (unsigned long) func; - register unsigned long r1 asm("1") = (unsigned long) param; - register unsigned long r2 asm("2") = (unsigned long) src; - register unsigned long r3 asm("3") = (unsigned long) src_len; - register unsigned long r4 asm("4") = (unsigned long) dest; + union register_pair d, s; + d.even = (unsigned long)dest; + s.even = (unsigned long)src; + s.odd = (unsigned long)src_len; asm volatile( + " lgr 0,%[fc]\n" + " lgr 1,%[pba]\n" "0: .insn rre,%[opc] << 16,%[dst],%[src]\n" " brc 1,0b\n" /* handle partial completion */ - : [src] "+a" (r2), [len] "+d" (r3), [dst] "+a" (r4) - : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_KM) - : "cc", "memory"); + : [src] "+&d" (s.pair), [dst] "+&d" (d.pair) + : [fc] "d" (func), [pba] "d" ((unsigned long)param), + [opc] "i" (CPACF_KM) + : "cc", "memory", "0", "1"); - return src_len - r3; + return src_len - s.odd; } /** @@ -279,20 +285,22 @@ static inline int cpacf_km(unsigned long func, void *param, static inline int cpacf_kmc(unsigned long func, void *param, u8 *dest, const u8 *src, long src_len) { - register unsigned long r0 asm("0") = (unsigned long) func; - register unsigned long r1 asm("1") = (unsigned long) param; - register unsigned long r2 asm("2") = (unsigned long) src; - register unsigned long r3 asm("3") = (unsigned long) src_len; - register unsigned long r4 asm("4") = (unsigned long) dest; + union register_pair d, s; + d.even = (unsigned long)dest; + s.even = (unsigned long)src; + s.odd = (unsigned long)src_len; asm volatile( + " lgr 0,%[fc]\n" + " lgr 1,%[pba]\n" "0: .insn rre,%[opc] << 16,%[dst],%[src]\n" " brc 1,0b\n" /* handle partial completion */ - : [src] "+a" (r2), [len] "+d" (r3), [dst] "+a" (r4) - : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_KMC) - : "cc", "memory"); + : [src] "+&d" (s.pair), [dst] "+&d" (d.pair) + : [fc] "d" (func), [pba] "d" ((unsigned long)param), + [opc] "i" (CPACF_KMC) + : "cc", "memory", "0", "1"); - return src_len - r3; + return src_len - s.odd; } /** @@ -306,17 +314,19 @@ static inline int cpacf_kmc(unsigned long func, void *param, static inline void cpacf_kimd(unsigned long func, void *param, const u8 *src, long src_len) { - register unsigned long r0 asm("0") = (unsigned long) func; - register unsigned long r1 asm("1") = (unsigned long) param; - register unsigned long r2 asm("2") = (unsigned long) src; - register unsigned long r3 asm("3") = (unsigned long) src_len; + union register_pair s; + s.even = (unsigned long)src; + s.odd = (unsigned long)src_len; asm volatile( + " lgr 0,%[fc]\n" + " lgr 1,%[pba]\n" "0: .insn rre,%[opc] << 16,0,%[src]\n" " brc 1,0b\n" /* handle partial completion */ - : [src] "+a" (r2), [len] "+d" (r3) - : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_KIMD) - : "cc", "memory"); + : [src] "+&d" (s.pair) + : [fc] "d" (func), [pba] "d" ((unsigned long)(param)), + [opc] "i" (CPACF_KIMD) + : "cc", "memory", "0", "1"); } /** @@ -329,17 +339,19 @@ static inline void cpacf_kimd(unsigned long func, void *param, static inline void cpacf_klmd(unsigned long func, void *param, const u8 *src, long src_len) { - register unsigned long r0 asm("0") = (unsigned long) func; - register unsigned long r1 asm("1") = (unsigned long) param; - register unsigned long r2 asm("2") = (unsigned long) src; - register unsigned long r3 asm("3") = (unsigned long) src_len; + union register_pair s; + s.even = (unsigned long)src; + s.odd = (unsigned long)src_len; asm volatile( + " lgr 0,%[fc]\n" + " lgr 1,%[pba]\n" "0: .insn rre,%[opc] << 16,0,%[src]\n" " brc 1,0b\n" /* handle partial completion */ - : [src] "+a" (r2), [len] "+d" (r3) - : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_KLMD) - : "cc", "memory"); + : [src] "+&d" (s.pair) + : [fc] "d" (func), [pba] "d" ((unsigned long)param), + [opc] "i" (CPACF_KLMD) + : "cc", "memory", "0", "1"); } /** @@ -355,19 +367,21 @@ static inline void cpacf_klmd(unsigned long func, void *param, static inline int cpacf_kmac(unsigned long func, void *param, const u8 *src, long src_len) { - register unsigned long r0 asm("0") = (unsigned long) func; - register unsigned long r1 asm("1") = (unsigned long) param; - register unsigned long r2 asm("2") = (unsigned long) src; - register unsigned long r3 asm("3") = (unsigned long) src_len; + union register_pair s; + s.even = (unsigned long)src; + s.odd = (unsigned long)src_len; asm volatile( + " lgr 0,%[fc]\n" + " lgr 1,%[pba]\n" "0: .insn rre,%[opc] << 16,0,%[src]\n" " brc 1,0b\n" /* handle partial completion */ - : [src] "+a" (r2), [len] "+d" (r3) - : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_KMAC) - : "cc", "memory"); + : [src] "+&d" (s.pair) + : [fc] "d" (func), [pba] "d" ((unsigned long)param), + [opc] "i" (CPACF_KMAC) + : "cc", "memory", "0", "1"); - return src_len - r3; + return src_len - s.odd; } /** @@ -385,22 +399,24 @@ static inline int cpacf_kmac(unsigned long func, void *param, static inline int cpacf_kmctr(unsigned long func, void *param, u8 *dest, const u8 *src, long src_len, u8 *counter) { - register unsigned long r0 asm("0") = (unsigned long) func; - register unsigned long r1 asm("1") = (unsigned long) param; - register unsigned long r2 asm("2") = (unsigned long) src; - register unsigned long r3 asm("3") = (unsigned long) src_len; - register unsigned long r4 asm("4") = (unsigned long) dest; - register unsigned long r6 asm("6") = (unsigned long) counter; + union register_pair d, s, c; + d.even = (unsigned long)dest; + s.even = (unsigned long)src; + s.odd = (unsigned long)src_len; + c.even = (unsigned long)counter; asm volatile( + " lgr 0,%[fc]\n" + " lgr 1,%[pba]\n" "0: .insn rrf,%[opc] << 16,%[dst],%[src],%[ctr],0\n" " brc 1,0b\n" /* handle partial completion */ - : [src] "+a" (r2), [len] "+d" (r3), - [dst] "+a" (r4), [ctr] "+a" (r6) - : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_KMCTR) - : "cc", "memory"); + : [src] "+&d" (s.pair), [dst] "+&d" (d.pair), + [ctr] "+&d" (c.pair) + : [fc] "d" (func), [pba] "d" ((unsigned long)param), + [opc] "i" (CPACF_KMCTR) + : "cc", "memory", "0", "1"); - return src_len - r3; + return src_len - s.odd; } /** @@ -417,20 +433,21 @@ static inline void cpacf_prno(unsigned long func, void *param, u8 *dest, unsigned long dest_len, const u8 *seed, unsigned long seed_len) { - register unsigned long r0 asm("0") = (unsigned long) func; - register unsigned long r1 asm("1") = (unsigned long) param; - register unsigned long r2 asm("2") = (unsigned long) dest; - register unsigned long r3 asm("3") = (unsigned long) dest_len; - register unsigned long r4 asm("4") = (unsigned long) seed; - register unsigned long r5 asm("5") = (unsigned long) seed_len; + union register_pair d, s; + d.even = (unsigned long)dest; + d.odd = (unsigned long)dest_len; + s.even = (unsigned long)seed; + s.odd = (unsigned long)seed_len; asm volatile ( + " lgr 0,%[fc]\n" + " lgr 1,%[pba]\n" "0: .insn rre,%[opc] << 16,%[dst],%[seed]\n" " brc 1,0b\n" /* handle partial completion */ - : [dst] "+a" (r2), [dlen] "+d" (r3) - : [fc] "d" (r0), [pba] "a" (r1), - [seed] "a" (r4), [slen] "d" (r5), [opc] "i" (CPACF_PRNO) - : "cc", "memory"); + : [dst] "+&d" (d.pair) + : [fc] "d" (func), [pba] "d" ((unsigned long)param), + [seed] "d" (s.pair), [opc] "i" (CPACF_PRNO) + : "cc", "memory", "0", "1"); } /** @@ -443,19 +460,19 @@ static inline void cpacf_prno(unsigned long func, void *param, static inline void cpacf_trng(u8 *ucbuf, unsigned long ucbuf_len, u8 *cbuf, unsigned long cbuf_len) { - register unsigned long r0 asm("0") = (unsigned long) CPACF_PRNO_TRNG; - register unsigned long r2 asm("2") = (unsigned long) ucbuf; - register unsigned long r3 asm("3") = (unsigned long) ucbuf_len; - register unsigned long r4 asm("4") = (unsigned long) cbuf; - register unsigned long r5 asm("5") = (unsigned long) cbuf_len; + union register_pair u, c; + u.even = (unsigned long)ucbuf; + u.odd = (unsigned long)ucbuf_len; + c.even = (unsigned long)cbuf; + c.odd = (unsigned long)cbuf_len; asm volatile ( + " lghi 0,%[fc]\n" "0: .insn rre,%[opc] << 16,%[ucbuf],%[cbuf]\n" " brc 1,0b\n" /* handle partial completion */ - : [ucbuf] "+a" (r2), [ucbuflen] "+d" (r3), - [cbuf] "+a" (r4), [cbuflen] "+d" (r5) - : [fc] "d" (r0), [opc] "i" (CPACF_PRNO) - : "cc", "memory"); + : [ucbuf] "+&d" (u.pair), [cbuf] "+&d" (c.pair) + : [fc] "K" (CPACF_PRNO_TRNG), [opc] "i" (CPACF_PRNO) + : "cc", "memory", "0"); } /** @@ -466,15 +483,15 @@ static inline void cpacf_trng(u8 *ucbuf, unsigned long ucbuf_len, */ static inline void cpacf_pcc(unsigned long func, void *param) { - register unsigned long r0 asm("0") = (unsigned long) func; - register unsigned long r1 asm("1") = (unsigned long) param; - asm volatile( + " lgr 0,%[fc]\n" + " lgr 1,%[pba]\n" "0: .insn rre,%[opc] << 16,0,0\n" /* PCC opcode */ " brc 1,0b\n" /* handle partial completion */ : - : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_PCC) - : "cc", "memory"); + : [fc] "d" (func), [pba] "d" ((unsigned long)param), + [opc] "i" (CPACF_PCC) + : "cc", "memory", "0", "1"); } /** @@ -487,14 +504,14 @@ static inline void cpacf_pcc(unsigned long func, void *param) */ static inline void cpacf_pckmo(long func, void *param) { - register unsigned long r0 asm("0") = (unsigned long) func; - register unsigned long r1 asm("1") = (unsigned long) param; - asm volatile( + " lgr 0,%[fc]\n" + " lgr 1,%[pba]\n" " .insn rre,%[opc] << 16,0,0\n" /* PCKMO opcode */ : - : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_PCKMO) - : "cc", "memory"); + : [fc] "d" (func), [pba] "d" ((unsigned long)param), + [opc] "i" (CPACF_PCKMO) + : "cc", "memory", "0", "1"); } /** @@ -512,21 +529,23 @@ static inline void cpacf_kma(unsigned long func, void *param, u8 *dest, const u8 *src, unsigned long src_len, const u8 *aad, unsigned long aad_len) { - register unsigned long r0 asm("0") = (unsigned long) func; - register unsigned long r1 asm("1") = (unsigned long) param; - register unsigned long r2 asm("2") = (unsigned long) src; - register unsigned long r3 asm("3") = (unsigned long) src_len; - register unsigned long r4 asm("4") = (unsigned long) aad; - register unsigned long r5 asm("5") = (unsigned long) aad_len; - register unsigned long r6 asm("6") = (unsigned long) dest; + union register_pair d, s, a; + d.even = (unsigned long)dest; + s.even = (unsigned long)src; + s.odd = (unsigned long)src_len; + a.even = (unsigned long)aad; + a.odd = (unsigned long)aad_len; asm volatile( + " lgr 0,%[fc]\n" + " lgr 1,%[pba]\n" "0: .insn rrf,%[opc] << 16,%[dst],%[src],%[aad],0\n" " brc 1,0b\n" /* handle partial completion */ - : [dst] "+a" (r6), [src] "+a" (r2), [slen] "+d" (r3), - [aad] "+a" (r4), [alen] "+d" (r5) - : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_KMA) - : "cc", "memory"); + : [dst] "+&d" (d.pair), [src] "+&d" (s.pair), + [aad] "+&d" (a.pair) + : [fc] "d" (func), [pba] "d" ((unsigned long)param), + [opc] "i" (CPACF_KMA) + : "cc", "memory", "0", "1"); } #endif /* _ASM_S390_CPACF_H */ diff --git a/arch/s390/include/asm/cpu.h b/arch/s390/include/asm/cpu.h index 62228a884e06..26c710cd3485 100644 --- a/arch/s390/include/asm/cpu.h +++ b/arch/s390/include/asm/cpu.h @@ -12,6 +12,7 @@ #ifndef __ASSEMBLY__ #include <linux/types.h> +#include <linux/jump_label.h> struct cpuid { @@ -21,5 +22,7 @@ struct cpuid unsigned int unused : 16; } __attribute__ ((packed, aligned(8))); +DECLARE_STATIC_KEY_FALSE(cpu_has_bear); + #endif /* __ASSEMBLY__ */ #endif /* _ASM_S390_CPU_H */ diff --git a/arch/s390/include/asm/cpu_mcf.h b/arch/s390/include/asm/cpu_mcf.h deleted file mode 100644 index 649b9fc60685..000000000000 --- a/arch/s390/include/asm/cpu_mcf.h +++ /dev/null @@ -1,126 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Counter facility support definitions for the Linux perf - * - * Copyright IBM Corp. 2019 - * Author(s): Hendrik Brueckner <brueckner@linux.ibm.com> - */ -#ifndef _ASM_S390_CPU_MCF_H -#define _ASM_S390_CPU_MCF_H - -#include <linux/perf_event.h> -#include <asm/cpu_mf.h> - -enum cpumf_ctr_set { - CPUMF_CTR_SET_BASIC = 0, /* Basic Counter Set */ - CPUMF_CTR_SET_USER = 1, /* Problem-State Counter Set */ - CPUMF_CTR_SET_CRYPTO = 2, /* Crypto-Activity Counter Set */ - CPUMF_CTR_SET_EXT = 3, /* Extended Counter Set */ - CPUMF_CTR_SET_MT_DIAG = 4, /* MT-diagnostic Counter Set */ - - /* Maximum number of counter sets */ - CPUMF_CTR_SET_MAX, -}; - -#define CPUMF_LCCTL_ENABLE_SHIFT 16 -#define CPUMF_LCCTL_ACTCTL_SHIFT 0 -static const u64 cpumf_ctr_ctl[CPUMF_CTR_SET_MAX] = { - [CPUMF_CTR_SET_BASIC] = 0x02, - [CPUMF_CTR_SET_USER] = 0x04, - [CPUMF_CTR_SET_CRYPTO] = 0x08, - [CPUMF_CTR_SET_EXT] = 0x01, - [CPUMF_CTR_SET_MT_DIAG] = 0x20, -}; - -static inline void ctr_set_enable(u64 *state, int ctr_set) -{ - *state |= cpumf_ctr_ctl[ctr_set] << CPUMF_LCCTL_ENABLE_SHIFT; -} -static inline void ctr_set_disable(u64 *state, int ctr_set) -{ - *state &= ~(cpumf_ctr_ctl[ctr_set] << CPUMF_LCCTL_ENABLE_SHIFT); -} -static inline void ctr_set_start(u64 *state, int ctr_set) -{ - *state |= cpumf_ctr_ctl[ctr_set] << CPUMF_LCCTL_ACTCTL_SHIFT; -} -static inline void ctr_set_stop(u64 *state, int ctr_set) -{ - *state &= ~(cpumf_ctr_ctl[ctr_set] << CPUMF_LCCTL_ACTCTL_SHIFT); -} - -static inline void ctr_set_multiple_enable(u64 *state, u64 ctrsets) -{ - *state |= ctrsets << CPUMF_LCCTL_ENABLE_SHIFT; -} - -static inline void ctr_set_multiple_disable(u64 *state, u64 ctrsets) -{ - *state &= ~(ctrsets << CPUMF_LCCTL_ENABLE_SHIFT); -} - -static inline void ctr_set_multiple_start(u64 *state, u64 ctrsets) -{ - *state |= ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT; -} - -static inline void ctr_set_multiple_stop(u64 *state, u64 ctrsets) -{ - *state &= ~(ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT); -} - -static inline int ctr_stcctm(enum cpumf_ctr_set set, u64 range, u64 *dest) -{ - switch (set) { - case CPUMF_CTR_SET_BASIC: - return stcctm(BASIC, range, dest); - case CPUMF_CTR_SET_USER: - return stcctm(PROBLEM_STATE, range, dest); - case CPUMF_CTR_SET_CRYPTO: - return stcctm(CRYPTO_ACTIVITY, range, dest); - case CPUMF_CTR_SET_EXT: - return stcctm(EXTENDED, range, dest); - case CPUMF_CTR_SET_MT_DIAG: - return stcctm(MT_DIAG_CLEARING, range, dest); - case CPUMF_CTR_SET_MAX: - return 3; - } - return 3; -} - -struct cpu_cf_events { - struct cpumf_ctr_info info; - atomic_t ctr_set[CPUMF_CTR_SET_MAX]; - atomic64_t alert; - u64 state, tx_state; - unsigned int flags; - unsigned int txn_flags; -}; -DECLARE_PER_CPU(struct cpu_cf_events, cpu_cf_events); - -bool kernel_cpumcf_avail(void); -int __kernel_cpumcf_begin(void); -unsigned long kernel_cpumcf_alert(int clear); -void __kernel_cpumcf_end(void); - -static inline int kernel_cpumcf_begin(void) -{ - if (!cpum_cf_avail()) - return -ENODEV; - - preempt_disable(); - return __kernel_cpumcf_begin(); -} -static inline void kernel_cpumcf_end(void) -{ - __kernel_cpumcf_end(); - preempt_enable(); -} - -/* Return true if store counter set multiple instruction is available */ -static inline int stccm_avail(void) -{ - return test_facility(142); -} - -#endif /* _ASM_S390_CPU_MCF_H */ diff --git a/arch/s390/include/asm/cpu_mf.h b/arch/s390/include/asm/cpu_mf.h index 0d90cbeb89b4..a0de5b9b02ea 100644 --- a/arch/s390/include/asm/cpu_mf.h +++ b/arch/s390/include/asm/cpu_mf.h @@ -10,6 +10,7 @@ #define _ASM_S390_CPU_MF_H #include <linux/errno.h> +#include <asm/asm-extable.h> #include <asm/facility.h> asm(".include \"asm/cpu_mf-insn.h\"\n"); @@ -41,7 +42,6 @@ static inline int cpum_sf_avail(void) return test_facility(40) && test_facility(68); } - struct cpumf_ctr_info { u16 cfvn; u16 auth_ctl; @@ -109,7 +109,9 @@ struct hws_basic_entry { unsigned int AS:2; /* 29-30 PSW address-space control */ unsigned int I:1; /* 31 entry valid or invalid */ unsigned int CL:2; /* 32-33 Configuration Level */ - unsigned int:14; + unsigned int H:1; /* 34 Host Indicator */ + unsigned int LS:1; /* 35 Limited Sampling */ + unsigned int:12; unsigned int prim_asn:16; /* primary ASN */ unsigned long long ia; /* Instruction Address */ unsigned long long gpp; /* Guest Program Parameter */ @@ -128,19 +130,21 @@ struct hws_combined_entry { struct hws_diag_entry diag; /* Diagnostic-sampling data entry */ } __packed; -struct hws_trailer_entry { - union { - struct { - unsigned int f:1; /* 0 - Block Full Indicator */ - unsigned int a:1; /* 1 - Alert request control */ - unsigned int t:1; /* 2 - Timestamp format */ - unsigned int :29; /* 3 - 31: Reserved */ - unsigned int bsdes:16; /* 32-47: size of basic SDE */ - unsigned int dsdes:16; /* 48-63: size of diagnostic SDE */ - }; - unsigned long long flags; /* 0 - 63: All indicators */ +union hws_trailer_header { + struct { + unsigned int f:1; /* 0 - Block Full Indicator */ + unsigned int a:1; /* 1 - Alert request control */ + unsigned int t:1; /* 2 - Timestamp format */ + unsigned int :29; /* 3 - 31: Reserved */ + unsigned int bsdes:16; /* 32-47: size of basic SDE */ + unsigned int dsdes:16; /* 48-63: size of diagnostic SDE */ + unsigned long long overflow; /* 64 - Overflow Count */ }; - unsigned long long overflow; /* 64 - sample Overflow count */ + u128 val; +}; + +struct hws_trailer_entry { + union hws_trailer_header header; /* 0 - 15 Flags + Overflow Count */ unsigned char timestamp[16]; /* 16 - 31 timestamp */ unsigned long long reserved1; /* 32 -Reserved */ unsigned long long reserved2; /* */ @@ -157,7 +161,7 @@ struct hws_trailer_entry { /* Load program parameter */ static inline void lpp(void *pp) { - asm volatile(".insn s,0xb2800000,0(%0)\n":: "a" (pp) : "memory"); + asm volatile("lpp 0(%0)\n" :: "a" (pp) : "memory"); } /* Query counter information */ @@ -166,7 +170,7 @@ static inline int qctri(struct cpumf_ctr_info *info) int rc = -EINVAL; asm volatile ( - "0: .insn s,0xb28e0000,%1\n" + "0: qctri %1\n" "1: lhi %0,0\n" "2:\n" EX_TABLE(1b, 2b) @@ -180,7 +184,7 @@ static inline int lcctl(u64 ctl) int cc; asm volatile ( - " .insn s,0xb2840000,%1\n" + " lcctl %1\n" " ipm %0\n" " srl %0,28\n" : "=d" (cc) : "Q" (ctl) : "cc"); @@ -194,7 +198,7 @@ static inline int __ecctr(u64 ctr, u64 *content) int cc; asm volatile ( - " .insn rre,0xb2e40000,%0,%2\n" + " ecctr %0,%2\n" " ipm %1\n" " srl %1,28\n" : "=d" (_content), "=d" (cc) : "d" (ctr) : "cc"); @@ -244,7 +248,7 @@ static inline int qsi(struct hws_qsi_info_block *info) int cc = 1; asm volatile( - "0: .insn s,0xb2860000,%1\n" + "0: qsi %1\n" "1: lhi %0,0\n" "2:\n" EX_TABLE(0b, 2b) EX_TABLE(1b, 2b) @@ -259,7 +263,7 @@ static inline int lsctl(struct hws_lsctl_request_block *req) cc = 1; asm volatile( - "0: .insn s,0xb2870000,0(%1)\n" + "0: lsctl 0(%1)\n" "1: ipm %0\n" " srl %0,28\n" "2:\n" @@ -270,59 +274,4 @@ static inline int lsctl(struct hws_lsctl_request_block *req) return cc ? -EINVAL : 0; } - -/* Sampling control helper functions */ - -#include <linux/time.h> - -static inline unsigned long freq_to_sample_rate(struct hws_qsi_info_block *qsi, - unsigned long freq) -{ - return (USEC_PER_SEC / freq) * qsi->cpu_speed; -} - -static inline unsigned long sample_rate_to_freq(struct hws_qsi_info_block *qsi, - unsigned long rate) -{ - return USEC_PER_SEC * qsi->cpu_speed / rate; -} - -#define SDB_TE_ALERT_REQ_MASK 0x4000000000000000UL -#define SDB_TE_BUFFER_FULL_MASK 0x8000000000000000UL - -/* Return TOD timestamp contained in an trailer entry */ -static inline unsigned long long trailer_timestamp(struct hws_trailer_entry *te) -{ - /* TOD in STCKE format */ - if (te->t) - return *((unsigned long long *) &te->timestamp[1]); - - /* TOD in STCK format */ - return *((unsigned long long *) &te->timestamp[0]); -} - -/* Return pointer to trailer entry of an sample data block */ -static inline unsigned long *trailer_entry_ptr(unsigned long v) -{ - void *ret; - - ret = (void *) v; - ret += PAGE_SIZE; - ret -= sizeof(struct hws_trailer_entry); - - return (unsigned long *) ret; -} - -/* Return true if the entry in the sample data block table (sdbt) - * is a link to the next sdbt */ -static inline int is_link_entry(unsigned long *s) -{ - return *s & 0x1ul ? 1 : 0; -} - -/* Return pointer to the linked sdbt */ -static inline unsigned long *get_next_sdbt(unsigned long *s) -{ - return (unsigned long *) (*s & ~0x1ul); -} #endif /* _ASM_S390_CPU_MF_H */ diff --git a/arch/s390/include/asm/cpufeature.h b/arch/s390/include/asm/cpufeature.h index 1d007c6ede95..931204613753 100644 --- a/arch/s390/include/asm/cpufeature.h +++ b/arch/s390/include/asm/cpufeature.h @@ -2,28 +2,21 @@ /* * Module interface for CPU features * - * Copyright IBM Corp. 2015 + * Copyright IBM Corp. 2015, 2022 * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> */ #ifndef __ASM_S390_CPUFEATURE_H #define __ASM_S390_CPUFEATURE_H -#include <asm/elf.h> +enum { + S390_CPU_FEATURE_MSA, + S390_CPU_FEATURE_VXRS, + S390_CPU_FEATURE_UV, + MAX_CPU_FEATURES +}; -/* Hardware features on Linux on z Systems are indicated by facility bits that - * are mapped to the so-called machine flags. Particular machine flags are - * then used to define ELF hardware capabilities; most notably hardware flags - * that are essential for user space / glibc. - * - * Restrict the set of exposed CPU features to ELF hardware capabilities for - * now. Additional machine flags can be indicated by values larger than - * MAX_ELF_HWCAP_FEATURES. - */ -#define MAX_ELF_HWCAP_FEATURES (8 * sizeof(elf_hwcap)) -#define MAX_CPU_FEATURES MAX_ELF_HWCAP_FEATURES - -#define cpu_feature(feat) ilog2(HWCAP_S390_ ## feat) +#define cpu_feature(feature) (feature) int cpu_have_feature(unsigned int nr); diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h index cb729d111e20..30bb3ec4e5fc 100644 --- a/arch/s390/include/asm/cputime.h +++ b/arch/s390/include/asm/cputime.h @@ -11,28 +11,11 @@ #include <linux/types.h> #include <asm/timex.h> -#define CPUTIME_PER_USEC 4096ULL -#define CPUTIME_PER_SEC (CPUTIME_PER_USEC * USEC_PER_SEC) - -/* We want to use full resolution of the CPU timer: 2**-12 micro-seconds. */ - -#define cmpxchg_cputime(ptr, old, new) cmpxchg64(ptr, old, new) - -/* - * Convert cputime to microseconds. - */ -static inline u64 cputime_to_usecs(const u64 cputime) -{ - return cputime >> 12; -} - /* * Convert cputime to nanoseconds. */ #define cputime_to_nsecs(cputime) tod_to_ns(cputime) -u64 arch_cpu_idle_time(int cpu); - -#define arch_idle_time(cpu) arch_cpu_idle_time(cpu) +void account_idle_time_irq(void); #endif /* _S390_CPUTIME_H */ diff --git a/arch/s390/include/asm/crw.h b/arch/s390/include/asm/crw.h index c6ebfd31f1db..97456d98fe76 100644 --- a/arch/s390/include/asm/crw.h +++ b/arch/s390/include/asm/crw.h @@ -5,7 +5,6 @@ * Author(s): Ingo Adlung <adlung@de.ibm.com>, * Martin Schwidefsky <schwidefsky@de.ibm.com>, * Cornelia Huck <cornelia.huck@de.ibm.com>, - * Heiko Carstens <heiko.carstens@de.ibm.com>, */ #ifndef _ASM_S390_CRW_H diff --git a/arch/s390/include/asm/css_chars.h b/arch/s390/include/asm/css_chars.h index 480bb02ccacd..638137d46c85 100644 --- a/arch/s390/include/asm/css_chars.h +++ b/arch/s390/include/asm/css_chars.h @@ -36,7 +36,9 @@ struct css_general_char { u64 alt_ssi : 1; /* bit 108 */ u64 : 1; u64 narf : 1; /* bit 110 */ - u64 : 12; + u64 : 5; + u64 enarf: 1; /* bit 116 */ + u64 : 6; u64 util_str : 1;/* bit 123 */ } __packed; diff --git a/arch/s390/include/asm/ctl_reg.h b/arch/s390/include/asm/ctl_reg.h deleted file mode 100644 index ed5efbb531c4..000000000000 --- a/arch/s390/include/asm/ctl_reg.h +++ /dev/null @@ -1,120 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright IBM Corp. 1999, 2009 - * - * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> - */ - -#ifndef __ASM_CTL_REG_H -#define __ASM_CTL_REG_H - -#include <linux/bits.h> - -#define CR0_CLOCK_COMPARATOR_SIGN BIT(63 - 10) -#define CR0_LOW_ADDRESS_PROTECTION BIT(63 - 35) -#define CR0_EMERGENCY_SIGNAL_SUBMASK BIT(63 - 49) -#define CR0_EXTERNAL_CALL_SUBMASK BIT(63 - 50) -#define CR0_CLOCK_COMPARATOR_SUBMASK BIT(63 - 52) -#define CR0_CPU_TIMER_SUBMASK BIT(63 - 53) -#define CR0_SERVICE_SIGNAL_SUBMASK BIT(63 - 54) -#define CR0_UNUSED_56 BIT(63 - 56) -#define CR0_INTERRUPT_KEY_SUBMASK BIT(63 - 57) -#define CR0_MEASUREMENT_ALERT_SUBMASK BIT(63 - 58) - -#define CR2_GUARDED_STORAGE BIT(63 - 59) - -#define CR14_UNUSED_32 BIT(63 - 32) -#define CR14_UNUSED_33 BIT(63 - 33) -#define CR14_CHANNEL_REPORT_SUBMASK BIT(63 - 35) -#define CR14_RECOVERY_SUBMASK BIT(63 - 36) -#define CR14_DEGRADATION_SUBMASK BIT(63 - 37) -#define CR14_EXTERNAL_DAMAGE_SUBMASK BIT(63 - 38) -#define CR14_WARNING_SUBMASK BIT(63 - 39) - -#ifndef __ASSEMBLY__ - -#include <linux/bug.h> - -#define __ctl_load(array, low, high) do { \ - typedef struct { char _[sizeof(array)]; } addrtype; \ - \ - BUILD_BUG_ON(sizeof(addrtype) != (high - low + 1) * sizeof(long));\ - asm volatile( \ - " lctlg %1,%2,%0\n" \ - : \ - : "Q" (*(addrtype *)(&array)), "i" (low), "i" (high) \ - : "memory"); \ -} while (0) - -#define __ctl_store(array, low, high) do { \ - typedef struct { char _[sizeof(array)]; } addrtype; \ - \ - BUILD_BUG_ON(sizeof(addrtype) != (high - low + 1) * sizeof(long));\ - asm volatile( \ - " stctg %1,%2,%0\n" \ - : "=Q" (*(addrtype *)(&array)) \ - : "i" (low), "i" (high)); \ -} while (0) - -static __always_inline void __ctl_set_bit(unsigned int cr, unsigned int bit) -{ - unsigned long reg; - - __ctl_store(reg, cr, cr); - reg |= 1UL << bit; - __ctl_load(reg, cr, cr); -} - -static __always_inline void __ctl_clear_bit(unsigned int cr, unsigned int bit) -{ - unsigned long reg; - - __ctl_store(reg, cr, cr); - reg &= ~(1UL << bit); - __ctl_load(reg, cr, cr); -} - -void smp_ctl_set_bit(int cr, int bit); -void smp_ctl_clear_bit(int cr, int bit); - -union ctlreg0 { - unsigned long val; - struct { - unsigned long : 8; - unsigned long tcx : 1; /* Transactional-Execution control */ - unsigned long pifo : 1; /* Transactional-Execution Program- - Interruption-Filtering Override */ - unsigned long : 22; - unsigned long : 3; - unsigned long lap : 1; /* Low-address-protection control */ - unsigned long : 4; - unsigned long edat : 1; /* Enhanced-DAT-enablement control */ - unsigned long : 2; - unsigned long iep : 1; /* Instruction-Execution-Protection */ - unsigned long : 1; - unsigned long afp : 1; /* AFP-register control */ - unsigned long vx : 1; /* Vector enablement control */ - unsigned long : 7; - unsigned long sssm : 1; /* Service signal subclass mask */ - unsigned long : 9; - }; -}; - -union ctlreg2 { - unsigned long val; - struct { - unsigned long : 33; - unsigned long ducto : 25; - unsigned long : 1; - unsigned long gse : 1; - unsigned long : 1; - unsigned long tds : 1; - unsigned long tdc : 2; - }; -}; - -#define ctl_set_bit(cr, bit) smp_ctl_set_bit(cr, bit) -#define ctl_clear_bit(cr, bit) smp_ctl_clear_bit(cr, bit) - -#endif /* __ASSEMBLY__ */ -#endif /* __ASM_CTL_REG_H */ diff --git a/arch/s390/include/asm/ctlreg.h b/arch/s390/include/asm/ctlreg.h new file mode 100644 index 000000000000..72a9556d04f3 --- /dev/null +++ b/arch/s390/include/asm/ctlreg.h @@ -0,0 +1,255 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 1999, 2009 + * + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> + */ + +#ifndef __ASM_S390_CTLREG_H +#define __ASM_S390_CTLREG_H + +#include <linux/bits.h> + +#define CR0_TRANSACTIONAL_EXECUTION_BIT (63 - 8) +#define CR0_CLOCK_COMPARATOR_SIGN_BIT (63 - 10) +#define CR0_CRYPTOGRAPHY_COUNTER_BIT (63 - 13) +#define CR0_PAI_EXTENSION_BIT (63 - 14) +#define CR0_CPUMF_EXTRACTION_AUTH_BIT (63 - 15) +#define CR0_WARNING_TRACK_BIT (63 - 30) +#define CR0_LOW_ADDRESS_PROTECTION_BIT (63 - 35) +#define CR0_FETCH_PROTECTION_OVERRIDE_BIT (63 - 38) +#define CR0_STORAGE_PROTECTION_OVERRIDE_BIT (63 - 39) +#define CR0_EDAT_BIT (63 - 40) +#define CR0_INSTRUCTION_EXEC_PROTECTION_BIT (63 - 43) +#define CR0_VECTOR_BIT (63 - 46) +#define CR0_MALFUNCTION_ALERT_SUBMASK_BIT (63 - 48) +#define CR0_EMERGENCY_SIGNAL_SUBMASK_BIT (63 - 49) +#define CR0_EXTERNAL_CALL_SUBMASK_BIT (63 - 50) +#define CR0_CLOCK_COMPARATOR_SUBMASK_BIT (63 - 52) +#define CR0_CPU_TIMER_SUBMASK_BIT (63 - 53) +#define CR0_SERVICE_SIGNAL_SUBMASK_BIT (63 - 54) +#define CR0_UNUSED_56_BIT (63 - 56) +#define CR0_INTERRUPT_KEY_SUBMASK_BIT (63 - 57) +#define CR0_MEASUREMENT_ALERT_SUBMASK_BIT (63 - 58) +#define CR0_ETR_SUBMASK_BIT (63 - 59) +#define CR0_IUCV_BIT (63 - 62) + +#define CR0_TRANSACTIONAL_EXECUTION BIT(CR0_TRANSACTIONAL_EXECUTION_BIT) +#define CR0_CLOCK_COMPARATOR_SIGN BIT(CR0_CLOCK_COMPARATOR_SIGN_BIT) +#define CR0_CRYPTOGRAPHY_COUNTER BIT(CR0_CRYPTOGRAPHY_COUNTER_BIT) +#define CR0_PAI_EXTENSION BIT(CR0_PAI_EXTENSION_BIT) +#define CR0_CPUMF_EXTRACTION_AUTH BIT(CR0_CPUMF_EXTRACTION_AUTH_BIT) +#define CR0_WARNING_TRACK BIT(CR0_WARNING_TRACK_BIT) +#define CR0_LOW_ADDRESS_PROTECTION BIT(CR0_LOW_ADDRESS_PROTECTION_BIT) +#define CR0_FETCH_PROTECTION_OVERRIDE BIT(CR0_FETCH_PROTECTION_OVERRIDE_BIT) +#define CR0_STORAGE_PROTECTION_OVERRIDE BIT(CR0_STORAGE_PROTECTION_OVERRIDE_BIT) +#define CR0_EDAT BIT(CR0_EDAT_BIT) +#define CR0_INSTRUCTION_EXEC_PROTECTION BIT(CR0_INSTRUCTION_EXEC_PROTECTION_BIT) +#define CR0_VECTOR BIT(CR0_VECTOR_BIT) +#define CR0_MALFUNCTION_ALERT_SUBMASK BIT(CR0_MALFUNCTION_ALERT_SUBMASK_BIT) +#define CR0_EMERGENCY_SIGNAL_SUBMASK BIT(CR0_EMERGENCY_SIGNAL_SUBMASK_BIT) +#define CR0_EXTERNAL_CALL_SUBMASK BIT(CR0_EXTERNAL_CALL_SUBMASK_BIT) +#define CR0_CLOCK_COMPARATOR_SUBMASK BIT(CR0_CLOCK_COMPARATOR_SUBMASK_BIT) +#define CR0_CPU_TIMER_SUBMASK BIT(CR0_CPU_TIMER_SUBMASK_BIT) +#define CR0_SERVICE_SIGNAL_SUBMASK BIT(CR0_SERVICE_SIGNAL_SUBMASK_BIT) +#define CR0_UNUSED_56 BIT(CR0_UNUSED_56_BIT) +#define CR0_INTERRUPT_KEY_SUBMASK BIT(CR0_INTERRUPT_KEY_SUBMASK_BIT) +#define CR0_MEASUREMENT_ALERT_SUBMASK BIT(CR0_MEASUREMENT_ALERT_SUBMASK_BIT) +#define CR0_ETR_SUBMASK BIT(CR0_ETR_SUBMASK_BIT) +#define CR0_IUCV BIT(CR0_IUCV_BIT) + +#define CR2_MIO_ADDRESSING_BIT (63 - 58) +#define CR2_GUARDED_STORAGE_BIT (63 - 59) + +#define CR2_MIO_ADDRESSING BIT(CR2_MIO_ADDRESSING_BIT) +#define CR2_GUARDED_STORAGE BIT(CR2_GUARDED_STORAGE_BIT) + +#define CR14_UNUSED_32_BIT (63 - 32) +#define CR14_UNUSED_33_BIT (63 - 33) +#define CR14_CHANNEL_REPORT_SUBMASK_BIT (63 - 35) +#define CR14_RECOVERY_SUBMASK_BIT (63 - 36) +#define CR14_DEGRADATION_SUBMASK_BIT (63 - 37) +#define CR14_EXTERNAL_DAMAGE_SUBMASK_BIT (63 - 38) +#define CR14_WARNING_SUBMASK_BIT (63 - 39) + +#define CR14_UNUSED_32 BIT(CR14_UNUSED_32_BIT) +#define CR14_UNUSED_33 BIT(CR14_UNUSED_33_BIT) +#define CR14_CHANNEL_REPORT_SUBMASK BIT(CR14_CHANNEL_REPORT_SUBMASK_BIT) +#define CR14_RECOVERY_SUBMASK BIT(CR14_RECOVERY_SUBMASK_BIT) +#define CR14_DEGRADATION_SUBMASK BIT(CR14_DEGRADATION_SUBMASK_BIT) +#define CR14_EXTERNAL_DAMAGE_SUBMASK BIT(CR14_EXTERNAL_DAMAGE_SUBMASK_BIT) +#define CR14_WARNING_SUBMASK BIT(CR14_WARNING_SUBMASK_BIT) + +#ifndef __ASSEMBLY__ + +#include <linux/bug.h> + +struct ctlreg { + unsigned long val; +}; + +#define __local_ctl_load(low, high, array) do { \ + struct addrtype { \ + char _[sizeof(array)]; \ + }; \ + int _high = high; \ + int _low = low; \ + int _esize; \ + \ + _esize = (_high - _low + 1) * sizeof(struct ctlreg); \ + BUILD_BUG_ON(sizeof(struct addrtype) != _esize); \ + typecheck(struct ctlreg, array[0]); \ + asm volatile( \ + " lctlg %[_low],%[_high],%[_arr]\n" \ + : \ + : [_arr] "Q" (*(struct addrtype *)(&array)), \ + [_low] "i" (low), [_high] "i" (high) \ + : "memory"); \ +} while (0) + +#define __local_ctl_store(low, high, array) do { \ + struct addrtype { \ + char _[sizeof(array)]; \ + }; \ + int _high = high; \ + int _low = low; \ + int _esize; \ + \ + _esize = (_high - _low + 1) * sizeof(struct ctlreg); \ + BUILD_BUG_ON(sizeof(struct addrtype) != _esize); \ + typecheck(struct ctlreg, array[0]); \ + asm volatile( \ + " stctg %[_low],%[_high],%[_arr]\n" \ + : [_arr] "=Q" (*(struct addrtype *)(&array)) \ + : [_low] "i" (low), [_high] "i" (high)); \ +} while (0) + +static __always_inline void local_ctl_load(unsigned int cr, struct ctlreg *reg) +{ + asm volatile( + " lctlg %[cr],%[cr],%[reg]\n" + : + : [reg] "Q" (*reg), [cr] "i" (cr) + : "memory"); +} + +static __always_inline void local_ctl_store(unsigned int cr, struct ctlreg *reg) +{ + asm volatile( + " stctg %[cr],%[cr],%[reg]\n" + : [reg] "=Q" (*reg) + : [cr] "i" (cr)); +} + +static __always_inline struct ctlreg local_ctl_set_bit(unsigned int cr, unsigned int bit) +{ + struct ctlreg new, old; + + local_ctl_store(cr, &old); + new = old; + new.val |= 1UL << bit; + local_ctl_load(cr, &new); + return old; +} + +static __always_inline struct ctlreg local_ctl_clear_bit(unsigned int cr, unsigned int bit) +{ + struct ctlreg new, old; + + local_ctl_store(cr, &old); + new = old; + new.val &= ~(1UL << bit); + local_ctl_load(cr, &new); + return old; +} + +struct lowcore; + +void system_ctlreg_lock(void); +void system_ctlreg_unlock(void); +void system_ctlreg_init_save_area(struct lowcore *lc); +void system_ctlreg_modify(unsigned int cr, unsigned long data, int request); + +enum { + CTLREG_SET_BIT, + CTLREG_CLEAR_BIT, + CTLREG_LOAD, +}; + +static inline void system_ctl_set_bit(unsigned int cr, unsigned int bit) +{ + system_ctlreg_modify(cr, bit, CTLREG_SET_BIT); +} + +static inline void system_ctl_clear_bit(unsigned int cr, unsigned int bit) +{ + system_ctlreg_modify(cr, bit, CTLREG_CLEAR_BIT); +} + +static inline void system_ctl_load(unsigned int cr, struct ctlreg *reg) +{ + system_ctlreg_modify(cr, reg->val, CTLREG_LOAD); +} + +union ctlreg0 { + unsigned long val; + struct ctlreg reg; + struct { + unsigned long : 8; + unsigned long tcx : 1; /* Transactional-Execution control */ + unsigned long pifo : 1; /* Transactional-Execution Program- + Interruption-Filtering Override */ + unsigned long : 3; + unsigned long ccc : 1; /* Cryptography counter control */ + unsigned long pec : 1; /* PAI extension control */ + unsigned long : 17; + unsigned long : 3; + unsigned long lap : 1; /* Low-address-protection control */ + unsigned long : 4; + unsigned long edat : 1; /* Enhanced-DAT-enablement control */ + unsigned long : 2; + unsigned long iep : 1; /* Instruction-Execution-Protection */ + unsigned long : 1; + unsigned long afp : 1; /* AFP-register control */ + unsigned long vx : 1; /* Vector enablement control */ + unsigned long : 7; + unsigned long sssm : 1; /* Service signal subclass mask */ + unsigned long : 9; + }; +}; + +union ctlreg2 { + unsigned long val; + struct ctlreg reg; + struct { + unsigned long : 33; + unsigned long ducto : 25; + unsigned long : 1; + unsigned long gse : 1; + unsigned long : 1; + unsigned long tds : 1; + unsigned long tdc : 2; + }; +}; + +union ctlreg5 { + unsigned long val; + struct ctlreg reg; + struct { + unsigned long : 33; + unsigned long pasteo: 25; + unsigned long : 6; + }; +}; + +union ctlreg15 { + unsigned long val; + struct ctlreg reg; + struct { + unsigned long lsea : 61; + unsigned long : 3; + }; +}; + +#endif /* __ASSEMBLY__ */ +#endif /* __ASM_S390_CTLREG_H */ diff --git a/arch/s390/include/asm/debug.h b/arch/s390/include/asm/debug.h index 310134015541..ccd4e148b5ed 100644 --- a/arch/s390/include/asm/debug.h +++ b/arch/s390/include/asm/debug.h @@ -2,17 +2,18 @@ /* * S/390 debug facility * - * Copyright IBM Corp. 1999, 2000 + * Copyright IBM Corp. 1999, 2020 */ -#ifndef DEBUG_H -#define DEBUG_H +#ifndef _ASM_S390_DEBUG_H +#define _ASM_S390_DEBUG_H #include <linux/string.h> #include <linux/spinlock.h> #include <linux/kernel.h> #include <linux/time.h> #include <linux/refcount.h> -#include <uapi/asm/debug.h> +#include <linux/fs.h> +#include <linux/init.h> #define DEBUG_MAX_LEVEL 6 /* debug levels range from 0 to 6 */ #define DEBUG_OFF_LEVEL -1 /* level where debug is switched off */ @@ -26,6 +27,16 @@ #define DEBUG_DATA(entry) (char *)(entry + 1) /* data is stored behind */ /* the entry information */ +#define __DEBUG_FEATURE_VERSION 3 /* version of debug feature */ + +struct __debug_entry { + unsigned long clock : 60; + unsigned long exception : 1; + unsigned long level : 3; + void *caller; + unsigned short cpu; +} __packed; + typedef struct __debug_entry debug_entry_t; struct debug_view; @@ -82,7 +93,6 @@ struct debug_view { }; extern struct debug_view debug_hex_ascii_view; -extern struct debug_view debug_raw_view; extern struct debug_view debug_sprintf_view; /* do NOT use the _common functions */ @@ -212,7 +222,7 @@ static inline debug_entry_t *debug_text_event(debug_info_t *id, int level, /* * IMPORTANT: Use "%s" in sprintf format strings with care! Only pointers are - * stored in the s390dbf. See Documentation/s390/s390dbf.rst for more details! + * stored in the s390dbf. See Documentation/arch/s390/s390dbf.rst for more details! */ extern debug_entry_t * __debug_sprintf_event(debug_info_t *id, int level, char *string, ...) @@ -340,7 +350,7 @@ static inline debug_entry_t *debug_text_exception(debug_info_t *id, int level, /* * IMPORTANT: Use "%s" in sprintf format strings with care! Only pointers are - * stored in the s390dbf. See Documentation/s390/s390dbf.rst for more details! + * stored in the s390dbf. See Documentation/arch/s390/s390dbf.rst for more details! */ extern debug_entry_t * __debug_sprintf_exception(debug_info_t *id, int level, char *string, ...) @@ -382,38 +392,99 @@ int debug_register_view(debug_info_t *id, struct debug_view *view); int debug_unregister_view(debug_info_t *id, struct debug_view *view); +#ifndef MODULE + /* - define the debug levels: - - 0 No debugging output to console or syslog - - 1 Log internal errors to syslog, ignore check conditions - - 2 Log internal errors and check conditions to syslog - - 3 Log internal errors to console, log check conditions to syslog - - 4 Log internal errors and check conditions to console - - 5 panic on internal errors, log check conditions to console - - 6 panic on both, internal errors and check conditions + * Note: Initial page and area numbers must be fixed to allow static + * initialization. This enables very early tracing. Changes to these values + * must be reflected in __DEFINE_STATIC_AREA. + */ +#define EARLY_PAGES 8 +#define EARLY_AREAS 1 + +#define VNAME(var, suffix) __##var##_##suffix + +/* + * Define static areas for early trace data. During boot debug_register_static() + * will replace these with dynamically allocated areas to allow custom page and + * area sizes, and dynamic resizing. + */ +#define __DEFINE_STATIC_AREA(var) \ +static char VNAME(var, data)[EARLY_PAGES][PAGE_SIZE] __initdata; \ +static debug_entry_t *VNAME(var, pages)[EARLY_PAGES] __initdata = { \ + (debug_entry_t *)VNAME(var, data)[0], \ + (debug_entry_t *)VNAME(var, data)[1], \ + (debug_entry_t *)VNAME(var, data)[2], \ + (debug_entry_t *)VNAME(var, data)[3], \ + (debug_entry_t *)VNAME(var, data)[4], \ + (debug_entry_t *)VNAME(var, data)[5], \ + (debug_entry_t *)VNAME(var, data)[6], \ + (debug_entry_t *)VNAME(var, data)[7], \ +}; \ +static debug_entry_t **VNAME(var, areas)[EARLY_AREAS] __initdata = { \ + (debug_entry_t **)VNAME(var, pages), \ +}; \ +static int VNAME(var, active_pages)[EARLY_AREAS] __initdata; \ +static int VNAME(var, active_entries)[EARLY_AREAS] __initdata + +#define __DEBUG_INFO_INIT(var, _name, _buf_size) { \ + .next = NULL, \ + .prev = NULL, \ + .ref_count = REFCOUNT_INIT(1), \ + .lock = __SPIN_LOCK_UNLOCKED(var.lock), \ + .level = DEBUG_DEFAULT_LEVEL, \ + .nr_areas = EARLY_AREAS, \ + .pages_per_area = EARLY_PAGES, \ + .buf_size = (_buf_size), \ + .entry_size = sizeof(debug_entry_t) + (_buf_size), \ + .areas = VNAME(var, areas), \ + .active_area = 0, \ + .active_pages = VNAME(var, active_pages), \ + .active_entries = VNAME(var, active_entries), \ + .debugfs_root_entry = NULL, \ + .debugfs_entries = { NULL }, \ + .views = { NULL }, \ + .name = (_name), \ + .mode = 0600, \ +} + +#define __REGISTER_STATIC_DEBUG_INFO(var, name, pages, areas, view) \ +static int __init VNAME(var, reg)(void) \ +{ \ + debug_register_static(&var, (pages), (areas)); \ + debug_register_view(&var, (view)); \ + return 0; \ +} \ +arch_initcall(VNAME(var, reg)) + +/** + * DEFINE_STATIC_DEBUG_INFO - Define static debug_info_t + * + * @var: Name of debug_info_t variable + * @name: Name of debug log (e.g. used for debugfs entry) + * @pages: Number of pages per area + * @nr_areas: Number of debug areas + * @buf_size: Size of data area in each debug entry + * @view: Pointer to debug view struct + * + * Define a static debug_info_t for early tracing. The associated debugfs log + * is automatically registered with the specified debug view. + * + * Important: Users of this macro must not call any of the + * debug_register/_unregister() functions for this debug_info_t! + * + * Note: Tracing will start with a fixed number of initial pages and areas. + * The debug area will be changed to use the specified numbers during + * arch_initcall. */ +#define DEFINE_STATIC_DEBUG_INFO(var, name, pages, nr_areas, buf_size, view) \ +__DEFINE_STATIC_AREA(var); \ +static debug_info_t __refdata var = \ + __DEBUG_INFO_INIT(var, (name), (buf_size)); \ +__REGISTER_STATIC_DEBUG_INFO(var, name, pages, nr_areas, view) + +void debug_register_static(debug_info_t *id, int pages_per_area, int nr_areas); + +#endif /* MODULE */ -#ifndef DEBUG_LEVEL -#define DEBUG_LEVEL 4 -#endif - -#define INTERNAL_ERRMSG(x,y...) "E" __FILE__ "%d: " x, __LINE__, y -#define INTERNAL_WRNMSG(x,y...) "W" __FILE__ "%d: " x, __LINE__, y -#define INTERNAL_INFMSG(x,y...) "I" __FILE__ "%d: " x, __LINE__, y -#define INTERNAL_DEBMSG(x,y...) "D" __FILE__ "%d: " x, __LINE__, y - -#if DEBUG_LEVEL > 0 -#define PRINT_DEBUG(x...) printk(KERN_DEBUG PRINTK_HEADER x) -#define PRINT_INFO(x...) printk(KERN_INFO PRINTK_HEADER x) -#define PRINT_WARN(x...) printk(KERN_WARNING PRINTK_HEADER x) -#define PRINT_ERR(x...) printk(KERN_ERR PRINTK_HEADER x) -#define PRINT_FATAL(x...) panic(PRINTK_HEADER x) -#else -#define PRINT_DEBUG(x...) printk(KERN_DEBUG PRINTK_HEADER x) -#define PRINT_INFO(x...) printk(KERN_DEBUG PRINTK_HEADER x) -#define PRINT_WARN(x...) printk(KERN_DEBUG PRINTK_HEADER x) -#define PRINT_ERR(x...) printk(KERN_DEBUG PRINTK_HEADER x) -#define PRINT_FATAL(x...) printk(KERN_DEBUG PRINTK_HEADER x) -#endif /* DASD_DEBUG */ - -#endif /* DEBUG_H */ +#endif /* _ASM_S390_DEBUG_H */ diff --git a/arch/s390/include/asm/delay.h b/arch/s390/include/asm/delay.h index 898323fd93d2..21a8fe18fe66 100644 --- a/arch/s390/include/asm/delay.h +++ b/arch/s390/include/asm/delay.h @@ -13,13 +13,12 @@ #ifndef _S390_DELAY_H #define _S390_DELAY_H -void __ndelay(unsigned long long nsecs); -void __udelay(unsigned long long usecs); -void udelay_simple(unsigned long long usecs); +void __ndelay(unsigned long nsecs); +void __udelay(unsigned long usecs); void __delay(unsigned long loops); -#define ndelay(n) __ndelay((unsigned long long) (n)) -#define udelay(n) __udelay((unsigned long long) (n)) -#define mdelay(n) __udelay((unsigned long long) (n) * 1000) +#define ndelay(n) __ndelay((unsigned long)(n)) +#define udelay(n) __udelay((unsigned long)(n)) +#define mdelay(n) __udelay((unsigned long)(n) * 1000) #endif /* defined(_S390_DELAY_H) */ diff --git a/arch/s390/include/asm/diag.h b/arch/s390/include/asm/diag.h index 0036eab14391..bed804137537 100644 --- a/arch/s390/include/asm/diag.h +++ b/arch/s390/include/asm/diag.h @@ -11,6 +11,8 @@ #include <linux/if_ether.h> #include <linux/percpu.h> +#include <asm/asm-extable.h> +#include <asm/cio.h> enum diag_stat_enum { DIAG_STAT_X008, @@ -19,6 +21,7 @@ enum diag_stat_enum { DIAG_STAT_X014, DIAG_STAT_X044, DIAG_STAT_X064, + DIAG_STAT_X08C, DIAG_STAT_X09C, DIAG_STAT_X0DC, DIAG_STAT_X204, @@ -33,6 +36,7 @@ enum diag_stat_enum { DIAG_STAT_X304, DIAG_STAT_X308, DIAG_STAT_X318, + DIAG_STAT_X320, DIAG_STAT_X500, NR_DIAG_STAT }; @@ -47,8 +51,8 @@ static inline void diag10_range(unsigned long start_pfn, unsigned long num_pfn) { unsigned long start_addr, end_addr; - start_addr = start_pfn << PAGE_SHIFT; - end_addr = (start_pfn + num_pfn - 1) << PAGE_SHIFT; + start_addr = pfn_to_phys(start_pfn); + end_addr = pfn_to_phys(start_pfn + num_pfn - 1); diag_stat_inc(DIAG_STAT_X010); asm volatile( @@ -78,10 +82,20 @@ struct diag210 { u8 vrdccrty; /* real device type (output) */ u8 vrdccrmd; /* real device model (output) */ u8 vrdccrft; /* real device feature (output) */ -} __attribute__((packed, aligned(4))); +} __packed __aligned(4); extern int diag210(struct diag210 *addr); +struct diag8c { + u8 flags; + u8 num_partitions; + u16 width; + u16 height; + u8 data[]; +} __packed __aligned(4); + +extern int diag8c(struct diag8c *out, struct ccw_dev_id *devno); + /* bit is set in flags, when physical cpu info is included in diag 204 data */ #define DIAG204_LPAR_PHYS_FLG 0x80 #define DIAG204_LPAR_NAME_LEN 8 /* lpar name len in diag 204 data */ @@ -95,6 +109,8 @@ enum diag204_sc { DIAG204_SUBC_STIB7 = 7 }; +#define DIAG204_SUBCODE_MASK 0xffff + /* The two available diag 204 data formats */ enum diag204_format { DIAG204_INFO_SIMPLE = 0, @@ -298,10 +314,8 @@ struct diag26c_mac_resp { union diag318_info { unsigned long val; struct { - unsigned int cpnc : 8; - unsigned int cpvc_linux : 24; - unsigned char cpvc_distro[3]; - unsigned char zero; + unsigned long cpnc : 8; + unsigned long cpvc : 56; }; }; @@ -311,14 +325,27 @@ int diag26c(void *req, void *resp, enum diag26c_sc subcode); struct hypfs_diag0c_entry; +/* + * This structure must contain only pointers/references into + * the AMODE31 text section. + */ struct diag_ops { int (*diag210)(struct diag210 *addr); int (*diag26c)(void *req, void *resp, enum diag26c_sc subcode); int (*diag14)(unsigned long rx, unsigned long ry1, unsigned long subcode); + int (*diag8c)(struct diag8c *addr, struct ccw_dev_id *devno, size_t len); void (*diag0c)(struct hypfs_diag0c_entry *entry); void (*diag308_reset)(void); }; -extern struct diag_ops diag_dma_ops; -extern struct diag210 *__diag210_tmp_dma; +extern struct diag_ops diag_amode31_ops; +extern struct diag210 *__diag210_tmp_amode31; + +int _diag210_amode31(struct diag210 *addr); +int _diag26c_amode31(void *req, void *resp, enum diag26c_sc subcode); +int _diag14_amode31(unsigned long rx, unsigned long ry1, unsigned long subcode); +void _diag0c_amode31(struct hypfs_diag0c_entry *entry); +void _diag308_reset_amode31(void); +int _diag8c_amode31(struct diag8c *addr, struct ccw_dev_id *devno, size_t len); + #endif /* _ASM_S390_DIAG_H */ diff --git a/arch/s390/include/asm/dma.h b/arch/s390/include/asm/dma.h index 6f26f35d4a71..7fe3e31956d7 100644 --- a/arch/s390/include/asm/dma.h +++ b/arch/s390/include/asm/dma.h @@ -2,19 +2,13 @@ #ifndef _ASM_S390_DMA_H #define _ASM_S390_DMA_H -#include <asm/io.h> +#include <linux/io.h> /* * MAX_DMA_ADDRESS is ambiguous because on s390 its completely unrelated * to DMA. It _is_ used for the s390 memory zone split at 2GB caused * by the 31 bit heritage. */ -#define MAX_DMA_ADDRESS 0x80000000 - -#ifdef CONFIG_PCI -extern int isa_dma_bridge_buggy; -#else -#define isa_dma_bridge_buggy (0) -#endif +#define MAX_DMA_ADDRESS __va(0x80000000) #endif /* _ASM_S390_DMA_H */ diff --git a/arch/s390/include/asm/eadm.h b/arch/s390/include/asm/eadm.h index bb63b2afdf6f..06f795855af7 100644 --- a/arch/s390/include/asm/eadm.h +++ b/arch/s390/include/asm/eadm.h @@ -78,7 +78,7 @@ struct aob { struct aob_rq_header { struct scm_device *scmdev; - char data[0]; + char data[]; }; struct scm_device { @@ -105,7 +105,7 @@ enum scm_event {SCM_CHANGE, SCM_AVAIL}; struct scm_driver { struct device_driver drv; int (*probe) (struct scm_device *scmdev); - int (*remove) (struct scm_device *scmdev); + void (*remove) (struct scm_device *scmdev); void (*notify) (struct scm_device *scmdev, enum scm_event event); void (*handler) (struct scm_device *scmdev, void *data, blk_status_t error); diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h index 5775fc22f410..70a30ae258b7 100644 --- a/arch/s390/include/asm/elf.h +++ b/arch/s390/include/asm/elf.h @@ -91,29 +91,57 @@ /* Keep this the last entry. */ #define R_390_NUM 61 -/* Bits present in AT_HWCAP. */ -#define HWCAP_S390_ESAN3 1 -#define HWCAP_S390_ZARCH 2 -#define HWCAP_S390_STFLE 4 -#define HWCAP_S390_MSA 8 -#define HWCAP_S390_LDISP 16 -#define HWCAP_S390_EIMM 32 -#define HWCAP_S390_DFP 64 -#define HWCAP_S390_HPAGE 128 -#define HWCAP_S390_ETF3EH 256 -#define HWCAP_S390_HIGH_GPRS 512 -#define HWCAP_S390_TE 1024 -#define HWCAP_S390_VXRS 2048 -#define HWCAP_S390_VXRS_BCD 4096 -#define HWCAP_S390_VXRS_EXT 8192 -#define HWCAP_S390_GS 16384 -#define HWCAP_S390_VXRS_EXT2 32768 -#define HWCAP_S390_VXRS_PDE 65536 -#define HWCAP_S390_SORT 131072 -#define HWCAP_S390_DFLT 262144 +enum { + HWCAP_NR_ESAN3 = 0, + HWCAP_NR_ZARCH = 1, + HWCAP_NR_STFLE = 2, + HWCAP_NR_MSA = 3, + HWCAP_NR_LDISP = 4, + HWCAP_NR_EIMM = 5, + HWCAP_NR_DFP = 6, + HWCAP_NR_HPAGE = 7, + HWCAP_NR_ETF3EH = 8, + HWCAP_NR_HIGH_GPRS = 9, + HWCAP_NR_TE = 10, + HWCAP_NR_VXRS = 11, + HWCAP_NR_VXRS_BCD = 12, + HWCAP_NR_VXRS_EXT = 13, + HWCAP_NR_GS = 14, + HWCAP_NR_VXRS_EXT2 = 15, + HWCAP_NR_VXRS_PDE = 16, + HWCAP_NR_SORT = 17, + HWCAP_NR_DFLT = 18, + HWCAP_NR_VXRS_PDE2 = 19, + HWCAP_NR_NNPA = 20, + HWCAP_NR_PCI_MIO = 21, + HWCAP_NR_SIE = 22, + HWCAP_NR_MAX +}; -/* Internal bits, not exposed via elf */ -#define HWCAP_INT_SIE 1UL +/* Bits present in AT_HWCAP. */ +#define HWCAP_ESAN3 BIT(HWCAP_NR_ESAN3) +#define HWCAP_ZARCH BIT(HWCAP_NR_ZARCH) +#define HWCAP_STFLE BIT(HWCAP_NR_STFLE) +#define HWCAP_MSA BIT(HWCAP_NR_MSA) +#define HWCAP_LDISP BIT(HWCAP_NR_LDISP) +#define HWCAP_EIMM BIT(HWCAP_NR_EIMM) +#define HWCAP_DFP BIT(HWCAP_NR_DFP) +#define HWCAP_HPAGE BIT(HWCAP_NR_HPAGE) +#define HWCAP_ETF3EH BIT(HWCAP_NR_ETF3EH) +#define HWCAP_HIGH_GPRS BIT(HWCAP_NR_HIGH_GPRS) +#define HWCAP_TE BIT(HWCAP_NR_TE) +#define HWCAP_VXRS BIT(HWCAP_NR_VXRS) +#define HWCAP_VXRS_BCD BIT(HWCAP_NR_VXRS_BCD) +#define HWCAP_VXRS_EXT BIT(HWCAP_NR_VXRS_EXT) +#define HWCAP_GS BIT(HWCAP_NR_GS) +#define HWCAP_VXRS_EXT2 BIT(HWCAP_NR_VXRS_EXT2) +#define HWCAP_VXRS_PDE BIT(HWCAP_NR_VXRS_PDE) +#define HWCAP_SORT BIT(HWCAP_NR_SORT) +#define HWCAP_DFLT BIT(HWCAP_NR_DFLT) +#define HWCAP_VXRS_PDE2 BIT(HWCAP_NR_VXRS_PDE2) +#define HWCAP_NNPA BIT(HWCAP_NR_NNPA) +#define HWCAP_PCI_MIO BIT(HWCAP_NR_PCI_MIO) +#define HWCAP_SIE BIT(HWCAP_NR_SIE) /* * These are used to set parameters in the core dumps. @@ -144,10 +172,6 @@ typedef s390_compat_regs compat_elf_gregset_t; #include <linux/sched/mm.h> /* for task_struct */ #include <asm/mmu_context.h> -#include <asm/vdso.h> - -extern unsigned int vdso_enabled; - /* * This is used to ensure we don't load something for the wrong architecture. */ @@ -176,7 +200,7 @@ struct arch_elf_state { !current->mm->context.alloc_pgste) { \ set_thread_flag(TIF_PGSTE); \ set_pt_regs_flag(task_pt_regs(current), \ - PIF_SYSCALL_RESTART); \ + PIF_EXECVE_PGSTE_RESTART); \ _state->rc = -EAGAIN; \ } \ _state->rc; \ @@ -213,10 +237,6 @@ struct arch_elf_state { extern unsigned long elf_hwcap; #define ELF_HWCAP (elf_hwcap) -/* Internal hardware capabilities, not exposed via elf */ - -extern unsigned long int_hwcap; - /* This yields a string that ld.so will use to load implementation specific libraries for optimization. This is more specific in intent than poking at uname or /proc/cpuinfo. @@ -233,8 +253,7 @@ extern char elf_platform[]; do { \ set_personality(PER_LINUX | \ (current->personality & (~PER_MASK))); \ - current->thread.sys_call_table = \ - (unsigned long) &sys_call_table; \ + current->thread.sys_call_table = sys_call_table; \ } while (0) #else /* CONFIG_COMPAT */ #define SET_PERSONALITY(ex) \ @@ -245,11 +264,11 @@ do { \ if ((ex).e_ident[EI_CLASS] == ELFCLASS32) { \ set_thread_flag(TIF_31BIT); \ current->thread.sys_call_table = \ - (unsigned long) &sys_call_table_emu; \ + sys_call_table_emu; \ } else { \ clear_thread_flag(TIF_31BIT); \ current->thread.sys_call_table = \ - (unsigned long) &sys_call_table; \ + sys_call_table; \ } \ } while (0) #endif /* CONFIG_COMPAT */ @@ -269,11 +288,10 @@ do { \ #define STACK_RND_MASK MMAP_RND_MASK /* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */ -#define ARCH_DLINFO \ -do { \ - if (vdso_enabled) \ - NEW_AUX_ENT(AT_SYSINFO_EHDR, \ - (unsigned long)current->mm->context.vdso_base); \ +#define ARCH_DLINFO \ +do { \ + NEW_AUX_ENT(AT_SYSINFO_EHDR, \ + (unsigned long)current->mm->context.vdso_base); \ } while (0) struct linux_binprm; diff --git a/arch/s390/include/asm/entry-common.h b/arch/s390/include/asm/entry-common.h new file mode 100644 index 000000000000..fdd319a622b0 --- /dev/null +++ b/arch/s390/include/asm/entry-common.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef ARCH_S390_ENTRY_COMMON_H +#define ARCH_S390_ENTRY_COMMON_H + +#include <linux/sched.h> +#include <linux/audit.h> +#include <linux/randomize_kstack.h> +#include <linux/processor.h> +#include <linux/uaccess.h> +#include <asm/timex.h> +#include <asm/fpu/api.h> +#include <asm/pai.h> + +#define ARCH_EXIT_TO_USER_MODE_WORK (_TIF_GUARDED_STORAGE | _TIF_PER_TRAP) + +void do_per_trap(struct pt_regs *regs); + +static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs) +{ + if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) + debug_user_asce(0); + + pai_kernel_enter(regs); +} + +#define arch_enter_from_user_mode arch_enter_from_user_mode + +static __always_inline void arch_exit_to_user_mode_work(struct pt_regs *regs, + unsigned long ti_work) +{ + if (ti_work & _TIF_PER_TRAP) { + clear_thread_flag(TIF_PER_TRAP); + do_per_trap(regs); + } + + if (ti_work & _TIF_GUARDED_STORAGE) + gs_load_bc_cb(regs); +} + +#define arch_exit_to_user_mode_work arch_exit_to_user_mode_work + +static __always_inline void arch_exit_to_user_mode(void) +{ + if (test_cpu_flag(CIF_FPU)) + __load_fpu_regs(); + + if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) + debug_user_asce(1); + + pai_kernel_exit(current_pt_regs()); +} + +#define arch_exit_to_user_mode arch_exit_to_user_mode + +static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, + unsigned long ti_work) +{ + choose_random_kstack_offset(get_tod_clock_fast() & 0xff); +} + +#define arch_exit_to_user_mode_prepare arch_exit_to_user_mode_prepare + +#endif diff --git a/arch/s390/include/asm/extable.h b/arch/s390/include/asm/extable.h index ae27f756b409..af6ba52743e9 100644 --- a/arch/s390/include/asm/extable.h +++ b/arch/s390/include/asm/extable.h @@ -1,12 +1,20 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __S390_EXTABLE_H #define __S390_EXTABLE_H + +#include <asm/ptrace.h> +#include <linux/compiler.h> + /* - * The exception table consists of pairs of addresses: the first is the - * address of an instruction that is allowed to fault, and the second is - * the address at which the program should continue. No registers are - * modified, so it is entirely up to the continuation code to figure out - * what to do. + * The exception table consists of three addresses: + * + * - Address of an instruction that is allowed to fault. + * - Address at which the program should continue. + * - Optional address of handler that takes pt_regs * argument and runs in + * interrupt context. + * + * No registers are modified, so it is entirely up to the continuation code + * to figure out what to do. * * All the routines below use bits of fixup code that are out of line * with the main instruction path. This means when everything is well, @@ -17,10 +25,11 @@ struct exception_table_entry { int insn, fixup; + short type, data; }; -extern struct exception_table_entry *__start_dma_ex_table; -extern struct exception_table_entry *__stop_dma_ex_table; +extern struct exception_table_entry *__start_amode31_ex_table; +extern struct exception_table_entry *__stop_amode31_ex_table; const struct exception_table_entry *s390_search_extables(unsigned long addr); @@ -31,4 +40,33 @@ static inline unsigned long extable_fixup(const struct exception_table_entry *x) #define ARCH_HAS_RELATIVE_EXTABLE +static inline void swap_ex_entry_fixup(struct exception_table_entry *a, + struct exception_table_entry *b, + struct exception_table_entry tmp, + int delta) +{ + a->fixup = b->fixup + delta; + b->fixup = tmp.fixup - delta; + a->type = b->type; + b->type = tmp.type; + a->data = b->data; + b->data = tmp.data; +} +#define swap_ex_entry_fixup swap_ex_entry_fixup + +#ifdef CONFIG_BPF_JIT + +bool ex_handler_bpf(const struct exception_table_entry *ex, struct pt_regs *regs); + +#else /* !CONFIG_BPF_JIT */ + +static inline bool ex_handler_bpf(const struct exception_table_entry *ex, struct pt_regs *regs) +{ + return false; +} + +#endif /* CONFIG_BPF_JIT */ + +bool fixup_exception(struct pt_regs *regs); + #endif diff --git a/arch/s390/include/asm/facility.h b/arch/s390/include/asm/facility.h index 68c476b20b57..796007125dff 100644 --- a/arch/s390/include/asm/facility.h +++ b/arch/s390/include/asm/facility.h @@ -9,11 +9,18 @@ #define __ASM_FACILITY_H #include <asm/facility-defs.h> + +#include <linux/minmax.h> #include <linux/string.h> +#include <linux/types.h> #include <linux/preempt.h> + #include <asm/lowcore.h> -#define MAX_FACILITY_BIT (sizeof(((struct lowcore *)0)->stfle_fac_list) * 8) +#define MAX_FACILITY_BIT (sizeof(stfle_fac_list) * 8) + +extern u64 stfle_fac_list[16]; +extern u64 alt_stfle_fac_list[16]; static inline void __set_facility(unsigned long nr, void *facilities) { @@ -44,7 +51,7 @@ static inline int __test_facility(unsigned long nr, void *facilities) } /* - * The test_facility function uses the bit odering where the MSB is bit 0. + * The test_facility function uses the bit ordering where the MSB is bit 0. * That makes it easier to query facility bits with the bit number as * documented in the Principles of Operation. */ @@ -56,18 +63,20 @@ static inline int test_facility(unsigned long nr) if (__test_facility(nr, &facilities_als)) return 1; } - return __test_facility(nr, &S390_lowcore.stfle_fac_list); + return __test_facility(nr, &stfle_fac_list); } static inline unsigned long __stfle_asm(u64 *stfle_fac_list, int size) { - register unsigned long reg0 asm("0") = size - 1; + unsigned long reg0 = size - 1; asm volatile( - ".insn s,0xb2b00000,0(%1)" /* stfle */ - : "+d" (reg0) - : "a" (stfle_fac_list) - : "memory", "cc"); + " lgr 0,%[reg0]\n" + " .insn s,0xb2b00000,%[list]\n" /* stfle */ + " lgr %[reg0],0\n" + : [reg0] "+&d" (reg0), [list] "+Q" (*stfle_fac_list) + : + : "memory", "cc", "0"); return reg0; } @@ -79,13 +88,15 @@ static inline unsigned long __stfle_asm(u64 *stfle_fac_list, int size) static inline void __stfle(u64 *stfle_fac_list, int size) { unsigned long nr; + u32 stfl_fac_list; asm volatile( " stfl 0(0)\n" : "=m" (S390_lowcore.stfl_fac_list)); + stfl_fac_list = S390_lowcore.stfl_fac_list; + memcpy(stfle_fac_list, &stfl_fac_list, 4); nr = 4; /* bytes stored by stfl */ - memcpy(stfle_fac_list, &S390_lowcore.stfl_fac_list, 4); - if (S390_lowcore.stfl_fac_list & 0x01000000) { + if (stfl_fac_list & 0x01000000) { /* More facility bits available with stfle */ nr = __stfle_asm(stfle_fac_list, size); nr = min_t(unsigned long, (nr + 1) * 8, size * 8); @@ -100,4 +111,10 @@ static inline void stfle(u64 *stfle_fac_list, int size) preempt_enable(); } +/** + * stfle_size - Actual size of the facility list as specified by stfle + * (number of double words) + */ +unsigned int stfle_size(void); + #endif /* __ASM_FACILITY_H */ diff --git a/arch/s390/include/asm/fault.h b/arch/s390/include/asm/fault.h new file mode 100644 index 000000000000..d326f56603d6 --- /dev/null +++ b/arch/s390/include/asm/fault.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 1999, 2023 + */ +#ifndef _ASM_S390_FAULT_H +#define _ASM_S390_FAULT_H + +union teid { + unsigned long val; + struct { + unsigned long addr : 52; /* Translation-exception Address */ + unsigned long fsi : 2; /* Access Exception Fetch/Store Indication */ + unsigned long : 2; + unsigned long b56 : 1; + unsigned long : 3; + unsigned long b60 : 1; + unsigned long b61 : 1; + unsigned long as : 2; /* ASCE Identifier */ + }; +}; + +enum { + TEID_FSI_UNKNOWN = 0, /* Unknown whether fetch or store */ + TEID_FSI_STORE = 1, /* Exception was due to store operation */ + TEID_FSI_FETCH = 2 /* Exception was due to fetch operation */ +}; + +#endif /* _ASM_S390_FAULT_H */ diff --git a/arch/s390/include/asm/fcx.h b/arch/s390/include/asm/fcx.h index cff0749e9657..29784b4b44f6 100644 --- a/arch/s390/include/asm/fcx.h +++ b/arch/s390/include/asm/fcx.h @@ -214,7 +214,7 @@ struct dcw_intrg_data { u32 :32; u64 time; u64 prog_id; - u8 prog_data[0]; + u8 prog_data[]; } __attribute__ ((packed)); #define DCW_FLAGS_CC (1 << (7 - 1)) @@ -241,7 +241,7 @@ struct dcw { u32 :8; u32 cd_count:8; u32 count; - u8 cd[0]; + u8 cd[]; } __attribute__ ((packed)); #define TCCB_FORMAT_DEFAULT 0x7f @@ -286,7 +286,7 @@ struct tccb_tcat { */ struct tccb { struct tccb_tcah tcah; - u8 tca[0]; + u8 tca[]; } __attribute__ ((packed, aligned(8))); struct tcw *tcw_get_intrg(struct tcw *tcw); diff --git a/arch/s390/include/asm/fpu/api.h b/arch/s390/include/asm/fpu/api.h index 34a7ae68485c..d6ca8bc6ca68 100644 --- a/arch/s390/include/asm/fpu/api.h +++ b/arch/s390/include/asm/fpu/api.h @@ -45,24 +45,34 @@ #define _ASM_S390_FPU_API_H #include <linux/preempt.h> +#include <asm/asm-extable.h> +#include <asm/fpu/internal.h> void save_fpu_regs(void); +void load_fpu_regs(void); +void __load_fpu_regs(void); -static inline int test_fp_ctl(u32 fpc) +/** + * sfpc_safe - Set floating point control register safely. + * @fpc: new value for floating point control register + * + * Set floating point control register. This may lead to an exception, + * since a saved value may have been modified by user space (ptrace, + * signal return, kvm registers) to an invalid value. In such a case + * set the floating point control register to zero. + */ +static inline void sfpc_safe(u32 fpc) { - u32 orig_fpc; - int rc; - - asm volatile( - " efpc %1\n" - " sfpc %2\n" - "0: sfpc %1\n" - " la %0,0\n" - "1:\n" - EX_TABLE(0b,1b) - : "=d" (rc), "=&d" (orig_fpc) - : "d" (fpc), "0" (-EINVAL)); - return rc; + asm volatile("\n" + "0: sfpc %[fpc]\n" + "1: nopr %%r7\n" + ".pushsection .fixup, \"ax\"\n" + "2: lghi %[fpc],0\n" + " jg 0b\n" + ".popsection\n" + EX_TABLE(1b, 2b) + : [fpc] "+d" (fpc) + : : "memory"); } #define KERNEL_FPC 1 @@ -76,7 +86,7 @@ static inline int test_fp_ctl(u32 fpc) #define KERNEL_VXR_HIGH (KERNEL_VXR_V16V23|KERNEL_VXR_V24V31) #define KERNEL_VXR (KERNEL_VXR_LOW|KERNEL_VXR_HIGH) -#define KERNEL_FPR (KERNEL_FPC|KERNEL_VXR_V0V7) +#define KERNEL_FPR (KERNEL_FPC|KERNEL_VXR_LOW) struct kernel_fpu; diff --git a/arch/s390/include/asm/fpu/internal.h b/arch/s390/include/asm/fpu/internal.h index 4a71dbbf76fb..d511c4cf5afb 100644 --- a/arch/s390/include/asm/fpu/internal.h +++ b/arch/s390/include/asm/fpu/internal.h @@ -10,9 +10,14 @@ #define _ASM_S390_FPU_INTERNAL_H #include <linux/string.h> -#include <asm/ctl_reg.h> +#include <asm/facility.h> #include <asm/fpu/types.h> +static inline bool cpu_has_vx(void) +{ + return likely(test_facility(129)); +} + static inline void save_vx_regs(__vector128 *vxrs) { asm volatile( @@ -27,7 +32,7 @@ static inline void convert_vx_to_fp(freg_t *fprs, __vector128 *vxrs) int i; for (i = 0; i < __NUM_FPRS; i++) - fprs[i] = *(freg_t *)(vxrs + i); + fprs[i].ui = vxrs[i].high; } static inline void convert_fp_to_vx(__vector128 *vxrs, freg_t *fprs) @@ -35,14 +40,14 @@ static inline void convert_fp_to_vx(__vector128 *vxrs, freg_t *fprs) int i; for (i = 0; i < __NUM_FPRS; i++) - *(freg_t *)(vxrs + i) = fprs[i]; + vxrs[i].high = fprs[i].ui; } static inline void fpregs_store(_s390_fp_regs *fpregs, struct fpu *fpu) { fpregs->pad = 0; fpregs->fpc = fpu->fpc; - if (MACHINE_HAS_VX) + if (cpu_has_vx()) convert_vx_to_fp((freg_t *)&fpregs->fprs, fpu->vxrs); else memcpy((freg_t *)&fpregs->fprs, fpu->fprs, @@ -52,7 +57,7 @@ static inline void fpregs_store(_s390_fp_regs *fpregs, struct fpu *fpu) static inline void fpregs_load(_s390_fp_regs *fpregs, struct fpu *fpu) { fpu->fpc = fpregs->fpc; - if (MACHINE_HAS_VX) + if (cpu_has_vx()) convert_fp_to_vx(fpu->vxrs, (freg_t *)&fpregs->fprs); else memcpy(fpu->fprs, (freg_t *)&fpregs->fprs, diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h index 68d362f8d6c1..5a82b08f03cd 100644 --- a/arch/s390/include/asm/ftrace.h +++ b/arch/s390/include/asm/ftrace.h @@ -2,16 +2,9 @@ #ifndef _ASM_S390_FTRACE_H #define _ASM_S390_FTRACE_H +#define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR #define ARCH_SUPPORTS_FTRACE_OPS 1 - -#if defined(CC_USING_HOTPATCH) || defined(CC_USING_NOP_MCOUNT) #define MCOUNT_INSN_SIZE 6 -#else -#define MCOUNT_INSN_SIZE 24 -#define MCOUNT_RETURN_FIXUP 18 -#endif - -#define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR #ifndef __ASSEMBLY__ @@ -22,72 +15,103 @@ #define ftrace_return_address(n) __builtin_return_address(n) #endif -void _mcount(void); void ftrace_caller(void); -extern char ftrace_graph_caller_end; -extern unsigned long ftrace_plt; +extern void *ftrace_func; struct dyn_arch_ftrace { }; -#define MCOUNT_ADDR ((unsigned long)_mcount) +#define MCOUNT_ADDR 0 #define FTRACE_ADDR ((unsigned long)ftrace_caller) #define KPROBE_ON_FTRACE_NOP 0 #define KPROBE_ON_FTRACE_CALL 1 +struct module; +struct dyn_ftrace; + +bool ftrace_need_init_nop(void); +#define ftrace_need_init_nop ftrace_need_init_nop + +int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec); +#define ftrace_init_nop ftrace_init_nop + static inline unsigned long ftrace_call_adjust(unsigned long addr) { return addr; } -struct ftrace_insn { - u16 opc; - s32 disp; -} __packed; +struct ftrace_regs { + struct pt_regs regs; +}; -static inline void ftrace_generate_nop_insn(struct ftrace_insn *insn) +static __always_inline struct pt_regs *arch_ftrace_get_regs(struct ftrace_regs *fregs) { -#ifdef CONFIG_FUNCTION_TRACER -#if defined(CC_USING_HOTPATCH) || defined(CC_USING_NOP_MCOUNT) - /* brcl 0,0 */ - insn->opc = 0xc004; - insn->disp = 0; -#else - /* jg .+24 */ - insn->opc = 0xc0f4; - insn->disp = MCOUNT_INSN_SIZE / 2; -#endif -#endif + struct pt_regs *regs = &fregs->regs; + + if (test_pt_regs_flag(regs, PIF_FTRACE_FULL_REGS)) + return regs; + return NULL; } -static inline int is_ftrace_nop(struct ftrace_insn *insn) +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +struct fgraph_ret_regs { + unsigned long gpr2; + unsigned long fp; +}; + +static __always_inline unsigned long fgraph_ret_regs_return_value(struct fgraph_ret_regs *ret_regs) { -#ifdef CONFIG_FUNCTION_TRACER -#if defined(CC_USING_HOTPATCH) || defined(CC_USING_NOP_MCOUNT) - if (insn->disp == 0) - return 1; -#else - if (insn->disp == MCOUNT_INSN_SIZE / 2) - return 1; -#endif -#endif - return 0; + return ret_regs->gpr2; } -static inline void ftrace_generate_call_insn(struct ftrace_insn *insn, - unsigned long ip) +static __always_inline unsigned long fgraph_ret_regs_frame_pointer(struct fgraph_ret_regs *ret_regs) { -#ifdef CONFIG_FUNCTION_TRACER - unsigned long target; + return ret_regs->fp; +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ - /* brasl r0,ftrace_caller */ - target = is_module_addr((void *) ip) ? ftrace_plt : FTRACE_ADDR; - insn->opc = 0xc005; - insn->disp = (target - ip) / 2; -#endif +static __always_inline unsigned long +ftrace_regs_get_instruction_pointer(const struct ftrace_regs *fregs) +{ + return fregs->regs.psw.addr; +} + +static __always_inline void +ftrace_regs_set_instruction_pointer(struct ftrace_regs *fregs, + unsigned long ip) +{ + fregs->regs.psw.addr = ip; } +#define ftrace_regs_get_argument(fregs, n) \ + regs_get_kernel_argument(&(fregs)->regs, n) +#define ftrace_regs_get_stack_pointer(fregs) \ + kernel_stack_pointer(&(fregs)->regs) +#define ftrace_regs_return_value(fregs) \ + regs_return_value(&(fregs)->regs) +#define ftrace_regs_set_return_value(fregs, ret) \ + regs_set_return_value(&(fregs)->regs, ret) +#define ftrace_override_function_with_return(fregs) \ + override_function_with_return(&(fregs)->regs) +#define ftrace_regs_query_register_offset(name) \ + regs_query_register_offset(name) + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +/* + * When an ftrace registered caller is tracing a function that is + * also set by a register_ftrace_direct() call, it needs to be + * differentiated in the ftrace_caller trampoline. To do this, + * place the direct caller in the ORIG_GPR2 part of pt_regs. This + * tells the ftrace_caller that there's a direct caller. + */ +static inline void arch_ftrace_set_direct_caller(struct ftrace_regs *fregs, unsigned long addr) +{ + struct pt_regs *regs = &fregs->regs; + regs->orig_gpr2 = addr; +} +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + /* * Even though the system call numbers are identical for s390/s390x a * different system call table is used for compat tasks. This may lead @@ -114,4 +138,32 @@ static inline bool arch_syscall_match_sym_name(const char *sym, } #endif /* __ASSEMBLY__ */ + +#ifdef CONFIG_FUNCTION_TRACER + +#define FTRACE_NOP_INSN .word 0xc004, 0x0000, 0x0000 /* brcl 0,0 */ + +#ifndef CC_USING_HOTPATCH + +#define FTRACE_GEN_MCOUNT_RECORD(name) \ + .section __mcount_loc, "a", @progbits; \ + .quad name; \ + .previous; + +#else /* !CC_USING_HOTPATCH */ + +#define FTRACE_GEN_MCOUNT_RECORD(name) + +#endif /* !CC_USING_HOTPATCH */ + +#define FTRACE_GEN_NOP_ASM(name) \ + FTRACE_GEN_MCOUNT_RECORD(name) \ + FTRACE_NOP_INSN + +#else /* CONFIG_FUNCTION_TRACER */ + +#define FTRACE_GEN_NOP_ASM(name) + +#endif /* CONFIG_FUNCTION_TRACER */ + #endif /* _ASM_S390_FTRACE_H */ diff --git a/arch/s390/include/asm/ftrace.lds.h b/arch/s390/include/asm/ftrace.lds.h new file mode 100644 index 000000000000..968adfd41240 --- /dev/null +++ b/arch/s390/include/asm/ftrace.lds.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef DIV_ROUND_UP +#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) +#endif + +#define SIZEOF_MCOUNT_LOC_ENTRY 8 +#define SIZEOF_FTRACE_HOTPATCH_TRAMPOLINE 24 +#define FTRACE_HOTPATCH_TRAMPOLINES_SIZE(n) \ + DIV_ROUND_UP(SIZEOF_FTRACE_HOTPATCH_TRAMPOLINE * (n), \ + SIZEOF_MCOUNT_LOC_ENTRY) + +#ifdef CONFIG_FUNCTION_TRACER +#define FTRACE_HOTPATCH_TRAMPOLINES_TEXT \ + . = ALIGN(8); \ + __ftrace_hotpatch_trampolines_start = .; \ + . = . + FTRACE_HOTPATCH_TRAMPOLINES_SIZE(__stop_mcount_loc - \ + __start_mcount_loc); \ + __ftrace_hotpatch_trampolines_end = .; +#else +#define FTRACE_HOTPATCH_TRAMPOLINES_TEXT +#endif diff --git a/arch/s390/include/asm/futex.h b/arch/s390/include/asm/futex.h index 5e97a4353147..eaeaeb3ff0be 100644 --- a/arch/s390/include/asm/futex.h +++ b/arch/s390/include/asm/futex.h @@ -4,6 +4,7 @@ #include <linux/uaccess.h> #include <linux/futex.h> +#include <asm/asm-extable.h> #include <asm/mmu_context.h> #include <asm/errno.h> @@ -16,7 +17,8 @@ "3: jl 1b\n" \ " lhi %0,0\n" \ "4: sacf 768\n" \ - EX_TABLE(0b,4b) EX_TABLE(2b,4b) EX_TABLE(3b,4b) \ + EX_TABLE(0b,4b) EX_TABLE(1b,4b) \ + EX_TABLE(2b,4b) EX_TABLE(3b,4b) \ : "=d" (ret), "=&d" (oldval), "=&d" (newval), \ "=m" (*uaddr) \ : "0" (-EFAULT), "d" (oparg), "a" (uaddr), \ @@ -26,10 +28,7 @@ static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { int oldval = 0, newval, ret; - mm_segment_t old_fs; - old_fs = enable_sacf_uaccess(); - pagefault_disable(); switch (op) { case FUTEX_OP_SET: __futex_atomic_op("lr %2,%5\n", @@ -54,8 +53,6 @@ static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, default: ret = -ENOSYS; } - pagefault_enable(); - disable_sacf_uaccess(old_fs); if (!ret) *oval = oldval; @@ -66,10 +63,8 @@ static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval, u32 newval) { - mm_segment_t old_fs; int ret; - old_fs = enable_sacf_uaccess(); asm volatile( " sacf 256\n" "0: cs %1,%4,0(%5)\n" @@ -79,7 +74,6 @@ static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, : "=d" (ret), "+d" (oldval), "=m" (*uaddr) : "0" (-EFAULT), "d" (newval), "a" (uaddr), "m" (*uaddr) : "cc", "memory"); - disable_sacf_uaccess(old_fs); *uval = oldval; return ret; } diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index 37f96b6f0e61..5cc46e0dde62 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -9,6 +9,7 @@ #ifndef _ASM_S390_GMAP_H #define _ASM_S390_GMAP_H +#include <linux/radix-tree.h> #include <linux/refcount.h> /* Generic bits for GMAP notification on DAT table entry changes. */ @@ -31,6 +32,7 @@ * @table: pointer to the page directory * @asce: address space control element for gmap page table * @pfault_enabled: defines if pfaults are applicable for the guest + * @guest_handle: protected virtual machine handle for the ultravisor * @host_to_rmap: radix tree with gmap_rmap lists * @children: list of shadow gmap structures * @pt_list: list of all page tables used in the shadow guest address space @@ -54,6 +56,8 @@ struct gmap { unsigned long asce_end; void *private; bool pfault_enabled; + /* only set for protected virtual machines */ + unsigned long guest_handle; /* Additional data for shadow guest address spaces */ struct radix_tree_root host_to_rmap; struct list_head children; @@ -136,12 +140,49 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte); void gmap_register_pte_notifier(struct gmap_notifier *); void gmap_unregister_pte_notifier(struct gmap_notifier *); -void gmap_pte_notify(struct mm_struct *, unsigned long addr, pte_t *, - unsigned long bits); int gmap_mprotect_notify(struct gmap *, unsigned long start, unsigned long len, int prot); void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4], unsigned long gaddr, unsigned long vmaddr); +int gmap_mark_unmergeable(void); +void s390_unlist_old_asce(struct gmap *gmap); +int s390_replace_asce(struct gmap *gmap); +void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns); +int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, + unsigned long end, bool interruptible); + +/** + * s390_uv_destroy_range - Destroy a range of pages in the given mm. + * @mm: the mm on which to operate on + * @start: the start of the range + * @end: the end of the range + * + * This function will call cond_sched, so it should not generate stalls, but + * it will otherwise only return when it completed. + */ +static inline void s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, + unsigned long end) +{ + (void)__s390_uv_destroy_range(mm, start, end, false); +} + +/** + * s390_uv_destroy_range_interruptible - Destroy a range of pages in the + * given mm, but stop when a fatal signal is received. + * @mm: the mm on which to operate on + * @start: the start of the range + * @end: the end of the range + * + * This function will call cond_sched, so it should not generate stalls. If + * a fatal signal is received, it will return with -EINTR immediately, + * without finishing destroying the whole range. Upon successful + * completion, 0 is returned. + */ +static inline int s390_uv_destroy_range_interruptible(struct mm_struct *mm, unsigned long start, + unsigned long end) +{ + return __s390_uv_destroy_range(mm, start, end, true); +} #endif /* _ASM_S390_GMAP_H */ diff --git a/arch/s390/include/asm/hardirq.h b/arch/s390/include/asm/hardirq.h index dfbc3c6c0674..58668ffb5488 100644 --- a/arch/s390/include/asm/hardirq.h +++ b/arch/s390/include/asm/hardirq.h @@ -18,7 +18,6 @@ #define or_softirq_pending(x) (S390_lowcore.softirq_pending |= (x)) #define __ARCH_IRQ_STAT -#define __ARCH_HAS_DO_SOFTIRQ #define __ARCH_IRQ_EXIT_IRQS_DISABLED static inline void ack_bad_irq(unsigned int irq) diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index de8f0bf5f238..deb198a61039 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -9,25 +9,20 @@ #ifndef _ASM_S390_HUGETLB_H #define _ASM_S390_HUGETLB_H +#include <linux/pgtable.h> #include <asm/page.h> -#include <asm/pgtable.h> #define hugetlb_free_pgd_range free_pgd_range #define hugepages_supported() (MACHINE_HAS_EDAT1) void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned long sz); +void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); pte_t huge_ptep_get(pte_t *ptep); pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -static inline bool is_hugepage_only_range(struct mm_struct *mm, - unsigned long addr, - unsigned long len) -{ - return false; -} - /* * If the arch doesn't supply something else, assume that hugepage * size aligned regions are ok without further preparation. @@ -35,9 +30,11 @@ static inline bool is_hugepage_only_range(struct mm_struct *mm, static inline int prepare_hugepage_range(struct file *file, unsigned long addr, unsigned long len) { - if (len & ~HPAGE_MASK) + struct hstate *h = hstate_file(file); + + if (len & ~huge_page_mask(h)) return -EINVAL; - if (addr & ~HPAGE_MASK) + if (addr & ~huge_page_mask(h)) return -EINVAL; return 0; } @@ -46,20 +43,21 @@ static inline void arch_clear_hugepage_flags(struct page *page) { clear_bit(PG_arch_1, &page->flags); } +#define arch_clear_hugepage_flags arch_clear_hugepage_flags static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz) { if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) - pte_val(*ptep) = _REGION3_ENTRY_EMPTY; + set_pte(ptep, __pte(_REGION3_ENTRY_EMPTY)); else - pte_val(*ptep) = _SEGMENT_ENTRY_EMPTY; + set_pte(ptep, __pte(_SEGMENT_ENTRY_EMPTY)); } -static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep) +static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) { - huge_ptep_get_and_clear(vma->vm_mm, address, ptep); + return huge_ptep_get_and_clear(vma->vm_mm, address, ptep); } static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, @@ -69,7 +67,7 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, int changed = !pte_same(huge_ptep_get(ptep), pte); if (changed) { huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); - set_huge_pte_at(vma->vm_mm, addr, ptep, pte); + __set_huge_pte_at(vma->vm_mm, addr, ptep, pte); } return changed; } @@ -78,7 +76,7 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pte_t pte = huge_ptep_get_and_clear(mm, addr, ptep); - set_huge_pte_at(mm, addr, ptep, pte_wrprotect(pte)); + __set_huge_pte_at(mm, addr, ptep, pte_wrprotect(pte)); } static inline pte_t mk_huge_pte(struct page *page, pgprot_t pgprot) @@ -91,6 +89,11 @@ static inline int huge_pte_none(pte_t pte) return pte_none(pte); } +static inline int huge_pte_none_mostly(pte_t pte) +{ + return huge_pte_none(pte); +} + static inline int huge_pte_write(pte_t pte) { return pte_write(pte); @@ -103,7 +106,7 @@ static inline int huge_pte_dirty(pte_t pte) static inline pte_t huge_pte_mkwrite(pte_t pte) { - return pte_mkwrite(pte); + return pte_mkwrite_novma(pte); } static inline pte_t huge_pte_mkdirty(pte_t pte) @@ -121,6 +124,21 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot) return pte_modify(pte, newprot); } +static inline pte_t huge_pte_mkuffd_wp(pte_t pte) +{ + return pte; +} + +static inline pte_t huge_pte_clear_uffd_wp(pte_t pte) +{ + return pte; +} + +static inline int huge_pte_uffd_wp(pte_t pte) +{ + return 0; +} + static inline bool gigantic_page_runtime_supported(void) { return true; diff --git a/arch/s390/include/asm/hw_irq.h b/arch/s390/include/asm/hw_irq.h index adae176757ae..9078b5b6b837 100644 --- a/arch/s390/include/asm/hw_irq.h +++ b/arch/s390/include/asm/hw_irq.h @@ -7,6 +7,5 @@ void __init init_airq_interrupts(void); void __init init_cio_interrupts(void); -void __init init_ext_interrupts(void); #endif diff --git a/arch/s390/include/asm/idals.h b/arch/s390/include/asm/idals.h index 6fb7aced104a..59fcc3c72edf 100644 --- a/arch/s390/include/asm/idals.h +++ b/arch/s390/include/asm/idals.h @@ -23,6 +23,9 @@ #define IDA_SIZE_LOG 12 /* 11 for 2k , 12 for 4k */ #define IDA_BLOCK_SIZE (1L<<IDA_SIZE_LOG) +#define IDA_2K_SIZE_LOG 11 +#define IDA_2K_BLOCK_SIZE (1L << IDA_2K_SIZE_LOG) + /* * Test if an address/length pair needs an idal list. */ @@ -43,6 +46,15 @@ static inline unsigned int idal_nr_words(void *vaddr, unsigned int length) } /* + * Return the number of 2K IDA words needed for an address/length pair. + */ +static inline unsigned int idal_2k_nr_words(void *vaddr, unsigned int length) +{ + return ((__pa(vaddr) & (IDA_2K_BLOCK_SIZE - 1)) + length + + (IDA_2K_BLOCK_SIZE - 1)) >> IDA_2K_SIZE_LOG; +} + +/* * Create the list of idal words for an address/length pair. */ static inline unsigned long *idal_create_words(unsigned long *idaws, @@ -108,7 +120,7 @@ clear_normalized_cda(struct ccw1 * ccw) struct idal_buffer { size_t size; size_t page_order; - void *data[0]; + void *data[]; }; /* diff --git a/arch/s390/include/asm/idle.h b/arch/s390/include/asm/idle.h index 6d4226dcf42a..09f763b9eb40 100644 --- a/arch/s390/include/asm/idle.h +++ b/arch/s390/include/asm/idle.h @@ -10,21 +10,18 @@ #include <linux/types.h> #include <linux/device.h> -#include <linux/seqlock.h> struct s390_idle_data { - seqcount_t seqcount; - unsigned long long idle_count; - unsigned long long idle_time; - unsigned long long clock_idle_enter; - unsigned long long clock_idle_exit; - unsigned long long timer_idle_enter; - unsigned long long timer_idle_exit; + unsigned long idle_count; + unsigned long idle_time; + unsigned long clock_idle_enter; + unsigned long timer_idle_enter; + unsigned long mt_cycles_enter[8]; }; extern struct device_attribute dev_attr_idle_count; extern struct device_attribute dev_attr_idle_time_us; -void psw_idle(struct s390_idle_data *, unsigned long); +void psw_idle(struct s390_idle_data *data, unsigned long psw_mask); #endif /* _S390_IDLE_H */ diff --git a/arch/s390/include/asm/io.h b/arch/s390/include/asm/io.h index 5a16f500515a..4453ad7c11ac 100644 --- a/arch/s390/include/asm/io.h +++ b/arch/s390/include/asm/io.h @@ -12,6 +12,7 @@ #include <linux/kernel.h> #include <asm/page.h> +#include <asm/pgtable.h> #include <asm/pci_io.h> #define xlate_dev_mem_ptr xlate_dev_mem_ptr @@ -19,15 +20,20 @@ void *xlate_dev_mem_ptr(phys_addr_t phys); #define unxlate_dev_mem_ptr unxlate_dev_mem_ptr void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); +#define IO_SPACE_LIMIT 0 + /* - * Convert a virtual cached pointer to an uncached pointer + * I/O memory mapping functions. */ -#define xlate_dev_kmem_ptr(p) p +#define ioremap_prot ioremap_prot +#define iounmap iounmap -#define IO_SPACE_LIMIT 0 +#define _PAGE_IOREMAP pgprot_val(PAGE_KERNEL) -void __iomem *ioremap(unsigned long offset, unsigned long size); -void iounmap(volatile void __iomem *addr); +#define ioremap_wc(addr, size) \ + ioremap_prot((addr), (size), pgprot_val(pgprot_writecombine(PAGE_KERNEL))) +#define ioremap_wt(addr, size) \ + ioremap_prot((addr), (size), pgprot_val(pgprot_writethrough(PAGE_KERNEL))) static inline void __iomem *ioport_map(unsigned long port, unsigned int nr) { diff --git a/arch/s390/include/asm/ipl.h b/arch/s390/include/asm/ipl.h index 084e71b7272a..b0d00032479d 100644 --- a/arch/s390/include/asm/ipl.h +++ b/arch/s390/include/asm/ipl.h @@ -12,6 +12,7 @@ #include <asm/types.h> #include <asm/cio.h> #include <asm/setup.h> +#include <asm/page.h> #include <uapi/asm/ipl.h> struct ipl_parameter_block { @@ -21,6 +22,8 @@ struct ipl_parameter_block { struct ipl_pb0_common common; struct ipl_pb0_fcp fcp; struct ipl_pb0_ccw ccw; + struct ipl_pb0_eckd eckd; + struct ipl_pb0_nvme nvme; char raw[PAGE_SIZE - sizeof(struct ipl_pl_hdr)]; }; } __packed __aligned(PAGE_SIZE); @@ -30,10 +33,19 @@ struct ipl_parameter_block { #define IPL_BP_FCP_LEN (sizeof(struct ipl_pl_hdr) + \ sizeof(struct ipl_pb0_fcp)) #define IPL_BP0_FCP_LEN (sizeof(struct ipl_pb0_fcp)) + +#define IPL_BP_NVME_LEN (sizeof(struct ipl_pl_hdr) + \ + sizeof(struct ipl_pb0_nvme)) +#define IPL_BP0_NVME_LEN (sizeof(struct ipl_pb0_nvme)) + #define IPL_BP_CCW_LEN (sizeof(struct ipl_pl_hdr) + \ sizeof(struct ipl_pb0_ccw)) #define IPL_BP0_CCW_LEN (sizeof(struct ipl_pb0_ccw)) +#define IPL_BP_ECKD_LEN (sizeof(struct ipl_pl_hdr) + \ + sizeof(struct ipl_pb0_eckd)) +#define IPL_BP0_ECKD_LEN (sizeof(struct ipl_pb0_eckd)) + #define IPL_MAX_SUPPORTED_VERSION (0) #define IPL_RB_CERT_UNKNOWN ((unsigned short)-1) @@ -59,6 +71,10 @@ enum ipl_type { IPL_TYPE_FCP = 4, IPL_TYPE_FCP_DUMP = 8, IPL_TYPE_NSS = 16, + IPL_TYPE_NVME = 32, + IPL_TYPE_NVME_DUMP = 64, + IPL_TYPE_ECKD = 128, + IPL_TYPE_ECKD_DUMP = 256, }; struct ipl_info @@ -70,10 +86,17 @@ struct ipl_info } ccw; struct { struct ccw_dev_id dev_id; + } eckd; + struct { + struct ccw_dev_id dev_id; u64 wwpn; u64 lun; } fcp; struct { + u32 fid; + u32 nsid; + } nvme; + struct { char name[NSS_NAME_SIZE + 1]; } nss; } data; @@ -83,6 +106,13 @@ extern struct ipl_info ipl_info; extern void setup_ipl(void); extern void set_os_info_reipl_block(void); +static inline bool is_ipl_type_dump(void) +{ + return (ipl_info.type == IPL_TYPE_FCP_DUMP) || + (ipl_info.type == IPL_TYPE_ECKD_DUMP) || + (ipl_info.type == IPL_TYPE_NVME_DUMP); +} + struct ipl_report { struct ipl_parameter_block *ipib; struct list_head components; @@ -114,11 +144,18 @@ int ipl_report_add_certificate(struct ipl_report *report, void *key, * DIAG 308 support */ enum diag308_subcode { + DIAG308_CLEAR_RESET = 0, + DIAG308_LOAD_NORMAL_RESET = 1, DIAG308_REL_HSA = 2, DIAG308_LOAD_CLEAR = 3, DIAG308_LOAD_NORMAL_DUMP = 4, DIAG308_SET = 5, DIAG308_STORE = 6, + DIAG308_LOAD_NORMAL = 7, +}; + +enum diag308_subcode_flags { + DIAG308_FLAG_EI = 1UL << 16, }; enum diag308_rc { diff --git a/arch/s390/include/asm/irq.h b/arch/s390/include/asm/irq.h index 9f75d67b8c20..54b42817f70a 100644 --- a/arch/s390/include/asm/irq.h +++ b/arch/s390/include/asm/irq.h @@ -31,6 +31,7 @@ #include <linux/percpu.h> #include <linux/cache.h> #include <linux/types.h> +#include <asm/ctlreg.h> enum interruption_class { IRQEXT_CLK, @@ -81,8 +82,13 @@ static __always_inline void inc_irq_stat(enum interruption_class irq) } struct ext_code { - unsigned short subcode; - unsigned short code; + union { + struct { + unsigned short subcode; + unsigned short code; + }; + unsigned int int_code; + }; }; typedef void (*ext_int_handler_t)(struct ext_code, unsigned int, unsigned long); @@ -96,17 +102,17 @@ enum irq_subclass { }; #define CR0_IRQ_SUBCLASS_MASK \ - ((1UL << (63 - 30)) /* Warning Track */ | \ - (1UL << (63 - 48)) /* Malfunction Alert */ | \ - (1UL << (63 - 49)) /* Emergency Signal */ | \ - (1UL << (63 - 50)) /* External Call */ | \ - (1UL << (63 - 52)) /* Clock Comparator */ | \ - (1UL << (63 - 53)) /* CPU Timer */ | \ - (1UL << (63 - 54)) /* Service Signal */ | \ - (1UL << (63 - 57)) /* Interrupt Key */ | \ - (1UL << (63 - 58)) /* Measurement Alert */ | \ - (1UL << (63 - 59)) /* Timing Alert */ | \ - (1UL << (63 - 62))) /* IUCV */ + (CR0_WARNING_TRACK | \ + CR0_MALFUNCTION_ALERT_SUBMASK | \ + CR0_EMERGENCY_SIGNAL_SUBMASK | \ + CR0_EXTERNAL_CALL_SUBMASK | \ + CR0_CLOCK_COMPARATOR_SUBMASK | \ + CR0_CPU_TIMER_SUBMASK | \ + CR0_SERVICE_SIGNAL_SUBMASK | \ + CR0_INTERRUPT_KEY_SUBMASK | \ + CR0_MEASUREMENT_ALERT_SUBMASK | \ + CR0_ETR_SUBMASK | \ + CR0_IUCV) void irq_subclass_register(enum irq_subclass subclass); void irq_subclass_unregister(enum irq_subclass subclass); diff --git a/arch/s390/include/asm/irq_work.h b/arch/s390/include/asm/irq_work.h new file mode 100644 index 000000000000..f00c9f610d5a --- /dev/null +++ b/arch/s390/include/asm/irq_work.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_IRQ_WORK_H +#define _ASM_S390_IRQ_WORK_H + +static inline bool arch_irq_work_has_interrupt(void) +{ + return true; +} + +#endif /* _ASM_S390_IRQ_WORK_H */ diff --git a/arch/s390/include/asm/irqflags.h b/arch/s390/include/asm/irqflags.h index 586df4c9e2f2..02427b205c11 100644 --- a/arch/s390/include/asm/irqflags.h +++ b/arch/s390/include/asm/irqflags.h @@ -32,45 +32,45 @@ }) /* set system mask. */ -static inline notrace void __arch_local_irq_ssm(unsigned long flags) +static __always_inline void __arch_local_irq_ssm(unsigned long flags) { asm volatile("ssm %0" : : "Q" (flags) : "memory"); } -static inline notrace unsigned long arch_local_save_flags(void) +static __always_inline unsigned long arch_local_save_flags(void) { return __arch_local_irq_stnsm(0xff); } -static inline notrace unsigned long arch_local_irq_save(void) +static __always_inline unsigned long arch_local_irq_save(void) { return __arch_local_irq_stnsm(0xfc); } -static inline notrace void arch_local_irq_disable(void) +static __always_inline void arch_local_irq_disable(void) { arch_local_irq_save(); } -static inline notrace void arch_local_irq_enable(void) +static __always_inline void arch_local_irq_enable(void) { __arch_local_irq_stosm(0x03); } /* This only restores external and I/O interrupt state */ -static inline notrace void arch_local_irq_restore(unsigned long flags) +static __always_inline void arch_local_irq_restore(unsigned long flags) { /* only disabled->disabled and disabled->enabled is valid */ if (flags & ARCH_IRQ_ENABLED) arch_local_irq_enable(); } -static inline notrace bool arch_irqs_disabled_flags(unsigned long flags) +static __always_inline bool arch_irqs_disabled_flags(unsigned long flags) { return !(flags & ARCH_IRQ_ENABLED); } -static inline notrace bool arch_irqs_disabled(void) +static __always_inline bool arch_irqs_disabled(void) { return arch_irqs_disabled_flags(arch_local_save_flags()); } diff --git a/arch/s390/include/asm/jump_label.h b/arch/s390/include/asm/jump_label.h index 39f747d63758..895f774bbcc5 100644 --- a/arch/s390/include/asm/jump_label.h +++ b/arch/s390/include/asm/jump_label.h @@ -2,27 +2,30 @@ #ifndef _ASM_S390_JUMP_LABEL_H #define _ASM_S390_JUMP_LABEL_H +#define HAVE_JUMP_LABEL_BATCH + #ifndef __ASSEMBLY__ #include <linux/types.h> #include <linux/stringify.h> #define JUMP_LABEL_NOP_SIZE 6 -#define JUMP_LABEL_NOP_OFFSET 2 -#if __GNUC__ < 9 +#ifdef CONFIG_CC_IS_CLANG +#define JUMP_LABEL_STATIC_KEY_CONSTRAINT "i" +#elif __GNUC__ < 9 #define JUMP_LABEL_STATIC_KEY_CONSTRAINT "X" #else #define JUMP_LABEL_STATIC_KEY_CONSTRAINT "jdd" #endif /* - * We use a brcl 0,2 instruction for jump labels at compile time so it + * We use a brcl 0,<offset> instruction for jump labels so it * can be easily distinguished from a hotpatch generated instruction. */ static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm_volatile_goto("0: brcl 0,"__stringify(JUMP_LABEL_NOP_OFFSET)"\n" + asm_volatile_goto("0: brcl 0,%l[label]\n" ".pushsection __jump_table,\"aw\"\n" ".balign 8\n" ".long 0b-.,%l[label]-.\n" diff --git a/arch/s390/include/asm/kasan.h b/arch/s390/include/asm/kasan.h index 70930fe5c496..0cffead0f2f2 100644 --- a/arch/s390/include/asm/kasan.h +++ b/arch/s390/include/asm/kasan.h @@ -2,29 +2,17 @@ #ifndef __ASM_KASAN_H #define __ASM_KASAN_H -#include <asm/pgtable.h> +#include <linux/const.h> #ifdef CONFIG_KASAN #define KASAN_SHADOW_SCALE_SHIFT 3 -#ifdef CONFIG_KASAN_S390_4_LEVEL_PAGING #define KASAN_SHADOW_SIZE \ (_AC(1, UL) << (_REGION1_SHIFT - KASAN_SHADOW_SCALE_SHIFT)) -#else -#define KASAN_SHADOW_SIZE \ - (_AC(1, UL) << (_REGION2_SHIFT - KASAN_SHADOW_SCALE_SHIFT)) -#endif #define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL) #define KASAN_SHADOW_START KASAN_SHADOW_OFFSET #define KASAN_SHADOW_END (KASAN_SHADOW_START + KASAN_SHADOW_SIZE) -extern void kasan_early_init(void); -extern void kasan_copy_shadow(pgd_t *dst); -extern void kasan_free_early_identity(void); -#else -static inline void kasan_early_init(void) { } -static inline void kasan_copy_shadow(pgd_t *dst) { } -static inline void kasan_free_early_identity(void) { } #endif #endif diff --git a/arch/s390/include/asm/kdebug.h b/arch/s390/include/asm/kdebug.h index d5327f064799..4377238e4752 100644 --- a/arch/s390/include/asm/kdebug.h +++ b/arch/s390/include/asm/kdebug.h @@ -23,6 +23,6 @@ enum die_val { DIE_NMI_IPI, }; -extern void die(struct pt_regs *, const char *); +extern void __noreturn die(struct pt_regs *, const char *); #endif diff --git a/arch/s390/include/asm/kexec.h b/arch/s390/include/asm/kexec.h index ea398a05f643..1bd08eb56d5f 100644 --- a/arch/s390/include/asm/kexec.h +++ b/arch/s390/include/asm/kexec.h @@ -9,6 +9,8 @@ #ifndef _S390_KEXEC_H #define _S390_KEXEC_H +#include <linux/module.h> + #include <asm/processor.h> #include <asm/page.h> #include <asm/setup.h> @@ -29,7 +31,7 @@ #define KEXEC_CONTROL_MEMORY_LIMIT (1UL<<31) /* Allocate control page with GFP_DMA */ -#define KEXEC_CONTROL_MEMORY_GFP GFP_DMA +#define KEXEC_CONTROL_MEMORY_GFP (GFP_DMA | __GFP_NORETRY) /* Maximum address we can use for the crash control pages */ #define KEXEC_CRASH_CONTROL_MEMORY_LIMIT (-1UL) @@ -74,7 +76,35 @@ void *kexec_file_add_components(struct kimage *image, int arch_kexec_do_relocs(int r_type, void *loc, unsigned long val, unsigned long addr); +#define ARCH_HAS_KIMAGE_ARCH + +struct kimage_arch { + void *ipl_buf; +}; + extern const struct kexec_file_ops s390_kexec_image_ops; extern const struct kexec_file_ops s390_kexec_elf_ops; +#ifdef CONFIG_CRASH_DUMP +void crash_free_reserved_phys_range(unsigned long begin, unsigned long end); +#define crash_free_reserved_phys_range crash_free_reserved_phys_range + +void arch_kexec_protect_crashkres(void); +#define arch_kexec_protect_crashkres arch_kexec_protect_crashkres + +void arch_kexec_unprotect_crashkres(void); +#define arch_kexec_unprotect_crashkres arch_kexec_unprotect_crashkres +#endif + +#ifdef CONFIG_KEXEC_FILE +struct purgatory_info; +int arch_kexec_apply_relocations_add(struct purgatory_info *pi, + Elf_Shdr *section, + const Elf_Shdr *relsec, + const Elf_Shdr *symtab); +#define arch_kexec_apply_relocations_add arch_kexec_apply_relocations_add + +int arch_kimage_file_post_load_cleanup(struct kimage *image); +#define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup +#endif #endif /*_S390_KEXEC_H */ diff --git a/arch/s390/include/asm/kfence.h b/arch/s390/include/asm/kfence.h new file mode 100644 index 000000000000..e47fd8cbe701 --- /dev/null +++ b/arch/s390/include/asm/kfence.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _ASM_S390_KFENCE_H +#define _ASM_S390_KFENCE_H + +#include <linux/mm.h> +#include <linux/kfence.h> +#include <asm/set_memory.h> +#include <asm/page.h> + +void __kernel_map_pages(struct page *page, int numpages, int enable); + +static __always_inline bool arch_kfence_init_pool(void) +{ + return true; +} + +#define arch_kfence_test_address(addr) ((addr) & PAGE_MASK) + +/* + * Do not split kfence pool to 4k mapping with arch_kfence_init_pool(), + * but earlier where page table allocations still happen with memblock. + * Reason is that arch_kfence_init_pool() gets called when the system + * is still in a limbo state - disabling and enabling bottom halves is + * not yet allowed, but that is what our page_table_alloc() would do. + */ +static __always_inline void kfence_split_mapping(void) +{ +#ifdef CONFIG_KFENCE + unsigned long pool_pages = KFENCE_POOL_SIZE >> PAGE_SHIFT; + + set_memory_4k((unsigned long)__kfence_pool, pool_pages); +#endif +} + +static inline bool kfence_protect_page(unsigned long addr, bool protect) +{ + __kernel_map_pages(virt_to_page((void *)addr), 1, !protect); + return true; +} + +#endif /* _ASM_S390_KFENCE_H */ diff --git a/arch/s390/include/asm/kprobes.h b/arch/s390/include/asm/kprobes.h index b106aa29bf55..01f1682a73b7 100644 --- a/arch/s390/include/asm/kprobes.h +++ b/arch/s390/include/asm/kprobes.h @@ -15,6 +15,7 @@ * <grundym@us.ibm.com> */ #include <linux/types.h> +#include <asm/ctlreg.h> #include <asm-generic/kprobes.h> #define BREAKPOINT_INSTRUCTION 0x0002 @@ -54,7 +55,6 @@ typedef u16 kprobe_opcode_t; struct arch_specific_insn { /* copy of original instruction */ kprobe_opcode_t *insn; - unsigned int is_ftrace_insn : 1; }; struct prev_kprobe { @@ -66,16 +66,13 @@ struct prev_kprobe { struct kprobe_ctlblk { unsigned long kprobe_status; unsigned long kprobe_saved_imask; - unsigned long kprobe_saved_ctl[3]; + struct ctlreg kprobe_saved_ctl[3]; struct prev_kprobe prev_kprobe; }; void arch_remove_kprobe(struct kprobe *p); -void kretprobe_trampoline(void); int kprobe_fault_handler(struct pt_regs *regs, int trapnr); -int kprobe_exceptions_notify(struct notifier_block *self, - unsigned long val, void *data); #define flush_insn_slot(p) do { } while (0) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 02f4c21c57f6..52664105a473 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -19,6 +19,8 @@ #include <linux/kvm.h> #include <linux/seqlock.h> #include <linux/module.h> +#include <linux/pci.h> +#include <linux/mmu_notifier.h> #include <asm/debug.h> #include <asm/cpu.h> #include <asm/fpu/api.h> @@ -28,15 +30,14 @@ #define KVM_S390_BSCA_CPU_SLOTS 64 #define KVM_S390_ESCA_CPU_SLOTS 248 #define KVM_MAX_VCPUS 255 -#define KVM_USER_MEM_SLOTS 32 /* - * These seem to be used for allocating ->chip in the routing table, - * which we don't use. 4096 is an out-of-thin-air value. If we need - * to look at ->chip later on, we'll need to revisit this. + * These seem to be used for allocating ->chip in the routing table, which we + * don't use. 1 is as small as we can get to reduce the needed memory. If we + * need to look at ->chip later on, we'll need to revisit this. */ #define KVM_NR_IRQCHIPS 1 -#define KVM_IRQCHIP_NUM_PINS 4096 +#define KVM_IRQCHIP_NUM_PINS 1 #define KVM_HALT_POLL_NS_DEFAULT 50000 /* s390-specific vcpu->requests bit members */ @@ -46,6 +47,8 @@ #define KVM_REQ_START_MIGRATION KVM_ARCH_REQ(3) #define KVM_REQ_STOP_MIGRATION KVM_ARCH_REQ(4) #define KVM_REQ_VSIE_RESTART KVM_ARCH_REQ(5) +#define KVM_REQ_REFRESH_GUEST_PREFIX \ + KVM_ARCH_REQ_FLAGS(6, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define SIGP_CTRL_C 0x80 #define SIGP_CTRL_SCN_MASK 0x3f @@ -92,19 +95,30 @@ union ipte_control { }; }; +union sca_utility { + __u16 val; + struct { + __u16 mtcr : 1; + __u16 reserved : 15; + }; +}; + struct bsca_block { union ipte_control ipte_control; __u64 reserved[5]; __u64 mcn; - __u64 reserved2; + union sca_utility utility; + __u8 reserved2[6]; struct bsca_entry cpu[KVM_S390_BSCA_CPU_SLOTS]; }; struct esca_block { union ipte_control ipte_control; - __u64 reserved1[7]; + __u64 reserved1[6]; + union sca_utility utility; + __u8 reserved2[6]; __u64 mcn[4]; - __u64 reserved2[20]; + __u64 reserved3[20]; struct esca_entry cpu[KVM_S390_ESCA_CPU_SLOTS]; }; @@ -122,6 +136,16 @@ struct mcck_volatile_info { __u32 reserved; }; +#define CR0_INITIAL_MASK (CR0_UNUSED_56 | CR0_INTERRUPT_KEY_SUBMASK | \ + CR0_MEASUREMENT_ALERT_SUBMASK) +#define CR14_INITIAL_MASK (CR14_UNUSED_32 | CR14_UNUSED_33 | \ + CR14_EXTERNAL_DAMAGE_SUBMASK) + +#define SIDAD_SIZE_MASK 0xff +#define sida_addr(sie_block) phys_to_virt((sie_block)->sidad & PAGE_MASK) +#define sida_size(sie_block) \ + ((((sie_block)->sidad & SIDAD_SIZE_MASK) + 1) * PAGE_SIZE) + #define CPUSTAT_STOPPED 0x80000000 #define CPUSTAT_WAIT 0x10000000 #define CPUSTAT_ECALL_PEND 0x08000000 @@ -155,7 +179,13 @@ struct kvm_s390_sie_block { __u8 reserved08[4]; /* 0x0008 */ #define PROG_IN_SIE (1<<0) __u32 prog0c; /* 0x000c */ - __u8 reserved10[16]; /* 0x0010 */ + union { + __u8 reserved10[16]; /* 0x0010 */ + struct { + __u64 pv_handle_cpu; + __u64 pv_handle_config; + }; + }; #define PROG_BLOCK_SIE (1<<0) #define PROG_REQUEST (1<<1) atomic_t prog20; /* 0x0020 */ @@ -204,10 +234,23 @@ struct kvm_s390_sie_block { #define ICPT_PARTEXEC 0x38 #define ICPT_IOINST 0x40 #define ICPT_KSS 0x5c +#define ICPT_MCHKREQ 0x60 +#define ICPT_INT_ENABLE 0x64 +#define ICPT_PV_INSTR 0x68 +#define ICPT_PV_NOTIFY 0x6c +#define ICPT_PV_PREF 0x70 __u8 icptcode; /* 0x0050 */ __u8 icptstatus; /* 0x0051 */ __u16 ihcpu; /* 0x0052 */ - __u8 reserved54[2]; /* 0x0054 */ + __u8 reserved54; /* 0x0054 */ +#define IICTL_CODE_NONE 0x00 +#define IICTL_CODE_MCHK 0x01 +#define IICTL_CODE_EXT 0x02 +#define IICTL_CODE_IO 0x03 +#define IICTL_CODE_RESTART 0x04 +#define IICTL_CODE_SPECIFICATION 0x10 +#define IICTL_CODE_OPERAND 0x11 + __u8 iictl; /* 0x0055 */ __u16 ipa; /* 0x0056 */ __u32 ipb; /* 0x0058 */ __u32 scaoh; /* 0x005c */ @@ -215,22 +258,29 @@ struct kvm_s390_sie_block { __u8 fpf; /* 0x0060 */ #define ECB_GS 0x40 #define ECB_TE 0x10 +#define ECB_SPECI 0x08 #define ECB_SRSI 0x04 #define ECB_HOSTPROTINT 0x02 +#define ECB_PTF 0x01 __u8 ecb; /* 0x0061 */ #define ECB2_CMMA 0x80 #define ECB2_IEP 0x20 #define ECB2_PFMFI 0x08 #define ECB2_ESCA 0x04 +#define ECB2_ZPCI_LSI 0x02 __u8 ecb2; /* 0x0062 */ +#define ECB3_AISI 0x20 +#define ECB3_AISII 0x10 #define ECB3_DEA 0x08 #define ECB3_AES 0x04 #define ECB3_RI 0x01 __u8 ecb3; /* 0x0063 */ +#define ESCA_SCAOL_MASK ~0x3fU __u32 scaol; /* 0x0064 */ - __u8 reserved68; /* 0x0068 */ + __u8 sdf; /* 0x0068 */ __u8 epdx; /* 0x0069 */ - __u8 reserved6a[2]; /* 0x006a */ + __u8 cpnc; /* 0x006a */ + __u8 reserved6b; /* 0x006b */ __u32 todpr; /* 0x006c */ #define GISA_FORMAT1 0x00000001 __u32 gd; /* 0x0070 */ @@ -244,31 +294,58 @@ struct kvm_s390_sie_block { #define HPID_KVM 0x4 #define HPID_VSIE 0x5 __u8 hpid; /* 0x00b8 */ - __u8 reservedb9[11]; /* 0x00b9 */ - __u16 extcpuaddr; /* 0x00c4 */ - __u16 eic; /* 0x00c6 */ + __u8 reservedb9[7]; /* 0x00b9 */ + union { + struct { + __u32 eiparams; /* 0x00c0 */ + __u16 extcpuaddr; /* 0x00c4 */ + __u16 eic; /* 0x00c6 */ + }; + __u64 mcic; /* 0x00c0 */ + } __packed; __u32 reservedc8; /* 0x00c8 */ - __u16 pgmilc; /* 0x00cc */ - __u16 iprcc; /* 0x00ce */ - __u32 dxc; /* 0x00d0 */ - __u16 mcn; /* 0x00d4 */ - __u8 perc; /* 0x00d6 */ - __u8 peratmid; /* 0x00d7 */ + union { + struct { + __u16 pgmilc; /* 0x00cc */ + __u16 iprcc; /* 0x00ce */ + }; + __u32 edc; /* 0x00cc */ + } __packed; + union { + struct { + __u32 dxc; /* 0x00d0 */ + __u16 mcn; /* 0x00d4 */ + __u8 perc; /* 0x00d6 */ + __u8 peratmid; /* 0x00d7 */ + }; + __u64 faddr; /* 0x00d0 */ + } __packed; __u64 peraddr; /* 0x00d8 */ __u8 eai; /* 0x00e0 */ __u8 peraid; /* 0x00e1 */ __u8 oai; /* 0x00e2 */ __u8 armid; /* 0x00e3 */ __u8 reservede4[4]; /* 0x00e4 */ - __u64 tecmc; /* 0x00e8 */ - __u8 reservedf0[12]; /* 0x00f0 */ + union { + __u64 tecmc; /* 0x00e8 */ + struct { + __u16 subchannel_id; /* 0x00e8 */ + __u16 subchannel_nr; /* 0x00ea */ + __u32 io_int_parm; /* 0x00ec */ + __u32 io_int_word; /* 0x00f0 */ + }; + } __packed; + __u8 reservedf4[8]; /* 0x00f4 */ #define CRYCB_FORMAT_MASK 0x00000003 #define CRYCB_FORMAT0 0x00000000 #define CRYCB_FORMAT1 0x00000001 #define CRYCB_FORMAT2 0x00000003 __u32 crycbd; /* 0x00fc */ __u64 gcr[16]; /* 0x0100 */ - __u64 gbea; /* 0x0180 */ + union { + __u64 gbea; /* 0x0180 */ + __u64 sidad; + }; __u8 reserved188[8]; /* 0x0188 */ __u64 sdnxo; /* 0x0190 */ __u8 reserved198[8]; /* 0x0198 */ @@ -287,7 +364,7 @@ struct kvm_s390_sie_block { __u64 itdba; /* 0x01e8 */ __u64 riccbd; /* 0x01f0 */ __u64 gvrd; /* 0x01f8 */ -} __attribute__((packed)); +} __packed __aligned(512); struct kvm_s390_itdb { __u8 data[256]; @@ -296,12 +373,15 @@ struct kvm_s390_itdb { struct sie_page { struct kvm_s390_sie_block sie_block; struct mcck_volatile_info mcck_info; /* 0x0200 */ - __u8 reserved218[1000]; /* 0x0218 */ + __u8 reserved218[360]; /* 0x0218 */ + __u64 pv_grregs[16]; /* 0x0380 */ + __u8 reserved400[512]; /* 0x0400 */ struct kvm_s390_itdb itdb; /* 0x0600 */ __u8 reserved700[2304]; /* 0x0700 */ }; struct kvm_vcpu_stat { + struct kvm_vcpu_stat_generic generic; u64 exit_userspace; u64 exit_null; u64 exit_external_request; @@ -311,11 +391,7 @@ struct kvm_vcpu_stat { u64 exit_validity; u64 exit_instruction; u64 exit_pei; - u64 halt_successful_poll; - u64 halt_attempted_poll; - u64 halt_poll_invalid; u64 halt_no_poll_steal; - u64 halt_wakeup; u64 instruction_lctl; u64 instruction_lctlg; u64 instruction_stctl; @@ -389,14 +465,16 @@ struct kvm_vcpu_stat { u64 instruction_sigp_init_cpu_reset; u64 instruction_sigp_cpu_reset; u64 instruction_sigp_unknown; - u64 diagnose_10; - u64 diagnose_44; - u64 diagnose_9c; - u64 diagnose_9c_ignored; - u64 diagnose_258; - u64 diagnose_308; - u64 diagnose_500; - u64 diagnose_other; + u64 instruction_diagnose_10; + u64 instruction_diagnose_44; + u64 instruction_diagnose_9c; + u64 diag_9c_ignored; + u64 diag_9c_forward; + u64 instruction_diagnose_258; + u64 instruction_diagnose_308; + u64 instruction_diagnose_500; + u64 instruction_diagnose_other; + u64 pfault_sync; }; #define PGM_OPERATION 0x01 @@ -471,6 +549,7 @@ enum irq_types { IRQ_PEND_PFAULT_INIT, IRQ_PEND_EXT_HOST, IRQ_PEND_EXT_SERVICE, + IRQ_PEND_EXT_SERVICE_EV, IRQ_PEND_EXT_TIMING, IRQ_PEND_EXT_CPU_TIMER, IRQ_PEND_EXT_CLOCK_COMP, @@ -515,6 +594,7 @@ enum irq_types { (1UL << IRQ_PEND_EXT_TIMING) | \ (1UL << IRQ_PEND_EXT_HOST) | \ (1UL << IRQ_PEND_EXT_SERVICE) | \ + (1UL << IRQ_PEND_EXT_SERVICE_EV) | \ (1UL << IRQ_PEND_VIRTIO) | \ (1UL << IRQ_PEND_PFAULT_INIT) | \ (1UL << IRQ_PEND_PFAULT_DONE)) @@ -531,6 +611,13 @@ enum irq_types { #define IRQ_PEND_MCHK_MASK ((1UL << IRQ_PEND_MCHK_REP) | \ (1UL << IRQ_PEND_MCHK_EX)) +#define IRQ_PEND_EXT_II_MASK ((1UL << IRQ_PEND_EXT_CPU_TIMER) | \ + (1UL << IRQ_PEND_EXT_CLOCK_COMP) | \ + (1UL << IRQ_PEND_EXT_EMERGENCY) | \ + (1UL << IRQ_PEND_EXT_EXTERNAL) | \ + (1UL << IRQ_PEND_EXT_SERVICE) | \ + (1UL << IRQ_PEND_EXT_SERVICE_EV)) + struct kvm_s390_interrupt_info { struct list_head list; u64 type; @@ -589,6 +676,7 @@ struct kvm_s390_local_interrupt { struct kvm_s390_float_interrupt { unsigned long pending_irqs; + unsigned long masked_irqs; spinlock_t lock; struct list_head lists[FIRQ_LIST_COUNT]; int counters[FIRQ_MAX_COUNT]; @@ -628,6 +716,10 @@ struct kvm_hw_bp_info_arch { #define guestdbg_exit_pending(vcpu) (guestdbg_enabled(vcpu) && \ (vcpu->guest_debug & KVM_GUESTDBG_EXIT_PENDING)) +#define KVM_GUESTDBG_VALID_MASK \ + (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP |\ + KVM_GUESTDBG_USE_HW_BP | KVM_GUESTDBG_EXIT_PENDING) + struct kvm_guestdbg_info_arch { unsigned long cr0; unsigned long cr9; @@ -640,6 +732,11 @@ struct kvm_guestdbg_info_arch { unsigned long last_bp; }; +struct kvm_s390_pv_vcpu { + u64 handle; + unsigned long stor_base; +}; + struct kvm_vcpu_arch { struct kvm_s390_sie_block *sie_block; /* if vsie is active, currently executed shadow sie control block */ @@ -668,15 +765,25 @@ struct kvm_vcpu_arch { __u64 cputm_start; bool gs_enabled; bool skey_enabled; + struct kvm_s390_pv_vcpu pv; + union diag318_info diag318_info; }; struct kvm_vm_stat { + struct kvm_vm_stat_generic generic; u64 inject_io; u64 inject_float_mchk; u64 inject_pfault_done; u64 inject_service_signal; u64 inject_virtio; - u64 remote_tlb_flush; + u64 aen_forward; + u64 gmap_shadow_create; + u64 gmap_shadow_reuse; + u64 gmap_shadow_r1_entry; + u64 gmap_shadow_r2_entry; + u64 gmap_shadow_r3_entry; + u64 gmap_shadow_sg_entry; + u64 gmap_shadow_pg_entry; }; struct kvm_arch_memory_slot { @@ -696,9 +803,6 @@ struct s390_io_adapter { bool masked; bool swap; bool suppressible; - struct rw_semaphore maps_lock; - struct list_head maps; - atomic_t nr_maps; }; #define MAX_S390_IO_ADAPTERS ((MAX_ISC + 1) * 8) @@ -714,22 +818,22 @@ struct s390_io_adapter { struct kvm_s390_cpu_model { /* facility mask supported by kvm & hosting machine */ - __u64 fac_mask[S390_ARCH_FAC_LIST_SIZE_U64]; + __u64 fac_mask[S390_ARCH_FAC_MASK_SIZE_U64]; struct kvm_s390_vm_cpu_subfunc subfuncs; /* facility list requested by guest (in dma page) */ __u64 *fac_list; u64 cpuid; unsigned short ibc; + /* subset of available UV-features for pv-guests enabled by user space */ + struct kvm_s390_vm_cpu_uv_feat uv_feat_guest; }; -struct kvm_s390_module_hook { - int (*hook)(struct kvm_vcpu *vcpu); - struct module *owner; -}; +typedef int (*crypto_hook)(struct kvm_vcpu *vcpu); struct kvm_s390_crypto { struct kvm_s390_crypto_cb *crycb; - struct kvm_s390_module_hook *pqap_hook; + struct rw_semaphore pqap_hook_rwsem; + crypto_hook *pqap_hook; __u32 crycbd; __u8 aes_kw; __u8 dea_kw; @@ -841,6 +945,17 @@ struct kvm_s390_gisa_interrupt { DECLARE_BITMAP(kicked_mask, KVM_MAX_VCPUS); }; +struct kvm_s390_pv { + u64 handle; + u64 guest_len; + unsigned long stor_base; + void *stor_var; + bool dumping; + void *set_aside; + struct list_head need_cleanup; + struct mmu_notifier mmu_notifier; +}; + struct kvm_arch{ void *sca; int use_esca; @@ -855,6 +970,7 @@ struct kvm_arch{ int use_cmma; int use_pfmfi; int use_skf; + int use_zpci_interp; int user_cpu_state_ctrl; int user_sigp; int user_stsi; @@ -874,8 +990,12 @@ struct kvm_arch{ atomic64_t cmma_dirty_pages; /* subset of available cpu features enabled by user space */ DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); + /* indexed by vcpu_idx */ DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS); struct kvm_s390_gisa_interrupt gisa_int; + struct kvm_s390_pv pv; + struct list_head kzdev_list; + spinlock_t kzdev_list_lock; }; #define KVM_HVA_ERR_BAD (-1UL) @@ -891,33 +1011,42 @@ struct kvm_arch_async_pf { unsigned long pfault_token; }; -bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu); +bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu); void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work); -void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, +bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work); void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work); +static inline void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu) {} + void kvm_arch_crypto_clear_masks(struct kvm *kvm); void kvm_arch_crypto_set_masks(struct kvm *kvm, unsigned long *apm, unsigned long *aqm, unsigned long *adm); -extern int sie64a(struct kvm_s390_sie_block *, u64 *); +int __sie64a(phys_addr_t sie_block_phys, struct kvm_s390_sie_block *sie_block, u64 *rsa); + +static inline int sie64a(struct kvm_s390_sie_block *sie_block, u64 *rsa) +{ + return __sie64a(virt_to_phys(sie_block), sie_block, rsa); +} + extern char sie_exit; +bool kvm_s390_pv_is_protected(struct kvm *kvm); +bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu); + extern int kvm_s390_gisc_register(struct kvm *kvm, u32 gisc); extern int kvm_s390_gisc_unregister(struct kvm *kvm, u32 gisc); -static inline void kvm_arch_hardware_disable(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} -static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_free_memslot(struct kvm *kvm, - struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {} + struct kvm_memory_slot *slot) {} static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {} static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm, @@ -925,6 +1054,14 @@ static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm, static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} -void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu); +#define __KVM_HAVE_ARCH_VM_FREE +void kvm_arch_free_vm(struct kvm *kvm); + +struct zpci_kvm_hook { + int (*kvm_register)(void *opaque, struct kvm *kvm); + void (*kvm_unregister)(void *opaque); +}; + +extern struct zpci_kvm_hook zpci_kvm_hook; #endif diff --git a/arch/s390/include/asm/kvm_para.h b/arch/s390/include/asm/kvm_para.h index cbc7c3a68e4d..df73a052760c 100644 --- a/arch/s390/include/asm/kvm_para.h +++ b/arch/s390/include/asm/kvm_para.h @@ -24,162 +24,79 @@ #include <uapi/asm/kvm_para.h> #include <asm/diag.h> -static inline long __kvm_hypercall0(unsigned long nr) -{ - register unsigned long __nr asm("1") = nr; - register long __rc asm("2"); - - asm volatile ("diag 2,4,0x500\n" - : "=d" (__rc) : "d" (__nr): "memory", "cc"); - return __rc; -} - -static inline long kvm_hypercall0(unsigned long nr) -{ - diag_stat_inc(DIAG_STAT_X500); - return __kvm_hypercall0(nr); -} - -static inline long __kvm_hypercall1(unsigned long nr, unsigned long p1) -{ - register unsigned long __nr asm("1") = nr; - register unsigned long __p1 asm("2") = p1; - register long __rc asm("2"); - - asm volatile ("diag 2,4,0x500\n" - : "=d" (__rc) : "d" (__nr), "0" (__p1) : "memory", "cc"); - return __rc; -} - -static inline long kvm_hypercall1(unsigned long nr, unsigned long p1) -{ - diag_stat_inc(DIAG_STAT_X500); - return __kvm_hypercall1(nr, p1); -} - -static inline long __kvm_hypercall2(unsigned long nr, unsigned long p1, - unsigned long p2) -{ - register unsigned long __nr asm("1") = nr; - register unsigned long __p1 asm("2") = p1; - register unsigned long __p2 asm("3") = p2; - register long __rc asm("2"); - - asm volatile ("diag 2,4,0x500\n" - : "=d" (__rc) : "d" (__nr), "0" (__p1), "d" (__p2) - : "memory", "cc"); - return __rc; -} - -static inline long kvm_hypercall2(unsigned long nr, unsigned long p1, - unsigned long p2) -{ - diag_stat_inc(DIAG_STAT_X500); - return __kvm_hypercall2(nr, p1, p2); -} - -static inline long __kvm_hypercall3(unsigned long nr, unsigned long p1, - unsigned long p2, unsigned long p3) -{ - register unsigned long __nr asm("1") = nr; - register unsigned long __p1 asm("2") = p1; - register unsigned long __p2 asm("3") = p2; - register unsigned long __p3 asm("4") = p3; - register long __rc asm("2"); - - asm volatile ("diag 2,4,0x500\n" - : "=d" (__rc) : "d" (__nr), "0" (__p1), "d" (__p2), - "d" (__p3) : "memory", "cc"); - return __rc; -} - -static inline long kvm_hypercall3(unsigned long nr, unsigned long p1, - unsigned long p2, unsigned long p3) -{ - diag_stat_inc(DIAG_STAT_X500); - return __kvm_hypercall3(nr, p1, p2, p3); -} - -static inline long __kvm_hypercall4(unsigned long nr, unsigned long p1, - unsigned long p2, unsigned long p3, - unsigned long p4) -{ - register unsigned long __nr asm("1") = nr; - register unsigned long __p1 asm("2") = p1; - register unsigned long __p2 asm("3") = p2; - register unsigned long __p3 asm("4") = p3; - register unsigned long __p4 asm("5") = p4; - register long __rc asm("2"); - - asm volatile ("diag 2,4,0x500\n" - : "=d" (__rc) : "d" (__nr), "0" (__p1), "d" (__p2), - "d" (__p3), "d" (__p4) : "memory", "cc"); - return __rc; -} - -static inline long kvm_hypercall4(unsigned long nr, unsigned long p1, - unsigned long p2, unsigned long p3, - unsigned long p4) -{ - diag_stat_inc(DIAG_STAT_X500); - return __kvm_hypercall4(nr, p1, p2, p3, p4); -} - -static inline long __kvm_hypercall5(unsigned long nr, unsigned long p1, - unsigned long p2, unsigned long p3, - unsigned long p4, unsigned long p5) -{ - register unsigned long __nr asm("1") = nr; - register unsigned long __p1 asm("2") = p1; - register unsigned long __p2 asm("3") = p2; - register unsigned long __p3 asm("4") = p3; - register unsigned long __p4 asm("5") = p4; - register unsigned long __p5 asm("6") = p5; - register long __rc asm("2"); - - asm volatile ("diag 2,4,0x500\n" - : "=d" (__rc) : "d" (__nr), "0" (__p1), "d" (__p2), - "d" (__p3), "d" (__p4), "d" (__p5) : "memory", "cc"); - return __rc; -} - -static inline long kvm_hypercall5(unsigned long nr, unsigned long p1, - unsigned long p2, unsigned long p3, - unsigned long p4, unsigned long p5) -{ - diag_stat_inc(DIAG_STAT_X500); - return __kvm_hypercall5(nr, p1, p2, p3, p4, p5); -} - -static inline long __kvm_hypercall6(unsigned long nr, unsigned long p1, - unsigned long p2, unsigned long p3, - unsigned long p4, unsigned long p5, - unsigned long p6) -{ - register unsigned long __nr asm("1") = nr; - register unsigned long __p1 asm("2") = p1; - register unsigned long __p2 asm("3") = p2; - register unsigned long __p3 asm("4") = p3; - register unsigned long __p4 asm("5") = p4; - register unsigned long __p5 asm("6") = p5; - register unsigned long __p6 asm("7") = p6; - register long __rc asm("2"); - - asm volatile ("diag 2,4,0x500\n" - : "=d" (__rc) : "d" (__nr), "0" (__p1), "d" (__p2), - "d" (__p3), "d" (__p4), "d" (__p5), "d" (__p6) - : "memory", "cc"); - return __rc; -} - -static inline long kvm_hypercall6(unsigned long nr, unsigned long p1, - unsigned long p2, unsigned long p3, - unsigned long p4, unsigned long p5, - unsigned long p6) -{ - diag_stat_inc(DIAG_STAT_X500); - return __kvm_hypercall6(nr, p1, p2, p3, p4, p5, p6); -} +#define HYPERCALL_FMT_0 +#define HYPERCALL_FMT_1 , "0" (r2) +#define HYPERCALL_FMT_2 , "d" (r3) HYPERCALL_FMT_1 +#define HYPERCALL_FMT_3 , "d" (r4) HYPERCALL_FMT_2 +#define HYPERCALL_FMT_4 , "d" (r5) HYPERCALL_FMT_3 +#define HYPERCALL_FMT_5 , "d" (r6) HYPERCALL_FMT_4 +#define HYPERCALL_FMT_6 , "d" (r7) HYPERCALL_FMT_5 + +#define HYPERCALL_PARM_0 +#define HYPERCALL_PARM_1 , unsigned long arg1 +#define HYPERCALL_PARM_2 HYPERCALL_PARM_1, unsigned long arg2 +#define HYPERCALL_PARM_3 HYPERCALL_PARM_2, unsigned long arg3 +#define HYPERCALL_PARM_4 HYPERCALL_PARM_3, unsigned long arg4 +#define HYPERCALL_PARM_5 HYPERCALL_PARM_4, unsigned long arg5 +#define HYPERCALL_PARM_6 HYPERCALL_PARM_5, unsigned long arg6 + +#define HYPERCALL_REGS_0 +#define HYPERCALL_REGS_1 \ + register unsigned long r2 asm("2") = arg1 +#define HYPERCALL_REGS_2 \ + HYPERCALL_REGS_1; \ + register unsigned long r3 asm("3") = arg2 +#define HYPERCALL_REGS_3 \ + HYPERCALL_REGS_2; \ + register unsigned long r4 asm("4") = arg3 +#define HYPERCALL_REGS_4 \ + HYPERCALL_REGS_3; \ + register unsigned long r5 asm("5") = arg4 +#define HYPERCALL_REGS_5 \ + HYPERCALL_REGS_4; \ + register unsigned long r6 asm("6") = arg5 +#define HYPERCALL_REGS_6 \ + HYPERCALL_REGS_5; \ + register unsigned long r7 asm("7") = arg6 + +#define HYPERCALL_ARGS_0 +#define HYPERCALL_ARGS_1 , arg1 +#define HYPERCALL_ARGS_2 HYPERCALL_ARGS_1, arg2 +#define HYPERCALL_ARGS_3 HYPERCALL_ARGS_2, arg3 +#define HYPERCALL_ARGS_4 HYPERCALL_ARGS_3, arg4 +#define HYPERCALL_ARGS_5 HYPERCALL_ARGS_4, arg5 +#define HYPERCALL_ARGS_6 HYPERCALL_ARGS_5, arg6 + +#define GENERATE_KVM_HYPERCALL_FUNC(args) \ +static inline \ +long __kvm_hypercall##args(unsigned long nr HYPERCALL_PARM_##args) \ +{ \ + register unsigned long __nr asm("1") = nr; \ + register long __rc asm("2"); \ + HYPERCALL_REGS_##args; \ + \ + asm volatile ( \ + " diag 2,4,0x500\n" \ + : "=d" (__rc) \ + : "d" (__nr) HYPERCALL_FMT_##args \ + : "memory", "cc"); \ + return __rc; \ +} \ + \ +static inline \ +long kvm_hypercall##args(unsigned long nr HYPERCALL_PARM_##args) \ +{ \ + diag_stat_inc(DIAG_STAT_X500); \ + return __kvm_hypercall##args(nr HYPERCALL_ARGS_##args); \ +} + +GENERATE_KVM_HYPERCALL_FUNC(0) +GENERATE_KVM_HYPERCALL_FUNC(1) +GENERATE_KVM_HYPERCALL_FUNC(2) +GENERATE_KVM_HYPERCALL_FUNC(3) +GENERATE_KVM_HYPERCALL_FUNC(4) +GENERATE_KVM_HYPERCALL_FUNC(5) +GENERATE_KVM_HYPERCALL_FUNC(6) /* kvm on s390 is always paravirtualization enabled */ static inline int kvm_para_available(void) diff --git a/arch/s390/include/asm/linkage.h b/arch/s390/include/asm/linkage.h index 7f22262b0e46..df3fb7d8227b 100644 --- a/arch/s390/include/asm/linkage.h +++ b/arch/s390/include/asm/linkage.h @@ -4,36 +4,7 @@ #include <linux/stringify.h> -#define __ALIGN .align 4, 0x07 +#define __ALIGN .balign CONFIG_FUNCTION_ALIGNMENT, 0x07 #define __ALIGN_STR __stringify(__ALIGN) -#ifndef __ASSEMBLY__ - -/* - * Helper macro for exception table entries - */ -#define EX_TABLE(_fault, _target) \ - ".section __ex_table,\"a\"\n" \ - ".align 4\n" \ - ".long (" #_fault ") - .\n" \ - ".long (" #_target ") - .\n" \ - ".previous\n" - -#else /* __ASSEMBLY__ */ - -#define EX_TABLE(_fault, _target) \ - .section __ex_table,"a" ; \ - .align 4 ; \ - .long (_fault) - . ; \ - .long (_target) - . ; \ - .previous - -#define EX_TABLE_DMA(_fault, _target) \ - .section .dma.ex_table, "a" ; \ - .align 4 ; \ - .long (_fault) - . ; \ - .long (_target) - . ; \ - .previous - -#endif /* __ASSEMBLY__ */ #endif diff --git a/arch/s390/include/asm/livepatch.h b/arch/s390/include/asm/livepatch.h deleted file mode 100644 index 818612b784cd..000000000000 --- a/arch/s390/include/asm/livepatch.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ */ -/* - * livepatch.h - s390-specific Kernel Live Patching Core - * - * Copyright (c) 2013-2015 SUSE - * Authors: Jiri Kosina - * Vojtech Pavlik - * Jiri Slaby - */ - -#ifndef ASM_LIVEPATCH_H -#define ASM_LIVEPATCH_H - -#include <asm/ptrace.h> - -static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip) -{ - regs->psw.addr = ip; -} - -#endif diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index 237ee0c4169f..5dc1b6345006 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h @@ -11,27 +11,46 @@ #include <linux/types.h> #include <asm/ptrace.h> +#include <asm/ctlreg.h> #include <asm/cpu.h> #include <asm/types.h> #define LC_ORDER 1 #define LC_PAGES 2 +struct pgm_tdb { + u64 data[32]; +}; + struct lowcore { __u8 pad_0x0000[0x0014-0x0000]; /* 0x0000 */ __u32 ipl_parmblock_ptr; /* 0x0014 */ __u8 pad_0x0018[0x0080-0x0018]; /* 0x0018 */ __u32 ext_params; /* 0x0080 */ - __u16 ext_cpu_addr; /* 0x0084 */ - __u16 ext_int_code; /* 0x0086 */ - __u16 svc_ilc; /* 0x0088 */ - __u16 svc_code; /* 0x008a */ - __u16 pgm_ilc; /* 0x008c */ - __u16 pgm_code; /* 0x008e */ + union { + struct { + __u16 ext_cpu_addr; /* 0x0084 */ + __u16 ext_int_code; /* 0x0086 */ + }; + __u32 ext_int_code_addr; + }; + __u32 svc_int_code; /* 0x0088 */ + union { + struct { + __u16 pgm_ilc; /* 0x008c */ + __u16 pgm_code; /* 0x008e */ + }; + __u32 pgm_int_code; + }; __u32 data_exc_code; /* 0x0090 */ __u16 mon_class_num; /* 0x0094 */ - __u8 per_code; /* 0x0096 */ - __u8 per_atmid; /* 0x0097 */ + union { + struct { + __u8 per_code; /* 0x0096 */ + __u8 per_atmid; /* 0x0097 */ + }; + __u16 per_code_combined; + }; __u64 per_address; /* 0x0098 */ __u8 exc_access_id; /* 0x00a0 */ __u8 per_access_id; /* 0x00a1 */ @@ -40,10 +59,15 @@ struct lowcore { __u8 pad_0x00a4[0x00a8-0x00a4]; /* 0x00a4 */ __u64 trans_exc_code; /* 0x00a8 */ __u64 monitor_code; /* 0x00b0 */ - __u16 subchannel_id; /* 0x00b8 */ - __u16 subchannel_nr; /* 0x00ba */ - __u32 io_int_parm; /* 0x00bc */ - __u32 io_int_word; /* 0x00c0 */ + union { + struct { + __u16 subchannel_id; /* 0x00b8 */ + __u16 subchannel_nr; /* 0x00ba */ + __u32 io_int_parm; /* 0x00bc */ + __u32 io_int_word; /* 0x00c0 */ + }; + struct tpi_info tpi_info; /* 0x00b8 */ + }; __u8 pad_0x00c4[0x00c8-0x00c4]; /* 0x00c4 */ __u32 stfl_fac_list; /* 0x00c8 */ __u8 pad_0x00cc[0x00e8-0x00cc]; /* 0x00cc */ @@ -52,7 +76,7 @@ struct lowcore { __u32 external_damage_code; /* 0x00f4 */ __u64 failing_storage_address; /* 0x00f8 */ __u8 pad_0x0100[0x0110-0x0100]; /* 0x0100 */ - __u64 breaking_event_addr; /* 0x0110 */ + __u64 pgm_last_break; /* 0x0110 */ __u8 pad_0x0118[0x0120-0x0118]; /* 0x0118 */ psw_t restart_old_psw; /* 0x0120 */ psw_t external_old_psw; /* 0x0130 */ @@ -80,9 +104,10 @@ struct lowcore { psw_t return_psw; /* 0x0290 */ psw_t return_mcck_psw; /* 0x02a0 */ + __u64 last_break; /* 0x02b0 */ + /* CPU accounting and timing values. */ - __u64 sync_enter_timer; /* 0x02b0 */ - __u64 async_enter_timer; /* 0x02b8 */ + __u64 sys_enter_timer; /* 0x02b8 */ __u64 mcck_enter_timer; /* 0x02c0 */ __u64 exit_timer; /* 0x02c8 */ __u64 user_timer; /* 0x02d0 */ @@ -94,8 +119,8 @@ struct lowcore { __u64 avg_steal_timer; /* 0x0300 */ __u64 last_update_timer; /* 0x0308 */ __u64 last_update_clock; /* 0x0310 */ - __u64 int_clock; /* 0x0318*/ - __u64 mcck_clock; /* 0x0320 */ + __u64 int_clock; /* 0x0318 */ + __u8 pad_0x0320[0x0328-0x0320]; /* 0x0320 */ __u64 clock_comparator; /* 0x0328 */ __u64 boot_clock[2]; /* 0x0330 */ @@ -107,16 +132,16 @@ struct lowcore { __u64 async_stack; /* 0x0350 */ __u64 nodat_stack; /* 0x0358 */ __u64 restart_stack; /* 0x0360 */ - + __u64 mcck_stack; /* 0x0368 */ /* Restart function and parameter. */ - __u64 restart_fn; /* 0x0368 */ - __u64 restart_data; /* 0x0370 */ - __u64 restart_source; /* 0x0378 */ + __u64 restart_fn; /* 0x0370 */ + __u64 restart_data; /* 0x0378 */ + __u32 restart_source; /* 0x0380 */ + __u32 restart_flags; /* 0x0384 */ /* Address space pointer. */ - __u64 kernel_asce; /* 0x0380 */ - __u64 user_asce; /* 0x0388 */ - __u64 vdso_asce; /* 0x0390 */ + struct ctlreg kernel_asce; /* 0x0388 */ + struct ctlreg user_asce; /* 0x0390 */ /* * The lpp and current_pid fields form a @@ -134,14 +159,14 @@ struct lowcore { __u32 spinlock_index; /* 0x03b0 */ __u32 fpu_flags; /* 0x03b4 */ __u64 percpu_offset; /* 0x03b8 */ - __u64 vdso_per_cpu_data; /* 0x03c0 */ + __u8 pad_0x03c0[0x03c8-0x03c0]; /* 0x03c0 */ __u64 machine_flags; /* 0x03c8 */ __u64 gmap; /* 0x03d0 */ __u8 pad_0x03d8[0x0400-0x03d8]; /* 0x03d8 */ - /* br %r1 trampoline */ - __u16 br_r1_trampoline; /* 0x0400 */ - __u8 pad_0x0402[0x0e00-0x0402]; /* 0x0402 */ + __u32 return_lpswe; /* 0x0400 */ + __u32 return_mcck_lpswe; /* 0x0404 */ + __u8 pad_0x040a[0x0e00-0x0408]; /* 0x0408 */ /* * 0xe00 contains the address of the IPL Parameter Information @@ -153,12 +178,7 @@ struct lowcore { __u64 vmcore_info; /* 0x0e0c */ __u8 pad_0x0e14[0x0e18-0x0e14]; /* 0x0e14 */ __u64 os_info; /* 0x0e18 */ - __u8 pad_0x0e20[0x0f00-0x0e20]; /* 0x0e20 */ - - /* Extended facility list */ - __u64 stfle_fac_list[16]; /* 0x0f00 */ - __u64 alt_stfle_fac_list[16]; /* 0x0f80 */ - __u8 pad_0x1000[0x11b0-0x1000]; /* 0x1000 */ + __u8 pad_0x0e20[0x11b0-0x0e20]; /* 0x0e20 */ /* Pointer to the machine check extended save area */ __u64 mcesad; /* 0x11b0 */ @@ -178,13 +198,18 @@ struct lowcore { __u32 tod_progreg_save_area; /* 0x1324 */ __u32 cpu_timer_save_area[2]; /* 0x1328 */ __u32 clock_comp_save_area[2]; /* 0x1330 */ - __u8 pad_0x1338[0x1340-0x1338]; /* 0x1338 */ + __u64 last_break_save_area; /* 0x1338 */ __u32 access_regs_save_area[16]; /* 0x1340 */ - __u64 cregs_save_area[16]; /* 0x1380 */ - __u8 pad_0x1400[0x1800-0x1400]; /* 0x1400 */ + struct ctlreg cregs_save_area[16]; /* 0x1380 */ + __u8 pad_0x1400[0x1500-0x1400]; /* 0x1400 */ + /* Cryptography-counter designation */ + __u64 ccd; /* 0x1500 */ + /* AI-extension counter designation */ + __u64 aicd; /* 0x1508 */ + __u8 pad_0x1510[0x1800-0x1510]; /* 0x1510 */ /* Transaction abort diagnostic block */ - __u8 pgm_tdb[256]; /* 0x1800 */ + struct pgm_tdb pgm_tdb; /* 0x1800 */ __u8 pad_0x1900[0x2000-0x1900]; /* 0x1900 */ } __packed __aligned(8192); @@ -197,12 +222,4 @@ static inline void set_prefix(__u32 address) asm volatile("spx %0" : : "Q" (address) : "memory"); } -static inline __u32 store_prefix(void) -{ - __u32 address; - - asm volatile("stpx %0" : "=Q" (address)); - return address; -} - #endif /* _ASM_S390_LOWCORE_H */ diff --git a/arch/s390/include/asm/maccess.h b/arch/s390/include/asm/maccess.h new file mode 100644 index 000000000000..50225940d971 --- /dev/null +++ b/arch/s390/include/asm/maccess.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_S390_MACCESS_H +#define __ASM_S390_MACCESS_H + +#include <linux/types.h> + +#define MEMCPY_REAL_SIZE PAGE_SIZE +#define MEMCPY_REAL_MASK PAGE_MASK + +struct iov_iter; + +extern unsigned long __memcpy_real_area; +extern pte_t *memcpy_real_ptep; +size_t memcpy_real_iter(struct iov_iter *iter, unsigned long src, size_t count); +int memcpy_real(void *dest, unsigned long src, size_t count); +#ifdef CONFIG_CRASH_DUMP +int copy_oldmem_kernel(void *dst, unsigned long src, size_t count); +#endif + +#endif /* __ASM_S390_MACCESS_H */ diff --git a/arch/s390/include/asm/mem_detect.h b/arch/s390/include/asm/mem_detect.h deleted file mode 100644 index a7c922a69050..000000000000 --- a/arch/s390/include/asm/mem_detect.h +++ /dev/null @@ -1,94 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_S390_MEM_DETECT_H -#define _ASM_S390_MEM_DETECT_H - -#include <linux/types.h> - -enum mem_info_source { - MEM_DETECT_NONE = 0, - MEM_DETECT_SCLP_STOR_INFO, - MEM_DETECT_DIAG260, - MEM_DETECT_SCLP_READ_INFO, - MEM_DETECT_BIN_SEARCH -}; - -struct mem_detect_block { - u64 start; - u64 end; -}; - -/* - * Storage element id is defined as 1 byte (up to 256 storage elements). - * In practise only storage element id 0 and 1 are used). - * According to architecture one storage element could have as much as - * 1020 subincrements. 255 mem_detect_blocks are embedded in mem_detect_info. - * If more mem_detect_blocks are required, a block of memory from already - * known mem_detect_block is taken (entries_extended points to it). - */ -#define MEM_INLINED_ENTRIES 255 /* (PAGE_SIZE - 16) / 16 */ - -struct mem_detect_info { - u32 count; - u8 info_source; - struct mem_detect_block entries[MEM_INLINED_ENTRIES]; - struct mem_detect_block *entries_extended; -}; -extern struct mem_detect_info mem_detect; - -void add_mem_detect_block(u64 start, u64 end); - -static inline int __get_mem_detect_block(u32 n, unsigned long *start, - unsigned long *end) -{ - if (n >= mem_detect.count) { - *start = 0; - *end = 0; - return -1; - } - - if (n < MEM_INLINED_ENTRIES) { - *start = (unsigned long)mem_detect.entries[n].start; - *end = (unsigned long)mem_detect.entries[n].end; - } else { - *start = (unsigned long)mem_detect.entries_extended[n - MEM_INLINED_ENTRIES].start; - *end = (unsigned long)mem_detect.entries_extended[n - MEM_INLINED_ENTRIES].end; - } - return 0; -} - -/** - * for_each_mem_detect_block - early online memory range iterator - * @i: an integer used as loop variable - * @p_start: ptr to unsigned long for start address of the range - * @p_end: ptr to unsigned long for end address of the range - * - * Walks over detected online memory ranges. - */ -#define for_each_mem_detect_block(i, p_start, p_end) \ - for (i = 0, __get_mem_detect_block(i, p_start, p_end); \ - i < mem_detect.count; \ - i++, __get_mem_detect_block(i, p_start, p_end)) - -static inline void get_mem_detect_reserved(unsigned long *start, - unsigned long *size) -{ - *start = (unsigned long)mem_detect.entries_extended; - if (mem_detect.count > MEM_INLINED_ENTRIES) - *size = (mem_detect.count - MEM_INLINED_ENTRIES) * sizeof(struct mem_detect_block); - else - *size = 0; -} - -static inline unsigned long get_mem_detect_end(void) -{ - unsigned long start; - unsigned long end; - - if (mem_detect.count) { - __get_mem_detect_block(mem_detect.count - 1, &start, &end); - return end; - } - return 0; -} - -#endif diff --git a/arch/s390/include/asm/mem_encrypt.h b/arch/s390/include/asm/mem_encrypt.h index 2542cbf7e2d1..b85e13505a0f 100644 --- a/arch/s390/include/asm/mem_encrypt.h +++ b/arch/s390/include/asm/mem_encrypt.h @@ -4,10 +4,8 @@ #ifndef __ASSEMBLY__ -static inline bool mem_encrypt_active(void) { return false; } - -int set_memory_encrypted(unsigned long addr, int numpages); -int set_memory_decrypted(unsigned long addr, int numpages); +int set_memory_encrypted(unsigned long vaddr, int numpages); +int set_memory_decrypted(unsigned long vaddr, int numpages); #endif /* __ASSEMBLY__ */ diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h index bcfb6371086f..bb1b4bef1878 100644 --- a/arch/s390/include/asm/mmu.h +++ b/arch/s390/include/asm/mmu.h @@ -4,18 +4,20 @@ #include <linux/cpumask.h> #include <linux/errno.h> +#include <asm/asm-extable.h> typedef struct { spinlock_t lock; cpumask_t cpu_attach_mask; atomic_t flush_count; unsigned int flush_mm; - struct list_head pgtable_list; struct list_head gmap_list; unsigned long gmap_asce; unsigned long asce; unsigned long asce_limit; unsigned long vdso_base; + /* The mmu context belongs to a secure guest. */ + atomic_t protected_count; /* * The following bitfields need a down_write on the mm * semaphore when they are written to. As they are only @@ -32,27 +34,10 @@ typedef struct { unsigned int uses_cmm:1; /* The gmaps associated with this context are allowed to use huge pages. */ unsigned int allow_gmap_hpage_1m:1; - /* The mmu context is for compat task */ - unsigned int compat_mm:1; } mm_context_t; #define INIT_MM_CONTEXT(name) \ .context.lock = __SPIN_LOCK_UNLOCKED(name.context.lock), \ - .context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \ .context.gmap_list = LIST_HEAD_INIT(name.context.gmap_list), -static inline int tprot(unsigned long addr) -{ - int rc = -EFAULT; - - asm volatile( - " tprot 0(%1),0\n" - "0: ipm %0\n" - " srl %0,28\n" - "1:\n" - EX_TABLE(0b,1b) - : "+d" (rc) : "a" (addr) : "cc"); - return rc; -} - #endif diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index 8d04e6f3f796..929af18b0908 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -12,20 +12,22 @@ #include <linux/uaccess.h> #include <linux/mm_types.h> #include <asm/tlbflush.h> -#include <asm/ctl_reg.h> +#include <asm/ctlreg.h> #include <asm-generic/mm_hooks.h> +#define init_new_context init_new_context static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { + unsigned long asce_type, init_entry; + spin_lock_init(&mm->context.lock); - INIT_LIST_HEAD(&mm->context.pgtable_list); INIT_LIST_HEAD(&mm->context.gmap_list); cpumask_clear(&mm->context.cpu_attach_mask); atomic_set(&mm->context.flush_count, 0); + atomic_set(&mm->context.protected_count, 0); mm->context.gmap_asce = 0; mm->context.flush_mm = 0; - mm->context.compat_mm = test_thread_flag(TIF_31BIT); #ifdef CONFIG_PGSTE mm->context.alloc_pgste = page_table_allocate_pgste || test_thread_flag(TIF_PGSTE) || @@ -36,73 +38,62 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.allow_gmap_hpage_1m = 0; #endif switch (mm->context.asce_limit) { - case _REGION2_SIZE: + default: /* - * forked 3-level task, fall through to set new asce with new - * mm->pgd + * context created by exec, the value of asce_limit can + * only be zero in this case */ - case 0: - /* context created by exec, set asce limit to 4TB */ - mm->context.asce_limit = STACK_TOP_MAX; - mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | _ASCE_TYPE_REGION3; + VM_BUG_ON(mm->context.asce_limit); + /* continue as 3-level task */ + mm->context.asce_limit = _REGION2_SIZE; + fallthrough; + case _REGION2_SIZE: + /* forked 3-level task */ + init_entry = _REGION3_ENTRY_EMPTY; + asce_type = _ASCE_TYPE_REGION3; break; - case -PAGE_SIZE: - /* forked 5-level task, set new asce with new_mm->pgd */ - mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | _ASCE_TYPE_REGION1; + case TASK_SIZE_MAX: + /* forked 5-level task */ + init_entry = _REGION1_ENTRY_EMPTY; + asce_type = _ASCE_TYPE_REGION1; break; case _REGION1_SIZE: - /* forked 4-level task, set new asce with new mm->pgd */ - mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | _ASCE_TYPE_REGION2; + /* forked 4-level task */ + init_entry = _REGION2_ENTRY_EMPTY; + asce_type = _ASCE_TYPE_REGION2; break; - case _REGION3_SIZE: - /* forked 2-level compat task, set new asce with new mm->pgd */ - mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT; } - crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm)); + mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | asce_type; + crst_table_init((unsigned long *) mm->pgd, init_entry); return 0; } -#define destroy_context(mm) do { } while (0) - -static inline void set_user_asce(struct mm_struct *mm) +static inline void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) { - S390_lowcore.user_asce = mm->context.asce; - __ctl_load(S390_lowcore.user_asce, 1, 1); - clear_cpu_flag(CIF_ASCE_PRIMARY); -} + int cpu = smp_processor_id(); -static inline void clear_user_asce(void) -{ - S390_lowcore.user_asce = S390_lowcore.kernel_asce; - __ctl_load(S390_lowcore.kernel_asce, 1, 1); - set_cpu_flag(CIF_ASCE_PRIMARY); + if (next == &init_mm) + S390_lowcore.user_asce = s390_invalid_asce; + else + S390_lowcore.user_asce.val = next->context.asce; + cpumask_set_cpu(cpu, &next->context.cpu_attach_mask); + /* Clear previous user-ASCE from CR7 */ + local_ctl_load(7, &s390_invalid_asce); + if (prev != next) + cpumask_clear_cpu(cpu, &prev->context.cpu_attach_mask); } - -mm_segment_t enable_sacf_uaccess(void); -void disable_sacf_uaccess(mm_segment_t old_fs); +#define switch_mm_irqs_off switch_mm_irqs_off static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { - int cpu = smp_processor_id(); + unsigned long flags; - S390_lowcore.user_asce = next->context.asce; - cpumask_set_cpu(cpu, &next->context.cpu_attach_mask); - /* Clear previous user-ASCE from CR1 and CR7 */ - if (!test_cpu_flag(CIF_ASCE_PRIMARY)) { - __ctl_load(S390_lowcore.kernel_asce, 1, 1); - set_cpu_flag(CIF_ASCE_PRIMARY); - } - if (test_cpu_flag(CIF_ASCE_SECONDARY)) { - __ctl_load(S390_lowcore.vdso_asce, 7, 7); - clear_cpu_flag(CIF_ASCE_SECONDARY); - } - if (prev != next) - cpumask_clear_cpu(cpu, &prev->context.cpu_attach_mask); + local_irq_save(flags); + switch_mm_irqs_off(prev, next, tsk); + local_irq_restore(flags); } #define finish_arch_post_lock_switch finish_arch_post_lock_switch @@ -119,18 +110,18 @@ static inline void finish_arch_post_lock_switch(void) __tlb_flush_mm_lazy(mm); preempt_enable(); } - set_fs(current->thread.mm_segment); + local_ctl_load(7, &S390_lowcore.user_asce); } -#define enter_lazy_tlb(mm,tsk) do { } while (0) -#define deactivate_mm(tsk,mm) do { } while (0) - +#define activate_mm activate_mm static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next) { switch_mm(prev, next, current); cpumask_set_cpu(smp_processor_id(), mm_cpumask(next)); - set_user_asce(next); + local_ctl_load(7, &S390_lowcore.user_asce); } +#include <asm-generic/mmu_context.h> + #endif /* __S390_MMU_CONTEXT_H */ diff --git a/arch/s390/include/asm/module.h b/arch/s390/include/asm/module.h index e0a6d29846e2..9f1eea15872c 100644 --- a/arch/s390/include/asm/module.h +++ b/arch/s390/include/asm/module.h @@ -8,16 +8,14 @@ * This file contains the s390 architecture specific module code. */ -struct mod_arch_syminfo -{ +struct mod_arch_syminfo { unsigned long got_offset; unsigned long plt_offset; int got_initialized; int plt_initialized; }; -struct mod_arch_specific -{ +struct mod_arch_specific { /* Starting offset of got in the module core memory. */ unsigned long got_offset; /* Starting offset of plt in the module core memory. */ @@ -30,6 +28,14 @@ struct mod_arch_specific int nsyms; /* Additional symbol information (got and plt offsets). */ struct mod_arch_syminfo *syminfo; +#ifdef CONFIG_FUNCTION_TRACER + /* Start of memory reserved for ftrace hotpatch trampolines. */ + struct ftrace_hotpatch_trampoline *trampolines_start; + /* End of memory reserved for ftrace hotpatch trampolines. */ + struct ftrace_hotpatch_trampoline *trampolines_end; + /* Next unused ftrace hotpatch trampoline slot. */ + struct ftrace_hotpatch_trampoline *next_trampoline; +#endif /* CONFIG_FUNCTION_TRACER */ }; #endif /* _ASM_S390_MODULE_H */ diff --git a/arch/s390/include/asm/msi.h b/arch/s390/include/asm/msi.h new file mode 100644 index 000000000000..399343ed9ffb --- /dev/null +++ b/arch/s390/include/asm/msi.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_MSI_H +#define _ASM_S390_MSI_H +#include <asm-generic/msi.h> + +/* + * Work around S390 not using irq_domain at all so we can't set + * IRQ_DOMAIN_FLAG_ISOLATED_MSI. See for an explanation how it works: + * + * https://lore.kernel.org/r/31af8174-35e9-ebeb-b9ef-74c90d4bfd93@linux.ibm.com/ + * + * Note this is less isolated than the ARM/x86 versions as userspace can trigger + * MSI belonging to kernel devices within the same gisa. + */ +#define arch_is_isolated_msi() true + +#endif diff --git a/arch/s390/include/asm/nmi.h b/arch/s390/include/asm/nmi.h index b160da8fa14b..227466ce9e41 100644 --- a/arch/s390/include/asm/nmi.h +++ b/arch/s390/include/asm/nmi.h @@ -6,7 +6,6 @@ * Author(s): Ingo Adlung <adlung@de.ibm.com>, * Martin Schwidefsky <schwidefsky@de.ibm.com>, * Cornelia Huck <cornelia.huck@de.ibm.com>, - * Heiko Carstens <heiko.carstens@de.ibm.com>, */ #ifndef _ASM_S390_NMI_H @@ -23,12 +22,16 @@ #define MCCK_CODE_SYSTEM_DAMAGE BIT(63) #define MCCK_CODE_EXT_DAMAGE BIT(63 - 5) #define MCCK_CODE_CP BIT(63 - 9) -#define MCCK_CODE_CPU_TIMER_VALID BIT(63 - 46) +#define MCCK_CODE_STG_ERROR BIT(63 - 16) +#define MCCK_CODE_STG_KEY_ERROR BIT(63 - 18) +#define MCCK_CODE_STG_DEGRAD BIT(63 - 19) #define MCCK_CODE_PSW_MWP_VALID BIT(63 - 20) #define MCCK_CODE_PSW_IA_VALID BIT(63 - 23) +#define MCCK_CODE_STG_FAIL_ADDR BIT(63 - 24) #define MCCK_CODE_CR_VALID BIT(63 - 29) #define MCCK_CODE_GS_VALID BIT(63 - 36) #define MCCK_CODE_FC_VALID BIT(63 - 43) +#define MCCK_CODE_CPU_TIMER_VALID BIT(63 - 46) #ifndef __ASSEMBLY__ @@ -94,9 +97,9 @@ struct mcesa { struct pt_regs; -void nmi_alloc_boot_cpu(struct lowcore *lc); -int nmi_alloc_per_cpu(struct lowcore *lc); -void nmi_free_per_cpu(struct lowcore *lc); +void nmi_alloc_mcesa_early(u64 *mcesad); +int nmi_alloc_mcesa(u64 *mcesad); +void nmi_free_mcesa(u64 *mcesad); void s390_handle_mcck(void); void s390_do_machine_check(struct pt_regs *regs); diff --git a/arch/s390/include/asm/nospec-branch.h b/arch/s390/include/asm/nospec-branch.h index b4bd8c41e9d3..82725cf783c7 100644 --- a/arch/s390/include/asm/nospec-branch.h +++ b/arch/s390/include/asm/nospec-branch.h @@ -12,6 +12,11 @@ void nospec_init_branches(void); void nospec_auto_detect(void); void nospec_revert(s32 *start, s32 *end); +static inline bool nospec_uses_trampoline(void) +{ + return __is_defined(CC_USING_EXPOLINE) && !nospec_disable; +} + #endif /* __ASSEMBLY__ */ #endif /* _ASM_S390_EXPOLINE_H */ diff --git a/arch/s390/include/asm/nospec-insn.h b/arch/s390/include/asm/nospec-insn.h index 0033dcd663b1..7a946c42ad13 100644 --- a/arch/s390/include/asm/nospec-insn.h +++ b/arch/s390/include/asm/nospec-insn.h @@ -2,23 +2,25 @@ #ifndef _ASM_S390_NOSPEC_ASM_H #define _ASM_S390_NOSPEC_ASM_H -#include <asm/alternative-asm.h> -#include <asm/asm-offsets.h> +#include <linux/linkage.h> #include <asm/dwarf.h> #ifdef __ASSEMBLY__ #ifdef CC_USING_EXPOLINE -_LC_BR_R1 = __LC_BR_R1 - /* * The expoline macros are used to create thunks in the same format * as gcc generates them. The 'comdat' section flag makes sure that * the various thunks are merged into a single copy. */ .macro __THUNK_PROLOG_NAME name +#ifdef CONFIG_EXPOLINE_EXTERN + .pushsection .text,"ax",@progbits + __ALIGN +#else .pushsection .text.\name,"axG",@progbits,\name,comdat +#endif .globl \name .hidden \name .type \name,@function @@ -26,167 +28,101 @@ _LC_BR_R1 = __LC_BR_R1 CFI_STARTPROC .endm - .macro __THUNK_EPILOG + .macro __THUNK_EPILOG_NAME name CFI_ENDPROC +#ifdef CONFIG_EXPOLINE_EXTERN + .size \name, .-\name +#endif .popsection .endm - .macro __THUNK_PROLOG_BR r1,r2 - __THUNK_PROLOG_NAME __s390_indirect_jump_r\r2\()use_r\r1 - .endm - - .macro __THUNK_PROLOG_BC d0,r1,r2 - __THUNK_PROLOG_NAME __s390_indirect_branch_\d0\()_\r2\()use_\r1 + .macro __THUNK_PROLOG_BR r1 + __THUNK_PROLOG_NAME __s390_indirect_jump_r\r1 .endm - .macro __THUNK_BR r1,r2 - jg __s390_indirect_jump_r\r2\()use_r\r1 + .macro __THUNK_EPILOG_BR r1 + __THUNK_EPILOG_NAME __s390_indirect_jump_r\r1 .endm - .macro __THUNK_BC d0,r1,r2 - jg __s390_indirect_branch_\d0\()_\r2\()use_\r1 + .macro __THUNK_BR r1 + jg __s390_indirect_jump_r\r1 .endm - .macro __THUNK_BRASL r1,r2,r3 - brasl \r1,__s390_indirect_jump_r\r3\()use_r\r2 + .macro __THUNK_BRASL r1,r2 + brasl \r1,__s390_indirect_jump_r\r2 .endm - .macro __DECODE_RR expand,reg,ruse - .set __decode_fail,1 + .macro __DECODE_R expand,reg + .set .L__decode_fail,1 .irp r1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 .ifc \reg,%r\r1 - .irp r2,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 - .ifc \ruse,%r\r2 - \expand \r1,\r2 - .set __decode_fail,0 - .endif - .endr + \expand \r1 + .set .L__decode_fail,0 .endif .endr - .if __decode_fail == 1 - .error "__DECODE_RR failed" + .if .L__decode_fail == 1 + .error "__DECODE_R failed" .endif .endm - .macro __DECODE_RRR expand,rsave,rtarget,ruse - .set __decode_fail,1 + .macro __DECODE_RR expand,rsave,rtarget + .set .L__decode_fail,1 .irp r1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 .ifc \rsave,%r\r1 .irp r2,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 .ifc \rtarget,%r\r2 - .irp r3,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 - .ifc \ruse,%r\r3 - \expand \r1,\r2,\r3 - .set __decode_fail,0 - .endif - .endr - .endif - .endr - .endif - .endr - .if __decode_fail == 1 - .error "__DECODE_RRR failed" - .endif - .endm - - .macro __DECODE_DRR expand,disp,reg,ruse - .set __decode_fail,1 - .irp r1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 - .ifc \reg,%r\r1 - .irp r2,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 - .ifc \ruse,%r\r2 - \expand \disp,\r1,\r2 - .set __decode_fail,0 + \expand \r1,\r2 + .set .L__decode_fail,0 .endif .endr .endif .endr - .if __decode_fail == 1 - .error "__DECODE_DRR failed" + .if .L__decode_fail == 1 + .error "__DECODE_RR failed" .endif .endm - .macro __THUNK_EX_BR reg,ruse - # Be very careful when adding instructions to this macro! - # The ALTERNATIVE replacement code has a .+10 which targets - # the "br \reg" after the code has been patched. -#ifdef CONFIG_HAVE_MARCH_Z10_FEATURES + .macro __THUNK_EX_BR reg exrl 0,555f j . -#else - .ifc \reg,%r1 - ALTERNATIVE "ex %r0,_LC_BR_R1", ".insn ril,0xc60000000000,0,.+10", 35 - j . - .else - larl \ruse,555f - ex 0,0(\ruse) - j . - .endif -#endif 555: br \reg .endm - .macro __THUNK_EX_BC disp,reg,ruse -#ifdef CONFIG_HAVE_MARCH_Z10_FEATURES - exrl 0,556f - j . +#ifdef CONFIG_EXPOLINE_EXTERN + .macro GEN_BR_THUNK reg + .endm + .macro GEN_BR_THUNK_EXTERN reg #else - larl \ruse,556f - ex 0,0(\ruse) - j . + .macro GEN_BR_THUNK reg #endif -556: b \disp(\reg) - .endm - - .macro GEN_BR_THUNK reg,ruse=%r1 - __DECODE_RR __THUNK_PROLOG_BR,\reg,\ruse - __THUNK_EX_BR \reg,\ruse - __THUNK_EPILOG - .endm - - .macro GEN_B_THUNK disp,reg,ruse=%r1 - __DECODE_DRR __THUNK_PROLOG_BC,\disp,\reg,\ruse - __THUNK_EX_BC \disp,\reg,\ruse - __THUNK_EPILOG + __DECODE_R __THUNK_PROLOG_BR,\reg + __THUNK_EX_BR \reg + __DECODE_R __THUNK_EPILOG_BR,\reg .endm - .macro BR_EX reg,ruse=%r1 -557: __DECODE_RR __THUNK_BR,\reg,\ruse + .macro BR_EX reg +557: __DECODE_R __THUNK_BR,\reg .pushsection .s390_indirect_branches,"a",@progbits .long 557b-. .popsection .endm - .macro B_EX disp,reg,ruse=%r1 -558: __DECODE_DRR __THUNK_BC,\disp,\reg,\ruse - .pushsection .s390_indirect_branches,"a",@progbits - .long 558b-. - .popsection - .endm - - .macro BASR_EX rsave,rtarget,ruse=%r1 -559: __DECODE_RRR __THUNK_BRASL,\rsave,\rtarget,\ruse + .macro BASR_EX rsave,rtarget +559: __DECODE_RR __THUNK_BRASL,\rsave,\rtarget .pushsection .s390_indirect_branches,"a",@progbits .long 559b-. .popsection .endm #else - .macro GEN_BR_THUNK reg,ruse=%r1 - .endm - - .macro GEN_B_THUNK disp,reg,ruse=%r1 + .macro GEN_BR_THUNK reg .endm - .macro BR_EX reg,ruse=%r1 + .macro BR_EX reg br \reg .endm - .macro B_EX disp,reg,ruse=%r1 - b \disp(\reg) - .endm - - .macro BASR_EX rsave,rtarget,ruse=%r1 + .macro BASR_EX rsave,rtarget basr \rsave,\rtarget .endm #endif /* CC_USING_EXPOLINE */ diff --git a/arch/s390/include/asm/numa.h b/arch/s390/include/asm/numa.h index 35f8cbe7e5bb..23cd5d1b734b 100644 --- a/arch/s390/include/asm/numa.h +++ b/arch/s390/include/asm/numa.h @@ -13,24 +13,13 @@ #ifdef CONFIG_NUMA #include <linux/numa.h> -#include <linux/cpumask.h> void numa_setup(void); -int numa_pfn_to_nid(unsigned long pfn); -int __node_distance(int a, int b); -void numa_update_cpu_topology(void); - -extern cpumask_t node_to_cpumask_map[MAX_NUMNODES]; -extern int numa_debug_enabled; #else static inline void numa_setup(void) { } -static inline void numa_update_cpu_topology(void) { } -static inline int numa_pfn_to_nid(unsigned long pfn) -{ - return 0; -} #endif /* CONFIG_NUMA */ + #endif /* _ASM_S390_NUMA_H */ diff --git a/arch/s390/include/asm/os_info.h b/arch/s390/include/asm/os_info.h index 3c89279d2a4b..a4d2e103f116 100644 --- a/arch/s390/include/asm/os_info.h +++ b/arch/s390/include/asm/os_info.h @@ -8,12 +8,17 @@ #ifndef _ASM_S390_OS_INFO_H #define _ASM_S390_OS_INFO_H +#include <linux/uio.h> + #define OS_INFO_VERSION_MAJOR 1 #define OS_INFO_VERSION_MINOR 1 #define OS_INFO_MAGIC 0x4f53494e464f535aULL /* OSINFOSZ */ #define OS_INFO_VMCOREINFO 0 #define OS_INFO_REIPL_BLOCK 1 +#define OS_INFO_FLAGS_ENTRY 2 + +#define OS_INFO_FLAG_REIPL_CLEAR (1UL << 0) struct os_info_entry { u64 addr; @@ -28,8 +33,8 @@ struct os_info { u16 version_minor; u64 crashkernel_addr; u64 crashkernel_size; - struct os_info_entry entry[2]; - u8 reserved[4024]; + struct os_info_entry entry[3]; + u8 reserved[4004]; } __packed; void os_info_init(void); @@ -39,7 +44,6 @@ u32 os_info_csum(struct os_info *os_info); #ifdef CONFIG_CRASH_DUMP void *os_info_old_entry(int nr, unsigned long *size); -int copy_oldmem_kernel(void *dst, void *src, size_t count); #else static inline void *os_info_old_entry(int nr, unsigned long *size) { diff --git a/arch/s390/include/asm/page-states.h b/arch/s390/include/asm/page-states.h index c33c4deb545f..08fcbd628120 100644 --- a/arch/s390/include/asm/page-states.h +++ b/arch/s390/include/asm/page-states.h @@ -7,6 +7,9 @@ #ifndef PAGE_STATES_H #define PAGE_STATES_H +#include <asm/sections.h> +#include <asm/page.h> + #define ESSA_GET_STATE 0 #define ESSA_SET_STABLE 1 #define ESSA_SET_UNUSED 2 @@ -18,4 +21,60 @@ #define ESSA_MAX ESSA_SET_STABLE_NODAT +extern int __bootdata_preserved(cmma_flag); + +static __always_inline unsigned long essa(unsigned long paddr, unsigned char cmd) +{ + unsigned long rc; + + asm volatile( + " .insn rrf,0xb9ab0000,%[rc],%[paddr],%[cmd],0" + : [rc] "=d" (rc) + : [paddr] "d" (paddr), + [cmd] "i" (cmd)); + return rc; +} + +static __always_inline void __set_page_state(void *addr, unsigned long num_pages, unsigned char cmd) +{ + unsigned long paddr = __pa(addr) & PAGE_MASK; + + while (num_pages--) { + essa(paddr, cmd); + paddr += PAGE_SIZE; + } +} + +static inline void __set_page_unused(void *addr, unsigned long num_pages) +{ + __set_page_state(addr, num_pages, ESSA_SET_UNUSED); +} + +static inline void __set_page_stable_dat(void *addr, unsigned long num_pages) +{ + __set_page_state(addr, num_pages, ESSA_SET_STABLE); +} + +static inline void __set_page_stable_nodat(void *addr, unsigned long num_pages) +{ + __set_page_state(addr, num_pages, ESSA_SET_STABLE_NODAT); +} + +static inline void __arch_set_page_nodat(void *addr, unsigned long num_pages) +{ + if (!cmma_flag) + return; + if (cmma_flag < 2) + __set_page_stable_dat(addr, num_pages); + else + __set_page_stable_nodat(addr, num_pages); +} + +static inline void __arch_set_page_dat(void *addr, unsigned long num_pages) +{ + if (!cmma_flag) + return; + __set_page_stable_dat(addr, num_pages); +} + #endif diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index a4d38092530a..73b9c3bf377f 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -19,7 +19,9 @@ #define PAGE_SHIFT _PAGE_SHIFT #define PAGE_SIZE _PAGE_SIZE #define PAGE_MASK _PAGE_MASK -#define PAGE_DEFAULT_ACC 0 +#define PAGE_DEFAULT_ACC _AC(0, UL) +/* storage-protection override */ +#define PAGE_SPO_ACC 9 #define PAGE_DEFAULT_KEY (PAGE_DEFAULT_ACC << 4) #define HPAGE_SHIFT 20 @@ -33,6 +35,8 @@ #define ARCH_HAS_PREPARE_HUGEPAGE #define ARCH_HAS_HUGEPAGE_CLEAR_FLUSH +#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA + #include <asm/setup.h> #ifndef __ASSEMBLY__ @@ -40,7 +44,7 @@ void __storage_key_init_range(unsigned long start, unsigned long end); static inline void storage_key_init_range(unsigned long start, unsigned long end) { - if (PAGE_DEFAULT_KEY) + if (PAGE_DEFAULT_KEY != 0) __storage_key_init_range(start, end); } @@ -53,22 +57,24 @@ static inline void storage_key_init_range(unsigned long start, unsigned long end */ static inline void copy_page(void *to, void *from) { - register void *reg2 asm ("2") = to; - register unsigned long reg3 asm ("3") = 0x1000; - register void *reg4 asm ("4") = from; - register unsigned long reg5 asm ("5") = 0xb0001000; + union register_pair dst, src; + + dst.even = (unsigned long) to; + dst.odd = 0x1000; + src.even = (unsigned long) from; + src.odd = 0xb0001000; + asm volatile( - " mvcl 2,4" - : "+d" (reg2), "+d" (reg3), "+d" (reg4), "+d" (reg5) + " mvcl %[dst],%[src]" + : [dst] "+&d" (dst.pair), [src] "+&d" (src.pair) : : "memory", "cc"); } #define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) -#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \ - alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr) -#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE +#define vma_alloc_zeroed_movable_folio(vma, vaddr) \ + vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false) /* * These are used to make use of C type-checking.. @@ -85,11 +91,31 @@ typedef pte_t *pgtable_t; #define pgprot_val(x) ((x).pgprot) #define pgste_val(x) ((x).pgste) -#define pte_val(x) ((x).pte) -#define pmd_val(x) ((x).pmd) -#define pud_val(x) ((x).pud) -#define p4d_val(x) ((x).p4d) -#define pgd_val(x) ((x).pgd) + +static inline unsigned long pte_val(pte_t pte) +{ + return pte.pte; +} + +static inline unsigned long pmd_val(pmd_t pmd) +{ + return pmd.pmd; +} + +static inline unsigned long pud_val(pud_t pud) +{ + return pud.pud; +} + +static inline unsigned long p4d_val(p4d_t p4d) +{ + return p4d.p4d; +} + +static inline unsigned long pgd_val(pgd_t pgd) +{ + return pgd.pgd; +} #define __pgste(x) ((pgste_t) { (x) } ) #define __pte(x) ((pte_t) { (x) } ) @@ -138,10 +164,6 @@ static inline int page_reset_referenced(unsigned long addr) struct page; void arch_free_page(struct page *page, int order); void arch_alloc_page(struct page *page, int order); -void arch_set_page_dat(struct page *page, int order); -void arch_set_page_nodat(struct page *page, int order); -int arch_test_page_nodat(struct page *page); -void arch_set_page_states(int make_stable); static inline int devmem_is_allowed(unsigned long pfn) { @@ -151,7 +173,10 @@ static inline int devmem_is_allowed(unsigned long pfn) #define HAVE_ARCH_FREE_PAGE #define HAVE_ARCH_ALLOC_PAGE -#endif /* !__ASSEMBLY__ */ +#if IS_ENABLED(CONFIG_PGSTE) +int arch_make_page_accessible(struct page *page); +#define HAVE_ARCH_MAKE_PAGE_ACCESSIBLE +#endif #define __PAGE_OFFSET 0x0UL #define PAGE_OFFSET 0x0UL @@ -159,23 +184,32 @@ static inline int devmem_is_allowed(unsigned long pfn) #define __pa(x) ((unsigned long)(x)) #define __va(x) ((void *)(unsigned long)(x)) -#define virt_to_pfn(kaddr) (__pa(kaddr) >> PAGE_SHIFT) -#define pfn_to_virt(pfn) __va((pfn) << PAGE_SHIFT) +#define phys_to_pfn(phys) ((phys) >> PAGE_SHIFT) +#define pfn_to_phys(pfn) ((pfn) << PAGE_SHIFT) + +#define phys_to_page(phys) pfn_to_page(phys_to_pfn(phys)) +#define page_to_phys(page) pfn_to_phys(page_to_pfn(page)) + +static inline void *pfn_to_virt(unsigned long pfn) +{ + return __va(pfn_to_phys(pfn)); +} + +static inline unsigned long virt_to_pfn(const void *kaddr) +{ + return phys_to_pfn(__pa(kaddr)); +} + #define pfn_to_kaddr(pfn) pfn_to_virt(pfn) #define virt_to_page(kaddr) pfn_to_page(virt_to_pfn(kaddr)) #define page_to_virt(page) pfn_to_virt(page_to_pfn(page)) -#define phys_to_pfn(kaddr) ((kaddr) >> PAGE_SHIFT) -#define pfn_to_phys(pfn) ((pfn) << PAGE_SHIFT) - -#define phys_to_page(kaddr) pfn_to_page(phys_to_pfn(kaddr)) -#define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) +#define virt_addr_valid(kaddr) pfn_valid(virt_to_pfn(kaddr)) -#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) +#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_NON_EXEC -#define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | \ - VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#endif /* !__ASSEMBLY__ */ #include <asm-generic/memory_model.h> #include <asm-generic/getorder.h> diff --git a/arch/s390/include/asm/pai.h b/arch/s390/include/asm/pai.h new file mode 100644 index 000000000000..7d1888e3dee6 --- /dev/null +++ b/arch/s390/include/asm/pai.h @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Processor Activity Instrumentation support for cryptography counters + * + * Copyright IBM Corp. 2022 + * Author(s): Thomas Richter <tmricht@linux.ibm.com> + */ +#ifndef _ASM_S390_PAI_H +#define _ASM_S390_PAI_H + +#include <linux/jump_label.h> +#include <asm/lowcore.h> +#include <asm/ptrace.h> + +struct qpaci_info_block { + u64 header; + struct { + u64 : 8; + u64 num_cc : 8; /* # of supported crypto counters */ + u64 : 9; + u64 num_nnpa : 7; /* # of supported NNPA counters */ + u64 : 32; + }; +}; + +static inline int qpaci(struct qpaci_info_block *info) +{ + /* Size of info (in double words minus one) */ + size_t size = sizeof(*info) / sizeof(u64) - 1; + int cc; + + asm volatile( + " lgr 0,%[size]\n" + " .insn s,0xb28f0000,%[info]\n" + " lgr %[size],0\n" + " ipm %[cc]\n" + " srl %[cc],28\n" + : [cc] "=d" (cc), [info] "=Q" (*info), [size] "+&d" (size) + : + : "0", "cc", "memory"); + return cc ? (size + 1) * sizeof(u64) : 0; +} + +#define PAI_CRYPTO_BASE 0x1000 /* First event number */ +#define PAI_CRYPTO_MAXCTR 256 /* Max # of event counters */ +#define PAI_CRYPTO_KERNEL_OFFSET 2048 +#define PAI_NNPA_BASE 0x1800 /* First event number */ +#define PAI_NNPA_MAXCTR 128 /* Max # of event counters */ + +DECLARE_STATIC_KEY_FALSE(pai_key); + +static __always_inline void pai_kernel_enter(struct pt_regs *regs) +{ + if (!IS_ENABLED(CONFIG_PERF_EVENTS)) + return; + if (!static_branch_unlikely(&pai_key)) + return; + if (!S390_lowcore.ccd) + return; + if (!user_mode(regs)) + return; + WRITE_ONCE(S390_lowcore.ccd, S390_lowcore.ccd | PAI_CRYPTO_KERNEL_OFFSET); +} + +static __always_inline void pai_kernel_exit(struct pt_regs *regs) +{ + if (!IS_ENABLED(CONFIG_PERF_EVENTS)) + return; + if (!static_branch_unlikely(&pai_key)) + return; + if (!S390_lowcore.ccd) + return; + if (!user_mode(regs)) + return; + WRITE_ONCE(S390_lowcore.ccd, S390_lowcore.ccd & ~PAI_CRYPTO_KERNEL_OFFSET); +} + +enum paievt_mode { + PAI_MODE_NONE, + PAI_MODE_SAMPLING, + PAI_MODE_COUNTING, +}; + +#endif diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 3a06c264ea53..e91cd6bbc330 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -5,9 +5,10 @@ #include <linux/pci.h> #include <linux/mutex.h> #include <linux/iommu.h> -#include <asm-generic/pci.h> +#include <linux/pci_hotplug.h> #include <asm/pci_clp.h> #include <asm/pci_debug.h> +#include <asm/pci_insn.h> #include <asm/sclp.h> #define PCIBIOS_MIN_IO 0x1000 @@ -21,10 +22,16 @@ int pci_domain_nr(struct pci_bus *); int pci_proc_domain(struct pci_bus *); #define ZPCI_BUS_NR 0 /* default bus number */ -#define ZPCI_DEVFN 0 /* default device number */ #define ZPCI_NR_DMA_SPACES 1 #define ZPCI_NR_DEVICES CONFIG_PCI_NR_FUNCTIONS +#define ZPCI_DOMAIN_BITMAP_SIZE (1 << 16) + +#ifdef PCI +#if (ZPCI_NR_DEVICES > ZPCI_DOMAIN_BITMAP_SIZE) +# error ZPCI_NR_DEVICES can not be bigger than ZPCI_DOMAIN_BITMAP_SIZE +#endif +#endif /* PCI */ /* PCI Function Controls */ #define ZPCI_FC_FN_ENABLED 0x80 @@ -78,7 +85,6 @@ enum zpci_state { ZPCI_FN_STATE_STANDBY = 0, ZPCI_FN_STATE_CONFIGURED = 1, ZPCI_FN_STATE_RESERVED = 2, - ZPCI_FN_STATE_ONLINE = 3, }; struct zpci_bar_struct { @@ -91,20 +97,50 @@ struct zpci_bar_struct { }; struct s390_domain; +struct kvm_zdev; + +#define ZPCI_FUNCTIONS_PER_BUS 256 +struct zpci_bus { + struct kref kref; + struct pci_bus *bus; + struct zpci_dev *function[ZPCI_FUNCTIONS_PER_BUS]; + struct list_head resources; + struct list_head bus_next; + struct resource bus_resource; + int pchid; + int domain_nr; + bool multifunction; + enum pci_bus_speed max_bus_speed; +}; /* Private data per function */ struct zpci_dev { - struct pci_bus *bus; + struct zpci_bus *zbus; struct list_head entry; /* list of all zpci_devices, needed for hotplug, etc. */ + struct list_head iommu_list; + struct kref kref; + struct rcu_head rcu; + struct hotplug_slot hotplug_slot; enum zpci_state state; u32 fid; /* function ID, used by sclp */ u32 fh; /* function handle, used by insn's */ + u32 gisa; /* GISA designation for passthrough */ u16 vfn; /* virtual function number */ u16 pchid; /* physical channel ID */ + u16 maxstbl; /* Maximum store block size */ u8 pfgid; /* function group ID */ u8 pft; /* pci function type */ - u16 domain; + u8 port; + u8 dtsm; /* Supported DT mask */ + u8 rid_available : 1; + u8 has_hp_slot : 1; + u8 has_resources : 1; + u8 is_physfn : 1; + u8 util_str_avail : 1; + u8 irqs_registered : 1; + u8 reserved : 2; + unsigned int devfn; /* DEVFN part of the RID*/ struct mutex lock; u8 pfip[CLP_PFIP_NR_SEGMENTS]; /* pci function internal path */ @@ -121,16 +157,8 @@ struct zpci_dev { /* DMA stuff */ unsigned long *dma_table; - spinlock_t dma_table_lock; int tlb_refresh; - spinlock_t iommu_bitmap_lock; - unsigned long *iommu_bitmap; - unsigned long *lazy_bitmap; - unsigned long iommu_size; - unsigned long iommu_pages; - unsigned int next_bit; - struct iommu_device iommu_dev; /* IOMMU core handle */ char res_name[16]; @@ -145,16 +173,16 @@ struct zpci_dev { struct zpci_fmb *fmb; u16 fmb_update; /* update interval */ u16 fmb_length; - /* software counters */ - atomic64_t allocated_pages; - atomic64_t mapped_pages; - atomic64_t unmapped_pages; + u8 version; enum pci_bus_speed max_bus_speed; struct dentry *debugfs_dev; + /* IOMMU and passthrough */ struct s390_domain *s390_domain; /* s390 IOMMU domain data */ + struct kvm_zdev *kzdev; + struct mutex kzdev_lock; }; static inline bool zdev_enabled(struct zpci_dev *zdev) @@ -164,27 +192,40 @@ static inline bool zdev_enabled(struct zpci_dev *zdev) extern const struct attribute_group *zpci_attr_groups[]; extern unsigned int s390_pci_force_floating __initdata; +extern unsigned int s390_pci_no_rid; + +extern union zpci_sic_iib *zpci_aipb; +extern struct airq_iv *zpci_aif_sbv; /* ----------------------------------------------------------------------------- Prototypes ----------------------------------------------------------------------------- */ /* Base stuff */ -int zpci_create_device(struct zpci_dev *); -void zpci_remove_device(struct zpci_dev *zdev); +struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state); int zpci_enable_device(struct zpci_dev *); int zpci_disable_device(struct zpci_dev *); -int zpci_register_ioat(struct zpci_dev *, u8, u64, u64, u64); +int zpci_scan_configured_device(struct zpci_dev *zdev, u32 fh); +int zpci_deconfigure_device(struct zpci_dev *zdev); +void zpci_device_reserved(struct zpci_dev *zdev); +bool zpci_is_device_configured(struct zpci_dev *zdev); + +int zpci_hot_reset_device(struct zpci_dev *zdev); +int zpci_register_ioat(struct zpci_dev *, u8, u64, u64, u64, u8 *); int zpci_unregister_ioat(struct zpci_dev *, u8); void zpci_remove_reserved_devices(void); +void zpci_update_fh(struct zpci_dev *zdev, u32 fh); /* CLP */ +int clp_setup_writeback_mio(void); int clp_scan_pci_devices(void); -int clp_rescan_pci_devices(void); -int clp_rescan_pci_devices_simple(void); -int clp_add_pci_device(u32, u32, int); -int clp_enable_fh(struct zpci_dev *, u8); -int clp_disable_fh(struct zpci_dev *); +int clp_query_pci_fn(struct zpci_dev *zdev); +int clp_enable_fh(struct zpci_dev *zdev, u32 *fh, u8 nr_dma_as); +int clp_disable_fh(struct zpci_dev *zdev, u32 *fh); int clp_get_state(u32 fid, enum zpci_state *state); +int clp_refresh_fh(u32 fid, u32 *fh); + +/* UID */ +void update_uid_checking(bool new); /* IOMMU Interface */ int zpci_init_iommu(struct zpci_dev *zdev); @@ -199,12 +240,10 @@ static inline bool zpci_use_mio(struct zpci_dev *zdev) /* Error handling and recovery */ void zpci_event_error(void *); void zpci_event_availability(void *); -void zpci_rescan(void); bool zpci_is_enabled(void); #else /* CONFIG_PCI */ static inline void zpci_event_error(void *e) {} static inline void zpci_event_availability(void *e) {} -static inline void zpci_rescan(void) {} #endif /* CONFIG_PCI */ #ifdef CONFIG_HOTPLUG_PCI_S390 @@ -221,7 +260,14 @@ static inline void zpci_exit_slot(struct zpci_dev *zdev) {} /* Helpers */ static inline struct zpci_dev *to_zpci(struct pci_dev *pdev) { - return pdev->sysdata; + struct zpci_bus *zbus = pdev->sysdata; + + return zbus->function[pdev->devfn]; +} + +static inline struct zpci_dev *to_zpci_dev(struct device *dev) +{ + return to_zpci(to_pci_dev(dev)); } struct zpci_dev *get_zdev_by_fid(u32); @@ -229,7 +275,10 @@ struct zpci_dev *get_zdev_by_fid(u32); /* DMA */ int zpci_dma_init(void); void zpci_dma_exit(void); +int zpci_dma_init_device(struct zpci_dev *zdev); +int zpci_dma_exit_device(struct zpci_dev *zdev); +/* IRQ */ int __init zpci_irq_init(void); void __init zpci_irq_exit(void); @@ -242,10 +291,11 @@ int zpci_debug_init(void); void zpci_debug_exit(void); void zpci_debug_init_device(struct zpci_dev *, const char *); void zpci_debug_exit_device(struct zpci_dev *); -void zpci_debug_info(struct zpci_dev *, struct seq_file *); -/* Error reporting */ +/* Error handling */ int zpci_report_error(struct pci_dev *, struct zpci_report_error_header *); +int zpci_clear_error_state(struct zpci_dev *zdev); +int zpci_reset_load_store_blocked(struct zpci_dev *zdev); #ifdef CONFIG_NUMA diff --git a/arch/s390/include/asm/pci_clp.h b/arch/s390/include/asm/pci_clp.h index bd2cb4ea7d93..f0c677ddd270 100644 --- a/arch/s390/include/asm/pci_clp.h +++ b/arch/s390/include/asm/pci_clp.h @@ -7,6 +7,7 @@ /* * Call Logical Processor - Command Codes */ +#define CLP_SLPC 0x0001 #define CLP_LIST_PCI 0x0002 #define CLP_QUERY_PCI_FN 0x0003 #define CLP_QUERY_PCI_FNGRP 0x0004 @@ -49,8 +50,24 @@ struct clp_fh_list_entry { #define CLP_UTIL_STR_LEN 64 #define CLP_PFIP_NR_SEGMENTS 4 +/* PCI function type numbers */ +#define PCI_FUNC_TYPE_ISM 0x5 /* ISM device */ + extern bool zpci_unique_uid; +struct clp_rsp_slpc_pci { + struct clp_rsp_hdr hdr; + u32 reserved2[4]; + u32 lpif[8]; + u32 reserved3[4]; + u32 vwb : 1; + u32 : 1; + u32 mio_wb : 6; + u32 : 24; + u32 reserved5[3]; + u32 lpic[8]; +} __packed; + /* List PCI functions request */ struct clp_req_list_pci { struct clp_req_hdr hdr; @@ -93,7 +110,10 @@ struct clp_req_query_pci { struct clp_rsp_query_pci { struct clp_rsp_hdr hdr; u16 vfn; /* virtual fn number */ - u16 : 6; + u16 : 3; + u16 rid_avail : 1; + u16 is_physfn : 1; + u16 reserved1 : 1; u16 mio_addr_avail : 1; u16 util_str_avail : 1; /* utility string available? */ u16 pfgid : 8; /* pci function group id */ @@ -102,12 +122,16 @@ struct clp_rsp_query_pci { u16 pchid; __le32 bar[PCI_STD_NUM_BARS]; u8 pfip[CLP_PFIP_NR_SEGMENTS]; /* pci function internal path */ - u32 : 16; + u16 : 12; + u16 port : 4; u8 fmb_len; u8 pft; /* pci function type */ u64 sdma; /* start dma as */ u64 edma; /* end dma as */ - u32 reserved[11]; +#define ZPCI_RID_MASK_DEVFN 0x00ff + u16 rid; /* BUS/DEVFN PCI address */ + u16 reserved0; + u32 reserved[10]; u32 uid; /* user defined id */ u8 util_str[CLP_UTIL_STR_LEN]; /* utility string */ u32 reserved2[16]; @@ -132,9 +156,11 @@ struct clp_rsp_query_pci_grp { u8 : 6; u8 frame : 1; u8 refresh : 1; /* TLB refresh mode */ - u16 reserved2; + u16 : 3; + u16 maxstbl : 13; /* Maximum store block size */ u16 mui; - u16 : 16; + u8 dtsm; /* Supported DT mask */ + u8 reserved3; u16 maxfaal; u16 : 4; u16 dnoi : 12; @@ -152,7 +178,8 @@ struct clp_req_set_pci { u16 reserved2; u8 oc; /* operation controls */ u8 ndas; /* number of dma spaces */ - u64 reserved3; + u32 reserved3; + u32 gisa; /* GISA designation */ } __packed; /* Set PCI function response */ @@ -165,6 +192,11 @@ struct clp_rsp_set_pci { } __packed; /* Combined request/response block structures used by clp insn */ +struct clp_req_rsp_slpc_pci { + struct clp_req_slpc request; + struct clp_rsp_slpc_pci response; +} __packed; + struct clp_req_rsp_list_pci { struct clp_req_list_pci request; struct clp_rsp_list_pci response; diff --git a/arch/s390/include/asm/pci_debug.h b/arch/s390/include/asm/pci_debug.h index 5dfe47588277..3bb4e7e33a0e 100644 --- a/arch/s390/include/asm/pci_debug.h +++ b/arch/s390/include/asm/pci_debug.h @@ -17,9 +17,14 @@ extern debug_info_t *pci_debug_err_id; debug_text_event(pci_debug_err_id, 0, debug_buffer); \ } while (0) +static inline void zpci_err_hex_level(int level, void *addr, int len) +{ + debug_event(pci_debug_err_id, level, addr, len); +} + static inline void zpci_err_hex(void *addr, int len) { - debug_event(pci_debug_err_id, 0, addr, len); + zpci_err_hex_level(0, addr, len); } #endif diff --git a/arch/s390/include/asm/pci_dma.h b/arch/s390/include/asm/pci_dma.h index 419fac7a62c0..42d7cc4262ca 100644 --- a/arch/s390/include/asm/pci_dma.h +++ b/arch/s390/include/asm/pci_dma.h @@ -82,126 +82,16 @@ enum zpci_ioat_dtype { #define ZPCI_TABLE_VALID_MASK 0x20 #define ZPCI_TABLE_PROT_MASK 0x200 -static inline unsigned int calc_rtx(dma_addr_t ptr) -{ - return ((unsigned long) ptr >> ZPCI_RT_SHIFT) & ZPCI_INDEX_MASK; -} - -static inline unsigned int calc_sx(dma_addr_t ptr) -{ - return ((unsigned long) ptr >> ZPCI_ST_SHIFT) & ZPCI_INDEX_MASK; -} - -static inline unsigned int calc_px(dma_addr_t ptr) -{ - return ((unsigned long) ptr >> PAGE_SHIFT) & ZPCI_PT_MASK; -} - -static inline void set_pt_pfaa(unsigned long *entry, void *pfaa) -{ - *entry &= ZPCI_PTE_FLAG_MASK; - *entry |= ((unsigned long) pfaa & ZPCI_PTE_ADDR_MASK); -} - -static inline void set_rt_sto(unsigned long *entry, void *sto) -{ - *entry &= ZPCI_RTE_FLAG_MASK; - *entry |= ((unsigned long) sto & ZPCI_RTE_ADDR_MASK); - *entry |= ZPCI_TABLE_TYPE_RTX; -} - -static inline void set_st_pto(unsigned long *entry, void *pto) -{ - *entry &= ZPCI_STE_FLAG_MASK; - *entry |= ((unsigned long) pto & ZPCI_STE_ADDR_MASK); - *entry |= ZPCI_TABLE_TYPE_SX; -} - -static inline void validate_rt_entry(unsigned long *entry) -{ - *entry &= ~ZPCI_TABLE_VALID_MASK; - *entry &= ~ZPCI_TABLE_OFFSET_MASK; - *entry |= ZPCI_TABLE_VALID; - *entry |= ZPCI_TABLE_LEN_RTX; -} - -static inline void validate_st_entry(unsigned long *entry) -{ - *entry &= ~ZPCI_TABLE_VALID_MASK; - *entry |= ZPCI_TABLE_VALID; -} - -static inline void invalidate_table_entry(unsigned long *entry) -{ - *entry &= ~ZPCI_TABLE_VALID_MASK; - *entry |= ZPCI_TABLE_INVALID; -} - -static inline void invalidate_pt_entry(unsigned long *entry) -{ - WARN_ON_ONCE((*entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_INVALID); - *entry &= ~ZPCI_PTE_VALID_MASK; - *entry |= ZPCI_PTE_INVALID; -} - -static inline void validate_pt_entry(unsigned long *entry) -{ - WARN_ON_ONCE((*entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID); - *entry &= ~ZPCI_PTE_VALID_MASK; - *entry |= ZPCI_PTE_VALID; -} - -static inline void entry_set_protected(unsigned long *entry) -{ - *entry &= ~ZPCI_TABLE_PROT_MASK; - *entry |= ZPCI_TABLE_PROTECTED; -} - -static inline void entry_clr_protected(unsigned long *entry) -{ - *entry &= ~ZPCI_TABLE_PROT_MASK; - *entry |= ZPCI_TABLE_UNPROTECTED; -} - -static inline int reg_entry_isvalid(unsigned long entry) -{ - return (entry & ZPCI_TABLE_VALID_MASK) == ZPCI_TABLE_VALID; -} - -static inline int pt_entry_isvalid(unsigned long entry) -{ - return (entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID; -} - -static inline int entry_isprotected(unsigned long entry) -{ - return (entry & ZPCI_TABLE_PROT_MASK) == ZPCI_TABLE_PROTECTED; -} - -static inline unsigned long *get_rt_sto(unsigned long entry) -{ - return ((entry & ZPCI_TABLE_TYPE_MASK) == ZPCI_TABLE_TYPE_RTX) - ? (unsigned long *) (entry & ZPCI_RTE_ADDR_MASK) - : NULL; -} - -static inline unsigned long *get_st_pto(unsigned long entry) -{ - return ((entry & ZPCI_TABLE_TYPE_MASK) == ZPCI_TABLE_TYPE_SX) - ? (unsigned long *) (entry & ZPCI_STE_ADDR_MASK) - : NULL; -} - -/* Prototypes */ -int zpci_dma_init_device(struct zpci_dev *); -void zpci_dma_exit_device(struct zpci_dev *); -void dma_free_seg_table(unsigned long); -unsigned long *dma_alloc_cpu_table(void); -void dma_cleanup_tables(unsigned long *); -unsigned long *dma_walk_cpu_trans(unsigned long *rto, dma_addr_t dma_addr); -void dma_update_cpu_trans(unsigned long *entry, void *page_addr, int flags); - -extern const struct dma_map_ops s390_pci_dma_ops; +struct zpci_iommu_ctrs { + atomic64_t mapped_pages; + atomic64_t unmapped_pages; + atomic64_t global_rpcits; + atomic64_t sync_map_rpcits; + atomic64_t sync_rpcits; +}; + +struct zpci_dev; +struct zpci_iommu_ctrs *zpci_get_iommu_ctrs(struct zpci_dev *zdev); #endif diff --git a/arch/s390/include/asm/pci_insn.h b/arch/s390/include/asm/pci_insn.h index 61cf9531f68f..e5f57cfe1d45 100644 --- a/arch/s390/include/asm/pci_insn.h +++ b/arch/s390/include/asm/pci_insn.h @@ -98,6 +98,15 @@ struct zpci_fib { u32 gd; } __packed __aligned(8); +/* Set Interruption Controls Operation Controls */ +#define SIC_IRQ_MODE_ALL 0 +#define SIC_IRQ_MODE_SINGLE 1 +#define SIC_SET_AENI_CONTROLS 2 +#define SIC_IRQ_MODE_DIRECT 4 +#define SIC_IRQ_MODE_D_ALL 16 +#define SIC_IRQ_MODE_D_SINGLE 17 +#define SIC_IRQ_MODE_SET_CPU 18 + /* directed interruption information block */ struct zpci_diib { u32 : 1; @@ -119,9 +128,20 @@ struct zpci_cdiib { u64 : 64; } __packed __aligned(8); +/* adapter interruption parameters block */ +struct zpci_aipb { + u64 faisb; + u64 gait; + u16 : 13; + u16 afi : 3; + u32 : 32; + u16 faal; +} __packed __aligned(8); + union zpci_sic_iib { struct zpci_diib diib; struct zpci_cdiib cdiib; + struct zpci_aipb aipb; }; DECLARE_STATIC_KEY_FALSE(have_mio); @@ -134,13 +154,6 @@ int __zpci_store(u64 data, u64 req, u64 offset); int zpci_store(const volatile void __iomem *addr, u64 data, unsigned long len); int __zpci_store_block(const u64 *data, u64 req, u64 offset); void zpci_barrier(void); -int __zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib); - -static inline int zpci_set_irq_ctrl(u16 ctl, u8 isc) -{ - union zpci_sic_iib iib = {{0}}; - - return __zpci_set_irq_ctrl(ctl, isc, &iib); -} +int zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib); #endif diff --git a/arch/s390/include/asm/pci_io.h b/arch/s390/include/asm/pci_io.h index cd060b5dd8fd..2686bee800e3 100644 --- a/arch/s390/include/asm/pci_io.h +++ b/arch/s390/include/asm/pci_io.h @@ -8,14 +8,21 @@ #include <linux/slab.h> #include <asm/pci_insn.h> +/* I/O size constraints */ +#define ZPCI_MAX_READ_SIZE 8 +#define ZPCI_MAX_WRITE_SIZE 128 +#define ZPCI_BOUNDARY_SIZE (1 << 12) +#define ZPCI_BOUNDARY_MASK (ZPCI_BOUNDARY_SIZE - 1) + /* I/O Map */ #define ZPCI_IOMAP_SHIFT 48 -#define ZPCI_IOMAP_ADDR_BASE 0x8000000000000000UL +#define ZPCI_IOMAP_ADDR_SHIFT 62 +#define ZPCI_IOMAP_ADDR_BASE (1UL << ZPCI_IOMAP_ADDR_SHIFT) #define ZPCI_IOMAP_ADDR_OFF_MASK ((1UL << ZPCI_IOMAP_SHIFT) - 1) #define ZPCI_IOMAP_MAX_ENTRIES \ - ((ULONG_MAX - ZPCI_IOMAP_ADDR_BASE + 1) / (1UL << ZPCI_IOMAP_SHIFT)) + (1UL << (ZPCI_IOMAP_ADDR_SHIFT - ZPCI_IOMAP_SHIFT)) #define ZPCI_IOMAP_ADDR_IDX_MASK \ - (~ZPCI_IOMAP_ADDR_OFF_MASK - ZPCI_IOMAP_ADDR_BASE) + ((ZPCI_IOMAP_ADDR_BASE - 1) & ~ZPCI_IOMAP_ADDR_OFF_MASK) struct zpci_iomap_entry { u32 fh; @@ -120,16 +127,18 @@ out: int zpci_write_block(volatile void __iomem *dst, const void *src, unsigned long len); -static inline u8 zpci_get_max_write_size(u64 src, u64 dst, int len, int max) +static inline int zpci_get_max_io_size(u64 src, u64 dst, int len, int max) { - int count = len > max ? max : len, size = 1; + int offset = dst & ZPCI_BOUNDARY_MASK; + int size; - while (!(src & 0x1) && !(dst & 0x1) && ((size << 1) <= count)) { - dst = dst >> 1; - src = src >> 1; - size = size << 1; - } - return size; + size = min3(len, ZPCI_BOUNDARY_SIZE - offset, max); + if (IS_ALIGNED(src, 8) && IS_ALIGNED(dst, 8) && IS_ALIGNED(size, 8)) + return size; + + if (size >= 8) + return 8; + return rounddown_pow_of_two(size); } static inline int zpci_memcpy_fromio(void *dst, @@ -139,8 +148,9 @@ static inline int zpci_memcpy_fromio(void *dst, int size, rc = 0; while (n > 0) { - size = zpci_get_max_write_size((u64 __force) src, - (u64) dst, n, 8); + size = zpci_get_max_io_size((u64 __force) src, + (u64) dst, n, + ZPCI_MAX_READ_SIZE); rc = zpci_read_single(dst, src, size); if (rc) break; @@ -160,8 +170,9 @@ static inline int zpci_memcpy_toio(volatile void __iomem *dst, return -EINVAL; while (n > 0) { - size = zpci_get_max_write_size((u64 __force) dst, - (u64) src, n, 128); + size = zpci_get_max_io_size((u64 __force) dst, + (u64) src, n, + ZPCI_MAX_WRITE_SIZE); if (size > 8) /* main path */ rc = zpci_write_block(dst, src, size); else diff --git a/arch/s390/include/asm/percpu.h b/arch/s390/include/asm/percpu.h index 50b4ce8cddfd..264095dd84bc 100644 --- a/arch/s390/include/asm/percpu.h +++ b/arch/s390/include/asm/percpu.h @@ -29,15 +29,15 @@ typedef typeof(pcp) pcp_op_T__; \ pcp_op_T__ old__, new__, prev__; \ pcp_op_T__ *ptr__; \ - preempt_disable(); \ + preempt_disable_notrace(); \ ptr__ = raw_cpu_ptr(&(pcp)); \ - prev__ = *ptr__; \ + prev__ = READ_ONCE(*ptr__); \ do { \ old__ = prev__; \ new__ = old__ op (val); \ prev__ = cmpxchg(ptr__, old__, new__); \ } while (prev__ != old__); \ - preempt_enable(); \ + preempt_enable_notrace(); \ new__; \ }) @@ -68,7 +68,7 @@ typedef typeof(pcp) pcp_op_T__; \ pcp_op_T__ val__ = (val); \ pcp_op_T__ old__, *ptr__; \ - preempt_disable(); \ + preempt_disable_notrace(); \ ptr__ = raw_cpu_ptr(&(pcp)); \ if (__builtin_constant_p(val__) && \ ((szcast)val__ > -129) && ((szcast)val__ < 128)) { \ @@ -84,7 +84,7 @@ : [val__] "d" (val__) \ : "cc"); \ } \ - preempt_enable(); \ + preempt_enable_notrace(); \ } #define this_cpu_add_4(pcp, val) arch_this_cpu_add(pcp, val, "laa", "asi", int) @@ -95,14 +95,14 @@ typedef typeof(pcp) pcp_op_T__; \ pcp_op_T__ val__ = (val); \ pcp_op_T__ old__, *ptr__; \ - preempt_disable(); \ + preempt_disable_notrace(); \ ptr__ = raw_cpu_ptr(&(pcp)); \ asm volatile( \ op " %[old__],%[val__],%[ptr__]\n" \ : [old__] "=d" (old__), [ptr__] "+Q" (*ptr__) \ : [val__] "d" (val__) \ : "cc"); \ - preempt_enable(); \ + preempt_enable_notrace(); \ old__ + val__; \ }) @@ -114,14 +114,14 @@ typedef typeof(pcp) pcp_op_T__; \ pcp_op_T__ val__ = (val); \ pcp_op_T__ old__, *ptr__; \ - preempt_disable(); \ + preempt_disable_notrace(); \ ptr__ = raw_cpu_ptr(&(pcp)); \ asm volatile( \ op " %[old__],%[val__],%[ptr__]\n" \ : [old__] "=d" (old__), [ptr__] "+Q" (*ptr__) \ : [val__] "d" (val__) \ : "cc"); \ - preempt_enable(); \ + preempt_enable_notrace(); \ } #define this_cpu_and_4(pcp, val) arch_this_cpu_to_op(pcp, val, "lan") @@ -136,10 +136,10 @@ typedef typeof(pcp) pcp_op_T__; \ pcp_op_T__ ret__; \ pcp_op_T__ *ptr__; \ - preempt_disable(); \ + preempt_disable_notrace(); \ ptr__ = raw_cpu_ptr(&(pcp)); \ ret__ = cmpxchg(ptr__, oval, nval); \ - preempt_enable(); \ + preempt_enable_notrace(); \ ret__; \ }) @@ -148,14 +148,30 @@ #define this_cpu_cmpxchg_4(pcp, oval, nval) arch_this_cpu_cmpxchg(pcp, oval, nval) #define this_cpu_cmpxchg_8(pcp, oval, nval) arch_this_cpu_cmpxchg(pcp, oval, nval) +#define this_cpu_cmpxchg64(pcp, o, n) this_cpu_cmpxchg_8(pcp, o, n) + +#define this_cpu_cmpxchg128(pcp, oval, nval) \ +({ \ + typedef typeof(pcp) pcp_op_T__; \ + u128 old__, new__, ret__; \ + pcp_op_T__ *ptr__; \ + old__ = oval; \ + new__ = nval; \ + preempt_disable_notrace(); \ + ptr__ = raw_cpu_ptr(&(pcp)); \ + ret__ = cmpxchg128((void *)ptr__, old__, new__); \ + preempt_enable_notrace(); \ + ret__; \ +}) + #define arch_this_cpu_xchg(pcp, nval) \ ({ \ typeof(pcp) *ptr__; \ typeof(pcp) ret__; \ - preempt_disable(); \ + preempt_disable_notrace(); \ ptr__ = raw_cpu_ptr(&(pcp)); \ ret__ = xchg(ptr__, nval); \ - preempt_enable(); \ + preempt_enable_notrace(); \ ret__; \ }) @@ -164,23 +180,6 @@ #define this_cpu_xchg_4(pcp, nval) arch_this_cpu_xchg(pcp, nval) #define this_cpu_xchg_8(pcp, nval) arch_this_cpu_xchg(pcp, nval) -#define arch_this_cpu_cmpxchg_double(pcp1, pcp2, o1, o2, n1, n2) \ -({ \ - typeof(pcp1) o1__ = (o1), n1__ = (n1); \ - typeof(pcp2) o2__ = (o2), n2__ = (n2); \ - typeof(pcp1) *p1__; \ - typeof(pcp2) *p2__; \ - int ret__; \ - preempt_disable(); \ - p1__ = raw_cpu_ptr(&(pcp1)); \ - p2__ = raw_cpu_ptr(&(pcp2)); \ - ret__ = __cmpxchg_double(p1__, p2__, o1__, o2__, n1__, n2__); \ - preempt_enable(); \ - ret__; \ -}) - -#define this_cpu_cmpxchg_double_8 arch_this_cpu_cmpxchg_double - #include <asm-generic/percpu.h> #endif /* __ARCH_S390_PERCPU__ */ diff --git a/arch/s390/include/asm/perf_event.h b/arch/s390/include/asm/perf_event.h index b9da71632827..9917e2717b2b 100644 --- a/arch/s390/include/asm/perf_event.h +++ b/arch/s390/include/asm/perf_event.h @@ -60,7 +60,6 @@ struct perf_sf_sde_regs { #define PERF_CPUM_SF_DIAG_MODE 0x0002 /* Diagnostic-sampling flag */ #define PERF_CPUM_SF_MODE_MASK (PERF_CPUM_SF_BASIC_MODE| \ PERF_CPUM_SF_DIAG_MODE) -#define PERF_CPUM_SF_FULL_BLOCKS 0x0004 /* Process full SDBs only */ #define PERF_CPUM_SF_FREQ_MODE 0x0008 /* Sampling with frequency */ #define REG_NONE 0 @@ -71,7 +70,6 @@ struct perf_sf_sde_regs { #define SAMPL_RATE(hwc) ((hwc)->event_base) #define SAMPL_FLAGS(hwc) ((hwc)->config_base) #define SAMPL_DIAG_MODE(hwc) (SAMPL_FLAGS(hwc) & PERF_CPUM_SF_DIAG_MODE) -#define SDB_FULL_BLOCKS(hwc) (SAMPL_FLAGS(hwc) & PERF_CPUM_SF_FULL_BLOCKS) #define SAMPLE_FREQ_MODE(hwc) (SAMPL_FLAGS(hwc) & PERF_CPUM_SF_FREQ_MODE) #define perf_arch_fetch_caller_regs(regs, __ip) do { \ diff --git a/arch/s390/include/asm/pfault.h b/arch/s390/include/asm/pfault.h new file mode 100644 index 000000000000..a1bee4a1e470 --- /dev/null +++ b/arch/s390/include/asm/pfault.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 1999, 2023 + */ +#ifndef _ASM_S390_PFAULT_H +#define _ASM_S390_PFAULT_H + +#include <linux/errno.h> + +int __pfault_init(void); +void __pfault_fini(void); + +static inline int pfault_init(void) +{ + if (IS_ENABLED(CONFIG_PFAULT)) + return __pfault_init(); + return -EOPNOTSUPP; +} + +static inline void pfault_fini(void) +{ + if (IS_ENABLED(CONFIG_PFAULT)) + __pfault_fini(); +} + +#endif /* _ASM_S390_PFAULT_H */ diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index 77606c4acd58..502d655fe6ae 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -25,7 +25,6 @@ void crst_table_free(struct mm_struct *, unsigned long *); unsigned long *page_table_alloc(struct mm_struct *); struct page *page_table_alloc_pgste(struct mm_struct *mm); void page_table_free(struct mm_struct *, unsigned long *); -void page_table_free_rcu(struct mmu_gather *, unsigned long *, unsigned long); void page_table_free_pgste(struct page *page); extern int page_table_allocate_pgste; @@ -34,19 +33,21 @@ static inline void crst_table_init(unsigned long *crst, unsigned long entry) memset64((u64 *)crst, entry, _CRST_ENTRIES); } -static inline unsigned long pgd_entry_type(struct mm_struct *mm) +int crst_table_upgrade(struct mm_struct *mm, unsigned long limit); + +static inline unsigned long check_asce_limit(struct mm_struct *mm, unsigned long addr, + unsigned long len) { - if (mm_pmd_folded(mm)) - return _SEGMENT_ENTRY_EMPTY; - if (mm_pud_folded(mm)) - return _REGION3_ENTRY_EMPTY; - if (mm_p4d_folded(mm)) - return _REGION2_ENTRY_EMPTY; - return _REGION1_ENTRY_EMPTY; -} + int rc; -int crst_table_upgrade(struct mm_struct *mm, unsigned long limit); -void crst_table_downgrade(struct mm_struct *); + if (addr + len > mm->context.asce_limit && + addr + len <= TASK_SIZE) { + rc = crst_table_upgrade(mm, addr + len); + if (rc) + return (unsigned long) rc; + } + return addr; +} static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long address) { @@ -84,7 +85,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long vmaddr) if (!table) return NULL; crst_table_init(table, _SEGMENT_ENTRY_EMPTY); - if (!pgtable_pmd_page_ctor(virt_to_page(table))) { + if (!pagetable_pmd_ctor(virt_to_ptdesc(table))) { crst_table_free(mm, table); return NULL; } @@ -95,59 +96,43 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { if (mm_pmd_folded(mm)) return; - pgtable_pmd_page_dtor(virt_to_page(pmd)); + pagetable_pmd_dtor(virt_to_ptdesc(pmd)); crst_table_free(mm, (unsigned long *) pmd); } static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d) { - pgd_val(*pgd) = _REGION1_ENTRY | __pa(p4d); + set_pgd(pgd, __pgd(_REGION1_ENTRY | __pa(p4d))); } static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud) { - p4d_val(*p4d) = _REGION2_ENTRY | __pa(pud); + set_p4d(p4d, __p4d(_REGION2_ENTRY | __pa(pud))); } static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) { - pud_val(*pud) = _REGION3_ENTRY | __pa(pmd); + set_pud(pud, __pud(_REGION3_ENTRY | __pa(pmd))); } static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - unsigned long *table = crst_table_alloc(mm); - - if (!table) - return NULL; - if (mm->context.asce_limit == _REGION3_SIZE) { - /* Forking a compat process with 2 page table levels */ - if (!pgtable_pmd_page_ctor(virt_to_page(table))) { - crst_table_free(mm, table); - return NULL; - } - } - return (pgd_t *) table; + return (pgd_t *) crst_table_alloc(mm); } static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { - if (mm->context.asce_limit == _REGION3_SIZE) - pgtable_pmd_page_dtor(virt_to_page(pgd)); crst_table_free(mm, (unsigned long *) pgd); } static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t pte) { - pmd_val(*pmd) = _SEGMENT_ENTRY + __pa(pte); + set_pmd(pmd, __pmd(_SEGMENT_ENTRY | __pa(pte))); } #define pmd_populate_kernel(mm, pmd, pte) pmd_populate(mm, pmd, pte) -#define pmd_pgtable(pmd) \ - (pgtable_t)(pmd_val(pmd) & -sizeof(pte_t)*PTRS_PER_PTE) - /* * page table entry allocation/free routines. */ @@ -157,7 +142,9 @@ static inline void pmd_populate(struct mm_struct *mm, #define pte_free_kernel(mm, pte) page_table_free(mm, (unsigned long *) pte) #define pte_free(mm, pte) page_table_free(mm, (unsigned long *) pte) -extern void rcu_table_freelist_finish(void); +/* arch use pte_free_defer() implementation in arch/s390/mm/pgalloc.c */ +#define pte_free_defer pte_free_defer +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable); void vmem_map_init(void); void *vmem_crst_alloc(unsigned long val); diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 7b03037a8475..1299b56e43f6 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -17,11 +17,16 @@ #include <linux/page-flags.h> #include <linux/radix-tree.h> #include <linux/atomic.h> +#include <asm/sections.h> +#include <asm/ctlreg.h> #include <asm/bug.h> #include <asm/page.h> +#include <asm/uv.h> extern pgd_t swapper_pg_dir[]; +extern pgd_t invalid_pg_dir[]; extern void paging_init(void); +extern struct ctlreg s390_invalid_asce; enum { PG_DIRECT_MAP_4K = 0, @@ -30,7 +35,7 @@ enum { PG_DIRECT_MAP_MAX }; -extern atomic_long_t direct_pages_count[PG_DIRECT_MAP_MAX]; +extern atomic_long_t __bootdata_preserved(direct_pages_count[PG_DIRECT_MAP_MAX]); static inline void update_page_count(int level, long count) { @@ -38,14 +43,12 @@ static inline void update_page_count(int level, long count) atomic_long_add(count, &direct_pages_count[level]); } -struct seq_file; -void arch_report_meminfo(struct seq_file *m); - /* * The S390 doesn't have any external MMU info: the kernel page * tables contain all the necessary information. */ #define update_mmu_cache(vma, address, ptep) do { } while (0) +#define update_mmu_cache_range(vmf, vma, addr, ptep, nr) do { } while (0) #define update_mmu_cache_pmd(vma, address, ptep) do { } while (0) /* @@ -63,36 +66,33 @@ extern unsigned long zero_page_mask; /* TODO: s390 cannot support io_remap_pfn_range... */ -#define FIRST_USER_ADDRESS 0UL - #define pte_ERROR(e) \ - printk("%s:%d: bad pte %p.\n", __FILE__, __LINE__, (void *) pte_val(e)) + pr_err("%s:%d: bad pte %016lx.\n", __FILE__, __LINE__, pte_val(e)) #define pmd_ERROR(e) \ - printk("%s:%d: bad pmd %p.\n", __FILE__, __LINE__, (void *) pmd_val(e)) + pr_err("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e)) #define pud_ERROR(e) \ - printk("%s:%d: bad pud %p.\n", __FILE__, __LINE__, (void *) pud_val(e)) + pr_err("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e)) #define p4d_ERROR(e) \ - printk("%s:%d: bad p4d %p.\n", __FILE__, __LINE__, (void *) p4d_val(e)) + pr_err("%s:%d: bad p4d %016lx.\n", __FILE__, __LINE__, p4d_val(e)) #define pgd_ERROR(e) \ - printk("%s:%d: bad pgd %p.\n", __FILE__, __LINE__, (void *) pgd_val(e)) + pr_err("%s:%d: bad pgd %016lx.\n", __FILE__, __LINE__, pgd_val(e)) /* * The vmalloc and module area will always be on the topmost area of the - * kernel mapping. We reserve 128GB (64bit) for vmalloc and modules. - * On 64 bit kernels we have a 2GB area at the top of the vmalloc area where - * modules will reside. That makes sure that inter module branches always - * happen without trampolines and in addition the placement within a 2GB frame - * is branch prediction unit friendly. + * kernel mapping. 512GB are reserved for vmalloc by default. + * At the top of the vmalloc area a 2GB area is reserved where modules + * will reside. That makes sure that inter module branches always + * happen without trampolines and in addition the placement within a + * 2GB frame is branch prediction unit friendly. */ -extern unsigned long VMALLOC_START; -extern unsigned long VMALLOC_END; -#define VMALLOC_DEFAULT_SIZE ((128UL << 30) - MODULES_LEN) -extern struct page *vmemmap; - -#define VMEM_MAX_PHYS ((unsigned long) vmemmap) - -extern unsigned long MODULES_VADDR; -extern unsigned long MODULES_END; +extern unsigned long __bootdata_preserved(VMALLOC_START); +extern unsigned long __bootdata_preserved(VMALLOC_END); +#define VMALLOC_DEFAULT_SIZE ((512UL << 30) - MODULES_LEN) +extern struct page *__bootdata_preserved(vmemmap); +extern unsigned long __bootdata_preserved(vmemmap_size); + +extern unsigned long __bootdata_preserved(MODULES_VADDR); +extern unsigned long __bootdata_preserved(MODULES_END); #define MODULES_VADDR MODULES_VADDR #define MODULES_END MODULES_END #define MODULES_LEN (1UL << 31) @@ -179,11 +179,21 @@ static inline int is_module_addr(void *addr) #define _PAGE_SOFT_DIRTY 0x000 #endif +#define _PAGE_SW_BITS 0xffUL /* All SW bits */ + +#define _PAGE_SWP_EXCLUSIVE _PAGE_LARGE /* SW pte exclusive swap bit */ + /* Set of bits not changed in pte_modify */ #define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_SPECIAL | _PAGE_DIRTY | \ _PAGE_YOUNG | _PAGE_SOFT_DIRTY) /* + * Mask of bits that must not be changed with RDP. Allow only _PAGE_PROTECT + * HW bit and all SW bits. + */ +#define _PAGE_RDP_MASK ~(_PAGE_PROTECT | _PAGE_SW_BITS) + +/* * handle_pte_fault uses pte_present and pte_none to find out the pte type * WITHOUT holding the page table lock. The _PAGE_PRESENT bit is used to * distinguish present from not-present ptes. It is changed only with the page @@ -341,8 +351,6 @@ static inline int is_module_addr(void *addr) #define PTRS_PER_P4D _CRST_ENTRIES #define PTRS_PER_PGD _CRST_ENTRIES -#define MAX_PTRS_PER_P4D PTRS_PER_P4D - /* * Segment table and region3 table entry encoding * (R = read-only, I = invalid, y = young bit): @@ -422,23 +430,6 @@ static inline int is_module_addr(void *addr) * implies read permission. */ /*xwr*/ -#define __P000 PAGE_NONE -#define __P001 PAGE_RO -#define __P010 PAGE_RO -#define __P011 PAGE_RO -#define __P100 PAGE_RX -#define __P101 PAGE_RX -#define __P110 PAGE_RX -#define __P111 PAGE_RX - -#define __S000 PAGE_NONE -#define __S001 PAGE_RO -#define __S010 PAGE_RW -#define __S011 PAGE_RW -#define __S100 PAGE_RX -#define __S101 PAGE_RX -#define __S110 PAGE_RWX -#define __S111 PAGE_RWX /* * Segment entry (large page) protection definitions. @@ -492,6 +483,12 @@ static inline int is_module_addr(void *addr) _REGION3_ENTRY_YOUNG | \ _REGION_ENTRY_PROTECT | \ _REGION_ENTRY_NOEXEC) +#define REGION3_KERNEL_EXEC __pgprot(_REGION_ENTRY_TYPE_R3 | \ + _REGION3_ENTRY_LARGE | \ + _REGION3_ENTRY_READ | \ + _REGION3_ENTRY_WRITE | \ + _REGION3_ENTRY_YOUNG | \ + _REGION3_ENTRY_DIRTY) static inline bool mm_p4d_folded(struct mm_struct *mm) { @@ -520,6 +517,15 @@ static inline int mm_has_pgste(struct mm_struct *mm) return 0; } +static inline int mm_is_protected(struct mm_struct *mm) +{ +#ifdef CONFIG_PGSTE + if (unlikely(atomic_read(&mm->context.protected_count))) + return 1; +#endif + return 0; +} + static inline int mm_alloc_pgste(struct mm_struct *mm) { #ifdef CONFIG_PGSTE @@ -529,6 +535,36 @@ static inline int mm_alloc_pgste(struct mm_struct *mm) return 0; } +static inline pte_t clear_pte_bit(pte_t pte, pgprot_t prot) +{ + return __pte(pte_val(pte) & ~pgprot_val(prot)); +} + +static inline pte_t set_pte_bit(pte_t pte, pgprot_t prot) +{ + return __pte(pte_val(pte) | pgprot_val(prot)); +} + +static inline pmd_t clear_pmd_bit(pmd_t pmd, pgprot_t prot) +{ + return __pmd(pmd_val(pmd) & ~pgprot_val(prot)); +} + +static inline pmd_t set_pmd_bit(pmd_t pmd, pgprot_t prot) +{ + return __pmd(pmd_val(pmd) | pgprot_val(prot)); +} + +static inline pud_t clear_pud_bit(pud_t pud, pgprot_t prot) +{ + return __pud(pud_val(pud) & ~pgprot_val(prot)); +} + +static inline pud_t set_pud_bit(pud_t pud, pgprot_t prot) +{ + return __pud(pud_val(pud) | pgprot_val(prot)); +} + /* * In the case that a guest uses storage keys * faults should no longer be backed by zero pages @@ -545,27 +581,25 @@ static inline int mm_uses_skeys(struct mm_struct *mm) static inline void csp(unsigned int *ptr, unsigned int old, unsigned int new) { - register unsigned long reg2 asm("2") = old; - register unsigned long reg3 asm("3") = new; + union register_pair r1 = { .even = old, .odd = new, }; unsigned long address = (unsigned long)ptr | 1; asm volatile( - " csp %0,%3" - : "+d" (reg2), "+m" (*ptr) - : "d" (reg3), "d" (address) + " csp %[r1],%[address]" + : [r1] "+&d" (r1.pair), "+m" (*ptr) + : [address] "d" (address) : "cc"); } static inline void cspg(unsigned long *ptr, unsigned long old, unsigned long new) { - register unsigned long reg2 asm("2") = old; - register unsigned long reg3 asm("3") = new; + union register_pair r1 = { .even = old, .odd = new, }; unsigned long address = (unsigned long)ptr | 1; asm volatile( - " .insn rre,0xb98a0000,%0,%3" - : "+d" (reg2), "+m" (*ptr) - : "d" (reg3), "d" (address) + " cspg %[r1],%[address]" + : [r1] "+&d" (r1.pair), "+m" (*ptr) + : [address] "d" (address) : "cc"); } @@ -576,17 +610,15 @@ static inline void cspg(unsigned long *ptr, unsigned long old, unsigned long new #define CRDTE_DTT_REGION1 0x1cUL static inline void crdte(unsigned long old, unsigned long new, - unsigned long table, unsigned long dtt, + unsigned long *table, unsigned long dtt, unsigned long address, unsigned long asce) { - register unsigned long reg2 asm("2") = old; - register unsigned long reg3 asm("3") = new; - register unsigned long reg4 asm("4") = table | dtt; - register unsigned long reg5 asm("5") = address; + union register_pair r1 = { .even = old, .odd = new, }; + union register_pair r2 = { .even = __pa(table) | dtt, .odd = address, }; - asm volatile(".insn rrf,0xb98f0000,%0,%2,%4,0" - : "+d" (reg2) - : "d" (reg3), "d" (reg4), "d" (reg5), "a" (asce) + asm volatile(".insn rrf,0xb98f0000,%[r1],%[r2],%[asce],0" + : [r1] "+&d" (r1.pair) + : [r2] "d" (r2.pair), [asce] "a" (asce) : "memory", "cc"); } @@ -673,6 +705,7 @@ static inline int pud_none(pud_t pud) return pud_val(pud) == _REGION3_ENTRY_EMPTY; } +#define pud_leaf pud_large static inline int pud_large(pud_t pud) { if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) != _REGION_ENTRY_TYPE_R3) @@ -680,16 +713,7 @@ static inline int pud_large(pud_t pud) return !!(pud_val(pud) & _REGION3_ENTRY_LARGE); } -static inline unsigned long pud_pfn(pud_t pud) -{ - unsigned long origin_mask; - - origin_mask = _REGION_ENTRY_ORIGIN; - if (pud_large(pud)) - origin_mask = _REGION3_ENTRY_ORIGIN_LARGE; - return (pud_val(pud) & origin_mask) >> PAGE_SHIFT; -} - +#define pmd_leaf pmd_large static inline int pmd_large(pmd_t pmd) { return (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) != 0; @@ -734,27 +758,25 @@ static inline int pmd_none(pmd_t pmd) return pmd_val(pmd) == _SEGMENT_ENTRY_EMPTY; } -static inline unsigned long pmd_pfn(pmd_t pmd) -{ - unsigned long origin_mask; - - origin_mask = _SEGMENT_ENTRY_ORIGIN; - if (pmd_large(pmd)) - origin_mask = _SEGMENT_ENTRY_ORIGIN_LARGE; - return (pmd_val(pmd) & origin_mask) >> PAGE_SHIFT; -} - #define pmd_write pmd_write static inline int pmd_write(pmd_t pmd) { return (pmd_val(pmd) & _SEGMENT_ENTRY_WRITE) != 0; } +#define pud_write pud_write +static inline int pud_write(pud_t pud) +{ + return (pud_val(pud) & _REGION3_ENTRY_WRITE) != 0; +} + +#define pmd_dirty pmd_dirty static inline int pmd_dirty(pmd_t pmd) { return (pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY) != 0; } +#define pmd_young pmd_young static inline int pmd_young(pmd_t pmd) { return (pmd_val(pmd) & _SEGMENT_ENTRY_YOUNG) != 0; @@ -803,6 +825,21 @@ static inline int pmd_protnone(pmd_t pmd) } #endif +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + return set_pte_bit(pte, __pgprot(_PAGE_SWP_EXCLUSIVE)); +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + return clear_pte_bit(pte, __pgprot(_PAGE_SWP_EXCLUSIVE)); +} + static inline int pte_soft_dirty(pte_t pte) { return pte_val(pte) & _PAGE_SOFT_DIRTY; @@ -811,15 +848,13 @@ static inline int pte_soft_dirty(pte_t pte) static inline pte_t pte_mksoft_dirty(pte_t pte) { - pte_val(pte) |= _PAGE_SOFT_DIRTY; - return pte; + return set_pte_bit(pte, __pgprot(_PAGE_SOFT_DIRTY)); } #define pte_swp_mksoft_dirty pte_mksoft_dirty static inline pte_t pte_clear_soft_dirty(pte_t pte) { - pte_val(pte) &= ~_PAGE_SOFT_DIRTY; - return pte; + return clear_pte_bit(pte, __pgprot(_PAGE_SOFT_DIRTY)); } #define pte_swp_clear_soft_dirty pte_clear_soft_dirty @@ -830,14 +865,12 @@ static inline int pmd_soft_dirty(pmd_t pmd) static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) { - pmd_val(pmd) |= _SEGMENT_ENTRY_SOFT_DIRTY; - return pmd; + return set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_SOFT_DIRTY)); } static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd) { - pmd_val(pmd) &= ~_SEGMENT_ENTRY_SOFT_DIRTY; - return pmd; + return clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_SOFT_DIRTY)); } /* @@ -866,35 +899,79 @@ static inline int pte_unused(pte_t pte) } /* + * Extract the pgprot value from the given pte while at the same time making it + * usable for kernel address space mappings where fault driven dirty and + * young/old accounting is not supported, i.e _PAGE_PROTECT and _PAGE_INVALID + * must not be set. + */ +static inline pgprot_t pte_pgprot(pte_t pte) +{ + unsigned long pte_flags = pte_val(pte) & _PAGE_CHG_MASK; + + if (pte_write(pte)) + pte_flags |= pgprot_val(PAGE_KERNEL); + else + pte_flags |= pgprot_val(PAGE_KERNEL_RO); + pte_flags |= pte_val(pte) & mio_wb_bit_mask; + + return __pgprot(pte_flags); +} + +/* * pgd/pmd/pte modification functions */ +static inline void set_pgd(pgd_t *pgdp, pgd_t pgd) +{ + WRITE_ONCE(*pgdp, pgd); +} + +static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) +{ + WRITE_ONCE(*p4dp, p4d); +} + +static inline void set_pud(pud_t *pudp, pud_t pud) +{ + WRITE_ONCE(*pudp, pud); +} + +static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) +{ + WRITE_ONCE(*pmdp, pmd); +} + +static inline void set_pte(pte_t *ptep, pte_t pte) +{ + WRITE_ONCE(*ptep, pte); +} + static inline void pgd_clear(pgd_t *pgd) { if ((pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R1) - pgd_val(*pgd) = _REGION1_ENTRY_EMPTY; + set_pgd(pgd, __pgd(_REGION1_ENTRY_EMPTY)); } static inline void p4d_clear(p4d_t *p4d) { if ((p4d_val(*p4d) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R2) - p4d_val(*p4d) = _REGION2_ENTRY_EMPTY; + set_p4d(p4d, __p4d(_REGION2_ENTRY_EMPTY)); } static inline void pud_clear(pud_t *pud) { if ((pud_val(*pud) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) - pud_val(*pud) = _REGION3_ENTRY_EMPTY; + set_pud(pud, __pud(_REGION3_ENTRY_EMPTY)); } static inline void pmd_clear(pmd_t *pmdp) { - pmd_val(*pmdp) = _SEGMENT_ENTRY_EMPTY; + set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); } static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - pte_val(*ptep) = _PAGE_INVALID; + set_pte(ptep, __pte(_PAGE_INVALID)); } /* @@ -903,79 +980,74 @@ static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *pt */ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { - pte_val(pte) &= _PAGE_CHG_MASK; - pte_val(pte) |= pgprot_val(newprot); + pte = clear_pte_bit(pte, __pgprot(~_PAGE_CHG_MASK)); + pte = set_pte_bit(pte, newprot); /* * newprot for PAGE_NONE, PAGE_RO, PAGE_RX, PAGE_RW and PAGE_RWX * has the invalid bit set, clear it again for readable, young pages */ if ((pte_val(pte) & _PAGE_YOUNG) && (pte_val(pte) & _PAGE_READ)) - pte_val(pte) &= ~_PAGE_INVALID; + pte = clear_pte_bit(pte, __pgprot(_PAGE_INVALID)); /* * newprot for PAGE_RO, PAGE_RX, PAGE_RW and PAGE_RWX has the page * protection bit set, clear it again for writable, dirty pages */ if ((pte_val(pte) & _PAGE_DIRTY) && (pte_val(pte) & _PAGE_WRITE)) - pte_val(pte) &= ~_PAGE_PROTECT; + pte = clear_pte_bit(pte, __pgprot(_PAGE_PROTECT)); return pte; } static inline pte_t pte_wrprotect(pte_t pte) { - pte_val(pte) &= ~_PAGE_WRITE; - pte_val(pte) |= _PAGE_PROTECT; - return pte; + pte = clear_pte_bit(pte, __pgprot(_PAGE_WRITE)); + return set_pte_bit(pte, __pgprot(_PAGE_PROTECT)); } -static inline pte_t pte_mkwrite(pte_t pte) +static inline pte_t pte_mkwrite_novma(pte_t pte) { - pte_val(pte) |= _PAGE_WRITE; + pte = set_pte_bit(pte, __pgprot(_PAGE_WRITE)); if (pte_val(pte) & _PAGE_DIRTY) - pte_val(pte) &= ~_PAGE_PROTECT; + pte = clear_pte_bit(pte, __pgprot(_PAGE_PROTECT)); return pte; } static inline pte_t pte_mkclean(pte_t pte) { - pte_val(pte) &= ~_PAGE_DIRTY; - pte_val(pte) |= _PAGE_PROTECT; - return pte; + pte = clear_pte_bit(pte, __pgprot(_PAGE_DIRTY)); + return set_pte_bit(pte, __pgprot(_PAGE_PROTECT)); } static inline pte_t pte_mkdirty(pte_t pte) { - pte_val(pte) |= _PAGE_DIRTY | _PAGE_SOFT_DIRTY; + pte = set_pte_bit(pte, __pgprot(_PAGE_DIRTY | _PAGE_SOFT_DIRTY)); if (pte_val(pte) & _PAGE_WRITE) - pte_val(pte) &= ~_PAGE_PROTECT; + pte = clear_pte_bit(pte, __pgprot(_PAGE_PROTECT)); return pte; } static inline pte_t pte_mkold(pte_t pte) { - pte_val(pte) &= ~_PAGE_YOUNG; - pte_val(pte) |= _PAGE_INVALID; - return pte; + pte = clear_pte_bit(pte, __pgprot(_PAGE_YOUNG)); + return set_pte_bit(pte, __pgprot(_PAGE_INVALID)); } static inline pte_t pte_mkyoung(pte_t pte) { - pte_val(pte) |= _PAGE_YOUNG; + pte = set_pte_bit(pte, __pgprot(_PAGE_YOUNG)); if (pte_val(pte) & _PAGE_READ) - pte_val(pte) &= ~_PAGE_INVALID; + pte = clear_pte_bit(pte, __pgprot(_PAGE_INVALID)); return pte; } static inline pte_t pte_mkspecial(pte_t pte) { - pte_val(pte) |= _PAGE_SPECIAL; - return pte; + return set_pte_bit(pte, __pgprot(_PAGE_SPECIAL)); } #ifdef CONFIG_HUGETLB_PAGE static inline pte_t pte_mkhuge(pte_t pte) { - pte_val(pte) |= _PAGE_LARGE; - return pte; + return set_pte_bit(pte, __pgprot(_PAGE_LARGE)); } #endif @@ -985,16 +1057,29 @@ static inline pte_t pte_mkhuge(pte_t pte) #define IPTE_NODAT 0x400 #define IPTE_GUEST_ASCE 0x800 +static __always_inline void __ptep_rdp(unsigned long addr, pte_t *ptep, + unsigned long opt, unsigned long asce, + int local) +{ + unsigned long pto; + + pto = __pa(ptep) & ~(PTRS_PER_PTE * sizeof(pte_t) - 1); + asm volatile(".insn rrf,0xb98b0000,%[r1],%[r2],%[asce],%[m4]" + : "+m" (*ptep) + : [r1] "a" (pto), [r2] "a" ((addr & PAGE_MASK) | opt), + [asce] "a" (asce), [m4] "i" (local)); +} + static __always_inline void __ptep_ipte(unsigned long address, pte_t *ptep, unsigned long opt, unsigned long asce, int local) { - unsigned long pto = (unsigned long) ptep; + unsigned long pto = __pa(ptep); if (__builtin_constant_p(opt) && opt == 0) { /* Invalidation + TLB flush for the pte */ asm volatile( - " .insn rrf,0xb2210000,%[r1],%[r2],0,%[m4]" + " ipte %[r1],%[r2],0,%[m4]" : "+m" (*ptep) : [r1] "a" (pto), [r2] "a" (address), [m4] "i" (local)); return; @@ -1003,7 +1088,7 @@ static __always_inline void __ptep_ipte(unsigned long address, pte_t *ptep, /* Invalidate ptes with options + TLB flush of the ptes */ opt = opt | (asce & _ASCE_ORIGIN); asm volatile( - " .insn rrf,0xb2210000,%[r1],%[r2],%[r3],%[m4]" + " ipte %[r1],%[r2],%[r3],%[m4]" : [r2] "+a" (address), [r3] "+a" (opt) : [r1] "a" (pto), [m4] "i" (local) : "memory"); } @@ -1011,12 +1096,12 @@ static __always_inline void __ptep_ipte(unsigned long address, pte_t *ptep, static __always_inline void __ptep_ipte_range(unsigned long address, int nr, pte_t *ptep, int local) { - unsigned long pto = (unsigned long) ptep; + unsigned long pto = __pa(ptep); /* Invalidate a range of ptes + TLB flush of the ptes */ do { asm volatile( - " .insn rrf,0xb2210000,%[r1],%[r2],%[r3],%[m4]" + " ipte %[r1],%[r2],%[r3],%[m4]" : [r2] "+a" (address), [r3] "+a" (nr) : [r1] "a" (pto), [m4] "i" (local) : "memory"); } while (nr != 255); @@ -1059,7 +1144,13 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma, static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - return ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID)); + pte_t res; + + res = ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID)); + /* At this point the reference through the mapping is still present */ + if (mm_is_protected(mm) && pte_present(res)) + uv_convert_owned_from_secure(pte_val(res) & PAGE_MASK); + return res; } #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION @@ -1071,7 +1162,13 @@ void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long, static inline pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { - return ptep_xchg_direct(vma->vm_mm, addr, ptep, __pte(_PAGE_INVALID)); + pte_t res; + + res = ptep_xchg_direct(vma->vm_mm, addr, ptep, __pte(_PAGE_INVALID)); + /* At this point the reference through the mapping is still present */ + if (mm_is_protected(vma->vm_mm) && pte_present(res)) + uv_convert_owned_from_secure(pte_val(res) & PAGE_MASK); + return res; } /* @@ -1086,12 +1183,31 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full) { + pte_t res; + if (full) { - pte_t pte = *ptep; - *ptep = __pte(_PAGE_INVALID); - return pte; + res = *ptep; + set_pte(ptep, __pte(_PAGE_INVALID)); + } else { + res = ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID)); } - return ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID)); + /* Nothing to do */ + if (!mm_is_protected(mm) || !pte_present(res)) + return res; + /* + * At this point the reference through the mapping is still present. + * The notifier should have destroyed all protected vCPUs at this + * point, so the destroy should be successful. + */ + if (full && !uv_destroy_owned_page(pte_val(res) & PAGE_MASK)) + return res; + /* + * If something went wrong and the page could not be destroyed, or + * if this is not a mm teardown, the slower export is used as + * fallback instead. + */ + uv_convert_owned_from_secure(pte_val(res) & PAGE_MASK); + return res; } #define __HAVE_ARCH_PTEP_SET_WRPROTECT @@ -1104,6 +1220,44 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, ptep_xchg_lazy(mm, addr, ptep, pte_wrprotect(pte)); } +/* + * Check if PTEs only differ in _PAGE_PROTECT HW bit, but also allow SW PTE + * bits in the comparison. Those might change e.g. because of dirty and young + * tracking. + */ +static inline int pte_allow_rdp(pte_t old, pte_t new) +{ + /* + * Only allow changes from RO to RW + */ + if (!(pte_val(old) & _PAGE_PROTECT) || pte_val(new) & _PAGE_PROTECT) + return 0; + + return (pte_val(old) & _PAGE_RDP_MASK) == (pte_val(new) & _PAGE_RDP_MASK); +} + +static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma, + unsigned long address, + pte_t *ptep) +{ + /* + * RDP might not have propagated the PTE protection reset to all CPUs, + * so there could be spurious TLB protection faults. + * NOTE: This will also be called when a racing pagetable update on + * another thread already installed the correct PTE. Both cases cannot + * really be distinguished. + * Therefore, only do the local TLB flush when RDP can be used, and the + * PTE does not have _PAGE_PROTECT set, to avoid unnecessary overhead. + * A local RDP can be used to do the flush. + */ + if (MACHINE_HAS_RDP && !(pte_val(*ptep) & _PAGE_PROTECT)) + __ptep_rdp(address, ptep, 0, 0, 1); +} +#define flush_tlb_fix_spurious_fault flush_tlb_fix_spurious_fault + +void ptep_reset_dat_prot(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + pte_t new); + #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS static inline int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, @@ -1111,7 +1265,10 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma, { if (pte_same(*ptep, entry)) return 0; - ptep_xchg_direct(vma->vm_mm, addr, ptep, entry); + if (MACHINE_HAS_RDP && !mm_has_pgste(vma->vm_mm) && pte_allow_rdp(*ptep, entry)) + ptep_reset_dat_prot(vma->vm_mm, addr, ptep, entry); + else + ptep_xchg_direct(vma->vm_mm, addr, ptep, entry); return 1; } @@ -1153,21 +1310,41 @@ void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr); void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr); void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr); +#define pgprot_writecombine pgprot_writecombine +pgprot_t pgprot_writecombine(pgprot_t prot); + +#define pgprot_writethrough pgprot_writethrough +pgprot_t pgprot_writethrough(pgprot_t prot); + /* - * Certain architectures need to do special things when PTEs - * within a page table are directly modified. Thus, the following - * hook is made available. + * Set multiple PTEs to consecutive pages with a single call. All PTEs + * are within the same folio, PMD and VMA. */ -static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t entry) +static inline void set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t entry, unsigned int nr) { if (pte_present(entry)) - pte_val(entry) &= ~_PAGE_UNUSED; - if (mm_has_pgste(mm)) - ptep_set_pte_at(mm, addr, ptep, entry); - else - *ptep = entry; + entry = clear_pte_bit(entry, __pgprot(_PAGE_UNUSED)); + if (mm_has_pgste(mm)) { + for (;;) { + ptep_set_pte_at(mm, addr, ptep, entry); + if (--nr == 0) + break; + ptep++; + entry = __pte(pte_val(entry) + PAGE_SIZE); + addr += PAGE_SIZE; + } + } else { + for (;;) { + set_pte(ptep, entry); + if (--nr == 0) + break; + ptep++; + entry = __pte(pte_val(entry) + PAGE_SIZE); + } + } } +#define set_ptes set_ptes /* * Conversion functions: convert a page and protection to a page entry, @@ -1176,9 +1353,10 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, static inline pte_t mk_pte_phys(unsigned long physpage, pgprot_t pgprot) { pte_t __pte; - pte_val(__pte) = physpage + pgprot_val(pgprot); + + __pte = __pte(physpage | pgprot_val(pgprot)); if (!MACHINE_HAS_NX) - pte_val(__pte) &= ~_PAGE_NOEXEC; + __pte = clear_pte_bit(__pte, __pgprot(_PAGE_NOEXEC)); return pte_mkyoung(__pte); } @@ -1196,12 +1374,39 @@ static inline pte_t mk_pte(struct page *page, pgprot_t pgprot) #define p4d_index(address) (((address) >> P4D_SHIFT) & (PTRS_PER_P4D-1)) #define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) -#define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE-1)) -#define pmd_deref(pmd) (pmd_val(pmd) & _SEGMENT_ENTRY_ORIGIN) -#define pud_deref(pud) (pud_val(pud) & _REGION_ENTRY_ORIGIN) -#define p4d_deref(pud) (p4d_val(pud) & _REGION_ENTRY_ORIGIN) -#define pgd_deref(pgd) (pgd_val(pgd) & _REGION_ENTRY_ORIGIN) +#define p4d_deref(pud) ((unsigned long)__va(p4d_val(pud) & _REGION_ENTRY_ORIGIN)) +#define pgd_deref(pgd) ((unsigned long)__va(pgd_val(pgd) & _REGION_ENTRY_ORIGIN)) + +static inline unsigned long pmd_deref(pmd_t pmd) +{ + unsigned long origin_mask; + + origin_mask = _SEGMENT_ENTRY_ORIGIN; + if (pmd_large(pmd)) + origin_mask = _SEGMENT_ENTRY_ORIGIN_LARGE; + return (unsigned long)__va(pmd_val(pmd) & origin_mask); +} + +static inline unsigned long pmd_pfn(pmd_t pmd) +{ + return __pa(pmd_deref(pmd)) >> PAGE_SHIFT; +} + +static inline unsigned long pud_deref(pud_t pud) +{ + unsigned long origin_mask; + + origin_mask = _REGION_ENTRY_ORIGIN; + if (pud_large(pud)) + origin_mask = _REGION3_ENTRY_ORIGIN_LARGE; + return (unsigned long)__va(pud_val(pud) & origin_mask); +} + +static inline unsigned long pud_pfn(pud_t pud) +{ + return __pa(pud_deref(pud)) >> PAGE_SHIFT; +} /* * The pgd_offset function *always* adds the index for the top-level @@ -1227,38 +1432,52 @@ static inline pgd_t *pgd_offset_raw(pgd_t *pgd, unsigned long address) } #define pgd_offset(mm, address) pgd_offset_raw(READ_ONCE((mm)->pgd), address) -#define pgd_offset_k(address) pgd_offset(&init_mm, address) -static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) +static inline p4d_t *p4d_offset_lockless(pgd_t *pgdp, pgd_t pgd, unsigned long address) { - if ((pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R1) - return (p4d_t *) pgd_deref(*pgd) + p4d_index(address); - return (p4d_t *) pgd; + if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R1) + return (p4d_t *) pgd_deref(pgd) + p4d_index(address); + return (p4d_t *) pgdp; } +#define p4d_offset_lockless p4d_offset_lockless -static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) +static inline p4d_t *p4d_offset(pgd_t *pgdp, unsigned long address) { - if ((p4d_val(*p4d) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R2) - return (pud_t *) p4d_deref(*p4d) + pud_index(address); - return (pud_t *) p4d; + return p4d_offset_lockless(pgdp, *pgdp, address); } -static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) +static inline pud_t *pud_offset_lockless(p4d_t *p4dp, p4d_t p4d, unsigned long address) { - if ((pud_val(*pud) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R3) - return (pmd_t *) pud_deref(*pud) + pmd_index(address); - return (pmd_t *) pud; + if ((p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R2) + return (pud_t *) p4d_deref(p4d) + pud_index(address); + return (pud_t *) p4dp; } +#define pud_offset_lockless pud_offset_lockless -static inline pte_t *pte_offset(pmd_t *pmd, unsigned long address) +static inline pud_t *pud_offset(p4d_t *p4dp, unsigned long address) { - return (pte_t *) pmd_deref(*pmd) + pte_index(address); + return pud_offset_lockless(p4dp, *p4dp, address); } +#define pud_offset pud_offset -#define pte_offset_kernel(pmd, address) pte_offset(pmd, address) -#define pte_offset_map(pmd, address) pte_offset_kernel(pmd, address) +static inline pmd_t *pmd_offset_lockless(pud_t *pudp, pud_t pud, unsigned long address) +{ + if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R3) + return (pmd_t *) pud_deref(pud) + pmd_index(address); + return (pmd_t *) pudp; +} +#define pmd_offset_lockless pmd_offset_lockless -static inline void pte_unmap(pte_t *pte) { } +static inline pmd_t *pmd_offset(pud_t *pudp, unsigned long address) +{ + return pmd_offset_lockless(pudp, *pudp, address); +} +#define pmd_offset pmd_offset + +static inline unsigned long pmd_page_vaddr(pmd_t pmd) +{ + return (unsigned long) pmd_deref(pmd); +} static inline bool gup_fast_permitted(unsigned long start, unsigned long end) { @@ -1266,7 +1485,7 @@ static inline bool gup_fast_permitted(unsigned long start, unsigned long end) } #define gup_fast_permitted gup_fast_permitted -#define pfn_pte(pfn,pgprot) mk_pte_phys(__pa((pfn) << PAGE_SHIFT),(pgprot)) +#define pfn_pte(pfn, pgprot) mk_pte_phys(((pfn) << PAGE_SHIFT), (pgprot)) #define pte_pfn(x) (pte_val(x) >> PAGE_SHIFT) #define pte_page(x) pfn_to_page(pte_pfn(x)) @@ -1277,61 +1496,57 @@ static inline bool gup_fast_permitted(unsigned long start, unsigned long end) static inline pmd_t pmd_wrprotect(pmd_t pmd) { - pmd_val(pmd) &= ~_SEGMENT_ENTRY_WRITE; - pmd_val(pmd) |= _SEGMENT_ENTRY_PROTECT; - return pmd; + pmd = clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_WRITE)); + return set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_PROTECT)); } -static inline pmd_t pmd_mkwrite(pmd_t pmd) +static inline pmd_t pmd_mkwrite_novma(pmd_t pmd) { - pmd_val(pmd) |= _SEGMENT_ENTRY_WRITE; + pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_WRITE)); if (pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY) - pmd_val(pmd) &= ~_SEGMENT_ENTRY_PROTECT; + pmd = clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_PROTECT)); return pmd; } static inline pmd_t pmd_mkclean(pmd_t pmd) { - pmd_val(pmd) &= ~_SEGMENT_ENTRY_DIRTY; - pmd_val(pmd) |= _SEGMENT_ENTRY_PROTECT; - return pmd; + pmd = clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_DIRTY)); + return set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_PROTECT)); } static inline pmd_t pmd_mkdirty(pmd_t pmd) { - pmd_val(pmd) |= _SEGMENT_ENTRY_DIRTY | _SEGMENT_ENTRY_SOFT_DIRTY; + pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_DIRTY | _SEGMENT_ENTRY_SOFT_DIRTY)); if (pmd_val(pmd) & _SEGMENT_ENTRY_WRITE) - pmd_val(pmd) &= ~_SEGMENT_ENTRY_PROTECT; + pmd = clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_PROTECT)); return pmd; } static inline pud_t pud_wrprotect(pud_t pud) { - pud_val(pud) &= ~_REGION3_ENTRY_WRITE; - pud_val(pud) |= _REGION_ENTRY_PROTECT; - return pud; + pud = clear_pud_bit(pud, __pgprot(_REGION3_ENTRY_WRITE)); + return set_pud_bit(pud, __pgprot(_REGION_ENTRY_PROTECT)); } static inline pud_t pud_mkwrite(pud_t pud) { - pud_val(pud) |= _REGION3_ENTRY_WRITE; + pud = set_pud_bit(pud, __pgprot(_REGION3_ENTRY_WRITE)); if (pud_val(pud) & _REGION3_ENTRY_DIRTY) - pud_val(pud) &= ~_REGION_ENTRY_PROTECT; + pud = clear_pud_bit(pud, __pgprot(_REGION_ENTRY_PROTECT)); return pud; } static inline pud_t pud_mkclean(pud_t pud) { - pud_val(pud) &= ~_REGION3_ENTRY_DIRTY; - pud_val(pud) |= _REGION_ENTRY_PROTECT; - return pud; + pud = clear_pud_bit(pud, __pgprot(_REGION3_ENTRY_DIRTY)); + return set_pud_bit(pud, __pgprot(_REGION_ENTRY_PROTECT)); } static inline pud_t pud_mkdirty(pud_t pud) { - pud_val(pud) |= _REGION3_ENTRY_DIRTY | _REGION3_ENTRY_SOFT_DIRTY; + pud = set_pud_bit(pud, __pgprot(_REGION3_ENTRY_DIRTY | _REGION3_ENTRY_SOFT_DIRTY)); if (pud_val(pud) & _REGION3_ENTRY_WRITE) - pud_val(pud) &= ~_REGION_ENTRY_PROTECT; + pud = clear_pud_bit(pud, __pgprot(_REGION_ENTRY_PROTECT)); return pud; } @@ -1355,37 +1570,39 @@ static inline unsigned long massage_pgprot_pmd(pgprot_t pgprot) static inline pmd_t pmd_mkyoung(pmd_t pmd) { - pmd_val(pmd) |= _SEGMENT_ENTRY_YOUNG; + pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_YOUNG)); if (pmd_val(pmd) & _SEGMENT_ENTRY_READ) - pmd_val(pmd) &= ~_SEGMENT_ENTRY_INVALID; + pmd = clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_INVALID)); return pmd; } static inline pmd_t pmd_mkold(pmd_t pmd) { - pmd_val(pmd) &= ~_SEGMENT_ENTRY_YOUNG; - pmd_val(pmd) |= _SEGMENT_ENTRY_INVALID; - return pmd; + pmd = clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_YOUNG)); + return set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_INVALID)); } static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) { - pmd_val(pmd) &= _SEGMENT_ENTRY_ORIGIN_LARGE | - _SEGMENT_ENTRY_DIRTY | _SEGMENT_ENTRY_YOUNG | - _SEGMENT_ENTRY_LARGE | _SEGMENT_ENTRY_SOFT_DIRTY; - pmd_val(pmd) |= massage_pgprot_pmd(newprot); + unsigned long mask; + + mask = _SEGMENT_ENTRY_ORIGIN_LARGE; + mask |= _SEGMENT_ENTRY_DIRTY; + mask |= _SEGMENT_ENTRY_YOUNG; + mask |= _SEGMENT_ENTRY_LARGE; + mask |= _SEGMENT_ENTRY_SOFT_DIRTY; + pmd = __pmd(pmd_val(pmd) & mask); + pmd = set_pmd_bit(pmd, __pgprot(massage_pgprot_pmd(newprot))); if (!(pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY)) - pmd_val(pmd) |= _SEGMENT_ENTRY_PROTECT; + pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_PROTECT)); if (!(pmd_val(pmd) & _SEGMENT_ENTRY_YOUNG)) - pmd_val(pmd) |= _SEGMENT_ENTRY_INVALID; + pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_INVALID)); return pmd; } static inline pmd_t mk_pmd_phys(unsigned long physpage, pgprot_t pgprot) { - pmd_t __pmd; - pmd_val(__pmd) = physpage + massage_pgprot_pmd(pgprot); - return __pmd; + return __pmd(physpage + massage_pgprot_pmd(pgprot)); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLB_PAGE */ @@ -1409,11 +1626,11 @@ static __always_inline void __pmdp_idte(unsigned long addr, pmd_t *pmdp, { unsigned long sto; - sto = (unsigned long) pmdp - pmd_index(addr) * sizeof(pmd_t); + sto = __pa(pmdp) - pmd_index(addr) * sizeof(pmd_t); if (__builtin_constant_p(opt) && opt == 0) { /* flush without guest asce */ asm volatile( - " .insn rrf,0xb98e0000,%[r1],%[r2],0,%[m4]" + " idte %[r1],0,%[r2],%[m4]" : "+m" (*pmdp) : [r1] "a" (sto), [r2] "a" ((addr & HPAGE_MASK)), [m4] "i" (local) @@ -1421,7 +1638,7 @@ static __always_inline void __pmdp_idte(unsigned long addr, pmd_t *pmdp, } else { /* flush with guest asce */ asm volatile( - " .insn rrf,0xb98e0000,%[r1],%[r2],%[r3],%[m4]" + " idte %[r1],%[r3],%[r2],%[m4]" : "+m" (*pmdp) : [r1] "a" (sto), [r2] "a" ((addr & HPAGE_MASK) | opt), [r3] "a" (asce), [m4] "i" (local) @@ -1435,12 +1652,12 @@ static __always_inline void __pudp_idte(unsigned long addr, pud_t *pudp, { unsigned long r3o; - r3o = (unsigned long) pudp - pud_index(addr) * sizeof(pud_t); + r3o = __pa(pudp) - pud_index(addr) * sizeof(pud_t); r3o |= _ASCE_TYPE_REGION3; if (__builtin_constant_p(opt) && opt == 0) { /* flush without guest asce */ asm volatile( - " .insn rrf,0xb98e0000,%[r1],%[r2],0,%[m4]" + " idte %[r1],0,%[r2],%[m4]" : "+m" (*pudp) : [r1] "a" (r3o), [r2] "a" ((addr & PUD_MASK)), [m4] "i" (local) @@ -1448,7 +1665,7 @@ static __always_inline void __pudp_idte(unsigned long addr, pud_t *pudp, } else { /* flush with guest asce */ asm volatile( - " .insn rrf,0xb98e0000,%[r1],%[r2],%[r3],%[m4]" + " idte %[r1],%[r3],%[r2],%[m4]" : "+m" (*pudp) : [r1] "a" (r3o), [r2] "a" ((addr & PUD_MASK) | opt), [r3] "a" (asce), [m4] "i" (local) @@ -1507,16 +1724,15 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t entry) { if (!MACHINE_HAS_NX) - pmd_val(entry) &= ~_SEGMENT_ENTRY_NOEXEC; - *pmdp = entry; + entry = clear_pmd_bit(entry, __pgprot(_SEGMENT_ENTRY_NOEXEC)); + set_pmd(pmdp, entry); } static inline pmd_t pmd_mkhuge(pmd_t pmd) { - pmd_val(pmd) |= _SEGMENT_ENTRY_LARGE; - pmd_val(pmd) |= _SEGMENT_ENTRY_YOUNG; - pmd_val(pmd) |= _SEGMENT_ENTRY_PROTECT; - return pmd; + pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_LARGE)); + pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_YOUNG)); + return set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_PROTECT)); } #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR @@ -1527,16 +1743,16 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, } #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL -static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm, +static inline pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp, int full) { if (full) { pmd_t pmd = *pmdp; - *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY); + set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); return pmd; } - return pmdp_xchg_lazy(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); + return pmdp_xchg_lazy(vma->vm_mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); } #define __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH @@ -1573,7 +1789,7 @@ static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, } #define pmdp_collapse_flush pmdp_collapse_flush -#define pfn_pmd(pfn, pgprot) mk_pmd_phys(__pa((pfn) << PAGE_SHIFT), (pgprot)) +#define pfn_pmd(pfn, pgprot) mk_pmd_phys(((pfn) << PAGE_SHIFT), (pgprot)) #define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) static inline int pmd_trans_huge(pmd_t pmd) @@ -1591,18 +1807,18 @@ static inline int has_transparent_hugepage(void) /* * 64 bit swap entry format: * A page-table entry has some bits we have to treat in a special way. - * Bits 52 and bit 55 have to be zero, otherwise a specification - * exception will occur instead of a page translation exception. The - * specification exception has the bad habit not to store necessary - * information in the lowcore. - * Bits 54 and 63 are used to indicate the page type. + * Bits 54 and 63 are used to indicate the page type. Bit 53 marks the pte + * as invalid. * A swap pte is indicated by bit pattern (pte & 0x201) == 0x200 - * This leaves the bits 0-51 and bits 56-62 to store type and offset. - * We use the 5 bits from 57-61 for the type and the 52 bits from 0-51 - * for the offset. - * | offset |01100|type |00| + * | offset |E11XX|type |S0| * |0000000000111111111122222222223333333333444444444455|55555|55566|66| * |0123456789012345678901234567890123456789012345678901|23456|78901|23| + * + * Bits 0-51 store the offset. + * Bit 52 (E) is used to remember PG_anon_exclusive. + * Bits 57-61 store the type. + * Bit 62 (S) is used for softdirty tracking. + * Bits 55 and 56 (X) are unused. */ #define __SWP_OFFSET_MASK ((1UL << 52) - 1) @@ -1612,12 +1828,12 @@ static inline int has_transparent_hugepage(void) static inline pte_t mk_swap_pte(unsigned long type, unsigned long offset) { - pte_t pte; + unsigned long pteval; - pte_val(pte) = _PAGE_INVALID | _PAGE_PROTECT; - pte_val(pte) |= (offset & __SWP_OFFSET_MASK) << __SWP_OFFSET_SHIFT; - pte_val(pte) |= (type & __SWP_TYPE_MASK) << __SWP_TYPE_SHIFT; - return pte; + pteval = _PAGE_INVALID | _PAGE_PROTECT; + pteval |= (offset & __SWP_OFFSET_MASK) << __SWP_OFFSET_SHIFT; + pteval |= (type & __SWP_TYPE_MASK) << __SWP_TYPE_SHIFT; + return __pte(pteval); } static inline unsigned long __swp_type(swp_entry_t entry) @@ -1638,10 +1854,12 @@ static inline swp_entry_t __swp_entry(unsigned long type, unsigned long offset) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define kern_addr_valid(addr) (1) - extern int vmem_add_mapping(unsigned long start, unsigned long size); -extern int vmem_remove_mapping(unsigned long start, unsigned long size); +extern void vmem_remove_mapping(unsigned long start, unsigned long size); +extern int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc); +extern int vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot); +extern void vmem_unmap_4k_page(unsigned long addr); +extern pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc); extern int s390_enable_sie(void); extern int s390_enable_skey(void); extern void s390_reset_cmma(struct mm_struct *mm); @@ -1650,6 +1868,7 @@ extern void s390_reset_cmma(struct mm_struct *mm); #define HAVE_ARCH_UNMAPPED_AREA #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN -#include <asm-generic/pgtable.h> +#define pmd_pgtable(pmd) \ + ((pgtable_t)__va(pmd_val(pmd) & -sizeof(pte_t)*PTRS_PER_PTE)) #endif /* _S390_PAGE_H */ diff --git a/arch/s390/include/asm/physmem_info.h b/arch/s390/include/asm/physmem_info.h new file mode 100644 index 000000000000..9e41a74fce9a --- /dev/null +++ b/arch/s390/include/asm/physmem_info.h @@ -0,0 +1,172 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_MEM_DETECT_H +#define _ASM_S390_MEM_DETECT_H + +#include <linux/types.h> +#include <asm/page.h> + +enum physmem_info_source { + MEM_DETECT_NONE = 0, + MEM_DETECT_SCLP_STOR_INFO, + MEM_DETECT_DIAG260, + MEM_DETECT_SCLP_READ_INFO, + MEM_DETECT_BIN_SEARCH +}; + +struct physmem_range { + u64 start; + u64 end; +}; + +enum reserved_range_type { + RR_DECOMPRESSOR, + RR_INITRD, + RR_VMLINUX, + RR_AMODE31, + RR_IPLREPORT, + RR_CERT_COMP_LIST, + RR_MEM_DETECT_EXTENDED, + RR_VMEM, + RR_MAX +}; + +struct reserved_range { + unsigned long start; + unsigned long end; + struct reserved_range *chain; +}; + +/* + * Storage element id is defined as 1 byte (up to 256 storage elements). + * In practise only storage element id 0 and 1 are used). + * According to architecture one storage element could have as much as + * 1020 subincrements. 255 physmem_ranges are embedded in physmem_info. + * If more physmem_ranges are required, a block of memory from already + * known physmem_range is taken (online_extended points to it). + */ +#define MEM_INLINED_ENTRIES 255 /* (PAGE_SIZE - 16) / 16 */ + +struct physmem_info { + u32 range_count; + u8 info_source; + unsigned long usable; + struct reserved_range reserved[RR_MAX]; + struct physmem_range online[MEM_INLINED_ENTRIES]; + struct physmem_range *online_extended; +}; + +extern struct physmem_info physmem_info; + +void add_physmem_online_range(u64 start, u64 end); + +static inline int __get_physmem_range(u32 n, unsigned long *start, + unsigned long *end, bool respect_usable_limit) +{ + if (n >= physmem_info.range_count) { + *start = 0; + *end = 0; + return -1; + } + + if (n < MEM_INLINED_ENTRIES) { + *start = (unsigned long)physmem_info.online[n].start; + *end = (unsigned long)physmem_info.online[n].end; + } else { + *start = (unsigned long)physmem_info.online_extended[n - MEM_INLINED_ENTRIES].start; + *end = (unsigned long)physmem_info.online_extended[n - MEM_INLINED_ENTRIES].end; + } + + if (respect_usable_limit && physmem_info.usable) { + if (*start >= physmem_info.usable) + return -1; + if (*end > physmem_info.usable) + *end = physmem_info.usable; + } + return 0; +} + +/** + * for_each_physmem_usable_range - early online memory range iterator + * @i: an integer used as loop variable + * @p_start: ptr to unsigned long for start address of the range + * @p_end: ptr to unsigned long for end address of the range + * + * Walks over detected online memory ranges below usable limit. + */ +#define for_each_physmem_usable_range(i, p_start, p_end) \ + for (i = 0; !__get_physmem_range(i, p_start, p_end, true); i++) + +/* Walks over all detected online memory ranges disregarding usable limit. */ +#define for_each_physmem_online_range(i, p_start, p_end) \ + for (i = 0; !__get_physmem_range(i, p_start, p_end, false); i++) + +static inline const char *get_physmem_info_source(void) +{ + switch (physmem_info.info_source) { + case MEM_DETECT_SCLP_STOR_INFO: + return "sclp storage info"; + case MEM_DETECT_DIAG260: + return "diag260"; + case MEM_DETECT_SCLP_READ_INFO: + return "sclp read info"; + case MEM_DETECT_BIN_SEARCH: + return "binary search"; + } + return "none"; +} + +#define RR_TYPE_NAME(t) case RR_ ## t: return #t +static inline const char *get_rr_type_name(enum reserved_range_type t) +{ + switch (t) { + RR_TYPE_NAME(DECOMPRESSOR); + RR_TYPE_NAME(INITRD); + RR_TYPE_NAME(VMLINUX); + RR_TYPE_NAME(AMODE31); + RR_TYPE_NAME(IPLREPORT); + RR_TYPE_NAME(CERT_COMP_LIST); + RR_TYPE_NAME(MEM_DETECT_EXTENDED); + RR_TYPE_NAME(VMEM); + default: + return "UNKNOWN"; + } +} + +#define for_each_physmem_reserved_type_range(t, range, p_start, p_end) \ + for (range = &physmem_info.reserved[t], *p_start = range->start, *p_end = range->end; \ + range && range->end; range = range->chain ? __va(range->chain) : NULL, \ + *p_start = range ? range->start : 0, *p_end = range ? range->end : 0) + +static inline struct reserved_range *__physmem_reserved_next(enum reserved_range_type *t, + struct reserved_range *range) +{ + if (!range) { + range = &physmem_info.reserved[*t]; + if (range->end) + return range; + } + if (range->chain) + return __va(range->chain); + while (++*t < RR_MAX) { + range = &physmem_info.reserved[*t]; + if (range->end) + return range; + } + return NULL; +} + +#define for_each_physmem_reserved_range(t, range, p_start, p_end) \ + for (t = 0, range = __physmem_reserved_next(&t, NULL), \ + *p_start = range ? range->start : 0, *p_end = range ? range->end : 0; \ + range; range = __physmem_reserved_next(&t, range), \ + *p_start = range ? range->start : 0, *p_end = range ? range->end : 0) + +static inline unsigned long get_physmem_reserved(enum reserved_range_type type, + unsigned long *addr, unsigned long *size) +{ + *addr = physmem_info.reserved[type].start; + *size = physmem_info.reserved[type].end - physmem_info.reserved[type].start; + return *size; +} + +#endif diff --git a/arch/s390/include/asm/pkey.h b/arch/s390/include/asm/pkey.h index dd3d20c332ac..47d80a7451a6 100644 --- a/arch/s390/include/asm/pkey.h +++ b/arch/s390/include/asm/pkey.h @@ -2,7 +2,7 @@ /* * Kernelspace interface to the pkey device driver * - * Copyright IBM Corp. 2016,2019 + * Copyright IBM Corp. 2016, 2023 * * Author: Harald Freudenberger <freude@de.ibm.com> * @@ -23,6 +23,6 @@ * @return 0 on success, negative errno value on failure */ int pkey_keyblob2pkey(const u8 *key, u32 keylen, - struct pkey_protkey *protkey); + u8 *protkey, u32 *protkeylen, u32 *protkeytype); #endif /* _KAPI_PKEY_H */ diff --git a/arch/s390/include/asm/preempt.h b/arch/s390/include/asm/preempt.h index b5ea9e14c017..bf15da0fedbc 100644 --- a/arch/s390/include/asm/preempt.h +++ b/arch/s390/include/asm/preempt.h @@ -29,12 +29,6 @@ static inline void preempt_count_set(int pc) old, new) != old); } -#define init_task_preempt_count(p) do { } while (0) - -#define init_idle_preempt_count(p, cpu) do { \ - S390_lowcore.preempt_count = PREEMPT_ENABLED; \ -} while (0) - static inline void set_preempt_need_resched(void) { __atomic_and(~PREEMPT_NEED_RESCHED, &S390_lowcore.preempt_count); @@ -52,10 +46,17 @@ static inline bool test_preempt_need_resched(void) static inline void __preempt_count_add(int val) { - if (__builtin_constant_p(val) && (val >= -128) && (val <= 127)) - __atomic_add_const(val, &S390_lowcore.preempt_count); - else - __atomic_add(val, &S390_lowcore.preempt_count); + /* + * With some obscure config options and CONFIG_PROFILE_ALL_BRANCHES + * enabled, gcc 12 fails to handle __builtin_constant_p(). + */ + if (!IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES)) { + if (__builtin_constant_p(val) && (val >= -128) && (val <= 127)) { + __atomic_add_const(val, &S390_lowcore.preempt_count); + return; + } + } + __atomic_add(val, &S390_lowcore.preempt_count); } static inline void __preempt_count_sub(int val) @@ -88,12 +89,6 @@ static inline void preempt_count_set(int pc) S390_lowcore.preempt_count = pc; } -#define init_task_preempt_count(p) do { } while (0) - -#define init_idle_preempt_count(p, cpu) do { \ - S390_lowcore.preempt_count = PREEMPT_ENABLED; \ -} while (0) - static inline void set_preempt_need_resched(void) { } @@ -130,11 +125,15 @@ static inline bool should_resched(int preempt_offset) #endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */ -#ifdef CONFIG_PREEMPT -extern asmlinkage void preempt_schedule(void); +#define init_task_preempt_count(p) do { } while (0) +/* Deferred to CPU bringup time */ +#define init_idle_preempt_count(p, cpu) do { } while (0) + +#ifdef CONFIG_PREEMPTION +extern void preempt_schedule(void); #define __preempt_schedule() preempt_schedule() -extern asmlinkage void preempt_schedule_notrace(void); +extern void preempt_schedule_notrace(void); #define __preempt_schedule_notrace() preempt_schedule_notrace() -#endif /* CONFIG_PREEMPT */ +#endif /* CONFIG_PREEMPTION */ #endif /* __ASM_PREEMPT_H */ diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 361ef5eda468..c0b6e74d899a 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -14,26 +14,20 @@ #include <linux/bits.h> -#define CIF_MCCK_PENDING 0 /* machine check handling is pending */ -#define CIF_ASCE_PRIMARY 1 /* primary asce needs fixup / uaccess */ -#define CIF_ASCE_SECONDARY 2 /* secondary asce needs fixup / uaccess */ -#define CIF_NOHZ_DELAY 3 /* delay HZ disable for a tick */ -#define CIF_FPU 4 /* restore FPU registers */ -#define CIF_IGNORE_IRQ 5 /* ignore interrupt (for udelay) */ -#define CIF_ENABLED_WAIT 6 /* in enabled wait state */ -#define CIF_MCCK_GUEST 7 /* machine check happening in guest */ -#define CIF_DEDICATED_CPU 8 /* this CPU is dedicated */ - -#define _CIF_MCCK_PENDING BIT(CIF_MCCK_PENDING) -#define _CIF_ASCE_PRIMARY BIT(CIF_ASCE_PRIMARY) -#define _CIF_ASCE_SECONDARY BIT(CIF_ASCE_SECONDARY) +#define CIF_NOHZ_DELAY 2 /* delay HZ disable for a tick */ +#define CIF_FPU 3 /* restore FPU registers */ +#define CIF_ENABLED_WAIT 5 /* in enabled wait state */ +#define CIF_MCCK_GUEST 6 /* machine check happening in guest */ +#define CIF_DEDICATED_CPU 7 /* this CPU is dedicated */ + #define _CIF_NOHZ_DELAY BIT(CIF_NOHZ_DELAY) #define _CIF_FPU BIT(CIF_FPU) -#define _CIF_IGNORE_IRQ BIT(CIF_IGNORE_IRQ) #define _CIF_ENABLED_WAIT BIT(CIF_ENABLED_WAIT) #define _CIF_MCCK_GUEST BIT(CIF_MCCK_GUEST) #define _CIF_DEDICATED_CPU BIT(CIF_DEDICATED_CPU) +#define RESTART_FLAG_CTLREGS _AC(1 << 0, U) + #ifndef __ASSEMBLY__ #include <linux/cpumask.h> @@ -46,30 +40,50 @@ #include <asm/runtime_instr.h> #include <asm/fpu/types.h> #include <asm/fpu/internal.h> +#include <asm/irqflags.h> + +typedef long (*sys_call_ptr_t)(struct pt_regs *regs); -static inline void set_cpu_flag(int flag) +static __always_inline void set_cpu_flag(int flag) { S390_lowcore.cpu_flags |= (1UL << flag); } -static inline void clear_cpu_flag(int flag) +static __always_inline void clear_cpu_flag(int flag) { S390_lowcore.cpu_flags &= ~(1UL << flag); } -static inline int test_cpu_flag(int flag) +static __always_inline bool test_cpu_flag(int flag) +{ + return S390_lowcore.cpu_flags & (1UL << flag); +} + +static __always_inline bool test_and_set_cpu_flag(int flag) { - return !!(S390_lowcore.cpu_flags & (1UL << flag)); + if (test_cpu_flag(flag)) + return true; + set_cpu_flag(flag); + return false; +} + +static __always_inline bool test_and_clear_cpu_flag(int flag) +{ + if (!test_cpu_flag(flag)) + return false; + clear_cpu_flag(flag); + return true; } /* * Test CIF flag of another CPU. The caller needs to ensure that * CPU hotplug can not happen, e.g. by disabling preemption. */ -static inline int test_cpu_flag_of(int flag, int cpu) +static __always_inline bool test_cpu_flag_of(int flag, int cpu) { struct lowcore *lc = lowcore_ptr[cpu]; - return !!(lc->cpu_flags & (1UL << flag)); + + return lc->cpu_flags & (1UL << flag); } #define arch_needs_cpu() test_cpu_flag(CIF_NOHZ_DELAY) @@ -84,65 +98,93 @@ void s390_update_cpu_mhz(void); void cpu_detect_mhz_feature(void); extern const struct seq_operations cpuinfo_op; -extern int sysctl_ieee_emulation_warnings; extern void execve_tail(void); -extern void __bpon(void); +unsigned long vdso_size(void); /* * User space process size: 2GB for 31 bit, 4TB or 8PT for 64 bit. */ -#define TASK_SIZE_OF(tsk) (test_tsk_thread_flag(tsk, TIF_31BIT) ? \ - (1UL << 31) : -PAGE_SIZE) +#define TASK_SIZE (test_thread_flag(TIF_31BIT) ? \ + _REGION3_SIZE : TASK_SIZE_MAX) #define TASK_UNMAPPED_BASE (test_thread_flag(TIF_31BIT) ? \ - (1UL << 30) : (1UL << 41)) -#define TASK_SIZE TASK_SIZE_OF(current) + (_REGION3_SIZE >> 1) : (_REGION2_SIZE >> 1)) #define TASK_SIZE_MAX (-PAGE_SIZE) -#define STACK_TOP (test_thread_flag(TIF_31BIT) ? \ - (1UL << 31) : (1UL << 42)) -#define STACK_TOP_MAX (1UL << 42) +#define VDSO_BASE (STACK_TOP + PAGE_SIZE) +#define VDSO_LIMIT (test_thread_flag(TIF_31BIT) ? _REGION3_SIZE : _REGION2_SIZE) +#define STACK_TOP (VDSO_LIMIT - vdso_size() - PAGE_SIZE) +#define STACK_TOP_MAX (_REGION2_SIZE - vdso_size() - PAGE_SIZE) #define HAVE_ARCH_PICK_MMAP_LAYOUT -typedef unsigned int mm_segment_t; +#define __stackleak_poison __stackleak_poison +static __always_inline void __stackleak_poison(unsigned long erase_low, + unsigned long erase_high, + unsigned long poison) +{ + unsigned long tmp, count; + + count = erase_high - erase_low; + if (!count) + return; + asm volatile( + " cghi %[count],8\n" + " je 2f\n" + " aghi %[count],-(8+1)\n" + " srlg %[tmp],%[count],8\n" + " ltgr %[tmp],%[tmp]\n" + " jz 1f\n" + "0: stg %[poison],0(%[addr])\n" + " mvc 8(256-8,%[addr]),0(%[addr])\n" + " la %[addr],256(%[addr])\n" + " brctg %[tmp],0b\n" + "1: stg %[poison],0(%[addr])\n" + " larl %[tmp],3f\n" + " ex %[count],0(%[tmp])\n" + " j 4f\n" + "2: stg %[poison],0(%[addr])\n" + " j 4f\n" + "3: mvc 8(1,%[addr]),0(%[addr])\n" + "4:\n" + : [addr] "+&a" (erase_low), [count] "+&d" (count), [tmp] "=&a" (tmp) + : [poison] "d" (poison) + : "memory", "cc" + ); +} /* * Thread structure */ struct thread_struct { unsigned int acrs[NUM_ACRS]; - unsigned long ksp; /* kernel stack pointer */ - unsigned long user_timer; /* task cputime in user space */ - unsigned long guest_timer; /* task cputime in kvm guest */ - unsigned long system_timer; /* task cputime in kernel space */ - unsigned long hardirq_timer; /* task cputime in hardirq context */ - unsigned long softirq_timer; /* task cputime in softirq context */ - unsigned long sys_call_table; /* system call table address */ - mm_segment_t mm_segment; - unsigned long gmap_addr; /* address of last gmap fault. */ - unsigned int gmap_write_flag; /* gmap fault write indication */ - unsigned int gmap_int_code; /* int code of last gmap fault */ - unsigned int gmap_pfault; /* signal of a pending guest pfault */ + unsigned long ksp; /* kernel stack pointer */ + unsigned long user_timer; /* task cputime in user space */ + unsigned long guest_timer; /* task cputime in kvm guest */ + unsigned long system_timer; /* task cputime in kernel space */ + unsigned long hardirq_timer; /* task cputime in hardirq context */ + unsigned long softirq_timer; /* task cputime in softirq context */ + const sys_call_ptr_t *sys_call_table; /* system call table address */ + unsigned long gmap_addr; /* address of last gmap fault. */ + unsigned int gmap_write_flag; /* gmap fault write indication */ + unsigned int gmap_int_code; /* int code of last gmap fault */ + unsigned int gmap_pfault; /* signal of a pending guest pfault */ + /* Per-thread information related to debugging */ - struct per_regs per_user; /* User specified PER registers */ - struct per_event per_event; /* Cause of the last PER trap */ - unsigned long per_flags; /* Flags to control debug behavior */ - unsigned int system_call; /* system call number in signal */ - unsigned long last_break; /* last breaking-event-address. */ - /* pfault_wait is used to block the process on a pfault event */ + struct per_regs per_user; /* User specified PER registers */ + struct per_event per_event; /* Cause of the last PER trap */ + unsigned long per_flags; /* Flags to control debug behavior */ + unsigned int system_call; /* system call number in signal */ + unsigned long last_break; /* last breaking-event-address. */ + /* pfault_wait is used to block the process on a pfault event */ unsigned long pfault_wait; struct list_head list; /* cpu runtime instrumentation */ struct runtime_instr_cb *ri_cb; - struct gs_cb *gs_cb; /* Current guarded storage cb */ - struct gs_cb *gs_bc_cb; /* Broadcast guarded storage cb */ - unsigned char trap_tdb[256]; /* Transaction abort diagnose block */ - /* - * Warning: 'fpu' is dynamically-sized. It *MUST* be at - * the end. - */ - struct fpu fpu; /* FP and VX register save area */ + struct gs_cb *gs_cb; /* Current guarded storage cb */ + struct gs_cb *gs_bc_cb; /* Broadcast guarded storage cb */ + struct pgm_tdb trap_tdb; /* Transaction abort diagnose block */ + struct fpu fpu; /* FP and VX register save area */ }; /* Flag to disable transactions. */ @@ -162,6 +204,7 @@ typedef struct thread_struct thread_struct; #define INIT_THREAD { \ .ksp = sizeof(init_stack) + (unsigned long) &init_stack, \ .fpu.regs = (void *) init_task.thread.fpu.fprs, \ + .last_break = 1, \ } /* @@ -178,11 +221,9 @@ typedef struct thread_struct thread_struct; regs->psw.mask = PSW_USER_BITS | PSW_MASK_BA; \ regs->psw.addr = new_psw; \ regs->gprs[15] = new_stackp; \ - crst_table_downgrade(current->mm); \ execve_tail(); \ } while (0) -/* Forward declaration, a strange C thing */ struct task_struct; struct mm_struct; struct seq_file; @@ -191,13 +232,11 @@ struct pt_regs; void show_registers(struct pt_regs *regs); void show_cacheinfo(struct seq_file *m); -/* Free all resources held by a thread. */ -static inline void release_thread(struct task_struct *tsk) { } - /* Free guarded storage control block */ void guarded_storage_release(struct task_struct *tsk); +void gs_load_bc_cb(struct pt_regs *regs); -unsigned long get_wchan(struct task_struct *p); +unsigned long __get_wchan(struct task_struct *p); #define task_pt_regs(tsk) ((struct pt_regs *) \ (task_stack_page(tsk) + THREAD_SIZE) - 1) #define KSTK_EIP(tsk) (task_pt_regs(tsk)->psw.addr) @@ -206,15 +245,25 @@ unsigned long get_wchan(struct task_struct *p); /* Has task runtime instrumentation enabled ? */ #define is_ri_task(tsk) (!!(tsk)->thread.ri_cb) -static __always_inline unsigned long current_stack_pointer(void) +/* avoid using global register due to gcc bug in versions < 8.4 */ +#define current_stack_pointer (__current_stack_pointer()) + +static __always_inline unsigned long __current_stack_pointer(void) { unsigned long sp; - asm volatile("la %0,0(15)" : "=a" (sp)); + asm volatile("lgr %0,15" : "=d" (sp)); return sp; } -static __no_kasan_or_inline unsigned short stap(void) +static __always_inline bool on_thread_stack(void) +{ + unsigned long ksp = S390_lowcore.kernel_stack; + + return !((ksp ^ current_stack_pointer) & ~(THREAD_SIZE - 1)); +} + +static __always_inline unsigned short stap(void) { unsigned short cpu_address; @@ -231,8 +280,7 @@ static inline unsigned long __ecag(unsigned int asi, unsigned char parm) { unsigned long val; - asm volatile(".insn rsy,0xeb000000004c,%0,0,0(%1)" /* ecag */ - : "=d" (val) : "a" (asi << 8 | parm)); + asm volatile("ecag %0,0,0(%1)" : "=d" (val) : "a" (asi << 8 | parm)); return val; } @@ -253,7 +301,7 @@ static inline void __load_psw(psw_t psw) * Set PSW mask to specified value, while leaving the * PSW addr pointing to the next instruction. */ -static __no_kasan_or_inline void __load_psw_mask(unsigned long mask) +static __always_inline void __load_psw_mask(unsigned long mask) { unsigned long addr; psw_t psw; @@ -279,14 +327,36 @@ static inline unsigned long __extract_psw(void) return (((unsigned long) reg1) << 32) | ((unsigned long) reg2); } -static inline void local_mcck_enable(void) +static inline unsigned long __local_mcck_save(void) { - __load_psw_mask(__extract_psw() | PSW_MASK_MCHECK); + unsigned long mask = __extract_psw(); + + __load_psw_mask(mask & ~PSW_MASK_MCHECK); + return mask & PSW_MASK_MCHECK; +} + +#define local_mcck_save(mflags) \ +do { \ + typecheck(unsigned long, mflags); \ + mflags = __local_mcck_save(); \ +} while (0) + +static inline void local_mcck_restore(unsigned long mflags) +{ + unsigned long mask = __extract_psw(); + + mask &= ~PSW_MASK_MCHECK; + __load_psw_mask(mask | mflags); } static inline void local_mcck_disable(void) { - __load_psw_mask(__extract_psw() & ~PSW_MASK_MCHECK); + __local_mcck_save(); +} + +static inline void local_mcck_enable(void) +{ + __load_psw_mask(__extract_psw() | PSW_MASK_MCHECK); } /* @@ -303,11 +373,6 @@ static inline unsigned long __rewind_psw(psw_t psw, unsigned long ilc) } /* - * Function to stop a processor until the next interrupt occurs - */ -void enabled_wait(void); - -/* * Function to drop a processor into disabled wait state */ static __always_inline void __noreturn disabled_wait(void) @@ -320,30 +385,12 @@ static __always_inline void __noreturn disabled_wait(void) while (1); } -/* - * Basic Machine Check/Program Check Handler. - */ - -extern void s390_base_pgm_handler(void); -extern void s390_base_ext_handler(void); - -extern void (*s390_base_pgm_handler_fn)(void); -extern void (*s390_base_ext_handler_fn)(void); - #define ARCH_LOW_ADDRESS_LIMIT 0x7fffffffUL -extern int memcpy_real(void *, void *, size_t); -extern void memcpy_absolute(void *, void *, size_t); - -#define mem_assign_absolute(dest, val) do { \ - __typeof__(dest) __tmp = (val); \ - \ - BUILD_BUG_ON(sizeof(__tmp) != sizeof(val)); \ - memcpy_absolute(&(dest), &__tmp, sizeof(__tmp)); \ -} while (0) - -extern int s390_isolate_bp(void); -extern int s390_isolate_bp_guest(void); +static __always_inline bool regs_irqs_disabled(struct pt_regs *regs) +{ + return arch_irqs_disabled_flags(regs->psw.mask); +} #endif /* __ASSEMBLY__ */ diff --git a/arch/s390/include/asm/ptdump.h b/arch/s390/include/asm/ptdump.h new file mode 100644 index 000000000000..f960b2896606 --- /dev/null +++ b/arch/s390/include/asm/ptdump.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _ASM_S390_PTDUMP_H +#define _ASM_S390_PTDUMP_H + +void ptdump_check_wx(void); + +static inline void debug_checkwx(void) +{ + if (IS_ENABLED(CONFIG_DEBUG_WX)) + ptdump_check_wx(); +} + +#endif /* _ASM_S390_PTDUMP_H */ diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h index f009a13afe71..d28bf8fb2799 100644 --- a/arch/s390/include/asm/ptrace.h +++ b/arch/s390/include/asm/ptrace.h @@ -9,25 +9,54 @@ #include <linux/bits.h> #include <uapi/asm/ptrace.h> +#include <asm/tpi.h> -#define PIF_SYSCALL 0 /* inside a system call */ -#define PIF_PER_TRAP 1 /* deliver sigtrap on return to user */ -#define PIF_SYSCALL_RESTART 2 /* restart the current system call */ -#define PIF_GUEST_FAULT 3 /* indicates program check in sie64a */ +#define PIF_SYSCALL 0 /* inside a system call */ +#define PIF_EXECVE_PGSTE_RESTART 1 /* restart execve for PGSTE binaries */ +#define PIF_SYSCALL_RET_SET 2 /* return value was set via ptrace */ +#define PIF_GUEST_FAULT 3 /* indicates program check in sie64a */ +#define PIF_FTRACE_FULL_REGS 4 /* all register contents valid (ftrace) */ -#define _PIF_SYSCALL BIT(PIF_SYSCALL) -#define _PIF_PER_TRAP BIT(PIF_PER_TRAP) -#define _PIF_SYSCALL_RESTART BIT(PIF_SYSCALL_RESTART) -#define _PIF_GUEST_FAULT BIT(PIF_GUEST_FAULT) +#define _PIF_SYSCALL BIT(PIF_SYSCALL) +#define _PIF_EXECVE_PGSTE_RESTART BIT(PIF_EXECVE_PGSTE_RESTART) +#define _PIF_SYSCALL_RET_SET BIT(PIF_SYSCALL_RET_SET) +#define _PIF_GUEST_FAULT BIT(PIF_GUEST_FAULT) +#define _PIF_FTRACE_FULL_REGS BIT(PIF_FTRACE_FULL_REGS) -#ifndef __ASSEMBLY__ +#define PSW32_MASK_PER _AC(0x40000000, UL) +#define PSW32_MASK_DAT _AC(0x04000000, UL) +#define PSW32_MASK_IO _AC(0x02000000, UL) +#define PSW32_MASK_EXT _AC(0x01000000, UL) +#define PSW32_MASK_KEY _AC(0x00F00000, UL) +#define PSW32_MASK_BASE _AC(0x00080000, UL) /* Always one */ +#define PSW32_MASK_MCHECK _AC(0x00040000, UL) +#define PSW32_MASK_WAIT _AC(0x00020000, UL) +#define PSW32_MASK_PSTATE _AC(0x00010000, UL) +#define PSW32_MASK_ASC _AC(0x0000C000, UL) +#define PSW32_MASK_CC _AC(0x00003000, UL) +#define PSW32_MASK_PM _AC(0x00000f00, UL) +#define PSW32_MASK_RI _AC(0x00000080, UL) + +#define PSW32_ADDR_AMODE _AC(0x80000000, UL) +#define PSW32_ADDR_INSN _AC(0x7FFFFFFF, UL) + +#define PSW32_DEFAULT_KEY ((PAGE_DEFAULT_ACC) << 20) + +#define PSW32_ASC_PRIMARY _AC(0x00000000, UL) +#define PSW32_ASC_ACCREG _AC(0x00004000, UL) +#define PSW32_ASC_SECONDARY _AC(0x00008000, UL) +#define PSW32_ASC_HOME _AC(0x0000C000, UL) + +#define PSW_DEFAULT_KEY ((PAGE_DEFAULT_ACC) << 52) #define PSW_KERNEL_BITS (PSW_DEFAULT_KEY | PSW_MASK_BASE | PSW_ASC_HOME | \ - PSW_MASK_EA | PSW_MASK_BA) + PSW_MASK_EA | PSW_MASK_BA | PSW_MASK_DAT) #define PSW_USER_BITS (PSW_MASK_DAT | PSW_MASK_IO | PSW_MASK_EXT | \ PSW_DEFAULT_KEY | PSW_MASK_BASE | PSW_MASK_MCHECK | \ PSW_MASK_PSTATE | PSW_ASC_PRIMARY) +#ifndef __ASSEMBLY__ + struct psw_bits { unsigned long : 1; unsigned long per : 1; /* PER-Mask */ @@ -68,12 +97,19 @@ enum { &(*(struct psw_bits *)(&(__psw))); \ })) +typedef struct { + unsigned int mask; + unsigned int addr; +} psw_t32 __aligned(8); + +#define PGM_INT_CODE_MASK 0x7f +#define PGM_INT_CODE_PER 0x80 + /* * The pt_regs struct defines the way the registers are stored on * the stack during a system call. */ -struct pt_regs -{ +struct pt_regs { union { user_pt_regs user_regs; struct { @@ -83,10 +119,17 @@ struct pt_regs }; }; unsigned long orig_gpr2; - unsigned int int_code; - unsigned int int_parm; - unsigned long int_parm_long; + union { + struct { + unsigned int int_code; + unsigned int int_parm; + unsigned long int_parm_long; + }; + struct tpi_info tpi_info; + }; unsigned long flags; + unsigned long cr1; + unsigned long last_break; }; /* @@ -152,6 +195,14 @@ static inline int test_pt_regs_flag(struct pt_regs *regs, int flag) return !!(regs->flags & (1UL << flag)); } +static inline int test_and_clear_pt_regs_flag(struct pt_regs *regs, int flag) +{ + int ret = test_pt_regs_flag(regs, flag); + + clear_pt_regs_flag(regs, flag); + return ret; +} + /* * These are defined as per linux/ptrace.h, which see. */ @@ -179,10 +230,34 @@ const char *regs_query_register_name(unsigned int offset); unsigned long regs_get_register(struct pt_regs *regs, unsigned int offset); unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n); +/** + * regs_get_kernel_argument() - get Nth function argument in kernel + * @regs: pt_regs of that context + * @n: function argument number (start from 0) + * + * regs_get_kernel_argument() returns @n th argument of the function call. + */ +static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs, + unsigned int n) +{ + unsigned int argoffset = STACK_FRAME_OVERHEAD / sizeof(long); + +#define NR_REG_ARGUMENTS 5 + if (n < NR_REG_ARGUMENTS) + return regs_get_register(regs, 2 + n); + n -= NR_REG_ARGUMENTS; + return regs_get_kernel_stack_nth(regs, argoffset + n); +} + static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) { return regs->gprs[15]; } +static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) +{ + regs->gprs[2] = rc; +} + #endif /* __ASSEMBLY__ */ #endif /* _S390_PTRACE_H */ diff --git a/arch/s390/include/asm/qdio.h b/arch/s390/include/asm/qdio.h index 71e3f0146cda..2f983e0b95e0 100644 --- a/arch/s390/include/asm/qdio.h +++ b/arch/s390/include/asm/qdio.h @@ -18,7 +18,6 @@ #define QDIO_MAX_BUFFERS_MASK (QDIO_MAX_BUFFERS_PER_Q - 1) #define QDIO_BUFNR(num) ((num) & QDIO_MAX_BUFFERS_MASK) #define QDIO_MAX_ELEMENTS_PER_BUFFER 16 -#define QDIO_SBAL_SIZE 256 #define QDIO_QETH_QFMT 0 #define QDIO_ZFCP_QFMT 1 @@ -26,9 +25,9 @@ /** * struct qdesfmt0 - queue descriptor, format 0 - * @sliba: storage list information block address - * @sla: storage list address - * @slsba: storage list state block address + * @sliba: absolute address of storage list information block + * @sla: absolute address of storage list + * @slsba: absolute address of storage list state block * @akey: access key for SLIB * @bkey: access key for SL * @ckey: access key for SBALs @@ -56,7 +55,7 @@ struct qdesfmt0 { * @oqdcnt: output queue descriptor count * @iqdsz: input queue descriptor size * @oqdsz: output queue descriptor size - * @qiba: queue information block address + * @qiba: absolute address of queue information block * @qkey: queue information block key * @qdf0: queue descriptions */ @@ -92,8 +91,8 @@ struct qdr { * @pfmt: implementation dependent parameter format * @rflags: QEBSM * @ac: adapter characteristics - * @isliba: absolute address of first input SLIB - * @osliba: absolute address of first output SLIB + * @isliba: logical address of first input SLIB + * @osliba: logical address of first output SLIB * @ebcnam: adapter identifier in EBCDIC * @parm: implementation dependent parameters */ @@ -134,10 +133,9 @@ struct slibe { * @sb_count: number of storage blocks * @sba: storage block element addresses * @dcount: size of storage block elements - * @user0: user defineable value - * @res4: reserved paramater - * @user1: user defineable value - * @user2: user defineable value + * @user0: user definable value + * @res4: reserved parameter + * @user1: user definable value */ struct qaob { u64 res0[6]; @@ -152,8 +150,7 @@ struct qaob { u16 dcount[QDIO_MAX_ELEMENTS_PER_BUFFER]; u64 user0; u64 res4[2]; - u64 user1; - u64 user2; + u8 user1[16]; } __attribute__ ((packed, aligned(256))); /** @@ -201,7 +198,7 @@ struct slib { * @scount: SBAL count * @sflags: whole SBAL flags * @length: length - * @addr: address + * @addr: absolute data address */ struct qdio_buffer_element { u8 eflags; @@ -211,7 +208,7 @@ struct qdio_buffer_element { u8 scount; u8 sflags; u32 length; - void *addr; + u64 addr; } __attribute__ ((packed, aligned(16))); /** @@ -227,7 +224,7 @@ struct qdio_buffer { * @sbal: absolute SBAL address */ struct sl_element { - unsigned long sbal; + u64 sbal; } __attribute__ ((packed)); /** @@ -246,25 +243,8 @@ struct slsb { u8 val[QDIO_MAX_BUFFERS_PER_Q]; } __attribute__ ((packed, aligned(256))); -/** - * struct qdio_outbuf_state - SBAL related asynchronous operation information - * (for communication with upper layer programs) - * (only required for use with completion queues) - * @flags: flags indicating state of buffer - * @user: pointer to upper layer program's state information related to SBAL - * (stored in user1 data of QAOB) - */ -struct qdio_outbuf_state { - u8 flags; - void *user; -}; - -#define QDIO_OUTBUF_STATE_FLAG_PENDING 0x01 - -#define CHSC_AC1_INITIATE_INPUTQ 0x80 - - /* qdio adapter-characteristics-1 flag */ +#define CHSC_AC1_INITIATE_INPUTQ 0x80 #define AC1_SIGA_INPUT_NEEDED 0x40 /* process input queues */ #define AC1_SIGA_OUTPUT_NEEDED 0x20 /* process output queues */ #define AC1_SIGA_SYNC_NEEDED 0x10 /* ask hypervisor to sync */ @@ -310,14 +290,14 @@ struct qdio_ssqd_desc { typedef void qdio_handler_t(struct ccw_device *, unsigned int, int, int, int, unsigned long); -/* qdio errors reported to the upper-layer program */ +/* qdio errors reported through the queue handlers: */ #define QDIO_ERROR_ACTIVATE 0x0001 #define QDIO_ERROR_GET_BUF_STATE 0x0002 #define QDIO_ERROR_SET_BUF_STATE 0x0004 -#define QDIO_ERROR_SLSB_STATE 0x0100 -#define QDIO_ERROR_FATAL 0x00ff -#define QDIO_ERROR_TEMPORARY 0xff00 +/* extra info for completed SBALs: */ +#define QDIO_ERROR_SLSB_STATE 0x0100 +#define QDIO_ERROR_SLSB_PENDING 0x0200 /* for qdio_cleanup */ #define QDIO_FLAG_CLEANUP_USING_CLEAR 0x01 @@ -325,109 +305,60 @@ typedef void qdio_handler_t(struct ccw_device *, unsigned int, int, /** * struct qdio_initialize - qdio initialization data - * @cdev: associated ccw device * @q_format: queue format * @qdr_ac: feature flags to set - * @adapter_name: name for the adapter * @qib_param_field_format: format for qib_parm_field * @qib_param_field: pointer to 128 bytes or NULL, if no param field * @qib_rflags: rflags to set - * @input_slib_elements: pointer to no_input_qs * 128 words of data or NULL - * @output_slib_elements: pointer to no_output_qs * 128 words of data or NULL * @no_input_qs: number of input queues * @no_output_qs: number of output queues - * @input_handler: handler to be called for input queues + * @input_handler: handler to be called for input queues, and device-wide errors * @output_handler: handler to be called for output queues - * @queue_start_poll_array: polling handlers (one per input queue or NULL) + * @irq_poll: Data IRQ polling handler * @scan_threshold: # of in-use buffers that triggers scan on output queue * @int_parm: interruption parameter - * @input_sbal_addr_array: address of no_input_qs * 128 pointers - * @output_sbal_addr_array: address of no_output_qs * 128 pointers - * @output_sbal_state_array: no_output_qs * 128 state info (for CQ or NULL) + * @input_sbal_addr_array: per-queue array, each element points to 128 SBALs + * @output_sbal_addr_array: per-queue array, each element points to 128 SBALs */ struct qdio_initialize { - struct ccw_device *cdev; unsigned char q_format; unsigned char qdr_ac; - unsigned char adapter_name[8]; unsigned int qib_param_field_format; unsigned char *qib_param_field; unsigned char qib_rflags; - unsigned long *input_slib_elements; - unsigned long *output_slib_elements; unsigned int no_input_qs; unsigned int no_output_qs; qdio_handler_t *input_handler; qdio_handler_t *output_handler; - void (**queue_start_poll_array) (struct ccw_device *, int, - unsigned long); - unsigned int scan_threshold; + void (*irq_poll)(struct ccw_device *cdev, unsigned long data); unsigned long int_parm; - struct qdio_buffer **input_sbal_addr_array; - struct qdio_buffer **output_sbal_addr_array; - struct qdio_outbuf_state *output_sbal_state_array; + struct qdio_buffer ***input_sbal_addr_array; + struct qdio_buffer ***output_sbal_addr_array; }; -/** - * enum qdio_brinfo_entry_type - type of address entry for qdio_brinfo_desc() - * @l3_ipv6_addr: entry contains IPv6 address - * @l3_ipv4_addr: entry contains IPv4 address - * @l2_addr_lnid: entry contains MAC address and VLAN ID - */ -enum qdio_brinfo_entry_type {l3_ipv6_addr, l3_ipv4_addr, l2_addr_lnid}; - -/** - * struct qdio_brinfo_entry_XXX - Address entry for qdio_brinfo_desc() - * @nit: Network interface token - * @addr: Address of one of the three types - * - * The struct is passed to the callback function by qdio_brinfo_desc() - */ -struct qdio_brinfo_entry_l3_ipv6 { - u64 nit; - struct { unsigned char _s6_addr[16]; } addr; -} __packed; -struct qdio_brinfo_entry_l3_ipv4 { - u64 nit; - struct { uint32_t _s_addr; } addr; -} __packed; -struct qdio_brinfo_entry_l2 { - u64 nit; - struct { u8 mac[6]; u16 lnid; } addr_lnid; -} __packed; - -#define QDIO_STATE_INACTIVE 0x00000002 /* after qdio_cleanup */ -#define QDIO_STATE_ESTABLISHED 0x00000004 /* after qdio_establish */ -#define QDIO_STATE_ACTIVE 0x00000008 /* after qdio_activate */ -#define QDIO_STATE_STOPPED 0x00000010 /* after queues went down */ - -#define QDIO_FLAG_SYNC_INPUT 0x01 -#define QDIO_FLAG_SYNC_OUTPUT 0x02 -#define QDIO_FLAG_PCI_OUT 0x10 - int qdio_alloc_buffers(struct qdio_buffer **buf, unsigned int count); void qdio_free_buffers(struct qdio_buffer **buf, unsigned int count); void qdio_reset_buffers(struct qdio_buffer **buf, unsigned int count); -extern int qdio_allocate(struct qdio_initialize *); -extern int qdio_establish(struct qdio_initialize *); +extern int qdio_allocate(struct ccw_device *cdev, unsigned int no_input_qs, + unsigned int no_output_qs); +extern int qdio_establish(struct ccw_device *cdev, + struct qdio_initialize *init_data); extern int qdio_activate(struct ccw_device *); -extern void qdio_release_aob(struct qaob *); -extern int do_QDIO(struct ccw_device *, unsigned int, int, unsigned int, - unsigned int); -extern int qdio_start_irq(struct ccw_device *, int); -extern int qdio_stop_irq(struct ccw_device *, int); -extern int qdio_get_next_buffers(struct ccw_device *, int, int *, int *); -extern int qdio_inspect_queue(struct ccw_device *cdev, unsigned int nr, - bool is_input, unsigned int *bufnr, - unsigned int *error); +extern int qdio_start_irq(struct ccw_device *cdev); +extern int qdio_stop_irq(struct ccw_device *cdev); +extern int qdio_inspect_input_queue(struct ccw_device *cdev, unsigned int nr, + unsigned int *bufnr, unsigned int *error); +extern int qdio_inspect_output_queue(struct ccw_device *cdev, unsigned int nr, + unsigned int *bufnr, unsigned int *error); +extern int qdio_add_bufs_to_input_queue(struct ccw_device *cdev, + unsigned int q_nr, unsigned int bufnr, + unsigned int count); +extern int qdio_add_bufs_to_output_queue(struct ccw_device *cdev, + unsigned int q_nr, unsigned int bufnr, + unsigned int count, struct qaob *aob); extern int qdio_shutdown(struct ccw_device *, int); extern int qdio_free(struct ccw_device *); extern int qdio_get_ssqd_desc(struct ccw_device *, struct qdio_ssqd_desc *); -extern int qdio_pnso_brinfo(struct subchannel_id schid, - int cnc, u16 *response, - void (*cb)(void *priv, enum qdio_brinfo_entry_type type, - void *entry), - void *priv); #endif /* __QDIO_H__ */ diff --git a/arch/s390/include/asm/rwonce.h b/arch/s390/include/asm/rwonce.h new file mode 100644 index 000000000000..91fc24520e82 --- /dev/null +++ b/arch/s390/include/asm/rwonce.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __ASM_S390_RWONCE_H +#define __ASM_S390_RWONCE_H + +#include <linux/compiler_types.h> + +/* + * Use READ_ONCE_ALIGNED_128() for 128-bit block concurrent (atomic) read + * accesses. Note that x must be 128-bit aligned, otherwise a specification + * exception is generated. + */ +#define READ_ONCE_ALIGNED_128(x) \ +({ \ + union { \ + typeof(x) __x; \ + __uint128_t val; \ + } __u; \ + \ + BUILD_BUG_ON(sizeof(x) != 16); \ + asm volatile( \ + " lpq %[val],%[_x]\n" \ + : [val] "=d" (__u.val) \ + : [_x] "QS" (x) \ + : "memory"); \ + __u.__x; \ +}) + +#include <asm-generic/rwonce.h> + +#endif /* __ASM_S390_RWONCE_H */ diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index c563f8368b19..5742d23bba13 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -1,18 +1,25 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright IBM Corp. 2007 - * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com> */ #ifndef _ASM_S390_SCLP_H #define _ASM_S390_SCLP_H #include <linux/types.h> -#include <asm/chpid.h> -#include <asm/cpu.h> #define SCLP_CHP_INFO_MASK_SIZE 32 -#define SCLP_MAX_CORES 256 +#define EARLY_SCCB_SIZE PAGE_SIZE +#define SCLP_MAX_CORES 512 +/* 144 + 16 * SCLP_MAX_CORES + 2 * (SCLP_MAX_CORES - 1) */ +#define EXT_SCCB_READ_SCP (3 * PAGE_SIZE) +/* 24 + 16 * SCLP_MAX_CORES */ +#define EXT_SCCB_READ_CPU (3 * PAGE_SIZE) + +#ifndef __ASSEMBLY__ +#include <linux/uio.h> +#include <asm/chpid.h> +#include <asm/cpu.h> struct sclp_chp_info { u8 recognized[SCLP_CHP_INFO_MASK_SIZE]; @@ -79,8 +86,15 @@ struct sclp_info { unsigned char has_kss : 1; unsigned char has_gisaf : 1; unsigned char has_diag318 : 1; + unsigned char has_diag320 : 1; unsigned char has_sipl : 1; + unsigned char has_sipl_eckd : 1; unsigned char has_dirq : 1; + unsigned char has_iplcc : 1; + unsigned char has_zpci_lsi : 1; + unsigned char has_aisii : 1; + unsigned char has_aeni : 1; + unsigned char has_aisi : 1; unsigned int ibc; unsigned int mtid; unsigned int mtid_cp; @@ -105,17 +119,21 @@ struct zpci_report_error_header { * (OpenCrypto Successful Diagnostics Execution) */ u16 length; /* Length of Subsequent Data (up to 4K – SCLP header */ - u8 data[0]; /* Subsequent Data passed verbatim to SCLP ET 24 */ + u8 data[]; /* Subsequent Data passed verbatim to SCLP ET 24 */ } __packed; +extern char *sclp_early_sccb; + +void sclp_early_adjust_va(void); +void sclp_early_set_buffer(void *sccb); int sclp_early_read_info(void); int sclp_early_read_storage_info(void); int sclp_early_get_core_info(struct sclp_core_info *info); void sclp_early_get_ipl_info(struct sclp_ipl_info *info); void sclp_early_detect(void); void sclp_early_printk(const char *s); -void sclp_early_printk_force(const char *s); -void __sclp_early_printk(const char *s, unsigned int len, unsigned int force); +void __sclp_early_printk(const char *s, unsigned int len); +void sclp_emergency_printk(const char *s); int sclp_early_get_memsize(unsigned long *mem); int sclp_early_get_hsa_size(unsigned long *hsa_size); @@ -129,9 +147,10 @@ int sclp_chp_deconfigure(struct chp_id chpid); int sclp_chp_read_info(struct sclp_chp_info *info); int sclp_pci_configure(u32 fid); int sclp_pci_deconfigure(u32 fid); +int sclp_ap_configure(u32 apid); +int sclp_ap_deconfigure(u32 apid); int sclp_pci_report(struct zpci_report_error_header *report, u32 fh, u32 fid); -int memcpy_hsa_kernel(void *dest, unsigned long src, size_t count); -int memcpy_hsa_user(void __user *dest, unsigned long src, size_t count); +size_t memcpy_hsa_iter(struct iov_iter *iter, unsigned long src, size_t count); void sclp_ocf_cpc_name_copy(char *dst); static inline int sclp_get_core_info(struct sclp_core_info *info, int early) @@ -141,4 +160,5 @@ static inline int sclp_get_core_info(struct sclp_core_info *info, int early) return _sclp_get_core_info(info); } +#endif /* __ASSEMBLY__ */ #endif /* _ASM_S390_SCLP_H */ diff --git a/arch/s390/include/asm/scsw.h b/arch/s390/include/asm/scsw.h index c00f7b031628..322bdcd4b616 100644 --- a/arch/s390/include/asm/scsw.h +++ b/arch/s390/include/asm/scsw.h @@ -215,6 +215,11 @@ union scsw { #define SNS2_ENV_DATA_PRESENT 0x10 #define SNS2_INPRECISE_END 0x04 +/* + * architectured values for PPRC errors + */ +#define SNS7_INVALID_ON_SEC 0x0e + /** * scsw_is_tm - check for transport mode scsw * @scsw: pointer to scsw @@ -508,9 +513,21 @@ static inline int scsw_cmd_is_valid_zcc(union scsw *scsw) */ static inline int scsw_cmd_is_valid_ectl(union scsw *scsw) { - return (scsw->cmd.stctl & SCSW_STCTL_STATUS_PEND) && - !(scsw->cmd.stctl & SCSW_STCTL_INTER_STATUS) && - (scsw->cmd.stctl & SCSW_STCTL_ALERT_STATUS); + /* Must be status pending. */ + if (!(scsw->cmd.stctl & SCSW_STCTL_STATUS_PEND)) + return 0; + + /* Must have alert status. */ + if (!(scsw->cmd.stctl & SCSW_STCTL_ALERT_STATUS)) + return 0; + + /* Must be alone or together with primary, secondary or both, + * => no intermediate status. + */ + if (scsw->cmd.stctl & SCSW_STCTL_INTER_STATUS) + return 0; + + return 1; } /** @@ -522,11 +539,25 @@ static inline int scsw_cmd_is_valid_ectl(union scsw *scsw) */ static inline int scsw_cmd_is_valid_pno(union scsw *scsw) { - return (scsw->cmd.fctl != 0) && - (scsw->cmd.stctl & SCSW_STCTL_STATUS_PEND) && - (!(scsw->cmd.stctl & SCSW_STCTL_INTER_STATUS) || - ((scsw->cmd.stctl & SCSW_STCTL_INTER_STATUS) && - (scsw->cmd.actl & SCSW_ACTL_SUSPENDED))); + /* Must indicate at least one I/O function. */ + if (!scsw->cmd.fctl) + return 0; + + /* Must be status pending. */ + if (!(scsw->cmd.stctl & SCSW_STCTL_STATUS_PEND)) + return 0; + + /* Can be status pending alone, or with any combination of primary, + * secondary and alert => no intermediate status. + */ + if (!(scsw->cmd.stctl & SCSW_STCTL_INTER_STATUS)) + return 1; + + /* If intermediate, must be suspended. */ + if (scsw->cmd.actl & SCSW_ACTL_SUSPENDED) + return 1; + + return 0; } /** @@ -676,9 +707,21 @@ static inline int scsw_tm_is_valid_q(union scsw *scsw) */ static inline int scsw_tm_is_valid_ectl(union scsw *scsw) { - return (scsw->tm.stctl & SCSW_STCTL_STATUS_PEND) && - !(scsw->tm.stctl & SCSW_STCTL_INTER_STATUS) && - (scsw->tm.stctl & SCSW_STCTL_ALERT_STATUS); + /* Must be status pending. */ + if (!(scsw->tm.stctl & SCSW_STCTL_STATUS_PEND)) + return 0; + + /* Must have alert status. */ + if (!(scsw->tm.stctl & SCSW_STCTL_ALERT_STATUS)) + return 0; + + /* Must be alone or together with primary, secondary or both, + * => no intermediate status. + */ + if (scsw->tm.stctl & SCSW_STCTL_INTER_STATUS) + return 0; + + return 1; } /** @@ -690,11 +733,25 @@ static inline int scsw_tm_is_valid_ectl(union scsw *scsw) */ static inline int scsw_tm_is_valid_pno(union scsw *scsw) { - return (scsw->tm.fctl != 0) && - (scsw->tm.stctl & SCSW_STCTL_STATUS_PEND) && - (!(scsw->tm.stctl & SCSW_STCTL_INTER_STATUS) || - ((scsw->tm.stctl & SCSW_STCTL_INTER_STATUS) && - (scsw->tm.actl & SCSW_ACTL_SUSPENDED))); + /* Must indicate at least one I/O function. */ + if (!scsw->tm.fctl) + return 0; + + /* Must be status pending. */ + if (!(scsw->tm.stctl & SCSW_STCTL_STATUS_PEND)) + return 0; + + /* Can be status pending alone, or with any combination of primary, + * secondary and alert => no intermediate status. + */ + if (!(scsw->tm.stctl & SCSW_STCTL_INTER_STATUS)) + return 1; + + /* If intermediate, must be suspended. */ + if (scsw->tm.actl & SCSW_ACTL_SUSPENDED) + return 1; + + return 0; } /** diff --git a/arch/s390/include/asm/seccomp.h b/arch/s390/include/asm/seccomp.h index 795bbe0d7ca6..71d46f0ba97b 100644 --- a/arch/s390/include/asm/seccomp.h +++ b/arch/s390/include/asm/seccomp.h @@ -16,4 +16,13 @@ #include <asm-generic/seccomp.h> +#define SECCOMP_ARCH_NATIVE AUDIT_ARCH_S390X +#define SECCOMP_ARCH_NATIVE_NR NR_syscalls +#define SECCOMP_ARCH_NATIVE_NAME "s390x" +#ifdef CONFIG_COMPAT +# define SECCOMP_ARCH_COMPAT AUDIT_ARCH_S390 +# define SECCOMP_ARCH_COMPAT_NR NR_syscalls +# define SECCOMP_ARCH_COMPAT_NAME "s390" +#endif + #endif /* _ASM_S390_SECCOMP_H */ diff --git a/arch/s390/include/asm/sections.h b/arch/s390/include/asm/sections.h index 42de04ad9c07..0486e6ef62bf 100644 --- a/arch/s390/include/asm/sections.h +++ b/arch/s390/include/asm/sections.h @@ -2,20 +2,8 @@ #ifndef _S390_SECTIONS_H #define _S390_SECTIONS_H -#define arch_is_kernel_initmem_freed arch_is_kernel_initmem_freed - #include <asm-generic/sections.h> -extern bool initmem_freed; - -static inline int arch_is_kernel_initmem_freed(unsigned long addr) -{ - if (!initmem_freed) - return 0; - return addr >= (unsigned long)__init_begin && - addr < (unsigned long)__init_end; -} - /* * .boot.data section contains variables "shared" between the decompressor and * the decompressed kernel. The decompressor will store values in them, and @@ -26,16 +14,16 @@ static inline int arch_is_kernel_initmem_freed(unsigned long addr) * final .boot.data section, which should be identical in the decompressor and * the decompressed kernel (that is checked during the build). */ -#define __bootdata(var) __section(.boot.data.var) var +#define __bootdata(var) __section(".boot.data." #var) var /* * .boot.preserved.data is similar to .boot.data, but it is not part of the * .init section and thus will be preserved for later use in the decompressed * kernel. */ -#define __bootdata_preserved(var) __section(.boot.preserved.data.var) var +#define __bootdata_preserved(var) __section(".boot.preserved.data." #var) var -extern unsigned long __sdma, __edma; -extern unsigned long __stext_dma, __etext_dma; +extern char *__samode31, *__eamode31; +extern char *__stext_amode31, *__etext_amode31; #endif diff --git a/arch/s390/include/asm/serial.h b/arch/s390/include/asm/serial.h deleted file mode 100644 index aaf85a69061c..000000000000 --- a/arch/s390/include/asm/serial.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_S390_SERIAL_H -#define _ASM_S390_SERIAL_H - -#define BASE_BAUD 0 - -#endif /* _ASM_S390_SERIAL_H */ diff --git a/arch/s390/include/asm/set_memory.h b/arch/s390/include/asm/set_memory.h index c59a83536c70..06fbabe2f66c 100644 --- a/arch/s390/include/asm/set_memory.h +++ b/arch/s390/include/asm/set_memory.h @@ -2,31 +2,65 @@ #ifndef _ASMS390_SET_MEMORY_H #define _ASMS390_SET_MEMORY_H -#define SET_MEMORY_RO 1UL -#define SET_MEMORY_RW 2UL -#define SET_MEMORY_NX 4UL -#define SET_MEMORY_X 8UL +#include <linux/mutex.h> -int __set_memory(unsigned long addr, int numpages, unsigned long flags); +extern struct mutex cpa_mutex; -static inline int set_memory_ro(unsigned long addr, int numpages) -{ - return __set_memory(addr, numpages, SET_MEMORY_RO); -} +enum { + _SET_MEMORY_RO_BIT, + _SET_MEMORY_RW_BIT, + _SET_MEMORY_NX_BIT, + _SET_MEMORY_X_BIT, + _SET_MEMORY_4K_BIT, + _SET_MEMORY_INV_BIT, + _SET_MEMORY_DEF_BIT, +}; -static inline int set_memory_rw(unsigned long addr, int numpages) -{ - return __set_memory(addr, numpages, SET_MEMORY_RW); -} +#define SET_MEMORY_RO BIT(_SET_MEMORY_RO_BIT) +#define SET_MEMORY_RW BIT(_SET_MEMORY_RW_BIT) +#define SET_MEMORY_NX BIT(_SET_MEMORY_NX_BIT) +#define SET_MEMORY_X BIT(_SET_MEMORY_X_BIT) +#define SET_MEMORY_4K BIT(_SET_MEMORY_4K_BIT) +#define SET_MEMORY_INV BIT(_SET_MEMORY_INV_BIT) +#define SET_MEMORY_DEF BIT(_SET_MEMORY_DEF_BIT) -static inline int set_memory_nx(unsigned long addr, int numpages) -{ - return __set_memory(addr, numpages, SET_MEMORY_NX); -} +int __set_memory(unsigned long addr, unsigned long numpages, unsigned long flags); -static inline int set_memory_x(unsigned long addr, int numpages) -{ - return __set_memory(addr, numpages, SET_MEMORY_X); +#define set_memory_rox set_memory_rox + +/* + * Generate two variants of each set_memory() function: + * + * set_memory_yy(unsigned long addr, int numpages); + * __set_memory_yy(void *start, void *end); + * + * The second variant exists for both convenience to avoid the usual + * (unsigned long) casts, but unlike the first variant it can also be used + * for areas larger than 8TB, which may happen at memory initialization. + */ +#define __SET_MEMORY_FUNC(fname, flags) \ +static inline int fname(unsigned long addr, int numpages) \ +{ \ + return __set_memory(addr, numpages, (flags)); \ +} \ + \ +static inline int __##fname(void *start, void *end) \ +{ \ + unsigned long numpages; \ + \ + numpages = (end - start) >> PAGE_SHIFT; \ + return __set_memory((unsigned long)start, numpages, (flags)); \ } +__SET_MEMORY_FUNC(set_memory_ro, SET_MEMORY_RO) +__SET_MEMORY_FUNC(set_memory_rw, SET_MEMORY_RW) +__SET_MEMORY_FUNC(set_memory_nx, SET_MEMORY_NX) +__SET_MEMORY_FUNC(set_memory_x, SET_MEMORY_X) +__SET_MEMORY_FUNC(set_memory_rox, SET_MEMORY_RO | SET_MEMORY_X) +__SET_MEMORY_FUNC(set_memory_rwnx, SET_MEMORY_RW | SET_MEMORY_NX) +__SET_MEMORY_FUNC(set_memory_4k, SET_MEMORY_4K) + +int set_direct_map_invalid_noflush(struct page *page); +int set_direct_map_default_noflush(struct page *page); + #endif diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index 6dc6c4fbc8e2..03bcaa8effb2 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -8,15 +8,11 @@ #include <linux/bits.h> #include <uapi/asm/setup.h> +#include <linux/build_bug.h> -#define EP_OFFSET 0x10008 -#define EP_STRING "S390EP" #define PARMAREA 0x10400 -#define EARLY_SCCB_OFFSET 0x11000 -#define HEAD_END 0x12000 - -#define EARLY_SCCB_SIZE PAGE_SIZE +#define COMMAND_LINE_SIZE CONFIG_COMMAND_LINE_SIZE /* * Machine features detected in early.c */ @@ -27,17 +23,17 @@ #define MACHINE_FLAG_DIAG9C BIT(3) #define MACHINE_FLAG_ESOP BIT(4) #define MACHINE_FLAG_IDTE BIT(5) -#define MACHINE_FLAG_DIAG44 BIT(6) #define MACHINE_FLAG_EDAT1 BIT(7) #define MACHINE_FLAG_EDAT2 BIT(8) #define MACHINE_FLAG_TOPOLOGY BIT(10) #define MACHINE_FLAG_TE BIT(11) #define MACHINE_FLAG_TLB_LC BIT(12) -#define MACHINE_FLAG_VX BIT(13) #define MACHINE_FLAG_TLB_GUEST BIT(14) #define MACHINE_FLAG_NX BIT(15) #define MACHINE_FLAG_GS BIT(16) #define MACHINE_FLAG_SCC BIT(17) +#define MACHINE_FLAG_PCI_MIO BIT(18) +#define MACHINE_FLAG_RDP BIT(19) #define LPP_MAGIC BIT(31) #define LPP_PID_MASK _AC(0xffffffff, UL) @@ -47,28 +43,13 @@ #define STARTUP_NORMAL_OFFSET 0x10000 #define STARTUP_KDUMP_OFFSET 0x10010 -/* Offsets to parameters in kernel/head.S */ - -#define IPL_DEVICE_OFFSET 0x10400 -#define INITRD_START_OFFSET 0x10408 -#define INITRD_SIZE_OFFSET 0x10410 -#define OLDMEM_BASE_OFFSET 0x10418 -#define OLDMEM_SIZE_OFFSET 0x10420 -#define KERNEL_VERSION_OFFSET 0x10428 -#define COMMAND_LINE_OFFSET 0x10480 +#define LEGACY_COMMAND_LINE_SIZE 896 #ifndef __ASSEMBLY__ #include <asm/lowcore.h> #include <asm/types.h> -#define IPL_DEVICE (*(unsigned long *) (IPL_DEVICE_OFFSET)) -#define INITRD_START (*(unsigned long *) (INITRD_START_OFFSET)) -#define INITRD_SIZE (*(unsigned long *) (INITRD_SIZE_OFFSET)) -#define OLDMEM_BASE (*(unsigned long *) (OLDMEM_BASE_OFFSET)) -#define OLDMEM_SIZE (*(unsigned long *) (OLDMEM_SIZE_OFFSET)) -#define COMMAND_LINE ((char *) (COMMAND_LINE_OFFSET)) - struct parmarea { unsigned long ipl_device; /* 0x10400 */ unsigned long initrd_start; /* 0x10408 */ @@ -76,16 +57,25 @@ struct parmarea { unsigned long oldmem_base; /* 0x10418 */ unsigned long oldmem_size; /* 0x10420 */ unsigned long kernel_version; /* 0x10428 */ - char pad1[0x10480 - 0x10430]; /* 0x10430 - 0x10480 */ - char command_line[ARCH_COMMAND_LINE_SIZE]; /* 0x10480 */ + unsigned long max_command_line_size; /* 0x10430 */ + char pad1[0x10480-0x10438]; /* 0x10438 - 0x10480 */ + char command_line[COMMAND_LINE_SIZE]; /* 0x10480 */ }; -extern int noexec_disabled; -extern int memory_end_set; -extern unsigned long memory_end; -extern unsigned long vmalloc_size; -extern unsigned long max_physmem_end; -extern unsigned long __swsusp_reset_dma; +extern struct parmarea parmarea; + +extern unsigned int zlib_dfltcc_support; +#define ZLIB_DFLTCC_DISABLED 0 +#define ZLIB_DFLTCC_FULL 1 +#define ZLIB_DFLTCC_DEFLATE_ONLY 2 +#define ZLIB_DFLTCC_INFLATE_ONLY 3 +#define ZLIB_DFLTCC_FULL_DEBUG 4 + +extern unsigned long ident_map_size; +extern unsigned long max_mappable; + +/* The Write Back bit position in the physaddr is given by the SLPC PCI */ +extern unsigned long mio_wb_bit_mask; #define MACHINE_IS_VM (S390_lowcore.machine_flags & MACHINE_FLAG_VM) #define MACHINE_IS_KVM (S390_lowcore.machine_flags & MACHINE_FLAG_KVM) @@ -94,17 +84,17 @@ extern unsigned long __swsusp_reset_dma; #define MACHINE_HAS_DIAG9C (S390_lowcore.machine_flags & MACHINE_FLAG_DIAG9C) #define MACHINE_HAS_ESOP (S390_lowcore.machine_flags & MACHINE_FLAG_ESOP) #define MACHINE_HAS_IDTE (S390_lowcore.machine_flags & MACHINE_FLAG_IDTE) -#define MACHINE_HAS_DIAG44 (S390_lowcore.machine_flags & MACHINE_FLAG_DIAG44) #define MACHINE_HAS_EDAT1 (S390_lowcore.machine_flags & MACHINE_FLAG_EDAT1) #define MACHINE_HAS_EDAT2 (S390_lowcore.machine_flags & MACHINE_FLAG_EDAT2) #define MACHINE_HAS_TOPOLOGY (S390_lowcore.machine_flags & MACHINE_FLAG_TOPOLOGY) #define MACHINE_HAS_TE (S390_lowcore.machine_flags & MACHINE_FLAG_TE) #define MACHINE_HAS_TLB_LC (S390_lowcore.machine_flags & MACHINE_FLAG_TLB_LC) -#define MACHINE_HAS_VX (S390_lowcore.machine_flags & MACHINE_FLAG_VX) #define MACHINE_HAS_TLB_GUEST (S390_lowcore.machine_flags & MACHINE_FLAG_TLB_GUEST) #define MACHINE_HAS_NX (S390_lowcore.machine_flags & MACHINE_FLAG_NX) #define MACHINE_HAS_GS (S390_lowcore.machine_flags & MACHINE_FLAG_GS) #define MACHINE_HAS_SCC (S390_lowcore.machine_flags & MACHINE_FLAG_SCC) +#define MACHINE_HAS_PCI_MIO (S390_lowcore.machine_flags & MACHINE_FLAG_PCI_MIO) +#define MACHINE_HAS_RDP (S390_lowcore.machine_flags & MACHINE_FLAG_RDP) /* * Console mode. Override with conmode= @@ -113,9 +103,6 @@ extern unsigned int console_mode; extern unsigned int console_devno; extern unsigned int console_irq; -extern char vmhalt_cmd[]; -extern char vmpoff_cmd[]; - #define CONSOLE_IS_UNDEFINED (console_mode == 0) #define CONSOLE_IS_SCLP (console_mode == 1) #define CONSOLE_IS_3215 (console_mode == 2) @@ -128,14 +115,6 @@ extern char vmpoff_cmd[]; #define SET_CONSOLE_VT220 do { console_mode = 4; } while (0) #define SET_CONSOLE_HVC do { console_mode = 5; } while (0) -#ifdef CONFIG_PFAULT -extern int pfault_init(void); -extern void pfault_fini(void); -#else /* CONFIG_PFAULT */ -#define pfault_init() ({-1;}) -#define pfault_fini() do { } while (0) -#endif /* CONFIG_PFAULT */ - #ifdef CONFIG_VMCP void vmcp_cma_reserve(void); #else @@ -144,9 +123,6 @@ static inline void vmcp_cma_reserve(void) { } void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault); -void cmma_init(void); -void cmma_init_nodat(void); - extern void (*_machine_restart)(char *command); extern void (*_machine_halt)(void); extern void (*_machine_power_off)(void); @@ -157,14 +133,24 @@ static inline unsigned long kaslr_offset(void) return __kaslr_offset; } -#else /* __ASSEMBLY__ */ +extern int __kaslr_enabled; +static inline int kaslr_enabled(void) +{ + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) + return __kaslr_enabled; + return 0; +} -#define IPL_DEVICE (IPL_DEVICE_OFFSET) -#define INITRD_START (INITRD_START_OFFSET) -#define INITRD_SIZE (INITRD_SIZE_OFFSET) -#define OLDMEM_BASE (OLDMEM_BASE_OFFSET) -#define OLDMEM_SIZE (OLDMEM_SIZE_OFFSET) -#define COMMAND_LINE (COMMAND_LINE_OFFSET) +struct oldmem_data { + unsigned long start; + unsigned long size; +}; +extern struct oldmem_data oldmem_data; +static __always_inline u32 gen_lpswe(unsigned long addr) +{ + BUILD_BUG_ON(addr > 0xfff); + return 0xb2b20000 | addr; +} #endif /* __ASSEMBLY__ */ #endif /* _ASM_S390_SETUP_H */ diff --git a/arch/s390/include/asm/shmparam.h b/arch/s390/include/asm/shmparam.h deleted file mode 100644 index e75d45649c54..000000000000 --- a/arch/s390/include/asm/shmparam.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * S390 version - * - * Derived from "include/asm-i386/shmparam.h" - */ -#ifndef _ASM_S390_SHMPARAM_H -#define _ASM_S390_SHMPARAM_H - -#define SHMLBA PAGE_SIZE /* attach addr a multiple of this */ - -#endif /* _ASM_S390_SHMPARAM_H */ diff --git a/arch/s390/include/asm/sigp.h b/arch/s390/include/asm/sigp.h index 53ee795cd3d3..edee63da08e7 100644 --- a/arch/s390/include/asm/sigp.h +++ b/arch/s390/include/asm/sigp.h @@ -41,15 +41,17 @@ static inline int ____pcpu_sigp(u16 addr, u8 order, unsigned long parm, u32 *status) { - register unsigned long reg1 asm ("1") = parm; + union register_pair r1 = { .odd = parm, }; int cc; asm volatile( - " sigp %1,%2,0(%3)\n" - " ipm %0\n" - " srl %0,28\n" - : "=d" (cc), "+d" (reg1) : "d" (addr), "a" (order) : "cc"); - *status = reg1; + " sigp %[r1],%[addr],0(%[order])\n" + " ipm %[cc]\n" + " srl %[cc],28\n" + : [cc] "=&d" (cc), [r1] "+&d" (r1.pair) + : [addr] "d" (addr), [order] "a" (order) + : "cc"); + *status = r1.even; return cc; } diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h index b157a81fb977..6e5b1b4b19a9 100644 --- a/arch/s390/include/asm/smp.h +++ b/arch/s390/include/asm/smp.h @@ -3,13 +3,13 @@ * Copyright IBM Corp. 1999, 2012 * Author(s): Denis Joseph Barrow, * Martin Schwidefsky <schwidefsky@de.ibm.com>, - * Heiko Carstens <heiko.carstens@de.ibm.com>, */ #ifndef __ASM_SMP_H #define __ASM_SMP_H #include <asm/sigp.h> #include <asm/lowcore.h> +#include <asm/processor.h> #define raw_smp_processor_id() (S390_lowcore.cpu_nr) @@ -17,6 +17,7 @@ extern struct mutex smp_cpu_state_mutex; extern unsigned int smp_cpu_mt_shift; extern unsigned int smp_cpu_mtid; extern __vector128 __initdata boot_cpu_vector_save_area[__NUM_VXRS]; +extern cpumask_t cpu_setup_mask; extern int __cpu_up(unsigned int cpu, struct task_struct *tidle); @@ -29,11 +30,12 @@ extern void smp_emergency_stop(void); extern int smp_find_processor_id(u16 address); extern int smp_store_status(int cpu); -extern void smp_save_dump_cpus(void); -extern int smp_vcpu_scheduled(int cpu); +extern void smp_save_dump_ipl_cpu(void); +extern void smp_save_dump_secondary_cpus(void); extern void smp_yield_cpu(int cpu); extern void smp_cpu_set_polarization(int cpu, int val); extern int smp_cpu_get_polarization(int cpu); +extern int smp_cpu_get_cpu_address(int cpu); extern void smp_fill_possible_mask(void); extern void smp_detect_cpus(void); @@ -53,9 +55,15 @@ static inline int smp_get_base_cpu(int cpu) return cpu - (cpu % (smp_cpu_mtid + 1)); } +static inline void smp_cpus_done(unsigned int max_cpus) +{ +} + extern int smp_rescan_cpus(void); extern void __noreturn cpu_die(void); extern void __cpu_die(unsigned int cpu); extern int __cpu_disable(void); +extern void schedule_mcck_handler(void); +void notrace smp_yield_cpu(int cpu); #endif /* __ASM_SMP_H */ diff --git a/arch/s390/include/asm/softirq_stack.h b/arch/s390/include/asm/softirq_stack.h new file mode 100644 index 000000000000..1ac5115d3115 --- /dev/null +++ b/arch/s390/include/asm/softirq_stack.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef __ASM_S390_SOFTIRQ_STACK_H +#define __ASM_S390_SOFTIRQ_STACK_H + +#include <asm/lowcore.h> +#include <asm/stacktrace.h> + +#ifdef CONFIG_SOFTIRQ_ON_OWN_STACK +static inline void do_softirq_own_stack(void) +{ + call_on_stack(0, S390_lowcore.async_stack, void, __do_softirq); +} +#endif +#endif /* __ASM_S390_SOFTIRQ_STACK_H */ diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h index 3a37172d5398..37127cd7749e 100644 --- a/arch/s390/include/asm/spinlock.h +++ b/arch/s390/include/asm/spinlock.h @@ -67,14 +67,6 @@ static inline void arch_spin_lock(arch_spinlock_t *lp) arch_spin_lock_wait(lp); } -static inline void arch_spin_lock_flags(arch_spinlock_t *lp, - unsigned long flags) -{ - if (!arch_spin_trylock_once(lp)) - arch_spin_lock_wait(lp); -} -#define arch_spin_lock_flags arch_spin_lock_flags - static inline int arch_spin_trylock(arch_spinlock_t *lp) { if (!arch_spin_trylock_once(lp)) @@ -85,10 +77,11 @@ static inline int arch_spin_trylock(arch_spinlock_t *lp) static inline void arch_spin_unlock(arch_spinlock_t *lp) { typecheck(int, lp->lock); + kcsan_release(); asm_inline volatile( - ALTERNATIVE("", ".long 0xb2fa0070", 49) /* NIAI 7 */ + ALTERNATIVE("nop", ".insn rre,0xb2fa0000,7,0", 49) /* NIAI 7 */ " sth %1,%0\n" - : "=Q" (((unsigned short *) &lp->lock)[1]) + : "=R" (((unsigned short *) &lp->lock)[1]) : "d" (0) : "cc", "memory"); } diff --git a/arch/s390/include/asm/spinlock_types.h b/arch/s390/include/asm/spinlock_types.h index cfed272e4fd5..b69695e39957 100644 --- a/arch/s390/include/asm/spinlock_types.h +++ b/arch/s390/include/asm/spinlock_types.h @@ -2,13 +2,13 @@ #ifndef __ASM_SPINLOCK_TYPES_H #define __ASM_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H # error "please don't include this file directly" #endif typedef struct { int lock; -} __attribute__ ((aligned (4))) arch_spinlock_t; +} arch_spinlock_t; #define __ARCH_SPIN_LOCK_UNLOCKED { .lock = 0, } diff --git a/arch/s390/include/asm/stacktrace.h b/arch/s390/include/asm/stacktrace.h index ee056f4a4fa3..31ec4f545e03 100644 --- a/arch/s390/include/asm/stacktrace.h +++ b/arch/s390/include/asm/stacktrace.h @@ -6,12 +6,20 @@ #include <linux/ptrace.h> #include <asm/switch_to.h> +struct stack_frame_user { + unsigned long back_chain; + unsigned long empty1[5]; + unsigned long gprs[10]; + unsigned long empty2[4]; +}; + enum stack_type { STACK_TYPE_UNKNOWN, STACK_TYPE_TASK, STACK_TYPE_IRQ, STACK_TYPE_NODAT, STACK_TYPE_RESTART, + STACK_TYPE_MCCK, }; struct stack_info { @@ -33,37 +41,27 @@ static inline bool on_stack(struct stack_info *info, return addr >= info->begin && addr + len <= info->end; } -static __always_inline unsigned long get_stack_pointer(struct task_struct *task, - struct pt_regs *regs) -{ - if (regs) - return (unsigned long) kernel_stack_pointer(regs); - if (task == current) - return current_stack_pointer(); - return (unsigned long) task->thread.ksp; -} - /* * Stack layout of a C stack frame. + * Kernel uses the packed stack layout (-mpacked-stack). */ -#ifndef __PACK_STACK struct stack_frame { - unsigned long back_chain; - unsigned long empty1[5]; - unsigned long gprs[10]; - unsigned int empty2[8]; -}; -#else -struct stack_frame { - unsigned long empty1[5]; - unsigned int empty2[8]; + union { + unsigned long empty[9]; + struct { + unsigned long sie_control_block; + unsigned long sie_savearea; + unsigned long sie_reason; + unsigned long sie_flags; + unsigned long sie_control_block_phys; + }; + }; unsigned long gprs[10]; unsigned long back_chain; }; -#endif /* - * Unlike current_stack_pointer() which simply returns current value of %r15 + * Unlike current_stack_pointer which simply contains the current value of %r15 * current_frame_address() returns function stack frame address, which matches * %r15 upon function invocation. It may differ from %r15 later if function * allocates stack for local variables or new stack frame to call other @@ -73,29 +71,26 @@ struct stack_frame { ((unsigned long)__builtin_frame_address(0) - \ offsetof(struct stack_frame, back_chain)) -#define CALL_ARGS_0() \ - register unsigned long r2 asm("2") -#define CALL_ARGS_1(arg1) \ - register unsigned long r2 asm("2") = (unsigned long)(arg1) -#define CALL_ARGS_2(arg1, arg2) \ - CALL_ARGS_1(arg1); \ - register unsigned long r3 asm("3") = (unsigned long)(arg2) -#define CALL_ARGS_3(arg1, arg2, arg3) \ - CALL_ARGS_2(arg1, arg2); \ - register unsigned long r4 asm("4") = (unsigned long)(arg3) -#define CALL_ARGS_4(arg1, arg2, arg3, arg4) \ - CALL_ARGS_3(arg1, arg2, arg3); \ - register unsigned long r4 asm("5") = (unsigned long)(arg4) -#define CALL_ARGS_5(arg1, arg2, arg3, arg4, arg5) \ - CALL_ARGS_4(arg1, arg2, arg3, arg4); \ - register unsigned long r4 asm("6") = (unsigned long)(arg5) - -#define CALL_FMT_0 "=&d" (r2) : -#define CALL_FMT_1 "+&d" (r2) : -#define CALL_FMT_2 CALL_FMT_1 "d" (r3), -#define CALL_FMT_3 CALL_FMT_2 "d" (r4), -#define CALL_FMT_4 CALL_FMT_3 "d" (r5), -#define CALL_FMT_5 CALL_FMT_4 "d" (r6), +static __always_inline unsigned long get_stack_pointer(struct task_struct *task, + struct pt_regs *regs) +{ + if (regs) + return (unsigned long)kernel_stack_pointer(regs); + if (task == current) + return current_frame_address(); + return (unsigned long)task->thread.ksp; +} + +/* + * To keep this simple mark register 2-6 as being changed (volatile) + * by the called function, even though register 6 is saved/nonvolatile. + */ +#define CALL_FMT_0 "=&d" (r2) +#define CALL_FMT_1 "+&d" (r2) +#define CALL_FMT_2 CALL_FMT_1, "+&d" (r3) +#define CALL_FMT_3 CALL_FMT_2, "+&d" (r4) +#define CALL_FMT_4 CALL_FMT_3, "+&d" (r5) +#define CALL_FMT_5 CALL_FMT_4, "+&d" (r6) #define CALL_CLOBBER_5 "0", "1", "14", "cc", "memory" #define CALL_CLOBBER_4 CALL_CLOBBER_5 @@ -104,35 +99,150 @@ struct stack_frame { #define CALL_CLOBBER_1 CALL_CLOBBER_2, "3" #define CALL_CLOBBER_0 CALL_CLOBBER_1 -#define CALL_ON_STACK(fn, stack, nr, args...) \ +#define CALL_LARGS_0(...) \ + long dummy = 0 +#define CALL_LARGS_1(t1, a1) \ + long arg1 = (long)(t1)(a1) +#define CALL_LARGS_2(t1, a1, t2, a2) \ + CALL_LARGS_1(t1, a1); \ + long arg2 = (long)(t2)(a2) +#define CALL_LARGS_3(t1, a1, t2, a2, t3, a3) \ + CALL_LARGS_2(t1, a1, t2, a2); \ + long arg3 = (long)(t3)(a3) +#define CALL_LARGS_4(t1, a1, t2, a2, t3, a3, t4, a4) \ + CALL_LARGS_3(t1, a1, t2, a2, t3, a3); \ + long arg4 = (long)(t4)(a4) +#define CALL_LARGS_5(t1, a1, t2, a2, t3, a3, t4, a4, t5, a5) \ + CALL_LARGS_4(t1, a1, t2, a2, t3, a3, t4, a4); \ + long arg5 = (long)(t5)(a5) + +#define CALL_REGS_0 \ + register long r2 asm("2") = dummy +#define CALL_REGS_1 \ + register long r2 asm("2") = arg1 +#define CALL_REGS_2 \ + CALL_REGS_1; \ + register long r3 asm("3") = arg2 +#define CALL_REGS_3 \ + CALL_REGS_2; \ + register long r4 asm("4") = arg3 +#define CALL_REGS_4 \ + CALL_REGS_3; \ + register long r5 asm("5") = arg4 +#define CALL_REGS_5 \ + CALL_REGS_4; \ + register long r6 asm("6") = arg5 + +#define CALL_TYPECHECK_0(...) +#define CALL_TYPECHECK_1(t, a, ...) \ + typecheck(t, a) +#define CALL_TYPECHECK_2(t, a, ...) \ + CALL_TYPECHECK_1(__VA_ARGS__); \ + typecheck(t, a) +#define CALL_TYPECHECK_3(t, a, ...) \ + CALL_TYPECHECK_2(__VA_ARGS__); \ + typecheck(t, a) +#define CALL_TYPECHECK_4(t, a, ...) \ + CALL_TYPECHECK_3(__VA_ARGS__); \ + typecheck(t, a) +#define CALL_TYPECHECK_5(t, a, ...) \ + CALL_TYPECHECK_4(__VA_ARGS__); \ + typecheck(t, a) + +#define CALL_PARM_0(...) void +#define CALL_PARM_1(t, a, ...) t +#define CALL_PARM_2(t, a, ...) t, CALL_PARM_1(__VA_ARGS__) +#define CALL_PARM_3(t, a, ...) t, CALL_PARM_2(__VA_ARGS__) +#define CALL_PARM_4(t, a, ...) t, CALL_PARM_3(__VA_ARGS__) +#define CALL_PARM_5(t, a, ...) t, CALL_PARM_4(__VA_ARGS__) +#define CALL_PARM_6(t, a, ...) t, CALL_PARM_5(__VA_ARGS__) + +/* + * Use call_on_stack() to call a function switching to a specified + * stack. Proper sign and zero extension of function arguments is + * done. Usage: + * + * rc = call_on_stack(nr, stack, rettype, fn, t1, a1, t2, a2, ...) + * + * - nr specifies the number of function arguments of fn. + * - stack specifies the stack to be used. + * - fn is the function to be called. + * - rettype is the return type of fn. + * - t1, a1, ... are pairs, where t1 must match the type of the first + * argument of fn, t2 the second, etc. a1 is the corresponding + * first function argument (not name), etc. + */ +#define call_on_stack(nr, stack, rettype, fn, ...) \ ({ \ + rettype (*__fn)(CALL_PARM_##nr(__VA_ARGS__)) = fn; \ unsigned long frame = current_frame_address(); \ - CALL_ARGS_##nr(args); \ + unsigned long __stack = stack; \ unsigned long prev; \ + CALL_LARGS_##nr(__VA_ARGS__); \ + CALL_REGS_##nr; \ \ + CALL_TYPECHECK_##nr(__VA_ARGS__); \ asm volatile( \ - " la %[_prev],0(15)\n" \ + " lgr %[_prev],15\n" \ " lg 15,%[_stack]\n" \ " stg %[_frame],%[_bc](15)\n" \ " brasl 14,%[_fn]\n" \ - " la 15,0(%[_prev])\n" \ - : [_prev] "=&a" (prev), CALL_FMT_##nr \ - [_stack] "R" (stack), \ + " lgr 15,%[_prev]\n" \ + : [_prev] "=&d" (prev), CALL_FMT_##nr \ + : [_stack] "R" (__stack), \ [_bc] "i" (offsetof(struct stack_frame, back_chain)), \ [_frame] "d" (frame), \ - [_fn] "X" (fn) : CALL_CLOBBER_##nr); \ - r2; \ + [_fn] "X" (__fn) : CALL_CLOBBER_##nr); \ + (rettype)r2; \ }) -#define CALL_ON_STACK_NORETURN(fn, stack) \ +/* + * Use call_nodat() to call a function with DAT disabled. + * Proper sign and zero extension of function arguments is done. + * Usage: + * + * rc = call_nodat(nr, rettype, fn, t1, a1, t2, a2, ...) + * + * - nr specifies the number of function arguments of fn. + * - fn is the function to be called, where fn is a physical address. + * - rettype is the return type of fn. + * - t1, a1, ... are pairs, where t1 must match the type of the first + * argument of fn, t2 the second, etc. a1 is the corresponding + * first function argument (not name), etc. + * + * fn() is called with standard C function call ABI, with the exception + * that no useful stackframe or stackpointer is passed via register 15. + * Therefore the called function must not use r15 to access the stack. + */ +#define call_nodat(nr, rettype, fn, ...) \ ({ \ + rettype (*__fn)(CALL_PARM_##nr(__VA_ARGS__)) = (fn); \ + /* aligned since psw_leave must not cross page boundary */ \ + psw_t __aligned(16) psw_leave; \ + psw_t psw_enter; \ + CALL_LARGS_##nr(__VA_ARGS__); \ + CALL_REGS_##nr; \ + \ + CALL_TYPECHECK_##nr(__VA_ARGS__); \ + psw_enter.mask = PSW_KERNEL_BITS & ~PSW_MASK_DAT; \ + psw_enter.addr = (unsigned long)__fn; \ asm volatile( \ - " la 15,0(%[_stack])\n" \ - " xc %[_bc](8,15),%[_bc](15)\n" \ - " brasl 14,%[_fn]\n" \ - ::[_bc] "i" (offsetof(struct stack_frame, back_chain)), \ - [_stack] "a" (stack), [_fn] "X" (fn)); \ - BUG(); \ + " epsw 0,1\n" \ + " risbg 1,0,0,31,32\n" \ + " larl 7,1f\n" \ + " stg 1,%[psw_leave]\n" \ + " stg 7,8+%[psw_leave]\n" \ + " la 7,%[psw_leave]\n" \ + " lra 7,0(7)\n" \ + " larl 1,0f\n" \ + " lra 14,0(1)\n" \ + " lpswe %[psw_enter]\n" \ + "0: lpswe 0(7)\n" \ + "1:\n" \ + : CALL_FMT_##nr, [psw_leave] "=Q" (psw_leave) \ + : [psw_enter] "Q" (psw_enter) \ + : "7", CALL_CLOBBER_##nr); \ + (rettype)r2; \ }) #endif /* _ASM_S390_STACKTRACE_H */ diff --git a/arch/s390/include/asm/stp.h b/arch/s390/include/asm/stp.h index f0ddefb06ec8..4d74d7e33340 100644 --- a/arch/s390/include/asm/stp.h +++ b/arch/s390/include/asm/stp.h @@ -6,43 +6,89 @@ #ifndef __S390_STP_H #define __S390_STP_H +#include <linux/compiler.h> + /* notifier for syncs */ extern struct atomic_notifier_head s390_epoch_delta_notifier; /* STP interruption parameter */ struct stp_irq_parm { - unsigned int _pad0 : 14; - unsigned int tsc : 1; /* Timing status change */ - unsigned int lac : 1; /* Link availability change */ - unsigned int tcpc : 1; /* Time control parameter change */ - unsigned int _pad2 : 15; -} __attribute__ ((packed)); + u32 : 14; + u32 tsc : 1; /* Timing status change */ + u32 lac : 1; /* Link availability change */ + u32 tcpc : 1; /* Time control parameter change */ + u32 : 15; +} __packed; #define STP_OP_SYNC 1 #define STP_OP_CTRL 3 struct stp_sstpi { - unsigned int rsvd0; - unsigned int rsvd1 : 8; - unsigned int stratum : 8; - unsigned int vbits : 16; - unsigned int leaps : 16; - unsigned int tmd : 4; - unsigned int ctn : 4; - unsigned int rsvd2 : 3; - unsigned int c : 1; - unsigned int tst : 4; - unsigned int tzo : 16; - unsigned int dsto : 16; - unsigned int ctrl : 16; - unsigned int rsvd3 : 16; - unsigned int tto; - unsigned int rsvd4; - unsigned int ctnid[3]; - unsigned int rsvd5; - unsigned int todoff[4]; - unsigned int rsvd6[48]; -} __attribute__ ((packed)); + u32 : 32; + u32 tu : 1; + u32 lu : 1; + u32 : 6; + u32 stratum : 8; + u32 vbits : 16; + u32 leaps : 16; + u32 tmd : 4; + u32 ctn : 4; + u32 : 3; + u32 c : 1; + u32 tst : 4; + u32 tzo : 16; + u32 dsto : 16; + u32 ctrl : 16; + u32 : 16; + u32 tto; + u32 : 32; + u32 ctnid[3]; + u32 : 32; + u64 todoff; + u32 rsvd[50]; +} __packed; + +struct stp_tzib { + u32 tzan : 16; + u32 : 16; + u32 tzo : 16; + u32 dsto : 16; + u32 stn; + u32 dstn; + u64 dst_on_alg; + u64 dst_off_alg; +} __packed; + +struct stp_tcpib { + u32 atcode : 4; + u32 ntcode : 4; + u32 d : 1; + u32 : 23; + s32 tto; + struct stp_tzib atzib; + struct stp_tzib ntzib; + s32 adst_offset : 16; + s32 ndst_offset : 16; + u32 rsvd1; + u64 ntzib_update; + u64 ndsto_update; +} __packed; + +struct stp_lsoib { + u32 p : 1; + u32 : 31; + s32 also : 16; + s32 nlso : 16; + u64 nlsout; +} __packed; + +struct stp_stzi { + u32 rsvd0[3]; + u64 data_ts; + u32 rsvd1[22]; + struct stp_tcpib tcpib; + struct stp_lsoib lsoib; +} __packed; /* Functions needed by the machine check handler */ int stp_sync_check(void); diff --git a/arch/s390/include/asm/string.h b/arch/s390/include/asm/string.h index 4c0690fc5167..351685de53d2 100644 --- a/arch/s390/include/asm/string.h +++ b/arch/s390/include/asm/string.h @@ -31,22 +31,18 @@ void *memmove(void *dest, const void *src, size_t n); #define __HAVE_ARCH_STRCMP /* arch function */ #define __HAVE_ARCH_STRCPY /* inline & arch function */ #define __HAVE_ARCH_STRLCAT /* arch function */ -#define __HAVE_ARCH_STRLCPY /* arch function */ #define __HAVE_ARCH_STRLEN /* inline & arch function */ #define __HAVE_ARCH_STRNCAT /* arch function */ #define __HAVE_ARCH_STRNCPY /* arch function */ #define __HAVE_ARCH_STRNLEN /* inline & arch function */ -#define __HAVE_ARCH_STRRCHR /* arch function */ #define __HAVE_ARCH_STRSTR /* arch function */ /* Prototypes for non-inlined arch strings functions. */ int memcmp(const void *s1, const void *s2, size_t n); int strcmp(const char *s1, const char *s2); size_t strlcat(char *dest, const char *src, size_t n); -size_t strlcpy(char *dest, const char *src, size_t size); char *strncat(char *dest, const char *src, size_t n); char *strncpy(char *dest, const char *src, size_t n); -char *strrchr(const char *s, int c); char *strstr(const char *s1, const char *s2); #endif /* !CONFIG_KASAN */ @@ -59,18 +55,6 @@ char *strstr(const char *s1, const char *s2); #if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__) -extern void *__memcpy(void *dest, const void *src, size_t n); -extern void *__memset(void *s, int c, size_t n); -extern void *__memmove(void *dest, const void *src, size_t n); - -/* - * For files that are not instrumented (e.g. mm/slub.c) we - * should use not instrumented version of mem* functions. - */ - -#define memcpy(dst, src, len) __memcpy(dst, src, len) -#define memmove(dst, src, len) __memmove(dst, src, len) -#define memset(s, c, n) __memset(s, c, n) #define strlen(s) __strlen(s) #define __no_sanitize_prefix_strfunc(x) __##x @@ -83,6 +67,9 @@ extern void *__memmove(void *dest, const void *src, size_t n); #define __no_sanitize_prefix_strfunc(x) x #endif /* defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__) */ +void *__memcpy(void *dest, const void *src, size_t n); +void *__memset(void *s, int c, size_t n); +void *__memmove(void *dest, const void *src, size_t n); void *__memset16(uint16_t *s, uint16_t v, size_t count); void *__memset32(uint32_t *s, uint32_t v, size_t count); void *__memset64(uint64_t *s, uint64_t v, size_t count); @@ -107,16 +94,18 @@ static inline void *memset64(uint64_t *s, uint64_t v, size_t count) #ifdef __HAVE_ARCH_MEMCHR static inline void *memchr(const void * s, int c, size_t n) { - register int r0 asm("0") = (char) c; const void *ret = s + n; asm volatile( - "0: srst %0,%1\n" + " lgr 0,%[c]\n" + "0: srst %[ret],%[s]\n" " jo 0b\n" " jl 1f\n" - " la %0,0\n" + " la %[ret],0\n" "1:" - : "+a" (ret), "+&a" (s) : "d" (r0) : "cc", "memory"); + : [ret] "+&a" (ret), [s] "+&a" (s) + : [c] "d" (c) + : "cc", "memory", "0"); return (void *) ret; } #endif @@ -124,13 +113,15 @@ static inline void *memchr(const void * s, int c, size_t n) #ifdef __HAVE_ARCH_MEMSCAN static inline void *memscan(void *s, int c, size_t n) { - register int r0 asm("0") = (char) c; const void *ret = s + n; asm volatile( - "0: srst %0,%1\n" + " lgr 0,%[c]\n" + "0: srst %[ret],%[s]\n" " jo 0b\n" - : "+a" (ret), "+&a" (s) : "d" (r0) : "cc", "memory"); + : [ret] "+&a" (ret), [s] "+&a" (s) + : [c] "d" (c) + : "cc", "memory", "0"); return (void *) ret; } #endif @@ -138,17 +129,18 @@ static inline void *memscan(void *s, int c, size_t n) #ifdef __HAVE_ARCH_STRCAT static inline char *strcat(char *dst, const char *src) { - register int r0 asm("0") = 0; - unsigned long dummy; + unsigned long dummy = 0; char *ret = dst; asm volatile( - "0: srst %0,%1\n" + " lghi 0,0\n" + "0: srst %[dummy],%[dst]\n" " jo 0b\n" - "1: mvst %0,%2\n" + "1: mvst %[dummy],%[src]\n" " jo 1b" - : "=&a" (dummy), "+a" (dst), "+a" (src) - : "d" (r0), "0" (0) : "cc", "memory" ); + : [dummy] "+&a" (dummy), [dst] "+&a" (dst), [src] "+&a" (src) + : + : "cc", "memory", "0"); return ret; } #endif @@ -156,14 +148,15 @@ static inline char *strcat(char *dst, const char *src) #ifdef __HAVE_ARCH_STRCPY static inline char *strcpy(char *dst, const char *src) { - register int r0 asm("0") = 0; char *ret = dst; asm volatile( - "0: mvst %0,%1\n" + " lghi 0,0\n" + "0: mvst %[dst],%[src]\n" " jo 0b" - : "+&a" (dst), "+&a" (src) : "d" (r0) - : "cc", "memory"); + : [dst] "+&a" (dst), [src] "+&a" (src) + : + : "cc", "memory", "0"); return ret; } #endif @@ -171,28 +164,33 @@ static inline char *strcpy(char *dst, const char *src) #if defined(__HAVE_ARCH_STRLEN) || (defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)) static inline size_t __no_sanitize_prefix_strfunc(strlen)(const char *s) { - register unsigned long r0 asm("0") = 0; + unsigned long end = 0; const char *tmp = s; asm volatile( - "0: srst %0,%1\n" + " lghi 0,0\n" + "0: srst %[end],%[tmp]\n" " jo 0b" - : "+d" (r0), "+a" (tmp) : : "cc", "memory"); - return r0 - (unsigned long) s; + : [end] "+&a" (end), [tmp] "+&a" (tmp) + : + : "cc", "memory", "0"); + return end - (unsigned long)s; } #endif #ifdef __HAVE_ARCH_STRNLEN static inline size_t strnlen(const char * s, size_t n) { - register int r0 asm("0") = 0; const char *tmp = s; const char *end = s + n; asm volatile( - "0: srst %0,%1\n" + " lghi 0,0\n" + "0: srst %[end],%[tmp]\n" " jo 0b" - : "+a" (end), "+a" (tmp) : "d" (r0) : "cc", "memory"); + : [end] "+&a" (end), [tmp] "+&a" (tmp) + : + : "cc", "memory", "0"); return end - s; } #endif diff --git a/arch/s390/include/asm/syscall.h b/arch/s390/include/asm/syscall.h index f073292e9fdb..27e3d804b311 100644 --- a/arch/s390/include/asm/syscall.h +++ b/arch/s390/include/asm/syscall.h @@ -14,8 +14,8 @@ #include <linux/err.h> #include <asm/ptrace.h> -extern const unsigned long sys_call_table[]; -extern const unsigned long sys_call_table_emu[]; +extern const sys_call_ptr_t sys_call_table[]; +extern const sys_call_ptr_t sys_call_table_emu[]; static inline long syscall_get_nr(struct task_struct *task, struct pt_regs *regs) @@ -33,7 +33,17 @@ static inline void syscall_rollback(struct task_struct *task, static inline long syscall_get_error(struct task_struct *task, struct pt_regs *regs) { - return IS_ERR_VALUE(regs->gprs[2]) ? regs->gprs[2] : 0; + unsigned long error = regs->gprs[2]; +#ifdef CONFIG_COMPAT + if (test_tsk_thread_flag(task, TIF_31BIT)) { + /* + * Sign-extend the value so (int)-EFOO becomes (long)-EFOO + * and will match correctly in comparisons. + */ + error = (long)(int)error; + } +#endif + return IS_ERR_VALUE(error) ? error : 0; } static inline long syscall_get_return_value(struct task_struct *task, @@ -46,6 +56,7 @@ static inline void syscall_set_return_value(struct task_struct *task, struct pt_regs *regs, int error, long val) { + set_pt_regs_flag(regs, PIF_SYSCALL_RET_SET); regs->gprs[2] = error ? error : val; } @@ -67,18 +78,6 @@ static inline void syscall_get_arguments(struct task_struct *task, args[0] = regs->orig_gpr2 & mask; } -static inline void syscall_set_arguments(struct task_struct *task, - struct pt_regs *regs, - const unsigned long *args) -{ - unsigned int n = 6; - - while (n-- > 0) - if (n > 0) - regs->gprs[2 + n] = args[n]; - regs->orig_gpr2 = args[0]; -} - static inline int syscall_get_arch(struct task_struct *task) { #ifdef CONFIG_COMPAT @@ -87,4 +86,69 @@ static inline int syscall_get_arch(struct task_struct *task) #endif return AUDIT_ARCH_S390X; } + +static inline bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs) +{ + return false; +} + +#define SYSCALL_FMT_0 +#define SYSCALL_FMT_1 , "0" (r2) +#define SYSCALL_FMT_2 , "d" (r3) SYSCALL_FMT_1 +#define SYSCALL_FMT_3 , "d" (r4) SYSCALL_FMT_2 +#define SYSCALL_FMT_4 , "d" (r5) SYSCALL_FMT_3 +#define SYSCALL_FMT_5 , "d" (r6) SYSCALL_FMT_4 +#define SYSCALL_FMT_6 , "d" (r7) SYSCALL_FMT_5 + +#define SYSCALL_PARM_0 +#define SYSCALL_PARM_1 , long arg1 +#define SYSCALL_PARM_2 SYSCALL_PARM_1, long arg2 +#define SYSCALL_PARM_3 SYSCALL_PARM_2, long arg3 +#define SYSCALL_PARM_4 SYSCALL_PARM_3, long arg4 +#define SYSCALL_PARM_5 SYSCALL_PARM_4, long arg5 +#define SYSCALL_PARM_6 SYSCALL_PARM_5, long arg6 + +#define SYSCALL_REGS_0 +#define SYSCALL_REGS_1 \ + register long r2 asm("2") = arg1 +#define SYSCALL_REGS_2 \ + SYSCALL_REGS_1; \ + register long r3 asm("3") = arg2 +#define SYSCALL_REGS_3 \ + SYSCALL_REGS_2; \ + register long r4 asm("4") = arg3 +#define SYSCALL_REGS_4 \ + SYSCALL_REGS_3; \ + register long r5 asm("5") = arg4 +#define SYSCALL_REGS_5 \ + SYSCALL_REGS_4; \ + register long r6 asm("6") = arg5 +#define SYSCALL_REGS_6 \ + SYSCALL_REGS_5; \ + register long r7 asm("7") = arg6 + +#define GENERATE_SYSCALL_FUNC(nr) \ +static __always_inline \ +long syscall##nr(unsigned long syscall SYSCALL_PARM_##nr) \ +{ \ + register unsigned long r1 asm ("1") = syscall; \ + register long rc asm ("2"); \ + SYSCALL_REGS_##nr; \ + \ + asm volatile ( \ + " svc 0\n" \ + : "=d" (rc) \ + : "d" (r1) SYSCALL_FMT_##nr \ + : "memory"); \ + return rc; \ +} + +GENERATE_SYSCALL_FUNC(0) +GENERATE_SYSCALL_FUNC(1) +GENERATE_SYSCALL_FUNC(2) +GENERATE_SYSCALL_FUNC(3) +GENERATE_SYSCALL_FUNC(4) +GENERATE_SYSCALL_FUNC(5) +GENERATE_SYSCALL_FUNC(6) + #endif /* _ASM_SYSCALL_H */ diff --git a/arch/s390/include/asm/syscall_wrapper.h b/arch/s390/include/asm/syscall_wrapper.h index 3c3d6fe8e2f0..35c1d1b860d8 100644 --- a/arch/s390/include/asm/syscall_wrapper.h +++ b/arch/s390/include/asm/syscall_wrapper.h @@ -7,9 +7,13 @@ #ifndef _ASM_S390_SYSCALL_WRAPPER_H #define _ASM_S390_SYSCALL_WRAPPER_H +/* Mapping of registers to parameters for syscalls */ +#define SC_S390_REGS_TO_ARGS(x, ...) \ + __MAP(x, __SC_ARGS \ + ,, regs->orig_gpr2,, regs->gprs[3],, regs->gprs[4] \ + ,, regs->gprs[5],, regs->gprs[6],, regs->gprs[7]) + #ifdef CONFIG_COMPAT -#define __SC_COMPAT_TYPE(t, a) \ - __typeof(__builtin_choose_expr(sizeof(t) > 4, 0L, (t)0)) a #define __SC_COMPAT_CAST(t, a) \ ({ \ @@ -29,107 +33,108 @@ (t)__ReS; \ }) -#define __S390_SYS_STUBx(x, name, ...) \ - asmlinkage long __s390_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__))\ - ALLOW_ERROR_INJECTION(__s390_sys##name, ERRNO); \ - asmlinkage long __s390_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__))\ - { \ - long ret = __s390x_sys##name(__MAP(x,__SC_COMPAT_CAST,__VA_ARGS__));\ - __MAP(x,__SC_TEST,__VA_ARGS__); \ - return ret; \ - } - /* * To keep the naming coherent, re-define SYSCALL_DEFINE0 to create an alias * named __s390x_sys_*() */ #define COMPAT_SYSCALL_DEFINE0(sname) \ - SYSCALL_METADATA(_##sname, 0); \ - asmlinkage long __s390_compat_sys_##sname(void); \ - ALLOW_ERROR_INJECTION(__s390_compat__sys_##sname, ERRNO); \ - asmlinkage long __s390_compat_sys_##sname(void) + long __s390_compat_sys_##sname(void); \ + ALLOW_ERROR_INJECTION(__s390_compat_sys_##sname, ERRNO); \ + long __s390_compat_sys_##sname(void) #define SYSCALL_DEFINE0(sname) \ SYSCALL_METADATA(_##sname, 0); \ - asmlinkage long __s390x_sys_##sname(void); \ + long __s390_sys_##sname(void); \ + ALLOW_ERROR_INJECTION(__s390_sys_##sname, ERRNO); \ + long __s390x_sys_##sname(void); \ ALLOW_ERROR_INJECTION(__s390x_sys_##sname, ERRNO); \ - asmlinkage long __s390_sys_##sname(void) \ - __attribute__((alias(__stringify(__s390x_sys_##sname)))); \ - asmlinkage long __s390x_sys_##sname(void) + static inline long __do_sys_##sname(void); \ + long __s390_sys_##sname(void) \ + { \ + return __do_sys_##sname(); \ + } \ + long __s390x_sys_##sname(void) \ + { \ + return __do_sys_##sname(); \ + } \ + static inline long __do_sys_##sname(void) #define COND_SYSCALL(name) \ cond_syscall(__s390x_sys_##name); \ cond_syscall(__s390_sys_##name) -#define SYS_NI(name) \ - SYSCALL_ALIAS(__s390x_sys_##name, sys_ni_posix_timers); \ - SYSCALL_ALIAS(__s390_sys_##name, sys_ni_posix_timers) - -#define COMPAT_SYSCALL_DEFINEx(x, name, ...) \ - __diag_push(); \ - __diag_ignore(GCC, 8, "-Wattribute-alias", \ - "Type aliasing is used to sanitize syscall arguments");\ - asmlinkage long __s390_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ - asmlinkage long __s390_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \ - __attribute__((alias(__stringify(__se_compat_sys##name)))); \ - ALLOW_ERROR_INJECTION(compat_sys##name, ERRNO); \ - static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\ - asmlinkage long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ - asmlinkage long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ - { \ - long ret = __do_compat_sys##name(__MAP(x,__SC_DELOUSE,__VA_ARGS__));\ - __MAP(x,__SC_TEST,__VA_ARGS__); \ - return ret; \ - } \ - __diag_pop(); \ - static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) +#define COMPAT_SYSCALL_DEFINEx(x, name, ...) \ + long __s390_compat_sys##name(struct pt_regs *regs); \ + ALLOW_ERROR_INJECTION(__s390_compat_sys##name, ERRNO); \ + static inline long __se_compat_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__)); \ + static inline long __do_compat_sys##name(__MAP(x, __SC_DECL, __VA_ARGS__)); \ + long __s390_compat_sys##name(struct pt_regs *regs) \ + { \ + return __se_compat_sys##name(SC_S390_REGS_TO_ARGS(x, __VA_ARGS__)); \ + } \ + static inline long __se_compat_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__)) \ + { \ + __MAP(x, __SC_TEST, __VA_ARGS__); \ + return __do_compat_sys##name(__MAP(x, __SC_DELOUSE, __VA_ARGS__)); \ + } \ + static inline long __do_compat_sys##name(__MAP(x, __SC_DECL, __VA_ARGS__)) /* * As some compat syscalls may not be implemented, we need to expand - * COND_SYSCALL_COMPAT in kernel/sys_ni.c and COMPAT_SYS_NI in - * kernel/time/posix-stubs.c to cover this case as well. + * COND_SYSCALL_COMPAT in kernel/sys_ni.c to cover this case as well. */ #define COND_SYSCALL_COMPAT(name) \ cond_syscall(__s390_compat_sys_##name) -#define COMPAT_SYS_NI(name) \ - SYSCALL_ALIAS(__s390_compat_sys_##name, sys_ni_posix_timers) +#define __S390_SYS_STUBx(x, name, ...) \ + long __s390_sys##name(struct pt_regs *regs); \ + ALLOW_ERROR_INJECTION(__s390_sys##name, ERRNO); \ + static inline long ___se_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__)); \ + long __s390_sys##name(struct pt_regs *regs) \ + { \ + return ___se_sys##name(SC_S390_REGS_TO_ARGS(x, __VA_ARGS__)); \ + } \ + static inline long ___se_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__)) \ + { \ + __MAP(x, __SC_TEST, __VA_ARGS__); \ + return __do_sys##name(__MAP(x, __SC_COMPAT_CAST, __VA_ARGS__)); \ + } #else /* CONFIG_COMPAT */ -#define __S390_SYS_STUBx(x, fullname, name, ...) - #define SYSCALL_DEFINE0(sname) \ SYSCALL_METADATA(_##sname, 0); \ - asmlinkage long __s390x_sys_##sname(void); \ + long __s390x_sys_##sname(void); \ ALLOW_ERROR_INJECTION(__s390x_sys_##sname, ERRNO); \ - asmlinkage long __s390x_sys_##sname(void) + static inline long __do_sys_##sname(void); \ + long __s390x_sys_##sname(void) \ + { \ + return __do_sys_##sname(); \ + } \ + static inline long __do_sys_##sname(void) #define COND_SYSCALL(name) \ cond_syscall(__s390x_sys_##name) -#define SYS_NI(name) \ - SYSCALL_ALIAS(__s390x_sys_##name, sys_ni_posix_timers); +#define __S390_SYS_STUBx(x, fullname, name, ...) #endif /* CONFIG_COMPAT */ #define __SYSCALL_DEFINEx(x, name, ...) \ - __diag_push(); \ - __diag_ignore(GCC, 8, "-Wattribute-alias", \ - "Type aliasing is used to sanitize syscall arguments");\ - asmlinkage long __s390x_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \ - __attribute__((alias(__stringify(__se_sys##name)))); \ + long __s390x_sys##name(struct pt_regs *regs); \ ALLOW_ERROR_INJECTION(__s390x_sys##name, ERRNO); \ - long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ - static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ - __S390_SYS_STUBx(x, name, __VA_ARGS__) \ - asmlinkage long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ + static inline long __se_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__)); \ + static inline long __do_sys##name(__MAP(x, __SC_DECL, __VA_ARGS__)); \ + __S390_SYS_STUBx(x, name, __VA_ARGS__); \ + long __s390x_sys##name(struct pt_regs *regs) \ + { \ + return __se_sys##name(SC_S390_REGS_TO_ARGS(x, __VA_ARGS__)); \ + } \ + static inline long __se_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__)) \ { \ - long ret = __do_sys##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \ - __MAP(x,__SC_TEST,__VA_ARGS__); \ - return ret; \ + __MAP(x, __SC_TEST, __VA_ARGS__); \ + return __do_sys##name(__MAP(x, __SC_CAST, __VA_ARGS__)); \ } \ - __diag_pop(); \ - static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) + static inline long __do_sys##name(__MAP(x, __SC_DECL, __VA_ARGS__)) -#endif /* _ASM_X86_SYSCALL_WRAPPER_H */ +#endif /* _ASM_S390_SYSCALL_WRAPPER_H */ diff --git a/arch/s390/include/asm/sysinfo.h b/arch/s390/include/asm/sysinfo.h index fe7b3f8f0791..edca5a751df4 100644 --- a/arch/s390/include/asm/sysinfo.h +++ b/arch/s390/include/asm/sysinfo.h @@ -40,6 +40,10 @@ struct sysinfo_1_1_1 { unsigned int ncr; unsigned int npr; unsigned int ntr; + char reserved_3[4]; + char model_var_cap[16]; + unsigned int model_var_cap_rating; + unsigned int nvr; }; struct sysinfo_1_2_1 { @@ -67,12 +71,12 @@ struct sysinfo_1_2_2 { unsigned short cpus_configured; unsigned short cpus_standby; unsigned short cpus_reserved; - unsigned short adjustment[0]; + unsigned short adjustment[]; }; struct sysinfo_1_2_2_extension { unsigned int alt_capability; - unsigned short alt_adjustment[0]; + unsigned short alt_adjustment[]; }; struct sysinfo_2_2_1 { @@ -181,7 +185,7 @@ struct sysinfo_15_1_x { unsigned char reserved1; unsigned char mnest; unsigned char reserved2[4]; - union topology_entry tle[0]; + union topology_entry tle[]; }; int stsi(void *sysinfo, int fc, int sel1, int sel2); diff --git a/arch/s390/include/asm/termios.h b/arch/s390/include/asm/termios.h deleted file mode 100644 index 46fa3020b41e..000000000000 --- a/arch/s390/include/asm/termios.h +++ /dev/null @@ -1,26 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * S390 version - * - * Derived from "include/asm-i386/termios.h" - */ -#ifndef _S390_TERMIOS_H -#define _S390_TERMIOS_H - -#include <uapi/asm/termios.h> - - -/* intr=^C quit=^\ erase=del kill=^U - eof=^D vtime=\0 vmin=\1 sxtc=\0 - start=^Q stop=^S susp=^Z eol=\0 - reprint=^R discard=^U werase=^W lnext=^V - eol2=\0 -*/ -#define INIT_C_CC "\003\034\177\025\004\0\1\0\021\023\032\0\022\017\027\026\0" - -#define user_termios_to_kernel_termios(k, u) copy_from_user(k, u, sizeof(struct termios2)) -#define kernel_termios_to_user_termios(u, k) copy_to_user(u, k, sizeof(struct termios2)) - -#include <asm-generic/termios-base.h> - -#endif /* _S390_TERMIOS_H */ diff --git a/arch/s390/include/asm/text-patching.h b/arch/s390/include/asm/text-patching.h new file mode 100644 index 000000000000..b219056a8817 --- /dev/null +++ b/arch/s390/include/asm/text-patching.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _ASM_S390_TEXT_PATCHING_H +#define _ASM_S390_TEXT_PATCHING_H + +#include <asm/barrier.h> + +static __always_inline void sync_core(void) +{ + bcr_serialize(); +} + +void text_poke_sync(void); +void text_poke_sync_lock(void); + +#endif /* _ASM_S390_TEXT_PATCHING_H */ diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h index e582fbe59e20..a674c7d25da5 100644 --- a/arch/s390/include/asm/thread_info.h +++ b/arch/s390/include/asm/thread_info.h @@ -9,6 +9,9 @@ #define _ASM_THREAD_INFO_H #include <linux/bits.h> +#ifndef ASM_OFFSETS_C +#include <asm/asm-offsets.h> +#endif /* * General size of kernel stacks @@ -18,16 +21,14 @@ #else #define THREAD_SIZE_ORDER 2 #endif -#define BOOT_STACK_ORDER 2 +#define BOOT_STACK_SIZE (PAGE_SIZE << 2) #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) +#define STACK_INIT_OFFSET (THREAD_SIZE - STACK_FRAME_OVERHEAD - __PT_SIZE) + #ifndef __ASSEMBLY__ #include <asm/lowcore.h> #include <asm/page.h> -#include <asm/processor.h> - -#define STACK_INIT_OFFSET \ - (THREAD_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs)) /* * low level task data that entry.S needs immediate access to @@ -37,6 +38,8 @@ */ struct thread_info { unsigned long flags; /* low level flags */ + unsigned long syscall_work; /* SYSCALL_WORK_ flags */ + unsigned int cpu; /* current CPU */ }; /* @@ -47,8 +50,7 @@ struct thread_info { .flags = 0, \ } -void arch_release_task_struct(struct task_struct *tsk); -int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); +struct task_struct; void arch_setup_new_exec(void); #define arch_setup_new_exec arch_setup_new_exec @@ -66,8 +68,9 @@ void arch_setup_new_exec(void); #define TIF_GUARDED_STORAGE 4 /* load guarded storage control block */ #define TIF_PATCH_PENDING 5 /* pending live patching update */ #define TIF_PGSTE 6 /* New mm's will use 4K page tables */ -#define TIF_ISOLATE_BP 8 /* Run process with isolated BP */ +#define TIF_NOTIFY_SIGNAL 7 /* signal notifications exist */ #define TIF_ISOLATE_BP_GUEST 9 /* Run KVM guests with isolated BP */ +#define TIF_PER_TRAP 10 /* Need to handle PER trap on exit to usermode */ #define TIF_31BIT 16 /* 32bit process */ #define TIF_MEMDIE 17 /* is terminating due to OOM killer */ @@ -83,13 +86,14 @@ void arch_setup_new_exec(void); #define TIF_SYSCALL_TRACEPOINT 27 /* syscall tracepoint instrumentation */ #define _TIF_NOTIFY_RESUME BIT(TIF_NOTIFY_RESUME) +#define _TIF_NOTIFY_SIGNAL BIT(TIF_NOTIFY_SIGNAL) #define _TIF_SIGPENDING BIT(TIF_SIGPENDING) #define _TIF_NEED_RESCHED BIT(TIF_NEED_RESCHED) #define _TIF_UPROBE BIT(TIF_UPROBE) #define _TIF_GUARDED_STORAGE BIT(TIF_GUARDED_STORAGE) #define _TIF_PATCH_PENDING BIT(TIF_PATCH_PENDING) -#define _TIF_ISOLATE_BP BIT(TIF_ISOLATE_BP) #define _TIF_ISOLATE_BP_GUEST BIT(TIF_ISOLATE_BP_GUEST) +#define _TIF_PER_TRAP BIT(TIF_PER_TRAP) #define _TIF_31BIT BIT(TIF_31BIT) #define _TIF_SINGLE_STEP BIT(TIF_SINGLE_STEP) diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h index 6da8885251d6..4d646659a5f5 100644 --- a/arch/s390/include/asm/timex.h +++ b/arch/s390/include/asm/timex.h @@ -19,6 +19,25 @@ extern u64 clock_comparator_max; +union tod_clock { + __uint128_t val; + struct { + __uint128_t ei : 8; /* epoch index */ + __uint128_t tod : 64; /* bits 0-63 of tod clock */ + __uint128_t : 40; + __uint128_t pf : 16; /* programmable field */ + }; + struct { + __uint128_t eitod : 72; /* epoch index + bits 0-63 tod clock */ + __uint128_t : 56; + }; + struct { + __uint128_t us : 60; /* micro-seconds */ + __uint128_t sus : 12; /* sub-microseconds */ + __uint128_t : 56; + }; +} __packed; + /* Inline functions for clock register access. */ static inline int set_tod_clock(__u64 time) { @@ -32,26 +51,36 @@ static inline int set_tod_clock(__u64 time) return cc; } -static inline int store_tod_clock(__u64 *time) +static inline int store_tod_clock_ext_cc(union tod_clock *clk) { int cc; asm volatile( - " stck %1\n" + " stcke %1\n" " ipm %0\n" " srl %0,28\n" - : "=d" (cc), "=Q" (*time) : : "cc"); + : "=d" (cc), "=Q" (*clk) : : "cc"); return cc; } +static __always_inline void store_tod_clock_ext(union tod_clock *tod) +{ + asm volatile("stcke %0" : "=Q" (*tod) : : "cc"); +} + static inline void set_clock_comparator(__u64 time) { asm volatile("sckc %0" : : "Q" (time)); } -static inline void store_clock_comparator(__u64 *time) +static inline void set_tod_programmable_field(u16 val) { - asm volatile("stckc %0" : "=Q" (*time)); + asm volatile( + " lgr 0,%[val]\n" + " sckpf\n" + : + : [val] "d" ((unsigned long)val) + : "0"); } void clock_comparator_work(void); @@ -72,10 +101,10 @@ extern unsigned char ptff_function_mask[16]; /* Query TOD offset result */ struct ptff_qto { - unsigned long long physical_clock; - unsigned long long tod_offset; - unsigned long long logical_tod_offset; - unsigned long long tod_epoch_difference; + unsigned long physical_clock; + unsigned long tod_offset; + unsigned long logical_tod_offset; + unsigned long tod_epoch_difference; } __packed; static inline int ptff_query(unsigned int nr) @@ -112,22 +141,25 @@ struct ptff_qui { #define ptff(ptff_block, len, func) \ ({ \ struct addrtype { char _[len]; }; \ - register unsigned int reg0 asm("0") = func; \ - register unsigned long reg1 asm("1") = (unsigned long) (ptff_block);\ + unsigned int reg0 = func; \ + unsigned long reg1 = (unsigned long)(ptff_block); \ int rc; \ \ asm volatile( \ - " .word 0x0104\n" \ - " ipm %0\n" \ - " srl %0,28\n" \ - : "=d" (rc), "+m" (*(struct addrtype *) reg1) \ - : "d" (reg0), "d" (reg1) : "cc"); \ + " lgr 0,%[reg0]\n" \ + " lgr 1,%[reg1]\n" \ + " ptff\n" \ + " ipm %[rc]\n" \ + " srl %[rc],28\n" \ + : [rc] "=&d" (rc), "+m" (*(struct addrtype *)reg1) \ + : [reg0] "d" (reg0), [reg1] "d" (reg1) \ + : "cc", "0", "1"); \ rc; \ }) -static inline unsigned long long local_tick_disable(void) +static inline unsigned long local_tick_disable(void) { - unsigned long long old; + unsigned long old; old = S390_lowcore.clock_comparator; S390_lowcore.clock_comparator = clock_comparator_max; @@ -135,53 +167,47 @@ static inline unsigned long long local_tick_disable(void) return old; } -static inline void local_tick_enable(unsigned long long comp) +static inline void local_tick_enable(unsigned long comp) { S390_lowcore.clock_comparator = comp; set_clock_comparator(S390_lowcore.clock_comparator); } #define CLOCK_TICK_RATE 1193180 /* Underlying HZ */ -#define STORE_CLOCK_EXT_SIZE 16 /* stcke writes 16 bytes */ - -typedef unsigned long long cycles_t; -static inline void get_tod_clock_ext(char *clk) -{ - typedef struct { char _[STORE_CLOCK_EXT_SIZE]; } addrtype; - - asm volatile("stcke %0" : "=Q" (*(addrtype *) clk) : : "cc"); -} +typedef unsigned long cycles_t; -static inline unsigned long long get_tod_clock(void) +static __always_inline unsigned long get_tod_clock(void) { - unsigned char clk[STORE_CLOCK_EXT_SIZE]; + union tod_clock clk; - get_tod_clock_ext(clk); - return *((unsigned long long *)&clk[1]); + store_tod_clock_ext(&clk); + return clk.tod; } -static inline unsigned long long get_tod_clock_fast(void) +static inline unsigned long get_tod_clock_fast(void) { -#ifdef CONFIG_HAVE_MARCH_Z9_109_FEATURES - unsigned long long clk; + unsigned long clk; asm volatile("stckf %0" : "=Q" (clk) : : "cc"); return clk; -#else - return get_tod_clock(); -#endif } static inline cycles_t get_cycles(void) { return (cycles_t) get_tod_clock() >> 2; } +#define get_cycles get_cycles int get_phys_clock(unsigned long *clock); void init_cpu_timer(void); -extern unsigned char tod_clock_base[16] __aligned(8); +extern union tod_clock tod_clock_base; + +static __always_inline unsigned long __get_tod_clock_monotonic(void) +{ + return get_tod_clock() - tod_clock_base.tod; +} /** * get_clock_monotonic - returns current time in clock rate units @@ -190,13 +216,13 @@ extern unsigned char tod_clock_base[16] __aligned(8); * Therefore preemption must be disabled, otherwise the returned * value is not guaranteed to be monotonic. */ -static inline unsigned long long get_tod_clock_monotonic(void) +static inline unsigned long get_tod_clock_monotonic(void) { - unsigned long long tod; + unsigned long tod; - preempt_disable(); - tod = get_tod_clock() - *(unsigned long long *) &tod_clock_base[1]; - preempt_enable(); + preempt_disable_notrace(); + tod = __get_tod_clock_monotonic(); + preempt_enable_notrace(); return tod; } @@ -219,7 +245,7 @@ static inline unsigned long long get_tod_clock_monotonic(void) * -> ns = (th * 125) + ((tl * 125) >> 9); * */ -static inline unsigned long long tod_to_ns(unsigned long long todval) +static __always_inline unsigned long tod_to_ns(unsigned long todval) { return ((todval >> 9) * 125) + (((todval & 0x1ff) * 125) >> 9); } @@ -231,10 +257,10 @@ static inline unsigned long long tod_to_ns(unsigned long long todval) * * Returns: true if a is later than b */ -static inline int tod_after(unsigned long long a, unsigned long long b) +static inline int tod_after(unsigned long a, unsigned long b) { if (MACHINE_HAS_SCC) - return (long long) a > (long long) b; + return (long) a > (long) b; return a > b; } @@ -245,10 +271,10 @@ static inline int tod_after(unsigned long long a, unsigned long long b) * * Returns: true if a is later than b */ -static inline int tod_after_eq(unsigned long long a, unsigned long long b) +static inline int tod_after_eq(unsigned long a, unsigned long b) { if (MACHINE_HAS_SCC) - return (long long) a >= (long long) b; + return (long) a >= (long) b; return a >= b; } diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index aa406c05a350..d1455a601adc 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -25,10 +25,8 @@ void __tlb_remove_table(void *_table); static inline void tlb_flush(struct mmu_gather *tlb); static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, int page_size); - -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) + struct encoded_page *page, + int page_size); #define tlb_flush tlb_flush #define pte_free_tlb pte_free_tlb @@ -36,7 +34,6 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, #define p4d_free_tlb p4d_free_tlb #define pud_free_tlb pud_free_tlb -#include <asm/pgalloc.h> #include <asm/tlbflush.h> #include <asm-generic/tlb.h> @@ -44,11 +41,15 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, * Release the page cache reference for a pte removed by * tlb_ptep_clear_flush. In both flush modes the tlb for a page cache page * has already been freed, so just do free_page_and_swap_cache. + * + * s390 doesn't delay rmap removal, so there is nothing encoded in + * the page pointer. */ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, int page_size) + struct encoded_page *page, + int page_size) { - free_page_and_swap_cache(page); + free_page_and_swap_cache(encoded_page_ptr(page)); return false; } @@ -67,13 +68,10 @@ static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, __tlb_adjust_range(tlb, address, PAGE_SIZE); tlb->mm->context.flush_mm = 1; tlb->freed_tables = 1; - tlb->cleared_ptes = 1; - /* - * page_table_free_rcu takes care of the allocation bit masks - * of the 2K table fragments in the 4K page table page, - * then calls tlb_remove_table. - */ - page_table_free_rcu(tlb, (unsigned long *) pte, address); + tlb->cleared_pmds = 1; + if (mm_alloc_pgste(tlb->mm)) + gmap_unlink(tlb->mm, (unsigned long *)pte, address); + tlb_remove_ptdesc(tlb, pte); } /* @@ -88,12 +86,12 @@ static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, { if (mm_pmd_folded(tlb->mm)) return; - pgtable_pmd_page_dtor(virt_to_page(pmd)); + pagetable_pmd_dtor(virt_to_ptdesc(pmd)); __tlb_adjust_range(tlb, address, PAGE_SIZE); tlb->mm->context.flush_mm = 1; tlb->freed_tables = 1; tlb->cleared_puds = 1; - tlb_remove_table(tlb, pmd); + tlb_remove_ptdesc(tlb, pmd); } /* @@ -111,8 +109,7 @@ static inline void p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, __tlb_adjust_range(tlb, address, PAGE_SIZE); tlb->mm->context.flush_mm = 1; tlb->freed_tables = 1; - tlb->cleared_p4ds = 1; - tlb_remove_table(tlb, p4d); + tlb_remove_ptdesc(tlb, p4d); } /* @@ -129,8 +126,8 @@ static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, return; tlb->mm->context.flush_mm = 1; tlb->freed_tables = 1; - tlb->cleared_puds = 1; - tlb_remove_table(tlb, pud); + tlb->cleared_p4ds = 1; + tlb_remove_ptdesc(tlb, pud); } diff --git a/arch/s390/include/asm/tlbflush.h b/arch/s390/include/asm/tlbflush.h index 82703e03f35d..a6e2cd89b609 100644 --- a/arch/s390/include/asm/tlbflush.h +++ b/arch/s390/include/asm/tlbflush.h @@ -5,8 +5,6 @@ #include <linux/mm.h> #include <linux/sched.h> #include <asm/processor.h> -#include <asm/pgalloc.h> -#include <asm/pgtable.h> /* * Flush all TLB entries on the local CPU. @@ -27,13 +25,9 @@ static inline void __tlb_flush_idte(unsigned long asce) if (MACHINE_HAS_TLB_GUEST) opt |= IDTE_GUEST_ASCE; /* Global TLB flush for the mm */ - asm volatile( - " .insn rrf,0xb98e0000,0,%0,%1,0" - : : "a" (opt), "a" (asce) : "cc"); + asm volatile("idte 0,%1,%0" : : "a" (opt), "a" (asce) : "cc"); } -void smp_ptlb_all(void); - /* * Flush all TLB entries on all CPUs. */ diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h index cca406fdbe51..3a0ac0c7a9a3 100644 --- a/arch/s390/include/asm/topology.h +++ b/arch/s390/include/asm/topology.h @@ -16,8 +16,8 @@ struct cpu_topology_s390 { unsigned short socket_id; unsigned short book_id; unsigned short drawer_id; - unsigned short node_id; unsigned short dedicated : 1; + int booted_cores; cpumask_t thread_mask; cpumask_t core_mask; cpumask_t book_mask; @@ -25,7 +25,6 @@ struct cpu_topology_s390 { }; extern struct cpu_topology_s390 cpu_topology[NR_CPUS]; -extern cpumask_t cpus_with_topology; #define topology_physical_package_id(cpu) (cpu_topology[cpu].socket_id) #define topology_thread_id(cpu) (cpu_topology[cpu].thread_id) @@ -37,6 +36,7 @@ extern cpumask_t cpus_with_topology; #define topology_drawer_id(cpu) (cpu_topology[cpu].drawer_id) #define topology_drawer_cpumask(cpu) (&cpu_topology[cpu].drawer_mask) #define topology_cpu_dedicated(cpu) (cpu_topology[cpu].dedicated) +#define topology_booted_cores(cpu) (cpu_topology[cpu].booted_cores) #define mc_capable() 1 @@ -45,6 +45,7 @@ int topology_cpu_init(struct cpu *); int topology_set_cpu_management(int fc); void topology_schedule_update(void); void store_topology(struct sysinfo_15_1_x *info); +void update_cpu_masks(void); void topology_expect_change(void); const struct cpumask *cpu_coregroup_mask(int cpu); @@ -54,6 +55,8 @@ static inline void topology_init_early(void) { } static inline void topology_schedule_update(void) { } static inline int topology_cpu_init(struct cpu *cpu) { return 0; } static inline int topology_cpu_dedicated(int cpu_nr) { return 0; } +static inline int topology_booted_cores(int cpu_nr) { return 1; } +static inline void update_cpu_masks(void) { } static inline void topology_expect_change(void) { } #endif /* CONFIG_SCHED_TOPOLOGY */ @@ -71,20 +74,18 @@ static inline void topology_expect_change(void) { } #define cpu_to_node cpu_to_node static inline int cpu_to_node(int cpu) { - return cpu_topology[cpu].node_id; + return 0; } /* Returns a pointer to the cpumask of CPUs on node 'node'. */ #define cpumask_of_node cpumask_of_node static inline const struct cpumask *cpumask_of_node(int node) { - return &node_to_cpumask_map[node]; + return cpu_possible_mask; } #define pcibus_to_node(bus) __pcibus_to_node(bus) -#define node_distance(a, b) __node_distance(a, b) - #else /* !CONFIG_NUMA */ #define numa_node_id numa_node_id diff --git a/arch/s390/include/asm/tpi.h b/arch/s390/include/asm/tpi.h new file mode 100644 index 000000000000..f76e5fdff23a --- /dev/null +++ b/arch/s390/include/asm/tpi.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _ASM_S390_TPI_H +#define _ASM_S390_TPI_H + +#include <linux/types.h> +#include <uapi/asm/schid.h> + +#ifndef __ASSEMBLY__ + +/* I/O-Interruption Code as stored by TEST PENDING INTERRUPTION (TPI). */ +struct tpi_info { + struct subchannel_id schid; + u32 intparm; + u32 adapter_IO:1; + u32 directed_irq:1; + u32 isc:3; + u32 :12; + u32 type:3; + u32 :12; +} __packed __aligned(4); + +/* I/O-Interruption Code as stored by TPI for an Adapter I/O */ +struct tpi_adapter_info { + u32 aism:8; + u32 :22; + u32 error:1; + u32 forward:1; + u32 reserved; + u32 adapter_IO:1; + u32 directed_irq:1; + u32 isc:3; + u32 :27; +} __packed __aligned(4); + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_S390_TPI_H */ diff --git a/arch/s390/include/asm/types.h b/arch/s390/include/asm/types.h new file mode 100644 index 000000000000..0b5d550a0478 --- /dev/null +++ b/arch/s390/include/asm/types.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _ASM_S390_TYPES_H +#define _ASM_S390_TYPES_H + +#include <uapi/asm/types.h> + +#ifndef __ASSEMBLY__ + +union register_pair { + unsigned __int128 pair; + struct { + unsigned long even; + unsigned long odd; + }; +}; + +#endif /* __ASSEMBLY__ */ +#endif /* _ASM_S390_TYPES_H */ diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index a470f1fa9f2a..81ae8a98e7ec 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -3,7 +3,7 @@ * S390 version * Copyright IBM Corp. 1999, 2000 * Author(s): Hartmut Penner (hp@de.ibm.com), - * Martin Schwidefsky (schwidefsky@de.ibm.com) + * Martin Schwidefsky (schwidefsky@de.ibm.com) * * Derived from "include/asm-i386/uaccess.h" */ @@ -13,41 +13,13 @@ /* * User space memory access functions */ +#include <asm/asm-extable.h> #include <asm/processor.h> -#include <asm/ctl_reg.h> #include <asm/extable.h> #include <asm/facility.h> +#include <asm-generic/access_ok.h> -/* - * The fs value determines whether argument validity checking should be - * performed or not. If get_fs() == USER_DS, checking is performed, with - * get_fs() == KERNEL_DS, checking is bypassed. - * - * For historical reasons, these macros are grossly misnamed. - */ - -#define KERNEL_DS (0) -#define KERNEL_DS_SACF (1) -#define USER_DS (2) -#define USER_DS_SACF (3) - -#define get_fs() (current->thread.mm_segment) -#define segment_eq(a,b) (((a) & 2) == ((b) & 2)) - -void set_fs(mm_segment_t fs); - -static inline int __range_ok(unsigned long addr, unsigned long size) -{ - return 1; -} - -#define __access_ok(addr, size) \ -({ \ - __chk_user_ptr(addr); \ - __range_ok((unsigned long)(addr), (size)); \ -}) - -#define access_ok(addr, size) __access_ok(addr, size) +void debug_user_asce(int exit); unsigned long __must_check raw_copy_from_user(void *to, const void __user *from, unsigned long n); @@ -60,209 +32,246 @@ raw_copy_to_user(void __user *to, const void *from, unsigned long n); #define INLINE_COPY_TO_USER #endif -#ifdef CONFIG_HAVE_MARCH_Z10_FEATURES - -#define __put_get_user_asm(to, from, size, spec) \ -({ \ - register unsigned long __reg0 asm("0") = spec; \ - int __rc; \ - \ - asm volatile( \ - "0: mvcos %1,%3,%2\n" \ - "1: xr %0,%0\n" \ - "2:\n" \ - ".pushsection .fixup, \"ax\"\n" \ - "3: lhi %0,%5\n" \ - " jg 2b\n" \ - ".popsection\n" \ - EX_TABLE(0b,3b) EX_TABLE(1b,3b) \ - : "=d" (__rc), "+Q" (*(to)) \ - : "d" (size), "Q" (*(from)), \ - "d" (__reg0), "K" (-EFAULT) \ - : "cc"); \ - __rc; \ +unsigned long __must_check +_copy_from_user_key(void *to, const void __user *from, unsigned long n, unsigned long key); + +static __always_inline unsigned long __must_check +copy_from_user_key(void *to, const void __user *from, unsigned long n, unsigned long key) +{ + if (check_copy_size(to, n, false)) + n = _copy_from_user_key(to, from, n, key); + return n; +} + +unsigned long __must_check +_copy_to_user_key(void __user *to, const void *from, unsigned long n, unsigned long key); + +static __always_inline unsigned long __must_check +copy_to_user_key(void __user *to, const void *from, unsigned long n, unsigned long key) +{ + if (check_copy_size(from, n, true)) + n = _copy_to_user_key(to, from, n, key); + return n; +} + +union oac { + unsigned int val; + struct { + struct { + unsigned short key : 4; + unsigned short : 4; + unsigned short as : 2; + unsigned short : 4; + unsigned short k : 1; + unsigned short a : 1; + } oac1; + struct { + unsigned short key : 4; + unsigned short : 4; + unsigned short as : 2; + unsigned short : 4; + unsigned short k : 1; + unsigned short a : 1; + } oac2; + }; +}; + +int __noreturn __put_user_bad(void); + +#define __put_user_asm(to, from, size) \ +({ \ + union oac __oac_spec = { \ + .oac1.as = PSW_BITS_AS_SECONDARY, \ + .oac1.a = 1, \ + }; \ + int __rc; \ + \ + asm volatile( \ + " lr 0,%[spec]\n" \ + "0: mvcos %[_to],%[_from],%[_size]\n" \ + "1: xr %[rc],%[rc]\n" \ + "2:\n" \ + EX_TABLE_UA_STORE(0b, 2b, %[rc]) \ + EX_TABLE_UA_STORE(1b, 2b, %[rc]) \ + : [rc] "=&d" (__rc), [_to] "+Q" (*(to)) \ + : [_size] "d" (size), [_from] "Q" (*(from)), \ + [spec] "d" (__oac_spec.val) \ + : "cc", "0"); \ + __rc; \ }) static __always_inline int __put_user_fn(void *x, void __user *ptr, unsigned long size) { - unsigned long spec = 0x010000UL; int rc; switch (size) { case 1: - rc = __put_get_user_asm((unsigned char __user *)ptr, - (unsigned char *)x, - size, spec); + rc = __put_user_asm((unsigned char __user *)ptr, + (unsigned char *)x, + size); break; case 2: - rc = __put_get_user_asm((unsigned short __user *)ptr, - (unsigned short *)x, - size, spec); + rc = __put_user_asm((unsigned short __user *)ptr, + (unsigned short *)x, + size); break; case 4: - rc = __put_get_user_asm((unsigned int __user *)ptr, - (unsigned int *)x, - size, spec); + rc = __put_user_asm((unsigned int __user *)ptr, + (unsigned int *)x, + size); break; case 8: - rc = __put_get_user_asm((unsigned long __user *)ptr, - (unsigned long *)x, - size, spec); + rc = __put_user_asm((unsigned long __user *)ptr, + (unsigned long *)x, + size); + break; + default: + __put_user_bad(); break; } return rc; } +int __noreturn __get_user_bad(void); + +#define __get_user_asm(to, from, size) \ +({ \ + union oac __oac_spec = { \ + .oac2.as = PSW_BITS_AS_SECONDARY, \ + .oac2.a = 1, \ + }; \ + int __rc; \ + \ + asm volatile( \ + " lr 0,%[spec]\n" \ + "0: mvcos 0(%[_to]),%[_from],%[_size]\n" \ + "1: xr %[rc],%[rc]\n" \ + "2:\n" \ + EX_TABLE_UA_LOAD_MEM(0b, 2b, %[rc], %[_to], %[_ksize]) \ + EX_TABLE_UA_LOAD_MEM(1b, 2b, %[rc], %[_to], %[_ksize]) \ + : [rc] "=&d" (__rc), "=Q" (*(to)) \ + : [_size] "d" (size), [_from] "Q" (*(from)), \ + [spec] "d" (__oac_spec.val), [_to] "a" (to), \ + [_ksize] "K" (size) \ + : "cc", "0"); \ + __rc; \ +}) + static __always_inline int __get_user_fn(void *x, const void __user *ptr, unsigned long size) { - unsigned long spec = 0x01UL; int rc; switch (size) { case 1: - rc = __put_get_user_asm((unsigned char *)x, - (unsigned char __user *)ptr, - size, spec); + rc = __get_user_asm((unsigned char *)x, + (unsigned char __user *)ptr, + size); break; case 2: - rc = __put_get_user_asm((unsigned short *)x, - (unsigned short __user *)ptr, - size, spec); + rc = __get_user_asm((unsigned short *)x, + (unsigned short __user *)ptr, + size); break; case 4: - rc = __put_get_user_asm((unsigned int *)x, - (unsigned int __user *)ptr, - size, spec); + rc = __get_user_asm((unsigned int *)x, + (unsigned int __user *)ptr, + size); break; case 8: - rc = __put_get_user_asm((unsigned long *)x, - (unsigned long __user *)ptr, - size, spec); + rc = __get_user_asm((unsigned long *)x, + (unsigned long __user *)ptr, + size); + break; + default: + __get_user_bad(); break; } return rc; } -#else /* CONFIG_HAVE_MARCH_Z10_FEATURES */ - -static inline int __put_user_fn(void *x, void __user *ptr, unsigned long size) -{ - size = raw_copy_to_user(ptr, x, size); - return size ? -EFAULT : 0; -} - -static inline int __get_user_fn(void *x, const void __user *ptr, unsigned long size) -{ - size = raw_copy_from_user(x, ptr, size); - return size ? -EFAULT : 0; -} - -#endif /* CONFIG_HAVE_MARCH_Z10_FEATURES */ - /* * These are the main single-value transfer routines. They automatically * use the right size if we just have the right pointer type. */ -#define __put_user(x, ptr) \ -({ \ - __typeof__(*(ptr)) __x = (x); \ - int __pu_err = -EFAULT; \ - __chk_user_ptr(ptr); \ - switch (sizeof (*(ptr))) { \ - case 1: \ - case 2: \ - case 4: \ - case 8: \ - __pu_err = __put_user_fn(&__x, ptr, \ - sizeof(*(ptr))); \ - break; \ - default: \ - __put_user_bad(); \ - break; \ - } \ - __builtin_expect(__pu_err, 0); \ +#define __put_user(x, ptr) \ +({ \ + __typeof__(*(ptr)) __x = (x); \ + int __pu_err = -EFAULT; \ + \ + __chk_user_ptr(ptr); \ + switch (sizeof(*(ptr))) { \ + case 1: \ + case 2: \ + case 4: \ + case 8: \ + __pu_err = __put_user_fn(&__x, ptr, sizeof(*(ptr))); \ + break; \ + default: \ + __put_user_bad(); \ + break; \ + } \ + __builtin_expect(__pu_err, 0); \ }) -#define put_user(x, ptr) \ -({ \ - might_fault(); \ - __put_user(x, ptr); \ +#define put_user(x, ptr) \ +({ \ + might_fault(); \ + __put_user(x, ptr); \ }) - -int __put_user_bad(void) __attribute__((noreturn)); - -#define __get_user(x, ptr) \ -({ \ - int __gu_err = -EFAULT; \ - __chk_user_ptr(ptr); \ - switch (sizeof(*(ptr))) { \ - case 1: { \ - unsigned char __x = 0; \ - __gu_err = __get_user_fn(&__x, ptr, \ - sizeof(*(ptr))); \ - (x) = *(__force __typeof__(*(ptr)) *) &__x; \ - break; \ - }; \ - case 2: { \ - unsigned short __x = 0; \ - __gu_err = __get_user_fn(&__x, ptr, \ - sizeof(*(ptr))); \ - (x) = *(__force __typeof__(*(ptr)) *) &__x; \ - break; \ - }; \ - case 4: { \ - unsigned int __x = 0; \ - __gu_err = __get_user_fn(&__x, ptr, \ - sizeof(*(ptr))); \ - (x) = *(__force __typeof__(*(ptr)) *) &__x; \ - break; \ - }; \ - case 8: { \ - unsigned long long __x = 0; \ - __gu_err = __get_user_fn(&__x, ptr, \ - sizeof(*(ptr))); \ - (x) = *(__force __typeof__(*(ptr)) *) &__x; \ - break; \ - }; \ - default: \ - __get_user_bad(); \ - break; \ - } \ - __builtin_expect(__gu_err, 0); \ +#define __get_user(x, ptr) \ +({ \ + int __gu_err = -EFAULT; \ + \ + __chk_user_ptr(ptr); \ + switch (sizeof(*(ptr))) { \ + case 1: { \ + unsigned char __x; \ + \ + __gu_err = __get_user_fn(&__x, ptr, sizeof(*(ptr))); \ + (x) = *(__force __typeof__(*(ptr)) *)&__x; \ + break; \ + }; \ + case 2: { \ + unsigned short __x; \ + \ + __gu_err = __get_user_fn(&__x, ptr, sizeof(*(ptr))); \ + (x) = *(__force __typeof__(*(ptr)) *)&__x; \ + break; \ + }; \ + case 4: { \ + unsigned int __x; \ + \ + __gu_err = __get_user_fn(&__x, ptr, sizeof(*(ptr))); \ + (x) = *(__force __typeof__(*(ptr)) *)&__x; \ + break; \ + }; \ + case 8: { \ + unsigned long __x; \ + \ + __gu_err = __get_user_fn(&__x, ptr, sizeof(*(ptr))); \ + (x) = *(__force __typeof__(*(ptr)) *)&__x; \ + break; \ + }; \ + default: \ + __get_user_bad(); \ + break; \ + } \ + __builtin_expect(__gu_err, 0); \ }) -#define get_user(x, ptr) \ -({ \ - might_fault(); \ - __get_user(x, ptr); \ +#define get_user(x, ptr) \ +({ \ + might_fault(); \ + __get_user(x, ptr); \ }) -int __get_user_bad(void) __attribute__((noreturn)); - -unsigned long __must_check -raw_copy_in_user(void __user *to, const void __user *from, unsigned long n); - /* * Copy a null terminated string from userspace. */ +long __must_check strncpy_from_user(char *dst, const char __user *src, long count); -long __strncpy_from_user(char *dst, const char __user *src, long count); - -static inline long __must_check -strncpy_from_user(char *dst, const char __user *src, long count) -{ - might_fault(); - return __strncpy_from_user(dst, src, count); -} - -unsigned long __must_check __strnlen_user(const char __user *src, unsigned long count); - -static inline unsigned long strnlen_user(const char __user *src, unsigned long n) -{ - might_fault(); - return __strnlen_user(src, n); -} +long __must_check strnlen_user(const char __user *src, long count); /* * Zero Userspace @@ -275,7 +284,317 @@ static inline unsigned long __must_check clear_user(void __user *to, unsigned lo return __clear_user(to, n); } -int copy_to_user_real(void __user *dest, void *src, unsigned long count); -void s390_kernel_write(void *dst, const void *src, size_t size); +void *s390_kernel_write(void *dst, const void *src, size_t size); + +int __noreturn __put_kernel_bad(void); + +#define __put_kernel_asm(val, to, insn) \ +({ \ + int __rc; \ + \ + asm volatile( \ + "0: " insn " %[_val],%[_to]\n" \ + "1: xr %[rc],%[rc]\n" \ + "2:\n" \ + EX_TABLE_UA_STORE(0b, 2b, %[rc]) \ + EX_TABLE_UA_STORE(1b, 2b, %[rc]) \ + : [rc] "=d" (__rc), [_to] "+Q" (*(to)) \ + : [_val] "d" (val) \ + : "cc"); \ + __rc; \ +}) + +#define __put_kernel_nofault(dst, src, type, err_label) \ +do { \ + unsigned long __x = (unsigned long)(*((type *)(src))); \ + int __pk_err; \ + \ + switch (sizeof(type)) { \ + case 1: \ + __pk_err = __put_kernel_asm(__x, (type *)(dst), "stc"); \ + break; \ + case 2: \ + __pk_err = __put_kernel_asm(__x, (type *)(dst), "sth"); \ + break; \ + case 4: \ + __pk_err = __put_kernel_asm(__x, (type *)(dst), "st"); \ + break; \ + case 8: \ + __pk_err = __put_kernel_asm(__x, (type *)(dst), "stg"); \ + break; \ + default: \ + __pk_err = __put_kernel_bad(); \ + break; \ + } \ + if (unlikely(__pk_err)) \ + goto err_label; \ +} while (0) + +int __noreturn __get_kernel_bad(void); + +#define __get_kernel_asm(val, from, insn) \ +({ \ + int __rc; \ + \ + asm volatile( \ + "0: " insn " %[_val],%[_from]\n" \ + "1: xr %[rc],%[rc]\n" \ + "2:\n" \ + EX_TABLE_UA_LOAD_REG(0b, 2b, %[rc], %[_val]) \ + EX_TABLE_UA_LOAD_REG(1b, 2b, %[rc], %[_val]) \ + : [rc] "=d" (__rc), [_val] "=d" (val) \ + : [_from] "Q" (*(from)) \ + : "cc"); \ + __rc; \ +}) + +#define __get_kernel_nofault(dst, src, type, err_label) \ +do { \ + int __gk_err; \ + \ + switch (sizeof(type)) { \ + case 1: { \ + unsigned char __x; \ + \ + __gk_err = __get_kernel_asm(__x, (type *)(src), "ic"); \ + *((type *)(dst)) = (type)__x; \ + break; \ + }; \ + case 2: { \ + unsigned short __x; \ + \ + __gk_err = __get_kernel_asm(__x, (type *)(src), "lh"); \ + *((type *)(dst)) = (type)__x; \ + break; \ + }; \ + case 4: { \ + unsigned int __x; \ + \ + __gk_err = __get_kernel_asm(__x, (type *)(src), "l"); \ + *((type *)(dst)) = (type)__x; \ + break; \ + }; \ + case 8: { \ + unsigned long __x; \ + \ + __gk_err = __get_kernel_asm(__x, (type *)(src), "lg"); \ + *((type *)(dst)) = (type)__x; \ + break; \ + }; \ + default: \ + __gk_err = __get_kernel_bad(); \ + break; \ + } \ + if (unlikely(__gk_err)) \ + goto err_label; \ +} while (0) + +void __cmpxchg_user_key_called_with_bad_pointer(void); + +#define CMPXCHG_USER_KEY_MAX_LOOPS 128 + +static __always_inline int __cmpxchg_user_key(unsigned long address, void *uval, + __uint128_t old, __uint128_t new, + unsigned long key, int size) +{ + int rc = 0; + + switch (size) { + case 1: { + unsigned int prev, shift, mask, _old, _new; + unsigned long count; + + shift = (3 ^ (address & 3)) << 3; + address ^= address & 3; + _old = ((unsigned int)old & 0xff) << shift; + _new = ((unsigned int)new & 0xff) << shift; + mask = ~(0xff << shift); + asm volatile( + " spka 0(%[key])\n" + " sacf 256\n" + " llill %[count],%[max_loops]\n" + "0: l %[prev],%[address]\n" + "1: nr %[prev],%[mask]\n" + " xilf %[mask],0xffffffff\n" + " or %[new],%[prev]\n" + " or %[prev],%[tmp]\n" + "2: lr %[tmp],%[prev]\n" + "3: cs %[prev],%[new],%[address]\n" + "4: jnl 5f\n" + " xr %[tmp],%[prev]\n" + " xr %[new],%[tmp]\n" + " nr %[tmp],%[mask]\n" + " jnz 5f\n" + " brct %[count],2b\n" + "5: sacf 768\n" + " spka %[default_key]\n" + EX_TABLE_UA_LOAD_REG(0b, 5b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REG(1b, 5b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REG(3b, 5b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REG(4b, 5b, %[rc], %[prev]) + : [rc] "+&d" (rc), + [prev] "=&d" (prev), + [address] "+Q" (*(int *)address), + [tmp] "+&d" (_old), + [new] "+&d" (_new), + [mask] "+&d" (mask), + [count] "=a" (count) + : [key] "%[count]" (key << 4), + [default_key] "J" (PAGE_DEFAULT_KEY), + [max_loops] "J" (CMPXCHG_USER_KEY_MAX_LOOPS) + : "memory", "cc"); + *(unsigned char *)uval = prev >> shift; + if (!count) + rc = -EAGAIN; + return rc; + } + case 2: { + unsigned int prev, shift, mask, _old, _new; + unsigned long count; + + shift = (2 ^ (address & 2)) << 3; + address ^= address & 2; + _old = ((unsigned int)old & 0xffff) << shift; + _new = ((unsigned int)new & 0xffff) << shift; + mask = ~(0xffff << shift); + asm volatile( + " spka 0(%[key])\n" + " sacf 256\n" + " llill %[count],%[max_loops]\n" + "0: l %[prev],%[address]\n" + "1: nr %[prev],%[mask]\n" + " xilf %[mask],0xffffffff\n" + " or %[new],%[prev]\n" + " or %[prev],%[tmp]\n" + "2: lr %[tmp],%[prev]\n" + "3: cs %[prev],%[new],%[address]\n" + "4: jnl 5f\n" + " xr %[tmp],%[prev]\n" + " xr %[new],%[tmp]\n" + " nr %[tmp],%[mask]\n" + " jnz 5f\n" + " brct %[count],2b\n" + "5: sacf 768\n" + " spka %[default_key]\n" + EX_TABLE_UA_LOAD_REG(0b, 5b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REG(1b, 5b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REG(3b, 5b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REG(4b, 5b, %[rc], %[prev]) + : [rc] "+&d" (rc), + [prev] "=&d" (prev), + [address] "+Q" (*(int *)address), + [tmp] "+&d" (_old), + [new] "+&d" (_new), + [mask] "+&d" (mask), + [count] "=a" (count) + : [key] "%[count]" (key << 4), + [default_key] "J" (PAGE_DEFAULT_KEY), + [max_loops] "J" (CMPXCHG_USER_KEY_MAX_LOOPS) + : "memory", "cc"); + *(unsigned short *)uval = prev >> shift; + if (!count) + rc = -EAGAIN; + return rc; + } + case 4: { + unsigned int prev = old; + + asm volatile( + " spka 0(%[key])\n" + " sacf 256\n" + "0: cs %[prev],%[new],%[address]\n" + "1: sacf 768\n" + " spka %[default_key]\n" + EX_TABLE_UA_LOAD_REG(0b, 1b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REG(1b, 1b, %[rc], %[prev]) + : [rc] "+&d" (rc), + [prev] "+&d" (prev), + [address] "+Q" (*(int *)address) + : [new] "d" ((unsigned int)new), + [key] "a" (key << 4), + [default_key] "J" (PAGE_DEFAULT_KEY) + : "memory", "cc"); + *(unsigned int *)uval = prev; + return rc; + } + case 8: { + unsigned long prev = old; + + asm volatile( + " spka 0(%[key])\n" + " sacf 256\n" + "0: csg %[prev],%[new],%[address]\n" + "1: sacf 768\n" + " spka %[default_key]\n" + EX_TABLE_UA_LOAD_REG(0b, 1b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REG(1b, 1b, %[rc], %[prev]) + : [rc] "+&d" (rc), + [prev] "+&d" (prev), + [address] "+QS" (*(long *)address) + : [new] "d" ((unsigned long)new), + [key] "a" (key << 4), + [default_key] "J" (PAGE_DEFAULT_KEY) + : "memory", "cc"); + *(unsigned long *)uval = prev; + return rc; + } + case 16: { + __uint128_t prev = old; + + asm volatile( + " spka 0(%[key])\n" + " sacf 256\n" + "0: cdsg %[prev],%[new],%[address]\n" + "1: sacf 768\n" + " spka %[default_key]\n" + EX_TABLE_UA_LOAD_REGPAIR(0b, 1b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REGPAIR(1b, 1b, %[rc], %[prev]) + : [rc] "+&d" (rc), + [prev] "+&d" (prev), + [address] "+QS" (*(__int128_t *)address) + : [new] "d" (new), + [key] "a" (key << 4), + [default_key] "J" (PAGE_DEFAULT_KEY) + : "memory", "cc"); + *(__uint128_t *)uval = prev; + return rc; + } + } + __cmpxchg_user_key_called_with_bad_pointer(); + return rc; +} + +/** + * cmpxchg_user_key() - cmpxchg with user space target, honoring storage keys + * @ptr: User space address of value to compare to @old and exchange with + * @new. Must be aligned to sizeof(*@ptr). + * @uval: Address where the old value of *@ptr is written to. + * @old: Old value. Compared to the content pointed to by @ptr in order to + * determine if the exchange occurs. The old value read from *@ptr is + * written to *@uval. + * @new: New value to place at *@ptr. + * @key: Access key to use for checking storage key protection. + * + * Perform a cmpxchg on a user space target, honoring storage key protection. + * @key alone determines how key checking is performed, neither + * storage-protection-override nor fetch-protection-override apply. + * The caller must compare *@uval and @old to determine if values have been + * exchanged. In case of an exception *@uval is set to zero. + * + * Return: 0: cmpxchg executed + * -EFAULT: an exception happened when trying to access *@ptr + * -EAGAIN: maxed out number of retries (byte and short only) + */ +#define cmpxchg_user_key(ptr, uval, old, new, key) \ +({ \ + __typeof__(ptr) __ptr = (ptr); \ + __typeof__(uval) __uval = (uval); \ + \ + BUILD_BUG_ON(sizeof(*(__ptr)) != sizeof(*(__uval))); \ + might_fault(); \ + __chk_user_ptr(__ptr); \ + __cmpxchg_user_key((unsigned long)(__ptr), (void *)(__uval), \ + (old), (new), (key), sizeof(*(__ptr))); \ +}) #endif /* __S390_UACCESS_H */ diff --git a/arch/s390/include/asm/unistd.h b/arch/s390/include/asm/unistd.h index 9e9f75ef046a..4260bc5ce7f8 100644 --- a/arch/s390/include/asm/unistd.h +++ b/arch/s390/include/asm/unistd.h @@ -28,6 +28,7 @@ #define __ARCH_WANT_SYS_SIGPENDING #define __ARCH_WANT_SYS_SIGPROCMASK # ifdef CONFIG_COMPAT +# define __ARCH_WANT_COMPAT_STAT # define __ARCH_WANT_SYS_TIME32 # define __ARCH_WANT_SYS_UTIME32 # endif diff --git a/arch/s390/include/asm/unwind.h b/arch/s390/include/asm/unwind.h index de9006b0cfeb..b8ecf04e3468 100644 --- a/arch/s390/include/asm/unwind.h +++ b/arch/s390/include/asm/unwind.h @@ -4,6 +4,8 @@ #include <linux/sched.h> #include <linux/ftrace.h> +#include <linux/rethook.h> +#include <linux/llist.h> #include <asm/ptrace.h> #include <asm/stacktrace.h> @@ -36,10 +38,23 @@ struct unwind_state { struct pt_regs *regs; unsigned long sp, ip; int graph_idx; + struct llist_node *kr_cur; bool reliable; bool error; }; +/* Recover the return address modified by rethook and ftrace_graph. */ +static inline unsigned long unwind_recover_ret_addr(struct unwind_state *state, + unsigned long ip) +{ + ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, ip, (void *)state->sp); +#ifdef CONFIG_RETHOOK + if (is_rethook_trampoline(ip)) + ip = rethook_find_ret_addr(state->task, state->sp, &state->kr_cur); +#endif + return ip; +} + void __unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long first_frame); bool unwind_next_frame(struct unwind_state *state); @@ -55,10 +70,10 @@ static inline bool unwind_error(struct unwind_state *state) return state->error; } -static inline void unwind_start(struct unwind_state *state, - struct task_struct *task, - struct pt_regs *regs, - unsigned long first_frame) +static __always_inline void unwind_start(struct unwind_state *state, + struct task_struct *task, + struct pt_regs *regs, + unsigned long first_frame) { task = task ?: current; first_frame = first_frame ?: get_stack_pointer(task, regs); diff --git a/arch/s390/include/asm/user.h b/arch/s390/include/asm/user.h index 0ca572ced21b..8e8aaf48582e 100644 --- a/arch/s390/include/asm/user.h +++ b/arch/s390/include/asm/user.h @@ -67,9 +67,5 @@ struct user { unsigned long magic; /* To uniquely identify a core file */ char u_comm[32]; /* User command that was responsible */ }; -#define NBPG PAGE_SIZE -#define UPAGES 1 -#define HOST_TEXT_START_ADDR (u.start_code) -#define HOST_STACK_END_ADDR (u.start_stack + u.u_ssize * NBPG) #endif /* _S390_USER_H */ diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h index ef3c00b049ab..0e7bd3873907 100644 --- a/arch/s390/include/asm/uv.h +++ b/arch/s390/include/asm/uv.h @@ -2,7 +2,7 @@ /* * Ultravisor Interfaces * - * Copyright IBM Corp. 2019 + * Copyright IBM Corp. 2019, 2022 * * Author(s): * Vasily Gorbik <gor@linux.ibm.com> @@ -14,23 +14,93 @@ #include <linux/types.h> #include <linux/errno.h> #include <linux/bug.h> +#include <linux/sched.h> #include <asm/page.h> +#include <asm/gmap.h> + +#define UVC_CC_OK 0 +#define UVC_CC_ERROR 1 +#define UVC_CC_BUSY 2 +#define UVC_CC_PARTIAL 3 #define UVC_RC_EXECUTED 0x0001 #define UVC_RC_INV_CMD 0x0002 #define UVC_RC_INV_STATE 0x0003 #define UVC_RC_INV_LEN 0x0005 #define UVC_RC_NO_RESUME 0x0007 +#define UVC_RC_NEED_DESTROY 0x8000 #define UVC_CMD_QUI 0x0001 +#define UVC_CMD_INIT_UV 0x000f +#define UVC_CMD_CREATE_SEC_CONF 0x0100 +#define UVC_CMD_DESTROY_SEC_CONF 0x0101 +#define UVC_CMD_DESTROY_SEC_CONF_FAST 0x0102 +#define UVC_CMD_CREATE_SEC_CPU 0x0120 +#define UVC_CMD_DESTROY_SEC_CPU 0x0121 +#define UVC_CMD_CONV_TO_SEC_STOR 0x0200 +#define UVC_CMD_CONV_FROM_SEC_STOR 0x0201 +#define UVC_CMD_DESTR_SEC_STOR 0x0202 +#define UVC_CMD_SET_SEC_CONF_PARAMS 0x0300 +#define UVC_CMD_UNPACK_IMG 0x0301 +#define UVC_CMD_VERIFY_IMG 0x0302 +#define UVC_CMD_CPU_RESET 0x0310 +#define UVC_CMD_CPU_RESET_INITIAL 0x0311 +#define UVC_CMD_PREPARE_RESET 0x0320 +#define UVC_CMD_CPU_RESET_CLEAR 0x0321 +#define UVC_CMD_CPU_SET_STATE 0x0330 +#define UVC_CMD_SET_UNSHARE_ALL 0x0340 +#define UVC_CMD_PIN_PAGE_SHARED 0x0341 +#define UVC_CMD_UNPIN_PAGE_SHARED 0x0342 +#define UVC_CMD_DUMP_INIT 0x0400 +#define UVC_CMD_DUMP_CONF_STOR_STATE 0x0401 +#define UVC_CMD_DUMP_CPU 0x0402 +#define UVC_CMD_DUMP_COMPLETE 0x0403 #define UVC_CMD_SET_SHARED_ACCESS 0x1000 #define UVC_CMD_REMOVE_SHARED_ACCESS 0x1001 +#define UVC_CMD_RETR_ATTEST 0x1020 +#define UVC_CMD_ADD_SECRET 0x1031 +#define UVC_CMD_LIST_SECRETS 0x1033 +#define UVC_CMD_LOCK_SECRETS 0x1034 /* Bits in installed uv calls */ enum uv_cmds_inst { BIT_UVC_CMD_QUI = 0, + BIT_UVC_CMD_INIT_UV = 1, + BIT_UVC_CMD_CREATE_SEC_CONF = 2, + BIT_UVC_CMD_DESTROY_SEC_CONF = 3, + BIT_UVC_CMD_CREATE_SEC_CPU = 4, + BIT_UVC_CMD_DESTROY_SEC_CPU = 5, + BIT_UVC_CMD_CONV_TO_SEC_STOR = 6, + BIT_UVC_CMD_CONV_FROM_SEC_STOR = 7, BIT_UVC_CMD_SET_SHARED_ACCESS = 8, BIT_UVC_CMD_REMOVE_SHARED_ACCESS = 9, + BIT_UVC_CMD_SET_SEC_PARMS = 11, + BIT_UVC_CMD_UNPACK_IMG = 13, + BIT_UVC_CMD_VERIFY_IMG = 14, + BIT_UVC_CMD_CPU_RESET = 15, + BIT_UVC_CMD_CPU_RESET_INITIAL = 16, + BIT_UVC_CMD_CPU_SET_STATE = 17, + BIT_UVC_CMD_PREPARE_RESET = 18, + BIT_UVC_CMD_CPU_PERFORM_CLEAR_RESET = 19, + BIT_UVC_CMD_UNSHARE_ALL = 20, + BIT_UVC_CMD_PIN_PAGE_SHARED = 21, + BIT_UVC_CMD_UNPIN_PAGE_SHARED = 22, + BIT_UVC_CMD_DESTROY_SEC_CONF_FAST = 23, + BIT_UVC_CMD_DUMP_INIT = 24, + BIT_UVC_CMD_DUMP_CONFIG_STOR_STATE = 25, + BIT_UVC_CMD_DUMP_CPU = 26, + BIT_UVC_CMD_DUMP_COMPLETE = 27, + BIT_UVC_CMD_RETR_ATTEST = 28, + BIT_UVC_CMD_ADD_SECRET = 29, + BIT_UVC_CMD_LIST_SECRETS = 30, + BIT_UVC_CMD_LOCK_SECRETS = 31, +}; + +enum uv_feat_ind { + BIT_UV_FEAT_MISC = 0, + BIT_UV_FEAT_AIV = 1, + BIT_UV_FEAT_AP = 4, + BIT_UV_FEAT_AP_INTR = 5, }; struct uv_cb_header { @@ -40,13 +110,158 @@ struct uv_cb_header { u16 rrc; /* Return Reason Code */ } __packed __aligned(8); +/* Query Ultravisor Information */ struct uv_cb_qui { + struct uv_cb_header header; /* 0x0000 */ + u64 reserved08; /* 0x0008 */ + u64 inst_calls_list[4]; /* 0x0010 */ + u64 reserved30[2]; /* 0x0030 */ + u64 uv_base_stor_len; /* 0x0040 */ + u64 reserved48; /* 0x0048 */ + u64 conf_base_phys_stor_len; /* 0x0050 */ + u64 conf_base_virt_stor_len; /* 0x0058 */ + u64 conf_virt_var_stor_len; /* 0x0060 */ + u64 cpu_stor_len; /* 0x0068 */ + u32 reserved70[3]; /* 0x0070 */ + u32 max_num_sec_conf; /* 0x007c */ + u64 max_guest_stor_addr; /* 0x0080 */ + u8 reserved88[0x9e - 0x88]; /* 0x0088 */ + u16 max_guest_cpu_id; /* 0x009e */ + u64 uv_feature_indications; /* 0x00a0 */ + u64 reserveda8; /* 0x00a8 */ + u64 supp_se_hdr_versions; /* 0x00b0 */ + u64 supp_se_hdr_pcf; /* 0x00b8 */ + u64 reservedc0; /* 0x00c0 */ + u64 conf_dump_storage_state_len; /* 0x00c8 */ + u64 conf_dump_finalize_len; /* 0x00d0 */ + u64 reservedd8; /* 0x00d8 */ + u64 supp_att_req_hdr_ver; /* 0x00e0 */ + u64 supp_att_pflags; /* 0x00e8 */ + u64 reservedf0; /* 0x00f0 */ + u64 supp_add_secret_req_ver; /* 0x00f8 */ + u64 supp_add_secret_pcf; /* 0x0100 */ + u64 supp_secret_types; /* 0x0180 */ + u16 max_secrets; /* 0x0110 */ + u8 reserved112[0x120 - 0x112]; /* 0x0112 */ +} __packed __aligned(8); + +/* Initialize Ultravisor */ +struct uv_cb_init { + struct uv_cb_header header; + u64 reserved08[2]; + u64 stor_origin; + u64 stor_len; + u64 reserved28[4]; +} __packed __aligned(8); + +/* Create Guest Configuration */ +struct uv_cb_cgc { + struct uv_cb_header header; + u64 reserved08[2]; + u64 guest_handle; + u64 conf_base_stor_origin; + u64 conf_virt_stor_origin; + u8 reserved30[6]; + union { + struct { + u16 : 14; + u16 ap_instr_intr : 1; + u16 ap_allow_instr : 1; + }; + u16 raw; + } flags; + u64 guest_stor_origin; + u64 guest_stor_len; + u64 guest_sca; + u64 guest_asce; + u64 reserved58[5]; +} __packed __aligned(8); + +/* Create Secure CPU */ +struct uv_cb_csc { + struct uv_cb_header header; + u64 reserved08[2]; + u64 cpu_handle; + u64 guest_handle; + u64 stor_origin; + u8 reserved30[6]; + u16 num; + u64 state_origin; + u64 reserved40[4]; +} __packed __aligned(8); + +/* Convert to Secure */ +struct uv_cb_cts { + struct uv_cb_header header; + u64 reserved08[2]; + u64 guest_handle; + u64 gaddr; +} __packed __aligned(8); + +/* Convert from Secure / Pin Page Shared */ +struct uv_cb_cfs { + struct uv_cb_header header; + u64 reserved08[2]; + u64 paddr; +} __packed __aligned(8); + +/* Set Secure Config Parameter */ +struct uv_cb_ssc { + struct uv_cb_header header; + u64 reserved08[2]; + u64 guest_handle; + u64 sec_header_origin; + u32 sec_header_len; + u32 reserved2c; + u64 reserved30[4]; +} __packed __aligned(8); + +/* Unpack */ +struct uv_cb_unp { + struct uv_cb_header header; + u64 reserved08[2]; + u64 guest_handle; + u64 gaddr; + u64 tweak[2]; + u64 reserved38[3]; +} __packed __aligned(8); + +#define PV_CPU_STATE_OPR 1 +#define PV_CPU_STATE_STP 2 +#define PV_CPU_STATE_CHKSTP 3 +#define PV_CPU_STATE_OPR_LOAD 5 + +struct uv_cb_cpu_set_state { + struct uv_cb_header header; + u64 reserved08[2]; + u64 cpu_handle; + u8 reserved20[7]; + u8 state; + u64 reserved28[5]; +}; + +/* + * A common UV call struct for calls that take no payload + * Examples: + * Destroy cpu/config + * Verify + */ +struct uv_cb_nodata { struct uv_cb_header header; - u64 reserved08; - u64 inst_calls_list[4]; - u64 reserved30[15]; + u64 reserved08[2]; + u64 handle; + u64 reserved20[4]; } __packed __aligned(8); +/* Destroy Configuration Fast */ +struct uv_cb_destroy_fast { + struct uv_cb_header header; + u64 reserved08[2]; + u64 handle; + u64 reserved20[5]; +} __packed __aligned(8); + +/* Set Shared Access */ struct uv_cb_share { struct uv_cb_header header; u64 reserved08[3]; @@ -54,21 +269,151 @@ struct uv_cb_share { u64 reserved28; } __packed __aligned(8); -static inline int uv_call(unsigned long r1, unsigned long r2) +/* Retrieve Attestation Measurement */ +struct uv_cb_attest { + struct uv_cb_header header; /* 0x0000 */ + u64 reserved08[2]; /* 0x0008 */ + u64 arcb_addr; /* 0x0018 */ + u64 cont_token; /* 0x0020 */ + u8 reserved28[6]; /* 0x0028 */ + u16 user_data_len; /* 0x002e */ + u8 user_data[256]; /* 0x0030 */ + u32 reserved130[3]; /* 0x0130 */ + u32 meas_len; /* 0x013c */ + u64 meas_addr; /* 0x0140 */ + u8 config_uid[16]; /* 0x0148 */ + u32 reserved158; /* 0x0158 */ + u32 add_data_len; /* 0x015c */ + u64 add_data_addr; /* 0x0160 */ + u64 reserved168[4]; /* 0x0168 */ +} __packed __aligned(8); + +struct uv_cb_dump_cpu { + struct uv_cb_header header; + u64 reserved08[2]; + u64 cpu_handle; + u64 dump_area_origin; + u64 reserved28[5]; +} __packed __aligned(8); + +struct uv_cb_dump_stor_state { + struct uv_cb_header header; + u64 reserved08[2]; + u64 config_handle; + u64 dump_area_origin; + u64 gaddr; + u64 reserved28[4]; +} __packed __aligned(8); + +struct uv_cb_dump_complete { + struct uv_cb_header header; + u64 reserved08[2]; + u64 config_handle; + u64 dump_area_origin; + u64 reserved30[5]; +} __packed __aligned(8); + +/* + * A common UV call struct for pv guests that contains a single address + * Examples: + * Add Secret + * List Secrets + */ +struct uv_cb_guest_addr { + struct uv_cb_header header; + u64 reserved08[3]; + u64 addr; + u64 reserved28[4]; +} __packed __aligned(8); + +static inline int __uv_call(unsigned long r1, unsigned long r2) { int cc; asm volatile( - "0: .insn rrf,0xB9A40000,%[r1],%[r2],0,0\n" - " brc 3,0b\n" - " ipm %[cc]\n" - " srl %[cc],28\n" + " .insn rrf,0xB9A40000,%[r1],%[r2],0,0\n" + " ipm %[cc]\n" + " srl %[cc],28\n" : [cc] "=d" (cc) : [r1] "a" (r1), [r2] "a" (r2) : "memory", "cc"); return cc; } +static inline int uv_call(unsigned long r1, unsigned long r2) +{ + int cc; + + do { + cc = __uv_call(r1, r2); + } while (cc > 1); + return cc; +} + +/* Low level uv_call that avoids stalls for long running busy conditions */ +static inline int uv_call_sched(unsigned long r1, unsigned long r2) +{ + int cc; + + do { + cc = __uv_call(r1, r2); + cond_resched(); + } while (cc > 1); + return cc; +} + +/* + * special variant of uv_call that only transports the cpu or guest + * handle and the command, like destroy or verify. + */ +static inline int uv_cmd_nodata(u64 handle, u16 cmd, u16 *rc, u16 *rrc) +{ + struct uv_cb_nodata uvcb = { + .header.cmd = cmd, + .header.len = sizeof(uvcb), + .handle = handle, + }; + int cc; + + WARN(!handle, "No handle provided to Ultravisor call cmd %x\n", cmd); + cc = uv_call_sched(0, (u64)&uvcb); + *rc = uvcb.header.rc; + *rrc = uvcb.header.rrc; + return cc ? -EINVAL : 0; +} + +struct uv_info { + unsigned long inst_calls_list[4]; + unsigned long uv_base_stor_len; + unsigned long guest_base_stor_len; + unsigned long guest_virt_base_stor_len; + unsigned long guest_virt_var_stor_len; + unsigned long guest_cpu_stor_len; + unsigned long max_sec_stor_addr; + unsigned int max_num_sec_conf; + unsigned short max_guest_cpu_id; + unsigned long uv_feature_indications; + unsigned long supp_se_hdr_ver; + unsigned long supp_se_hdr_pcf; + unsigned long conf_dump_storage_state_len; + unsigned long conf_dump_finalize_len; + unsigned long supp_att_req_hdr_ver; + unsigned long supp_att_pflags; + unsigned long supp_add_secret_req_ver; + unsigned long supp_add_secret_pcf; + unsigned long supp_secret_types; + unsigned short max_secrets; +}; + +extern struct uv_info uv_info; + +static inline bool uv_has_feature(u8 feature_bit) +{ + if (feature_bit >= sizeof(uv_info.uv_feature_indications) * 8) + return false; + return test_bit_inv(feature_bit, &uv_info.uv_feature_indications); +} + #ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST extern int prot_virt_guest; @@ -86,7 +431,7 @@ static inline int share(unsigned long addr, u16 cmd) }; if (!is_prot_virt_guest()) - return -ENOTSUPP; + return -EOPNOTSUPP; /* * Sharing is page wise, if we encounter addresses that are * not page aligned, we assume something went wrong. If @@ -121,12 +466,52 @@ static inline int uv_remove_shared(unsigned long addr) return share(addr, UVC_CMD_REMOVE_SHARED_ACCESS); } -void uv_query_info(void); #else #define is_prot_virt_guest() 0 static inline int uv_set_shared(unsigned long addr) { return 0; } static inline int uv_remove_shared(unsigned long addr) { return 0; } -static inline void uv_query_info(void) {} +#endif + +#if IS_ENABLED(CONFIG_KVM) +extern int prot_virt_host; + +static inline int is_prot_virt_host(void) +{ + return prot_virt_host; +} + +int uv_pin_shared(unsigned long paddr); +int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb); +int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr); +int uv_destroy_owned_page(unsigned long paddr); +int uv_convert_from_secure(unsigned long paddr); +int uv_convert_owned_from_secure(unsigned long paddr); +int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr); + +void setup_uv(void); +#else +#define is_prot_virt_host() 0 +static inline void setup_uv(void) {} + +static inline int uv_pin_shared(unsigned long paddr) +{ + return 0; +} + +static inline int uv_destroy_owned_page(unsigned long paddr) +{ + return 0; +} + +static inline int uv_convert_from_secure(unsigned long paddr) +{ + return 0; +} + +static inline int uv_convert_owned_from_secure(unsigned long paddr) +{ + return 0; +} #endif #endif /* _ASM_S390_UV_H */ diff --git a/arch/s390/include/asm/vdso.h b/arch/s390/include/asm/vdso.h index 3bcfdeb01395..53165aa7813a 100644 --- a/arch/s390/include/asm/vdso.h +++ b/arch/s390/include/asm/vdso.h @@ -2,64 +2,33 @@ #ifndef __S390_VDSO_H__ #define __S390_VDSO_H__ -/* Default link addresses for the vDSOs */ -#define VDSO32_LBASE 0 -#define VDSO64_LBASE 0 - -#define VDSO_VERSION_STRING LINUX_2.6.29 +#include <vdso/datapage.h> #ifndef __ASSEMBLY__ -/* - * Note about the vdso_data and vdso_per_cpu_data structures: - * - * NEVER USE THEM IN USERSPACE CODE DIRECTLY. The layout of the - * structure is supposed to be known only to the function in the vdso - * itself and may change without notice. - */ +#include <generated/vdso64-offsets.h> +#ifdef CONFIG_COMPAT +#include <generated/vdso32-offsets.h> +#endif -struct vdso_data { - __u64 tb_update_count; /* Timebase atomicity ctr 0x00 */ - __u64 xtime_tod_stamp; /* TOD clock for xtime 0x08 */ - __u64 xtime_clock_sec; /* Kernel time 0x10 */ - __u64 xtime_clock_nsec; /* 0x18 */ - __u64 xtime_coarse_sec; /* Coarse kernel time 0x20 */ - __u64 xtime_coarse_nsec; /* 0x28 */ - __u64 wtom_clock_sec; /* Wall to monotonic clock 0x30 */ - __u64 wtom_clock_nsec; /* 0x38 */ - __u64 wtom_coarse_sec; /* Coarse wall to monotonic 0x40 */ - __u64 wtom_coarse_nsec; /* 0x48 */ - __u32 tz_minuteswest; /* Minutes west of Greenwich 0x50 */ - __u32 tz_dsttime; /* Type of dst correction 0x54 */ - __u32 ectg_available; /* ECTG instruction present 0x58 */ - __u32 tk_mult; /* Mult. used for xtime_nsec 0x5c */ - __u32 tk_shift; /* Shift used for xtime_nsec 0x60 */ - __u32 ts_dir; /* TOD steering direction 0x64 */ - __u64 ts_end; /* TOD steering end 0x68 */ -}; - -struct vdso_per_cpu_data { - __u64 ectg_timer_base; - __u64 ectg_user_time; - /* - * Note: node_id and cpu_nr must be at adjacent memory locations. - * VDSO userspace must read both values with a single instruction. - */ - union { - __u64 getcpu_val; - struct { - __u32 node_id; - __u32 cpu_nr; - }; - }; -}; +#define VDSO64_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso64_offset_##name)) +#ifdef CONFIG_COMPAT +#define VDSO32_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso32_offset_##name)) +#else +#define VDSO32_SYMBOL(tsk, name) (-1UL) +#endif extern struct vdso_data *vdso_data; -extern struct vdso_data boot_vdso_data; -void vdso_alloc_boot_cpu(struct lowcore *lowcore); -int vdso_alloc_per_cpu(struct lowcore *lowcore); -void vdso_free_per_cpu(struct lowcore *lowcore); +int vdso_getcpu_init(void); #endif /* __ASSEMBLY__ */ + +/* Default link address for the vDSO */ +#define VDSO_LBASE 0 + +#define __VVAR_PAGES 2 + +#define VDSO_VERSION_STRING LINUX_2.6.29 + #endif /* __S390_VDSO_H__ */ diff --git a/arch/s390/include/asm/vdso/clocksource.h b/arch/s390/include/asm/vdso/clocksource.h new file mode 100644 index 000000000000..a93eda0ce7bb --- /dev/null +++ b/arch/s390/include/asm/vdso/clocksource.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_VDSO_CLOCKSOURCE_H +#define __ASM_VDSO_CLOCKSOURCE_H + +#define VDSO_ARCH_CLOCKMODES \ + VDSO_CLOCKMODE_TOD + +#endif /* __ASM_VDSO_CLOCKSOURCE_H */ diff --git a/arch/s390/include/asm/vdso/data.h b/arch/s390/include/asm/vdso/data.h new file mode 100644 index 000000000000..73ee89142666 --- /dev/null +++ b/arch/s390/include/asm/vdso/data.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __S390_ASM_VDSO_DATA_H +#define __S390_ASM_VDSO_DATA_H + +#include <linux/types.h> +#include <vdso/datapage.h> + +struct arch_vdso_data { + __s64 tod_steering_delta; + __u64 tod_steering_end; +}; + +#endif /* __S390_ASM_VDSO_DATA_H */ diff --git a/arch/s390/include/asm/vdso/gettimeofday.h b/arch/s390/include/asm/vdso/gettimeofday.h new file mode 100644 index 000000000000..db84942eb78f --- /dev/null +++ b/arch/s390/include/asm/vdso/gettimeofday.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef ASM_VDSO_GETTIMEOFDAY_H +#define ASM_VDSO_GETTIMEOFDAY_H + +#define VDSO_HAS_TIME 1 + +#define VDSO_HAS_CLOCK_GETRES 1 + +#include <asm/syscall.h> +#include <asm/timex.h> +#include <asm/unistd.h> +#include <linux/compiler.h> + +#define vdso_calc_delta __arch_vdso_calc_delta +static __always_inline u64 __arch_vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult) +{ + return (cycles - last) * mult; +} + +static __always_inline const struct vdso_data *__arch_get_vdso_data(void) +{ + return _vdso_data; +} + +static inline u64 __arch_get_hw_counter(s32 clock_mode, const struct vdso_data *vd) +{ + u64 adj, now; + + now = get_tod_clock(); + adj = vd->arch_data.tod_steering_end - now; + if (unlikely((s64) adj > 0)) + now += (vd->arch_data.tod_steering_delta < 0) ? (adj >> 15) : -(adj >> 15); + return now; +} + +static __always_inline +long clock_gettime_fallback(clockid_t clkid, struct __kernel_timespec *ts) +{ + return syscall2(__NR_clock_gettime, (long)clkid, (long)ts); +} + +static __always_inline +long gettimeofday_fallback(register struct __kernel_old_timeval *tv, + register struct timezone *tz) +{ + return syscall2(__NR_gettimeofday, (long)tv, (long)tz); +} + +static __always_inline +long clock_getres_fallback(clockid_t clkid, struct __kernel_timespec *ts) +{ + return syscall2(__NR_clock_getres, (long)clkid, (long)ts); +} + +#ifdef CONFIG_TIME_NS +static __always_inline +const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd) +{ + return _timens_data; +} +#endif + +#endif diff --git a/arch/s390/include/asm/vdso/processor.h b/arch/s390/include/asm/vdso/processor.h new file mode 100644 index 000000000000..cfcc3e117c4c --- /dev/null +++ b/arch/s390/include/asm/vdso/processor.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __ASM_VDSO_PROCESSOR_H +#define __ASM_VDSO_PROCESSOR_H + +#define cpu_relax() barrier() + +#endif /* __ASM_VDSO_PROCESSOR_H */ diff --git a/arch/s390/include/asm/vdso/vsyscall.h b/arch/s390/include/asm/vdso/vsyscall.h new file mode 100644 index 000000000000..6c67c08cefdd --- /dev/null +++ b/arch/s390/include/asm/vdso/vsyscall.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_VDSO_VSYSCALL_H +#define __ASM_VDSO_VSYSCALL_H + +#ifndef __ASSEMBLY__ + +#include <linux/hrtimer.h> +#include <linux/timekeeper_internal.h> +#include <vdso/datapage.h> +#include <asm/vdso.h> +/* + * Update the vDSO data page to keep in sync with kernel timekeeping. + */ + +static __always_inline struct vdso_data *__s390_get_k_vdso_data(void) +{ + return vdso_data; +} +#define __arch_get_k_vdso_data __s390_get_k_vdso_data + +/* The asm-generic header needs to be included after the definitions above */ +#include <asm-generic/vdso/vsyscall.h> + +#endif /* !__ASSEMBLY__ */ + +#endif /* __ASM_VDSO_VSYSCALL_H */ diff --git a/arch/s390/include/asm/vga.h b/arch/s390/include/asm/vga.h deleted file mode 100644 index 605dc46bac5e..000000000000 --- a/arch/s390/include/asm/vga.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_S390_VGA_H -#define _ASM_S390_VGA_H - -/* Avoid compile errors due to missing asm/vga.h */ - -#endif /* _ASM_S390_VGA_H */ diff --git a/arch/s390/include/asm/vmalloc.h b/arch/s390/include/asm/vmalloc.h new file mode 100644 index 000000000000..3ba3a6bdca25 --- /dev/null +++ b/arch/s390/include/asm/vmalloc.h @@ -0,0 +1,4 @@ +#ifndef _ASM_S390_VMALLOC_H +#define _ASM_S390_VMALLOC_H + +#endif /* _ASM_S390_VMALLOC_H */ diff --git a/arch/s390/include/asm/vtime.h b/arch/s390/include/asm/vtime.h index 3622d4ebc73a..fe17e448c0c5 100644 --- a/arch/s390/include/asm/vtime.h +++ b/arch/s390/include/asm/vtime.h @@ -2,7 +2,20 @@ #ifndef _S390_VTIME_H #define _S390_VTIME_H -#define __ARCH_HAS_VTIME_ACCOUNT #define __ARCH_HAS_VTIME_TASK_SWITCH +static inline void update_timer_sys(void) +{ + S390_lowcore.system_timer += S390_lowcore.last_update_timer - S390_lowcore.exit_timer; + S390_lowcore.user_timer += S390_lowcore.exit_timer - S390_lowcore.sys_enter_timer; + S390_lowcore.last_update_timer = S390_lowcore.sys_enter_timer; +} + +static inline void update_timer_mcck(void) +{ + S390_lowcore.system_timer += S390_lowcore.last_update_timer - S390_lowcore.exit_timer; + S390_lowcore.user_timer += S390_lowcore.exit_timer - S390_lowcore.mcck_enter_timer; + S390_lowcore.last_update_timer = S390_lowcore.mcck_enter_timer; +} + #endif /* _S390_VTIME_H */ diff --git a/arch/s390/include/asm/vtimer.h b/arch/s390/include/asm/vtimer.h index 42f707d1c1e8..e601adaa6320 100644 --- a/arch/s390/include/asm/vtimer.h +++ b/arch/s390/include/asm/vtimer.h @@ -25,8 +25,6 @@ extern void add_virt_timer_periodic(struct vtimer_list *timer); extern int mod_virt_timer(struct vtimer_list *timer, u64 expires); extern int mod_virt_timer_periodic(struct vtimer_list *timer, u64 expires); extern int del_virt_timer(struct vtimer_list *timer); - -extern void init_cpu_vtimer(void); extern void vtime_init(void); #endif /* _ASM_S390_TIMER_H */ diff --git a/arch/s390/include/asm/vx-insn-asm.h b/arch/s390/include/asm/vx-insn-asm.h new file mode 100644 index 000000000000..360f8b36d962 --- /dev/null +++ b/arch/s390/include/asm/vx-insn-asm.h @@ -0,0 +1,681 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Support for Vector Instructions + * + * Assembler macros to generate .byte/.word code for particular + * vector instructions that are supported by recent binutils (>= 2.26) only. + * + * Copyright IBM Corp. 2015 + * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> + */ + +#ifndef __ASM_S390_VX_INSN_INTERNAL_H +#define __ASM_S390_VX_INSN_INTERNAL_H + +#ifndef __ASM_S390_VX_INSN_H +#error only <asm/vx-insn.h> can be included directly +#endif + +#ifdef __ASSEMBLY__ + +/* Macros to generate vector instruction byte code */ + +/* GR_NUM - Retrieve general-purpose register number + * + * @opd: Operand to store register number + * @r64: String designation register in the format "%rN" + */ +.macro GR_NUM opd gr + \opd = 255 + .ifc \gr,%r0 + \opd = 0 + .endif + .ifc \gr,%r1 + \opd = 1 + .endif + .ifc \gr,%r2 + \opd = 2 + .endif + .ifc \gr,%r3 + \opd = 3 + .endif + .ifc \gr,%r4 + \opd = 4 + .endif + .ifc \gr,%r5 + \opd = 5 + .endif + .ifc \gr,%r6 + \opd = 6 + .endif + .ifc \gr,%r7 + \opd = 7 + .endif + .ifc \gr,%r8 + \opd = 8 + .endif + .ifc \gr,%r9 + \opd = 9 + .endif + .ifc \gr,%r10 + \opd = 10 + .endif + .ifc \gr,%r11 + \opd = 11 + .endif + .ifc \gr,%r12 + \opd = 12 + .endif + .ifc \gr,%r13 + \opd = 13 + .endif + .ifc \gr,%r14 + \opd = 14 + .endif + .ifc \gr,%r15 + \opd = 15 + .endif + .if \opd == 255 + \opd = \gr + .endif +.endm + +/* VX_NUM - Retrieve vector register number + * + * @opd: Operand to store register number + * @vxr: String designation register in the format "%vN" + * + * The vector register number is used for as input number to the + * instruction and, as well as, to compute the RXB field of the + * instruction. + */ +.macro VX_NUM opd vxr + \opd = 255 + .ifc \vxr,%v0 + \opd = 0 + .endif + .ifc \vxr,%v1 + \opd = 1 + .endif + .ifc \vxr,%v2 + \opd = 2 + .endif + .ifc \vxr,%v3 + \opd = 3 + .endif + .ifc \vxr,%v4 + \opd = 4 + .endif + .ifc \vxr,%v5 + \opd = 5 + .endif + .ifc \vxr,%v6 + \opd = 6 + .endif + .ifc \vxr,%v7 + \opd = 7 + .endif + .ifc \vxr,%v8 + \opd = 8 + .endif + .ifc \vxr,%v9 + \opd = 9 + .endif + .ifc \vxr,%v10 + \opd = 10 + .endif + .ifc \vxr,%v11 + \opd = 11 + .endif + .ifc \vxr,%v12 + \opd = 12 + .endif + .ifc \vxr,%v13 + \opd = 13 + .endif + .ifc \vxr,%v14 + \opd = 14 + .endif + .ifc \vxr,%v15 + \opd = 15 + .endif + .ifc \vxr,%v16 + \opd = 16 + .endif + .ifc \vxr,%v17 + \opd = 17 + .endif + .ifc \vxr,%v18 + \opd = 18 + .endif + .ifc \vxr,%v19 + \opd = 19 + .endif + .ifc \vxr,%v20 + \opd = 20 + .endif + .ifc \vxr,%v21 + \opd = 21 + .endif + .ifc \vxr,%v22 + \opd = 22 + .endif + .ifc \vxr,%v23 + \opd = 23 + .endif + .ifc \vxr,%v24 + \opd = 24 + .endif + .ifc \vxr,%v25 + \opd = 25 + .endif + .ifc \vxr,%v26 + \opd = 26 + .endif + .ifc \vxr,%v27 + \opd = 27 + .endif + .ifc \vxr,%v28 + \opd = 28 + .endif + .ifc \vxr,%v29 + \opd = 29 + .endif + .ifc \vxr,%v30 + \opd = 30 + .endif + .ifc \vxr,%v31 + \opd = 31 + .endif + .if \opd == 255 + \opd = \vxr + .endif +.endm + +/* RXB - Compute most significant bit used vector registers + * + * @rxb: Operand to store computed RXB value + * @v1: First vector register designated operand + * @v2: Second vector register designated operand + * @v3: Third vector register designated operand + * @v4: Fourth vector register designated operand + */ +.macro RXB rxb v1 v2=0 v3=0 v4=0 + \rxb = 0 + .if \v1 & 0x10 + \rxb = \rxb | 0x08 + .endif + .if \v2 & 0x10 + \rxb = \rxb | 0x04 + .endif + .if \v3 & 0x10 + \rxb = \rxb | 0x02 + .endif + .if \v4 & 0x10 + \rxb = \rxb | 0x01 + .endif +.endm + +/* MRXB - Generate Element Size Control and RXB value + * + * @m: Element size control + * @v1: First vector register designated operand (for RXB) + * @v2: Second vector register designated operand (for RXB) + * @v3: Third vector register designated operand (for RXB) + * @v4: Fourth vector register designated operand (for RXB) + */ +.macro MRXB m v1 v2=0 v3=0 v4=0 + rxb = 0 + RXB rxb, \v1, \v2, \v3, \v4 + .byte (\m << 4) | rxb +.endm + +/* MRXBOPC - Generate Element Size Control, RXB, and final Opcode fields + * + * @m: Element size control + * @opc: Opcode + * @v1: First vector register designated operand (for RXB) + * @v2: Second vector register designated operand (for RXB) + * @v3: Third vector register designated operand (for RXB) + * @v4: Fourth vector register designated operand (for RXB) + */ +.macro MRXBOPC m opc v1 v2=0 v3=0 v4=0 + MRXB \m, \v1, \v2, \v3, \v4 + .byte \opc +.endm + +/* Vector support instructions */ + +/* VECTOR GENERATE BYTE MASK */ +.macro VGBM vr imm2 + VX_NUM v1, \vr + .word (0xE700 | ((v1&15) << 4)) + .word \imm2 + MRXBOPC 0, 0x44, v1 +.endm +.macro VZERO vxr + VGBM \vxr, 0 +.endm +.macro VONE vxr + VGBM \vxr, 0xFFFF +.endm + +/* VECTOR LOAD VR ELEMENT FROM GR */ +.macro VLVG v, gr, disp, m + VX_NUM v1, \v + GR_NUM b2, "%r0" + GR_NUM r3, \gr + .word 0xE700 | ((v1&15) << 4) | r3 + .word (b2 << 12) | (\disp) + MRXBOPC \m, 0x22, v1 +.endm +.macro VLVGB v, gr, index, base + VLVG \v, \gr, \index, \base, 0 +.endm +.macro VLVGH v, gr, index + VLVG \v, \gr, \index, 1 +.endm +.macro VLVGF v, gr, index + VLVG \v, \gr, \index, 2 +.endm +.macro VLVGG v, gr, index + VLVG \v, \gr, \index, 3 +.endm + +/* VECTOR LOAD REGISTER */ +.macro VLR v1, v2 + VX_NUM v1, \v1 + VX_NUM v2, \v2 + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word 0 + MRXBOPC 0, 0x56, v1, v2 +.endm + +/* VECTOR LOAD */ +.macro VL v, disp, index="%r0", base + VX_NUM v1, \v + GR_NUM x2, \index + GR_NUM b2, \base + .word 0xE700 | ((v1&15) << 4) | x2 + .word (b2 << 12) | (\disp) + MRXBOPC 0, 0x06, v1 +.endm + +/* VECTOR LOAD ELEMENT */ +.macro VLEx vr1, disp, index="%r0", base, m3, opc + VX_NUM v1, \vr1 + GR_NUM x2, \index + GR_NUM b2, \base + .word 0xE700 | ((v1&15) << 4) | x2 + .word (b2 << 12) | (\disp) + MRXBOPC \m3, \opc, v1 +.endm +.macro VLEB vr1, disp, index="%r0", base, m3 + VLEx \vr1, \disp, \index, \base, \m3, 0x00 +.endm +.macro VLEH vr1, disp, index="%r0", base, m3 + VLEx \vr1, \disp, \index, \base, \m3, 0x01 +.endm +.macro VLEF vr1, disp, index="%r0", base, m3 + VLEx \vr1, \disp, \index, \base, \m3, 0x03 +.endm +.macro VLEG vr1, disp, index="%r0", base, m3 + VLEx \vr1, \disp, \index, \base, \m3, 0x02 +.endm + +/* VECTOR LOAD ELEMENT IMMEDIATE */ +.macro VLEIx vr1, imm2, m3, opc + VX_NUM v1, \vr1 + .word 0xE700 | ((v1&15) << 4) + .word \imm2 + MRXBOPC \m3, \opc, v1 +.endm +.macro VLEIB vr1, imm2, index + VLEIx \vr1, \imm2, \index, 0x40 +.endm +.macro VLEIH vr1, imm2, index + VLEIx \vr1, \imm2, \index, 0x41 +.endm +.macro VLEIF vr1, imm2, index + VLEIx \vr1, \imm2, \index, 0x43 +.endm +.macro VLEIG vr1, imm2, index + VLEIx \vr1, \imm2, \index, 0x42 +.endm + +/* VECTOR LOAD GR FROM VR ELEMENT */ +.macro VLGV gr, vr, disp, base="%r0", m + GR_NUM r1, \gr + GR_NUM b2, \base + VX_NUM v3, \vr + .word 0xE700 | (r1 << 4) | (v3&15) + .word (b2 << 12) | (\disp) + MRXBOPC \m, 0x21, v3 +.endm +.macro VLGVB gr, vr, disp, base="%r0" + VLGV \gr, \vr, \disp, \base, 0 +.endm +.macro VLGVH gr, vr, disp, base="%r0" + VLGV \gr, \vr, \disp, \base, 1 +.endm +.macro VLGVF gr, vr, disp, base="%r0" + VLGV \gr, \vr, \disp, \base, 2 +.endm +.macro VLGVG gr, vr, disp, base="%r0" + VLGV \gr, \vr, \disp, \base, 3 +.endm + +/* VECTOR LOAD MULTIPLE */ +.macro VLM vfrom, vto, disp, base, hint=3 + VX_NUM v1, \vfrom + VX_NUM v3, \vto + GR_NUM b2, \base + .word 0xE700 | ((v1&15) << 4) | (v3&15) + .word (b2 << 12) | (\disp) + MRXBOPC \hint, 0x36, v1, v3 +.endm + +/* VECTOR STORE */ +.macro VST vr1, disp, index="%r0", base + VX_NUM v1, \vr1 + GR_NUM x2, \index + GR_NUM b2, \base + .word 0xE700 | ((v1&15) << 4) | (x2&15) + .word (b2 << 12) | (\disp) + MRXBOPC 0, 0x0E, v1 +.endm + +/* VECTOR STORE MULTIPLE */ +.macro VSTM vfrom, vto, disp, base, hint=3 + VX_NUM v1, \vfrom + VX_NUM v3, \vto + GR_NUM b2, \base + .word 0xE700 | ((v1&15) << 4) | (v3&15) + .word (b2 << 12) | (\disp) + MRXBOPC \hint, 0x3E, v1, v3 +.endm + +/* VECTOR PERMUTE */ +.macro VPERM vr1, vr2, vr3, vr4 + VX_NUM v1, \vr1 + VX_NUM v2, \vr2 + VX_NUM v3, \vr3 + VX_NUM v4, \vr4 + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word ((v3&15) << 12) + MRXBOPC (v4&15), 0x8C, v1, v2, v3, v4 +.endm + +/* VECTOR UNPACK LOGICAL LOW */ +.macro VUPLL vr1, vr2, m3 + VX_NUM v1, \vr1 + VX_NUM v2, \vr2 + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word 0x0000 + MRXBOPC \m3, 0xD4, v1, v2 +.endm +.macro VUPLLB vr1, vr2 + VUPLL \vr1, \vr2, 0 +.endm +.macro VUPLLH vr1, vr2 + VUPLL \vr1, \vr2, 1 +.endm +.macro VUPLLF vr1, vr2 + VUPLL \vr1, \vr2, 2 +.endm + +/* VECTOR PERMUTE DOUBLEWORD IMMEDIATE */ +.macro VPDI vr1, vr2, vr3, m4 + VX_NUM v1, \vr1 + VX_NUM v2, \vr2 + VX_NUM v3, \vr3 + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word ((v3&15) << 12) + MRXBOPC \m4, 0x84, v1, v2, v3 +.endm + +/* VECTOR REPLICATE */ +.macro VREP vr1, vr3, imm2, m4 + VX_NUM v1, \vr1 + VX_NUM v3, \vr3 + .word 0xE700 | ((v1&15) << 4) | (v3&15) + .word \imm2 + MRXBOPC \m4, 0x4D, v1, v3 +.endm +.macro VREPB vr1, vr3, imm2 + VREP \vr1, \vr3, \imm2, 0 +.endm +.macro VREPH vr1, vr3, imm2 + VREP \vr1, \vr3, \imm2, 1 +.endm +.macro VREPF vr1, vr3, imm2 + VREP \vr1, \vr3, \imm2, 2 +.endm +.macro VREPG vr1, vr3, imm2 + VREP \vr1, \vr3, \imm2, 3 +.endm + +/* VECTOR MERGE HIGH */ +.macro VMRH vr1, vr2, vr3, m4 + VX_NUM v1, \vr1 + VX_NUM v2, \vr2 + VX_NUM v3, \vr3 + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word ((v3&15) << 12) + MRXBOPC \m4, 0x61, v1, v2, v3 +.endm +.macro VMRHB vr1, vr2, vr3 + VMRH \vr1, \vr2, \vr3, 0 +.endm +.macro VMRHH vr1, vr2, vr3 + VMRH \vr1, \vr2, \vr3, 1 +.endm +.macro VMRHF vr1, vr2, vr3 + VMRH \vr1, \vr2, \vr3, 2 +.endm +.macro VMRHG vr1, vr2, vr3 + VMRH \vr1, \vr2, \vr3, 3 +.endm + +/* VECTOR MERGE LOW */ +.macro VMRL vr1, vr2, vr3, m4 + VX_NUM v1, \vr1 + VX_NUM v2, \vr2 + VX_NUM v3, \vr3 + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word ((v3&15) << 12) + MRXBOPC \m4, 0x60, v1, v2, v3 +.endm +.macro VMRLB vr1, vr2, vr3 + VMRL \vr1, \vr2, \vr3, 0 +.endm +.macro VMRLH vr1, vr2, vr3 + VMRL \vr1, \vr2, \vr3, 1 +.endm +.macro VMRLF vr1, vr2, vr3 + VMRL \vr1, \vr2, \vr3, 2 +.endm +.macro VMRLG vr1, vr2, vr3 + VMRL \vr1, \vr2, \vr3, 3 +.endm + + +/* Vector integer instructions */ + +/* VECTOR AND */ +.macro VN vr1, vr2, vr3 + VX_NUM v1, \vr1 + VX_NUM v2, \vr2 + VX_NUM v3, \vr3 + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word ((v3&15) << 12) + MRXBOPC 0, 0x68, v1, v2, v3 +.endm + +/* VECTOR EXCLUSIVE OR */ +.macro VX vr1, vr2, vr3 + VX_NUM v1, \vr1 + VX_NUM v2, \vr2 + VX_NUM v3, \vr3 + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word ((v3&15) << 12) + MRXBOPC 0, 0x6D, v1, v2, v3 +.endm + +/* VECTOR GALOIS FIELD MULTIPLY SUM */ +.macro VGFM vr1, vr2, vr3, m4 + VX_NUM v1, \vr1 + VX_NUM v2, \vr2 + VX_NUM v3, \vr3 + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word ((v3&15) << 12) + MRXBOPC \m4, 0xB4, v1, v2, v3 +.endm +.macro VGFMB vr1, vr2, vr3 + VGFM \vr1, \vr2, \vr3, 0 +.endm +.macro VGFMH vr1, vr2, vr3 + VGFM \vr1, \vr2, \vr3, 1 +.endm +.macro VGFMF vr1, vr2, vr3 + VGFM \vr1, \vr2, \vr3, 2 +.endm +.macro VGFMG vr1, vr2, vr3 + VGFM \vr1, \vr2, \vr3, 3 +.endm + +/* VECTOR GALOIS FIELD MULTIPLY SUM AND ACCUMULATE */ +.macro VGFMA vr1, vr2, vr3, vr4, m5 + VX_NUM v1, \vr1 + VX_NUM v2, \vr2 + VX_NUM v3, \vr3 + VX_NUM v4, \vr4 + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word ((v3&15) << 12) | (\m5 << 8) + MRXBOPC (v4&15), 0xBC, v1, v2, v3, v4 +.endm +.macro VGFMAB vr1, vr2, vr3, vr4 + VGFMA \vr1, \vr2, \vr3, \vr4, 0 +.endm +.macro VGFMAH vr1, vr2, vr3, vr4 + VGFMA \vr1, \vr2, \vr3, \vr4, 1 +.endm +.macro VGFMAF vr1, vr2, vr3, vr4 + VGFMA \vr1, \vr2, \vr3, \vr4, 2 +.endm +.macro VGFMAG vr1, vr2, vr3, vr4 + VGFMA \vr1, \vr2, \vr3, \vr4, 3 +.endm + +/* VECTOR SHIFT RIGHT LOGICAL BY BYTE */ +.macro VSRLB vr1, vr2, vr3 + VX_NUM v1, \vr1 + VX_NUM v2, \vr2 + VX_NUM v3, \vr3 + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word ((v3&15) << 12) + MRXBOPC 0, 0x7D, v1, v2, v3 +.endm + +/* VECTOR REPLICATE IMMEDIATE */ +.macro VREPI vr1, imm2, m3 + VX_NUM v1, \vr1 + .word 0xE700 | ((v1&15) << 4) + .word \imm2 + MRXBOPC \m3, 0x45, v1 +.endm +.macro VREPIB vr1, imm2 + VREPI \vr1, \imm2, 0 +.endm +.macro VREPIH vr1, imm2 + VREPI \vr1, \imm2, 1 +.endm +.macro VREPIF vr1, imm2 + VREPI \vr1, \imm2, 2 +.endm +.macro VREPIG vr1, imm2 + VREP \vr1, \imm2, 3 +.endm + +/* VECTOR ADD */ +.macro VA vr1, vr2, vr3, m4 + VX_NUM v1, \vr1 + VX_NUM v2, \vr2 + VX_NUM v3, \vr3 + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word ((v3&15) << 12) + MRXBOPC \m4, 0xF3, v1, v2, v3 +.endm +.macro VAB vr1, vr2, vr3 + VA \vr1, \vr2, \vr3, 0 +.endm +.macro VAH vr1, vr2, vr3 + VA \vr1, \vr2, \vr3, 1 +.endm +.macro VAF vr1, vr2, vr3 + VA \vr1, \vr2, \vr3, 2 +.endm +.macro VAG vr1, vr2, vr3 + VA \vr1, \vr2, \vr3, 3 +.endm +.macro VAQ vr1, vr2, vr3 + VA \vr1, \vr2, \vr3, 4 +.endm + +/* VECTOR ELEMENT SHIFT RIGHT ARITHMETIC */ +.macro VESRAV vr1, vr2, vr3, m4 + VX_NUM v1, \vr1 + VX_NUM v2, \vr2 + VX_NUM v3, \vr3 + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word ((v3&15) << 12) + MRXBOPC \m4, 0x7A, v1, v2, v3 +.endm + +.macro VESRAVB vr1, vr2, vr3 + VESRAV \vr1, \vr2, \vr3, 0 +.endm +.macro VESRAVH vr1, vr2, vr3 + VESRAV \vr1, \vr2, \vr3, 1 +.endm +.macro VESRAVF vr1, vr2, vr3 + VESRAV \vr1, \vr2, \vr3, 2 +.endm +.macro VESRAVG vr1, vr2, vr3 + VESRAV \vr1, \vr2, \vr3, 3 +.endm + +/* VECTOR ELEMENT ROTATE LEFT LOGICAL */ +.macro VERLL vr1, vr3, disp, base="%r0", m4 + VX_NUM v1, \vr1 + VX_NUM v3, \vr3 + GR_NUM b2, \base + .word 0xE700 | ((v1&15) << 4) | (v3&15) + .word (b2 << 12) | (\disp) + MRXBOPC \m4, 0x33, v1, v3 +.endm +.macro VERLLB vr1, vr3, disp, base="%r0" + VERLL \vr1, \vr3, \disp, \base, 0 +.endm +.macro VERLLH vr1, vr3, disp, base="%r0" + VERLL \vr1, \vr3, \disp, \base, 1 +.endm +.macro VERLLF vr1, vr3, disp, base="%r0" + VERLL \vr1, \vr3, \disp, \base, 2 +.endm +.macro VERLLG vr1, vr3, disp, base="%r0" + VERLL \vr1, \vr3, \disp, \base, 3 +.endm + +/* VECTOR SHIFT LEFT DOUBLE BY BYTE */ +.macro VSLDB vr1, vr2, vr3, imm4 + VX_NUM v1, \vr1 + VX_NUM v2, \vr2 + VX_NUM v3, \vr3 + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word ((v3&15) << 12) | (\imm4) + MRXBOPC 0, 0x77, v1, v2, v3 +.endm + +#endif /* __ASSEMBLY__ */ +#endif /* __ASM_S390_VX_INSN_INTERNAL_H */ diff --git a/arch/s390/include/asm/vx-insn.h b/arch/s390/include/asm/vx-insn.h index 0c05a673811c..8c188f1c6d27 100644 --- a/arch/s390/include/asm/vx-insn.h +++ b/arch/s390/include/asm/vx-insn.h @@ -2,560 +2,18 @@ /* * Support for Vector Instructions * - * Assembler macros to generate .byte/.word code for particular - * vector instructions that are supported by recent binutils (>= 2.26) only. - * - * Copyright IBM Corp. 2015 - * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> + * This wrapper header file allows to use the vector instruction macros in + * both assembler files as well as in inline assemblies in C files. */ #ifndef __ASM_S390_VX_INSN_H #define __ASM_S390_VX_INSN_H -#ifdef __ASSEMBLY__ - - -/* Macros to generate vector instruction byte code */ - -/* GR_NUM - Retrieve general-purpose register number - * - * @opd: Operand to store register number - * @r64: String designation register in the format "%rN" - */ -.macro GR_NUM opd gr - \opd = 255 - .ifc \gr,%r0 - \opd = 0 - .endif - .ifc \gr,%r1 - \opd = 1 - .endif - .ifc \gr,%r2 - \opd = 2 - .endif - .ifc \gr,%r3 - \opd = 3 - .endif - .ifc \gr,%r4 - \opd = 4 - .endif - .ifc \gr,%r5 - \opd = 5 - .endif - .ifc \gr,%r6 - \opd = 6 - .endif - .ifc \gr,%r7 - \opd = 7 - .endif - .ifc \gr,%r8 - \opd = 8 - .endif - .ifc \gr,%r9 - \opd = 9 - .endif - .ifc \gr,%r10 - \opd = 10 - .endif - .ifc \gr,%r11 - \opd = 11 - .endif - .ifc \gr,%r12 - \opd = 12 - .endif - .ifc \gr,%r13 - \opd = 13 - .endif - .ifc \gr,%r14 - \opd = 14 - .endif - .ifc \gr,%r15 - \opd = 15 - .endif - .if \opd == 255 - \opd = \gr - .endif -.endm - -/* VX_NUM - Retrieve vector register number - * - * @opd: Operand to store register number - * @vxr: String designation register in the format "%vN" - * - * The vector register number is used for as input number to the - * instruction and, as well as, to compute the RXB field of the - * instruction. - */ -.macro VX_NUM opd vxr - \opd = 255 - .ifc \vxr,%v0 - \opd = 0 - .endif - .ifc \vxr,%v1 - \opd = 1 - .endif - .ifc \vxr,%v2 - \opd = 2 - .endif - .ifc \vxr,%v3 - \opd = 3 - .endif - .ifc \vxr,%v4 - \opd = 4 - .endif - .ifc \vxr,%v5 - \opd = 5 - .endif - .ifc \vxr,%v6 - \opd = 6 - .endif - .ifc \vxr,%v7 - \opd = 7 - .endif - .ifc \vxr,%v8 - \opd = 8 - .endif - .ifc \vxr,%v9 - \opd = 9 - .endif - .ifc \vxr,%v10 - \opd = 10 - .endif - .ifc \vxr,%v11 - \opd = 11 - .endif - .ifc \vxr,%v12 - \opd = 12 - .endif - .ifc \vxr,%v13 - \opd = 13 - .endif - .ifc \vxr,%v14 - \opd = 14 - .endif - .ifc \vxr,%v15 - \opd = 15 - .endif - .ifc \vxr,%v16 - \opd = 16 - .endif - .ifc \vxr,%v17 - \opd = 17 - .endif - .ifc \vxr,%v18 - \opd = 18 - .endif - .ifc \vxr,%v19 - \opd = 19 - .endif - .ifc \vxr,%v20 - \opd = 20 - .endif - .ifc \vxr,%v21 - \opd = 21 - .endif - .ifc \vxr,%v22 - \opd = 22 - .endif - .ifc \vxr,%v23 - \opd = 23 - .endif - .ifc \vxr,%v24 - \opd = 24 - .endif - .ifc \vxr,%v25 - \opd = 25 - .endif - .ifc \vxr,%v26 - \opd = 26 - .endif - .ifc \vxr,%v27 - \opd = 27 - .endif - .ifc \vxr,%v28 - \opd = 28 - .endif - .ifc \vxr,%v29 - \opd = 29 - .endif - .ifc \vxr,%v30 - \opd = 30 - .endif - .ifc \vxr,%v31 - \opd = 31 - .endif - .if \opd == 255 - \opd = \vxr - .endif -.endm - -/* RXB - Compute most significant bit used vector registers - * - * @rxb: Operand to store computed RXB value - * @v1: First vector register designated operand - * @v2: Second vector register designated operand - * @v3: Third vector register designated operand - * @v4: Fourth vector register designated operand - */ -.macro RXB rxb v1 v2=0 v3=0 v4=0 - \rxb = 0 - .if \v1 & 0x10 - \rxb = \rxb | 0x08 - .endif - .if \v2 & 0x10 - \rxb = \rxb | 0x04 - .endif - .if \v3 & 0x10 - \rxb = \rxb | 0x02 - .endif - .if \v4 & 0x10 - \rxb = \rxb | 0x01 - .endif -.endm - -/* MRXB - Generate Element Size Control and RXB value - * - * @m: Element size control - * @v1: First vector register designated operand (for RXB) - * @v2: Second vector register designated operand (for RXB) - * @v3: Third vector register designated operand (for RXB) - * @v4: Fourth vector register designated operand (for RXB) - */ -.macro MRXB m v1 v2=0 v3=0 v4=0 - rxb = 0 - RXB rxb, \v1, \v2, \v3, \v4 - .byte (\m << 4) | rxb -.endm - -/* MRXBOPC - Generate Element Size Control, RXB, and final Opcode fields - * - * @m: Element size control - * @opc: Opcode - * @v1: First vector register designated operand (for RXB) - * @v2: Second vector register designated operand (for RXB) - * @v3: Third vector register designated operand (for RXB) - * @v4: Fourth vector register designated operand (for RXB) - */ -.macro MRXBOPC m opc v1 v2=0 v3=0 v4=0 - MRXB \m, \v1, \v2, \v3, \v4 - .byte \opc -.endm - -/* Vector support instructions */ - -/* VECTOR GENERATE BYTE MASK */ -.macro VGBM vr imm2 - VX_NUM v1, \vr - .word (0xE700 | ((v1&15) << 4)) - .word \imm2 - MRXBOPC 0, 0x44, v1 -.endm -.macro VZERO vxr - VGBM \vxr, 0 -.endm -.macro VONE vxr - VGBM \vxr, 0xFFFF -.endm - -/* VECTOR LOAD VR ELEMENT FROM GR */ -.macro VLVG v, gr, disp, m - VX_NUM v1, \v - GR_NUM b2, "%r0" - GR_NUM r3, \gr - .word 0xE700 | ((v1&15) << 4) | r3 - .word (b2 << 12) | (\disp) - MRXBOPC \m, 0x22, v1 -.endm -.macro VLVGB v, gr, index, base - VLVG \v, \gr, \index, \base, 0 -.endm -.macro VLVGH v, gr, index - VLVG \v, \gr, \index, 1 -.endm -.macro VLVGF v, gr, index - VLVG \v, \gr, \index, 2 -.endm -.macro VLVGG v, gr, index - VLVG \v, \gr, \index, 3 -.endm - -/* VECTOR LOAD REGISTER */ -.macro VLR v1, v2 - VX_NUM v1, \v1 - VX_NUM v2, \v2 - .word 0xE700 | ((v1&15) << 4) | (v2&15) - .word 0 - MRXBOPC 0, 0x56, v1, v2 -.endm - -/* VECTOR LOAD */ -.macro VL v, disp, index="%r0", base - VX_NUM v1, \v - GR_NUM x2, \index - GR_NUM b2, \base - .word 0xE700 | ((v1&15) << 4) | x2 - .word (b2 << 12) | (\disp) - MRXBOPC 0, 0x06, v1 -.endm - -/* VECTOR LOAD ELEMENT */ -.macro VLEx vr1, disp, index="%r0", base, m3, opc - VX_NUM v1, \vr1 - GR_NUM x2, \index - GR_NUM b2, \base - .word 0xE700 | ((v1&15) << 4) | x2 - .word (b2 << 12) | (\disp) - MRXBOPC \m3, \opc, v1 -.endm -.macro VLEB vr1, disp, index="%r0", base, m3 - VLEx \vr1, \disp, \index, \base, \m3, 0x00 -.endm -.macro VLEH vr1, disp, index="%r0", base, m3 - VLEx \vr1, \disp, \index, \base, \m3, 0x01 -.endm -.macro VLEF vr1, disp, index="%r0", base, m3 - VLEx \vr1, \disp, \index, \base, \m3, 0x03 -.endm -.macro VLEG vr1, disp, index="%r0", base, m3 - VLEx \vr1, \disp, \index, \base, \m3, 0x02 -.endm - -/* VECTOR LOAD ELEMENT IMMEDIATE */ -.macro VLEIx vr1, imm2, m3, opc - VX_NUM v1, \vr1 - .word 0xE700 | ((v1&15) << 4) - .word \imm2 - MRXBOPC \m3, \opc, v1 -.endm -.macro VLEIB vr1, imm2, index - VLEIx \vr1, \imm2, \index, 0x40 -.endm -.macro VLEIH vr1, imm2, index - VLEIx \vr1, \imm2, \index, 0x41 -.endm -.macro VLEIF vr1, imm2, index - VLEIx \vr1, \imm2, \index, 0x43 -.endm -.macro VLEIG vr1, imm2, index - VLEIx \vr1, \imm2, \index, 0x42 -.endm - -/* VECTOR LOAD GR FROM VR ELEMENT */ -.macro VLGV gr, vr, disp, base="%r0", m - GR_NUM r1, \gr - GR_NUM b2, \base - VX_NUM v3, \vr - .word 0xE700 | (r1 << 4) | (v3&15) - .word (b2 << 12) | (\disp) - MRXBOPC \m, 0x21, v3 -.endm -.macro VLGVB gr, vr, disp, base="%r0" - VLGV \gr, \vr, \disp, \base, 0 -.endm -.macro VLGVH gr, vr, disp, base="%r0" - VLGV \gr, \vr, \disp, \base, 1 -.endm -.macro VLGVF gr, vr, disp, base="%r0" - VLGV \gr, \vr, \disp, \base, 2 -.endm -.macro VLGVG gr, vr, disp, base="%r0" - VLGV \gr, \vr, \disp, \base, 3 -.endm - -/* VECTOR LOAD MULTIPLE */ -.macro VLM vfrom, vto, disp, base, hint=3 - VX_NUM v1, \vfrom - VX_NUM v3, \vto - GR_NUM b2, \base /* Base register */ - .word 0xE700 | ((v1&15) << 4) | (v3&15) - .word (b2 << 12) | (\disp) - MRXBOPC \hint, 0x36, v1, v3 -.endm - -/* VECTOR STORE MULTIPLE */ -.macro VSTM vfrom, vto, disp, base, hint=3 - VX_NUM v1, \vfrom - VX_NUM v3, \vto - GR_NUM b2, \base /* Base register */ - .word 0xE700 | ((v1&15) << 4) | (v3&15) - .word (b2 << 12) | (\disp) - MRXBOPC \hint, 0x3E, v1, v3 -.endm - -/* VECTOR PERMUTE */ -.macro VPERM vr1, vr2, vr3, vr4 - VX_NUM v1, \vr1 - VX_NUM v2, \vr2 - VX_NUM v3, \vr3 - VX_NUM v4, \vr4 - .word 0xE700 | ((v1&15) << 4) | (v2&15) - .word ((v3&15) << 12) - MRXBOPC (v4&15), 0x8C, v1, v2, v3, v4 -.endm - -/* VECTOR UNPACK LOGICAL LOW */ -.macro VUPLL vr1, vr2, m3 - VX_NUM v1, \vr1 - VX_NUM v2, \vr2 - .word 0xE700 | ((v1&15) << 4) | (v2&15) - .word 0x0000 - MRXBOPC \m3, 0xD4, v1, v2 -.endm -.macro VUPLLB vr1, vr2 - VUPLL \vr1, \vr2, 0 -.endm -.macro VUPLLH vr1, vr2 - VUPLL \vr1, \vr2, 1 -.endm -.macro VUPLLF vr1, vr2 - VUPLL \vr1, \vr2, 2 -.endm - - -/* Vector integer instructions */ - -/* VECTOR AND */ -.macro VN vr1, vr2, vr3 - VX_NUM v1, \vr1 - VX_NUM v2, \vr2 - VX_NUM v3, \vr3 - .word 0xE700 | ((v1&15) << 4) | (v2&15) - .word ((v3&15) << 12) - MRXBOPC 0, 0x68, v1, v2, v3 -.endm - -/* VECTOR EXCLUSIVE OR */ -.macro VX vr1, vr2, vr3 - VX_NUM v1, \vr1 - VX_NUM v2, \vr2 - VX_NUM v3, \vr3 - .word 0xE700 | ((v1&15) << 4) | (v2&15) - .word ((v3&15) << 12) - MRXBOPC 0, 0x6D, v1, v2, v3 -.endm - -/* VECTOR GALOIS FIELD MULTIPLY SUM */ -.macro VGFM vr1, vr2, vr3, m4 - VX_NUM v1, \vr1 - VX_NUM v2, \vr2 - VX_NUM v3, \vr3 - .word 0xE700 | ((v1&15) << 4) | (v2&15) - .word ((v3&15) << 12) - MRXBOPC \m4, 0xB4, v1, v2, v3 -.endm -.macro VGFMB vr1, vr2, vr3 - VGFM \vr1, \vr2, \vr3, 0 -.endm -.macro VGFMH vr1, vr2, vr3 - VGFM \vr1, \vr2, \vr3, 1 -.endm -.macro VGFMF vr1, vr2, vr3 - VGFM \vr1, \vr2, \vr3, 2 -.endm -.macro VGFMG vr1, vr2, vr3 - VGFM \vr1, \vr2, \vr3, 3 -.endm - -/* VECTOR GALOIS FIELD MULTIPLY SUM AND ACCUMULATE */ -.macro VGFMA vr1, vr2, vr3, vr4, m5 - VX_NUM v1, \vr1 - VX_NUM v2, \vr2 - VX_NUM v3, \vr3 - VX_NUM v4, \vr4 - .word 0xE700 | ((v1&15) << 4) | (v2&15) - .word ((v3&15) << 12) | (\m5 << 8) - MRXBOPC (v4&15), 0xBC, v1, v2, v3, v4 -.endm -.macro VGFMAB vr1, vr2, vr3, vr4 - VGFMA \vr1, \vr2, \vr3, \vr4, 0 -.endm -.macro VGFMAH vr1, vr2, vr3, vr4 - VGFMA \vr1, \vr2, \vr3, \vr4, 1 -.endm -.macro VGFMAF vr1, vr2, vr3, vr4 - VGFMA \vr1, \vr2, \vr3, \vr4, 2 -.endm -.macro VGFMAG vr1, vr2, vr3, vr4 - VGFMA \vr1, \vr2, \vr3, \vr4, 3 -.endm - -/* VECTOR SHIFT RIGHT LOGICAL BY BYTE */ -.macro VSRLB vr1, vr2, vr3 - VX_NUM v1, \vr1 - VX_NUM v2, \vr2 - VX_NUM v3, \vr3 - .word 0xE700 | ((v1&15) << 4) | (v2&15) - .word ((v3&15) << 12) - MRXBOPC 0, 0x7D, v1, v2, v3 -.endm - -/* VECTOR REPLICATE IMMEDIATE */ -.macro VREPI vr1, imm2, m3 - VX_NUM v1, \vr1 - .word 0xE700 | ((v1&15) << 4) - .word \imm2 - MRXBOPC \m3, 0x45, v1 -.endm -.macro VREPIB vr1, imm2 - VREPI \vr1, \imm2, 0 -.endm -.macro VREPIH vr1, imm2 - VREPI \vr1, \imm2, 1 -.endm -.macro VREPIF vr1, imm2 - VREPI \vr1, \imm2, 2 -.endm -.macro VREPIG vr1, imm2 - VREP \vr1, \imm2, 3 -.endm - -/* VECTOR ADD */ -.macro VA vr1, vr2, vr3, m4 - VX_NUM v1, \vr1 - VX_NUM v2, \vr2 - VX_NUM v3, \vr3 - .word 0xE700 | ((v1&15) << 4) | (v2&15) - .word ((v3&15) << 12) - MRXBOPC \m4, 0xF3, v1, v2, v3 -.endm -.macro VAB vr1, vr2, vr3 - VA \vr1, \vr2, \vr3, 0 -.endm -.macro VAH vr1, vr2, vr3 - VA \vr1, \vr2, \vr3, 1 -.endm -.macro VAF vr1, vr2, vr3 - VA \vr1, \vr2, \vr3, 2 -.endm -.macro VAG vr1, vr2, vr3 - VA \vr1, \vr2, \vr3, 3 -.endm -.macro VAQ vr1, vr2, vr3 - VA \vr1, \vr2, \vr3, 4 -.endm +#include <asm/vx-insn-asm.h> -/* VECTOR ELEMENT SHIFT RIGHT ARITHMETIC */ -.macro VESRAV vr1, vr2, vr3, m4 - VX_NUM v1, \vr1 - VX_NUM v2, \vr2 - VX_NUM v3, \vr3 - .word 0xE700 | ((v1&15) << 4) | (v2&15) - .word ((v3&15) << 12) - MRXBOPC \m4, 0x7A, v1, v2, v3 -.endm +#ifndef __ASSEMBLY__ -.macro VESRAVB vr1, vr2, vr3 - VESRAV \vr1, \vr2, \vr3, 0 -.endm -.macro VESRAVH vr1, vr2, vr3 - VESRAV \vr1, \vr2, \vr3, 1 -.endm -.macro VESRAVF vr1, vr2, vr3 - VESRAV \vr1, \vr2, \vr3, 2 -.endm -.macro VESRAVG vr1, vr2, vr3 - VESRAV \vr1, \vr2, \vr3, 3 -.endm +asm(".include \"asm/vx-insn-asm.h\"\n"); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLY__ */ #endif /* __ASM_S390_VX_INSN_H */ diff --git a/arch/s390/include/asm/word-at-a-time.h b/arch/s390/include/asm/word-at-a-time.h new file mode 100644 index 000000000000..2579f1694b82 --- /dev/null +++ b/arch/s390/include/asm/word-at-a-time.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_WORD_AT_A_TIME_H +#define _ASM_WORD_AT_A_TIME_H + +#include <linux/kernel.h> +#include <asm/asm-extable.h> +#include <asm/bitsperlong.h> + +struct word_at_a_time { + const unsigned long bits; +}; + +#define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x7f) } + +static inline unsigned long prep_zero_mask(unsigned long val, unsigned long data, const struct word_at_a_time *c) +{ + return data; +} + +static inline unsigned long create_zero_mask(unsigned long data) +{ + return __fls(data); +} + +static inline unsigned long find_zero(unsigned long data) +{ + return (data ^ (BITS_PER_LONG - 1)) >> 3; +} + +static inline unsigned long has_zero(unsigned long val, unsigned long *data, const struct word_at_a_time *c) +{ + unsigned long mask = (val & c->bits) + c->bits; + + *data = ~(mask | val | c->bits); + return *data; +} + +static inline unsigned long zero_bytemask(unsigned long data) +{ + return ~1UL << data; +} + +/* + * Load an unaligned word from kernel space. + * + * In the (very unlikely) case of the word being a page-crosser + * and the next page not being mapped, take the exception and + * return zeroes in the non-existing part. + */ +static inline unsigned long load_unaligned_zeropad(const void *addr) +{ + unsigned long data; + + asm volatile( + "0: lg %[data],0(%[addr])\n" + "1: nopr %%r7\n" + EX_TABLE_ZEROPAD(0b, 1b, %[data], %[addr]) + EX_TABLE_ZEROPAD(1b, 1b, %[data], %[addr]) + : [data] "=d" (data) + : [addr] "a" (addr), "m" (*(unsigned long *)addr)); + return data; +} + +#endif /* _ASM_WORD_AT_A_TIME_H */ diff --git a/arch/s390/include/uapi/asm/cmb.h b/arch/s390/include/uapi/asm/cmb.h index ecbe94941403..115434ab98fb 100644 --- a/arch/s390/include/uapi/asm/cmb.h +++ b/arch/s390/include/uapi/asm/cmb.h @@ -31,7 +31,7 @@ struct cmbdata { __u64 size; __u64 elapsed_time; - /* basic and exended format: */ + /* basic and extended format: */ __u64 ssch_rsch_count; __u64 sample_count; __u64 device_connect_time; diff --git a/arch/s390/include/uapi/asm/dasd.h b/arch/s390/include/uapi/asm/dasd.h index 9ec86fae9980..b11d98800458 100644 --- a/arch/s390/include/uapi/asm/dasd.h +++ b/arch/s390/include/uapi/asm/dasd.h @@ -24,7 +24,7 @@ /* * struct dasd_information2_t * represents any data about the device, which is visible to userspace. - * including foramt and featueres. + * including format and featueres. */ typedef struct dasd_information2_t { unsigned int devno; /* S/390 devno */ @@ -78,6 +78,7 @@ typedef struct dasd_information2_t { * 0x040: give access to raw eckd data * 0x080: enable discard support * 0x100: enable autodisable for IFCC errors (default) + * 0x200: enable requeue of all requests on autoquiesce */ #define DASD_FEATURE_READONLY 0x001 #define DASD_FEATURE_USEDIAG 0x002 @@ -88,6 +89,7 @@ typedef struct dasd_information2_t { #define DASD_FEATURE_USERAW 0x040 #define DASD_FEATURE_DISCARD 0x080 #define DASD_FEATURE_PATH_AUTODISABLE 0x100 +#define DASD_FEATURE_REQUEUEQUIESCE 0x200 #define DASD_FEATURE_DEFAULT DASD_FEATURE_PATH_AUTODISABLE #define DASD_PARTN_BITS 2 @@ -183,6 +185,18 @@ typedef struct format_data_t { } format_data_t; /* + * struct dasd_copypair_swap_data_t + * represents all data necessary to issue a swap of the copy pair relation + */ +struct dasd_copypair_swap_data_t { + char primary[20]; /* BUSID of primary */ + char secondary[20]; /* BUSID of secondary */ + + /* Reserved for future updates. */ + __u8 reserved[64]; +}; + +/* * values to be used for format_data_t.intensity * 0/8: normal format * 1/9: also write record zero @@ -326,6 +340,8 @@ struct dasd_snid_ioctl_data { #define BIODASDSATTR _IOW(DASD_IOCTL_LETTER,2,attrib_data_t) /* Release Allocated Space */ #define BIODASDRAS _IOW(DASD_IOCTL_LETTER, 3, format_data_t) +/* Swap copy pair relation */ +#define BIODASDCOPYPAIRSWAP _IOW(DASD_IOCTL_LETTER, 4, struct dasd_copypair_swap_data_t) /* Get Sense Path Group ID (SNID) data */ #define BIODASDSNID _IOWR(DASD_IOCTL_LETTER, 1, struct dasd_snid_ioctl_data) diff --git a/arch/s390/include/uapi/asm/debug.h b/arch/s390/include/uapi/asm/debug.h deleted file mode 100644 index c7c564d9aea4..000000000000 --- a/arch/s390/include/uapi/asm/debug.h +++ /dev/null @@ -1,35 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * S/390 debug facility - * - * Copyright IBM Corp. 1999, 2000 - */ - -#ifndef _UAPIDEBUG_H -#define _UAPIDEBUG_H - -#include <linux/fs.h> - -/* Note: - * struct __debug_entry must be defined outside of #ifdef __KERNEL__ - * in order to allow a user program to analyze the 'raw'-view. - */ - -struct __debug_entry{ - union { - struct { - unsigned long long clock:52; - unsigned long long exception:1; - unsigned long long level:3; - unsigned long long cpuid:8; - } fields; - - unsigned long long stck; - } id; - void* caller; -} __attribute__((packed)); - - -#define __DEBUG_FEATURE_VERSION 2 /* version of debug feature */ - -#endif /* _UAPIDEBUG_H */ diff --git a/arch/s390/include/uapi/asm/fs3270.h b/arch/s390/include/uapi/asm/fs3270.h new file mode 100644 index 000000000000..c4bc1108af6a --- /dev/null +++ b/arch/s390/include/uapi/asm/fs3270.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __ASM_S390_UAPI_FS3270_H +#define __ASM_S390_UAPI_FS3270_H + +#include <linux/types.h> +#include <asm/ioctl.h> + +/* ioctls for fullscreen 3270 */ +#define TUBICMD _IO('3', 3) /* set ccw command for fs reads. */ +#define TUBOCMD _IO('3', 4) /* set ccw command for fs writes. */ +#define TUBGETI _IO('3', 7) /* get ccw command for fs reads. */ +#define TUBGETO _IO('3', 8) /* get ccw command for fs writes. */ +#define TUBGETMOD _IO('3', 13) /* get characteristics like model, cols, rows */ + +/* For TUBGETMOD */ +struct raw3270_iocb { + __u16 model; + __u16 line_cnt; + __u16 col_cnt; + __u16 pf_cnt; + __u16 re_cnt; + __u16 map; +}; + +#endif /* __ASM_S390_UAPI_FS3270_H */ diff --git a/arch/s390/include/uapi/asm/hwctrset.h b/arch/s390/include/uapi/asm/hwctrset.h new file mode 100644 index 000000000000..e56b9dd23a4b --- /dev/null +++ b/arch/s390/include/uapi/asm/hwctrset.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright IBM Corp. 2021 + * Interface implementation for communication with the CPU Measurement + * counter facility device driver. + * + * Author(s): Thomas Richter <tmricht@linux.ibm.com> + * + * Define for ioctl() commands to communicate with the CPU Measurement + * counter facility device driver. + */ + +#ifndef _PERF_CPUM_CF_DIAG_H +#define _PERF_CPUM_CF_DIAG_H + +#include <linux/ioctl.h> +#include <linux/types.h> + +#define S390_HWCTR_DEVICE "hwctr" +#define S390_HWCTR_START_VERSION 1 + +struct s390_ctrset_start { /* Set CPUs to operate on */ + __u64 version; /* Version of interface */ + __u64 data_bytes; /* # of bytes required */ + __u64 cpumask_len; /* Length of CPU mask in bytes */ + __u64 *cpumask; /* Pointer to CPU mask */ + __u64 counter_sets; /* Bit mask of counter sets to get */ +}; + +struct s390_ctrset_setdata { /* Counter set data */ + __u32 set; /* Counter set number */ + __u32 no_cnts; /* # of counters stored in cv[] */ + __u64 cv[]; /* Counter values (variable length) */ +}; + +struct s390_ctrset_cpudata { /* Counter set data per CPU */ + __u32 cpu_nr; /* CPU number */ + __u32 no_sets; /* # of counters sets in data[] */ + struct s390_ctrset_setdata data[]; +}; + +struct s390_ctrset_read { /* Structure to get all ctr sets */ + __u64 no_cpus; /* Total # of CPUs data taken from */ + struct s390_ctrset_cpudata data[]; +}; + +#define S390_HWCTR_MAGIC 'C' /* Random magic # for ioctls */ +#define S390_HWCTR_START _IOWR(S390_HWCTR_MAGIC, 1, struct s390_ctrset_start) +#define S390_HWCTR_STOP _IO(S390_HWCTR_MAGIC, 2) +#define S390_HWCTR_READ _IOWR(S390_HWCTR_MAGIC, 3, struct s390_ctrset_read) +#endif diff --git a/arch/s390/include/uapi/asm/ipl.h b/arch/s390/include/uapi/asm/ipl.h index 451ba7d08905..2cd28af50dd4 100644 --- a/arch/s390/include/uapi/asm/ipl.h +++ b/arch/s390/include/uapi/asm/ipl.h @@ -27,6 +27,8 @@ enum ipl_pbt { IPL_PBT_FCP = 0, IPL_PBT_SCP_DATA = 1, IPL_PBT_CCW = 2, + IPL_PBT_ECKD = 3, + IPL_PBT_NVME = 4, }; /* IPL Parameter Block 0 with common fields */ @@ -67,6 +69,30 @@ struct ipl_pb0_fcp { #define IPL_PB0_FCP_OPT_IPL 0x10 #define IPL_PB0_FCP_OPT_DUMP 0x20 +/* IPL Parameter Block 0 for NVMe */ +struct ipl_pb0_nvme { + __u32 len; + __u8 pbt; + __u8 reserved1[3]; + __u8 loadparm[8]; + __u8 reserved2[304]; + __u8 opt; + __u8 reserved3[3]; + __u32 fid; + __u8 reserved4[12]; + __u32 nsid; + __u8 reserved5[4]; + __u32 bootprog; + __u8 reserved6[12]; + __u64 br_lba; + __u32 scp_data_len; + __u8 reserved7[260]; + __u8 scp_data[]; +} __packed; + +#define IPL_PB0_NVME_OPT_IPL 0x10 +#define IPL_PB0_NVME_OPT_DUMP 0x20 + /* IPL Parameter Block 0 for CCW */ struct ipl_pb0_ccw { __u32 len; @@ -86,6 +112,34 @@ struct ipl_pb0_ccw { __u8 reserved5[8]; } __packed; +/* IPL Parameter Block 0 for ECKD */ +struct ipl_pb0_eckd { + __u32 len; + __u8 pbt; + __u8 reserved1[3]; + __u32 reserved2[78]; + __u8 opt; + __u8 reserved4[4]; + __u8 reserved5:5; + __u8 ssid:3; + __u16 devno; + __u32 reserved6[5]; + __u32 bootprog; + __u8 reserved7[12]; + struct { + __u16 cyl; + __u8 head; + __u8 record; + __u32 reserved; + } br_chr __packed; + __u32 scp_data_len; + __u8 reserved8[260]; + __u8 scp_data[]; +} __packed; + +#define IPL_PB0_ECKD_OPT_IPL 0x10 +#define IPL_PB0_ECKD_OPT_DUMP 0x20 + #define IPL_PB0_CCW_VM_FLAG_NSS 0x80 #define IPL_PB0_CCW_VM_FLAG_VP 0x40 diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index 436ec7636927..abe926d43cbe 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -74,6 +74,7 @@ struct kvm_s390_io_adapter_req { #define KVM_S390_VM_CRYPTO 2 #define KVM_S390_VM_CPU_MODEL 3 #define KVM_S390_VM_MIGRATION 4 +#define KVM_S390_VM_CPU_TOPOLOGY 5 /* kvm attributes for mem_ctrl */ #define KVM_S390_VM_MEM_ENABLE_CMMA 0 @@ -158,6 +159,22 @@ struct kvm_s390_vm_cpu_subfunc { __u8 reserved[1728]; }; +#define KVM_S390_VM_CPU_PROCESSOR_UV_FEAT_GUEST 6 +#define KVM_S390_VM_CPU_MACHINE_UV_FEAT_GUEST 7 + +#define KVM_S390_VM_CPU_UV_FEAT_NR_BITS 64 +struct kvm_s390_vm_cpu_uv_feat { + union { + struct { + __u64 : 4; + __u64 ap : 1; /* bit 4 */ + __u64 ap_intr : 1; /* bit 5 */ + __u64 : 58; + }; + __u64 feat; + }; +}; + /* kvm attributes for crypto */ #define KVM_S390_VM_CRYPTO_ENABLE_AES_KW 0 #define KVM_S390_VM_CRYPTO_ENABLE_DEA_KW 1 @@ -231,11 +248,13 @@ struct kvm_guest_debug_arch { #define KVM_SYNC_GSCB (1UL << 9) #define KVM_SYNC_BPBC (1UL << 10) #define KVM_SYNC_ETOKEN (1UL << 11) +#define KVM_SYNC_DIAG318 (1UL << 12) #define KVM_SYNC_S390_VALID_FIELDS \ (KVM_SYNC_PREFIX | KVM_SYNC_GPRS | KVM_SYNC_ACRS | KVM_SYNC_CRS | \ KVM_SYNC_ARCH0 | KVM_SYNC_PFAULT | KVM_SYNC_VRS | KVM_SYNC_RICCB | \ - KVM_SYNC_FPRS | KVM_SYNC_GSCB | KVM_SYNC_BPBC | KVM_SYNC_ETOKEN) + KVM_SYNC_FPRS | KVM_SYNC_GSCB | KVM_SYNC_BPBC | KVM_SYNC_ETOKEN | \ + KVM_SYNC_DIAG318) /* length and alignment of the sdnx as a power of two */ #define SDNXC 8 @@ -264,7 +283,8 @@ struct kvm_sync_regs { __u8 reserved2 : 7; __u8 padding1[51]; /* riccb needs to be 64byte aligned */ __u8 riccb[64]; /* runtime instrumentation controls block */ - __u8 padding2[192]; /* sdnx needs to be 256byte aligned */ + __u64 diag318; /* diagnose 0x318 info */ + __u8 padding2[184]; /* sdnx needs to be 256byte aligned */ union { __u8 sdnx[SDNXL]; /* state description annex */ struct { diff --git a/arch/s390/include/uapi/asm/pkey.h b/arch/s390/include/uapi/asm/pkey.h index e22f0720bbb8..5ad76471e73f 100644 --- a/arch/s390/include/uapi/asm/pkey.h +++ b/arch/s390/include/uapi/asm/pkey.h @@ -2,7 +2,7 @@ /* * Userspace interface to the pkey device driver * - * Copyright IBM Corp. 2017, 2019 + * Copyright IBM Corp. 2017, 2023 * * Author: Harald Freudenberger <freude@de.ibm.com> * @@ -25,20 +25,31 @@ #define MAXPROTKEYSIZE 64 /* a protected key blob may be up to 64 bytes */ #define MAXCLRKEYSIZE 32 /* a clear key value may be up to 32 bytes */ #define MAXAESCIPHERKEYSIZE 136 /* our aes cipher keys have always 136 bytes */ +#define MINEP11AESKEYBLOBSIZE 256 /* min EP11 AES key blob size */ +#define MAXEP11AESKEYBLOBSIZE 336 /* max EP11 AES key blob size */ -/* Minimum and maximum size of a key blob */ +/* Minimum size of a key blob */ #define MINKEYBLOBSIZE SECKEYBLOBSIZE -#define MAXKEYBLOBSIZE MAXAESCIPHERKEYSIZE /* defines for the type field within the pkey_protkey struct */ -#define PKEY_KEYTYPE_AES_128 1 -#define PKEY_KEYTYPE_AES_192 2 -#define PKEY_KEYTYPE_AES_256 3 +#define PKEY_KEYTYPE_AES_128 1 +#define PKEY_KEYTYPE_AES_192 2 +#define PKEY_KEYTYPE_AES_256 3 +#define PKEY_KEYTYPE_ECC 4 +#define PKEY_KEYTYPE_ECC_P256 5 +#define PKEY_KEYTYPE_ECC_P384 6 +#define PKEY_KEYTYPE_ECC_P521 7 +#define PKEY_KEYTYPE_ECC_ED25519 8 +#define PKEY_KEYTYPE_ECC_ED448 9 /* the newer ioctls use a pkey_key_type enum for type information */ enum pkey_key_type { PKEY_TYPE_CCA_DATA = (__u32) 1, PKEY_TYPE_CCA_CIPHER = (__u32) 2, + PKEY_TYPE_EP11 = (__u32) 3, + PKEY_TYPE_CCA_ECC = (__u32) 0x1f, + PKEY_TYPE_EP11_AES = (__u32) 6, + PKEY_TYPE_EP11_ECC = (__u32) 7, }; /* the newer ioctls use a pkey_key_size enum for key size information */ @@ -87,6 +98,20 @@ struct pkey_clrkey { }; /* + * EP11 key blobs of type PKEY_TYPE_EP11_AES and PKEY_TYPE_EP11_ECC + * are ep11 blobs prepended by this header: + */ +struct ep11kblob_header { + __u8 type; /* always 0x00 */ + __u8 hver; /* header version, currently needs to be 0x00 */ + __u16 len; /* total length in bytes (including this header) */ + __u8 version; /* PKEY_TYPE_EP11_AES or PKEY_TYPE_EP11_ECC */ + __u8 res0; /* unused */ + __u16 bitlen; /* clear key bit len, 0 for unknown */ + __u8 res1[8]; /* unused */ +} __packed; + +/* * Generate CCA AES secure key. */ struct pkey_genseck { @@ -151,7 +176,7 @@ struct pkey_skey2pkey { #define PKEY_SKEY2PKEY _IOWR(PKEY_IOCTL_MAGIC, 0x06, struct pkey_skey2pkey) /* - * Verify the given CCA AES secure key for being able to be useable with + * Verify the given CCA AES secure key for being able to be usable with * the pkey module. Check for correct key type and check for having at * least one crypto card being able to handle this key (master key * or old master key verification pattern matches). @@ -200,7 +225,7 @@ struct pkey_kblob2pkey { /* * Generate secure key, version 2. - * Generate either a CCA AES secure key or a CCA AES cipher key. + * Generate CCA AES secure key, CCA AES cipher key or EP11 AES secure key. * There needs to be a list of apqns given with at least one entry in there. * All apqns in the list need to be exact apqns, 0xFFFF as ANY card or domain * is not supported. The implementation walks through the list of apqns and @@ -210,10 +235,13 @@ struct pkey_kblob2pkey { * (return -1 with errno ENODEV). You may use the PKEY_APQNS4KT ioctl to * generate a list of apqns based on the key type to generate. * The keygenflags argument is passed to the low level generation functions - * individual for the key type and has a key type specific meaning. Currently - * only CCA AES cipher keys react to this parameter: Use one or more of the - * PKEY_KEYGEN_* flags to widen the export possibilities. By default a cipher - * key is only exportable for CPACF (PKEY_KEYGEN_XPRT_CPAC). + * individual for the key type and has a key type specific meaning. When + * generating CCA cipher keys you can use one or more of the PKEY_KEYGEN_* + * flags to widen the export possibilities. By default a cipher key is + * only exportable for CPACF (PKEY_KEYGEN_XPRT_CPAC). + * The keygenflag argument for generating an EP11 AES key should either be 0 + * to use the defaults which are XCP_BLOB_ENCRYPT, XCP_BLOB_DECRYPT and + * XCP_BLOB_PROTKEY_EXTRACTABLE or a valid combination of XCP_BLOB_* flags. */ struct pkey_genseck2 { struct pkey_apqn __user *apqns; /* in: ptr to list of apqn targets*/ @@ -229,8 +257,8 @@ struct pkey_genseck2 { /* * Generate secure key from clear key value, version 2. - * Construct a CCA AES secure key or CCA AES cipher key from a given clear key - * value. + * Construct an CCA AES secure key, CCA AES cipher key or EP11 AES secure + * key from a given clear key value. * There needs to be a list of apqns given with at least one entry in there. * All apqns in the list need to be exact apqns, 0xFFFF as ANY card or domain * is not supported. The implementation walks through the list of apqns and @@ -240,10 +268,13 @@ struct pkey_genseck2 { * (return -1 with errno ENODEV). You may use the PKEY_APQNS4KT ioctl to * generate a list of apqns based on the key type to generate. * The keygenflags argument is passed to the low level generation functions - * individual for the key type and has a key type specific meaning. Currently - * only CCA AES cipher keys react to this parameter: Use one or more of the - * PKEY_KEYGEN_* flags to widen the export possibilities. By default a cipher - * key is only exportable for CPACF (PKEY_KEYGEN_XPRT_CPAC). + * individual for the key type and has a key type specific meaning. When + * generating CCA cipher keys you can use one or more of the PKEY_KEYGEN_* + * flags to widen the export possibilities. By default a cipher key is + * only exportable for CPACF (PKEY_KEYGEN_XPRT_CPAC). + * The keygenflag argument for generating an EP11 AES key should either be 0 + * to use the defaults which are XCP_BLOB_ENCRYPT, XCP_BLOB_DECRYPT and + * XCP_BLOB_PROTKEY_EXTRACTABLE or a valid combination of XCP_BLOB_* flags. */ struct pkey_clr2seck2 { struct pkey_apqn __user *apqns; /* in: ptr to list of apqn targets */ @@ -266,14 +297,19 @@ struct pkey_clr2seck2 { * with one apqn able to handle this key. * The function also checks for the master key verification patterns * of the key matching to the current or alternate mkvp of the apqn. - * Currently CCA AES secure keys and CCA AES cipher keys are supported. - * The flags field is updated with some additional info about the apqn mkvp + * For CCA AES secure keys and CCA AES cipher keys this means to check + * the key's mkvp against the current or old mkvp of the apqns. The flags + * field is updated with some additional info about the apqn mkvp * match: If the current mkvp matches to the key's mkvp then the * PKEY_FLAGS_MATCH_CUR_MKVP bit is set, if the alternate mkvp matches to * the key's mkvp the PKEY_FLAGS_MATCH_ALT_MKVP is set. For CCA keys the * alternate mkvp is the old master key verification pattern. * CCA AES secure keys are also checked to have the CPACF export allowed * bit enabled (XPRTCPAC) in the kmf1 field. + * EP11 keys are also supported and the wkvp of the key is checked against + * the current wkvp of the apqns. There is no alternate for this type of + * key and so on a match the flag PKEY_FLAGS_MATCH_CUR_MKVP always is set. + * EP11 keys are also checked to have XCP_BLOB_PROTKEY_EXTRACTABLE set. * The ioctl returns 0 as long as the given or found apqn matches to * matches with the current or alternate mkvp to the key's mkvp. If the given * apqn does not match or there is no such apqn found, -1 with errno @@ -291,7 +327,7 @@ struct pkey_verifykey2 { #define PKEY_VERIFYKEY2 _IOWR(PKEY_IOCTL_MAGIC, 0x17, struct pkey_verifykey2) /* - * Transform a key blob (of any type) into a protected key, version 2. + * Transform a key blob into a protected key, version 2. * There needs to be a list of apqns given with at least one entry in there. * All apqns in the list need to be exact apqns, 0xFFFF as ANY card or domain * is not supported. The implementation walks through the list of apqns and @@ -300,6 +336,8 @@ struct pkey_verifykey2 { * list is tried until success (return 0) or the end of the list is reached * (return -1 with errno ENODEV). You may use the PKEY_APQNS4K ioctl to * generate a list of apqns based on the key. + * Deriving ECC protected keys from ECC secure keys is not supported with + * this ioctl, use PKEY_KBLOB2PROTK3 for this purpose. */ struct pkey_kblob2pkey2 { __u8 __user *key; /* in: pointer to key blob */ @@ -313,22 +351,26 @@ struct pkey_kblob2pkey2 { /* * Build a list of APQNs based on a key blob given. * Is able to find out which type of secure key is given (CCA AES secure - * key or CCA AES cipher key) and tries to find all matching crypto cards - * based on the MKVP and maybe other criterias (like CCA AES cipher keys - * need a CEX5C or higher). The list of APQNs is further filtered by the key's - * mkvp which needs to match to either the current mkvp or the alternate mkvp - * (which is the old mkvp on CCA adapters) of the apqns. The flags argument may - * be used to limit the matching apqns. If the PKEY_FLAGS_MATCH_CUR_MKVP is - * given, only the current mkvp of each apqn is compared. Likewise with the - * PKEY_FLAGS_MATCH_ALT_MKVP. If both are given, it is assumed to - * return apqns where either the current or the alternate mkvp - * matches. At least one of the matching flags needs to be given. + * key, CCA AES cipher key, CCA ECC private key, EP11 AES key, EP11 ECC private + * key) and tries to find all matching crypto cards based on the MKVP and maybe + * other criteria (like CCA AES cipher keys need a CEX5C or higher, EP11 keys + * with BLOB_PKEY_EXTRACTABLE need a CEX7 and EP11 api version 4). The list of + * APQNs is further filtered by the key's mkvp which needs to match to either + * the current mkvp (CCA and EP11) or the alternate mkvp (old mkvp, CCA adapters + * only) of the apqns. The flags argument may be used to limit the matching + * apqns. If the PKEY_FLAGS_MATCH_CUR_MKVP is given, only the current mkvp of + * each apqn is compared. Likewise with the PKEY_FLAGS_MATCH_ALT_MKVP. If both + * are given, it is assumed to return apqns where either the current or the + * alternate mkvp matches. At least one of the matching flags needs to be given. + * The flags argument for EP11 keys has no further action and is currently + * ignored (but needs to be given as PKEY_FLAGS_MATCH_CUR_MKVP) as there is only + * the wkvp from the key to match against the apqn's wkvp. * The list of matching apqns is stored into the space given by the apqns * argument and the number of stored entries goes into apqn_entries. If the list * is empty (apqn_entries is 0) the apqn_entries field is updated to the number * of apqn targets found and the ioctl returns with 0. If apqn_entries is > 0 * but the number of apqn targets does not fit into the list, the apqn_targets - * field is updatedd with the number of reqired entries but there are no apqn + * field is updated with the number of required entries but there are no apqn * values stored in the list and the ioctl returns with ENOSPC. If no matching * APQN is found, the ioctl returns with 0 but the apqn_entries value is 0. */ @@ -348,20 +390,25 @@ struct pkey_apqns4key { * restrict the list by given master key verification patterns. * For different key types there may be different ways to match the * master key verification patterns. For CCA keys (CCA data key and CCA - * cipher key) the first 8 bytes of cur_mkvp refer to the current mkvp value - * of the apqn and the first 8 bytes of the alt_mkvp refer to the old mkvp. - * The flags argument controls if the apqns current and/or alternate mkvp + * cipher key) the first 8 bytes of cur_mkvp refer to the current AES mkvp value + * of the apqn and the first 8 bytes of the alt_mkvp refer to the old AES mkvp. + * For CCA ECC keys it is similar but the match is against the APKA current/old + * mkvp. The flags argument controls if the apqns current and/or alternate mkvp * should match. If the PKEY_FLAGS_MATCH_CUR_MKVP is given, only the current * mkvp of each apqn is compared. Likewise with the PKEY_FLAGS_MATCH_ALT_MKVP. * If both are given, it is assumed to return apqns where either the * current or the alternate mkvp matches. If no match flag is given * (flags is 0) the mkvp values are ignored for the match process. + * For EP11 keys there is only the current wkvp. So if the apqns should also + * match to a given wkvp, then the PKEY_FLAGS_MATCH_CUR_MKVP flag should be + * set. The wkvp value is 32 bytes but only the leftmost 16 bytes are compared + * against the leftmost 16 byte of the wkvp of the apqn. * The list of matching apqns is stored into the space given by the apqns * argument and the number of stored entries goes into apqn_entries. If the list * is empty (apqn_entries is 0) the apqn_entries field is updated to the number * of apqn targets found and the ioctl returns with 0. If apqn_entries is > 0 * but the number of apqn targets does not fit into the list, the apqn_targets - * field is updatedd with the number of reqired entries but there are no apqn + * field is updated with the number of required entries but there are no apqn * values stored in the list and the ioctl returns with ENOSPC. If no matching * APQN is found, the ioctl returns with 0 but the apqn_entries value is 0. */ @@ -376,4 +423,30 @@ struct pkey_apqns4keytype { }; #define PKEY_APQNS4KT _IOWR(PKEY_IOCTL_MAGIC, 0x1C, struct pkey_apqns4keytype) +/* + * Transform a key blob into a protected key, version 3. + * The difference to version 2 of this ioctl is that the protected key + * buffer is now explicitly and not within a struct pkey_protkey any more. + * So this ioctl is also able to handle EP11 and CCA ECC secure keys and + * provide ECC protected keys. + * There needs to be a list of apqns given with at least one entry in there. + * All apqns in the list need to be exact apqns, 0xFFFF as ANY card or domain + * is not supported. The implementation walks through the list of apqns and + * tries to send the request to each apqn without any further checking (like + * card type or online state). If the apqn fails, simple the next one in the + * list is tried until success (return 0) or the end of the list is reached + * (return -1 with errno ENODEV). You may use the PKEY_APQNS4K ioctl to + * generate a list of apqns based on the key. + */ +struct pkey_kblob2pkey3 { + __u8 __user *key; /* in: pointer to key blob */ + __u32 keylen; /* in: key blob size */ + struct pkey_apqn __user *apqns; /* in: ptr to list of apqn targets */ + __u32 apqn_entries; /* in: # of apqn target list entries */ + __u32 pkeytype; /* out: prot key type (enum pkey_key_type) */ + __u32 pkeylen; /* in/out: size of pkey buffer/actual len of pkey */ + __u8 __user *pkey; /* in: pkey blob buffer space ptr */ +}; +#define PKEY_KBLOB2PROTK3 _IOWR(PKEY_IOCTL_MAGIC, 0x1D, struct pkey_kblob2pkey3) + #endif /* _UAPI_PKEY_H */ diff --git a/arch/s390/include/uapi/asm/ptrace.h b/arch/s390/include/uapi/asm/ptrace.h index 543dd70e12c8..bb0826024bb9 100644 --- a/arch/s390/include/uapi/asm/ptrace.h +++ b/arch/s390/include/uapi/asm/ptrace.h @@ -8,6 +8,8 @@ #ifndef _UAPI_S390_PTRACE_H #define _UAPI_S390_PTRACE_H +#include <linux/const.h> + /* * Offsets in the user_regs_struct. They are used for the ptrace * system call and in entry.S @@ -166,6 +168,64 @@ #endif /* __s390x__ */ +#ifndef __s390x__ + +#define PSW_MASK_PER _AC(0x40000000, UL) +#define PSW_MASK_DAT _AC(0x04000000, UL) +#define PSW_MASK_IO _AC(0x02000000, UL) +#define PSW_MASK_EXT _AC(0x01000000, UL) +#define PSW_MASK_KEY _AC(0x00F00000, UL) +#define PSW_MASK_BASE _AC(0x00080000, UL) /* always one */ +#define PSW_MASK_MCHECK _AC(0x00040000, UL) +#define PSW_MASK_WAIT _AC(0x00020000, UL) +#define PSW_MASK_PSTATE _AC(0x00010000, UL) +#define PSW_MASK_ASC _AC(0x0000C000, UL) +#define PSW_MASK_CC _AC(0x00003000, UL) +#define PSW_MASK_PM _AC(0x00000F00, UL) +#define PSW_MASK_RI _AC(0x00000000, UL) +#define PSW_MASK_EA _AC(0x00000000, UL) +#define PSW_MASK_BA _AC(0x00000000, UL) + +#define PSW_MASK_USER _AC(0x0000FF00, UL) + +#define PSW_ADDR_AMODE _AC(0x80000000, UL) +#define PSW_ADDR_INSN _AC(0x7FFFFFFF, UL) + +#define PSW_ASC_PRIMARY _AC(0x00000000, UL) +#define PSW_ASC_ACCREG _AC(0x00004000, UL) +#define PSW_ASC_SECONDARY _AC(0x00008000, UL) +#define PSW_ASC_HOME _AC(0x0000C000, UL) + +#else /* __s390x__ */ + +#define PSW_MASK_PER _AC(0x4000000000000000, UL) +#define PSW_MASK_DAT _AC(0x0400000000000000, UL) +#define PSW_MASK_IO _AC(0x0200000000000000, UL) +#define PSW_MASK_EXT _AC(0x0100000000000000, UL) +#define PSW_MASK_BASE _AC(0x0000000000000000, UL) +#define PSW_MASK_KEY _AC(0x00F0000000000000, UL) +#define PSW_MASK_MCHECK _AC(0x0004000000000000, UL) +#define PSW_MASK_WAIT _AC(0x0002000000000000, UL) +#define PSW_MASK_PSTATE _AC(0x0001000000000000, UL) +#define PSW_MASK_ASC _AC(0x0000C00000000000, UL) +#define PSW_MASK_CC _AC(0x0000300000000000, UL) +#define PSW_MASK_PM _AC(0x00000F0000000000, UL) +#define PSW_MASK_RI _AC(0x0000008000000000, UL) +#define PSW_MASK_EA _AC(0x0000000100000000, UL) +#define PSW_MASK_BA _AC(0x0000000080000000, UL) + +#define PSW_MASK_USER _AC(0x0000FF0180000000, UL) + +#define PSW_ADDR_AMODE _AC(0x0000000000000000, UL) +#define PSW_ADDR_INSN _AC(0xFFFFFFFFFFFFFFFF, UL) + +#define PSW_ASC_PRIMARY _AC(0x0000000000000000, UL) +#define PSW_ASC_ACCREG _AC(0x0000400000000000, UL) +#define PSW_ASC_SECONDARY _AC(0x0000800000000000, UL) +#define PSW_ASC_HOME _AC(0x0000C00000000000, UL) + +#endif /* __s390x__ */ + #define NUM_GPRS 16 #define NUM_FPRS 16 #define NUM_CRS 16 @@ -179,8 +239,9 @@ #define ACR_SIZE 4 -#define PTRACE_OLDSETOPTIONS 21 - +#define PTRACE_OLDSETOPTIONS 21 +#define PTRACE_SYSEMU 31 +#define PTRACE_SYSEMU_SINGLESTEP 32 #ifndef __ASSEMBLY__ #include <linux/stddef.h> #include <linux/types.h> @@ -213,69 +274,6 @@ typedef struct { unsigned long addr; } __attribute__ ((aligned(8))) psw_t; -#ifndef __s390x__ - -#define PSW_MASK_PER 0x40000000UL -#define PSW_MASK_DAT 0x04000000UL -#define PSW_MASK_IO 0x02000000UL -#define PSW_MASK_EXT 0x01000000UL -#define PSW_MASK_KEY 0x00F00000UL -#define PSW_MASK_BASE 0x00080000UL /* always one */ -#define PSW_MASK_MCHECK 0x00040000UL -#define PSW_MASK_WAIT 0x00020000UL -#define PSW_MASK_PSTATE 0x00010000UL -#define PSW_MASK_ASC 0x0000C000UL -#define PSW_MASK_CC 0x00003000UL -#define PSW_MASK_PM 0x00000F00UL -#define PSW_MASK_RI 0x00000000UL -#define PSW_MASK_EA 0x00000000UL -#define PSW_MASK_BA 0x00000000UL - -#define PSW_MASK_USER 0x0000FF00UL - -#define PSW_ADDR_AMODE 0x80000000UL -#define PSW_ADDR_INSN 0x7FFFFFFFUL - -#define PSW_DEFAULT_KEY (((unsigned long) PAGE_DEFAULT_ACC) << 20) - -#define PSW_ASC_PRIMARY 0x00000000UL -#define PSW_ASC_ACCREG 0x00004000UL -#define PSW_ASC_SECONDARY 0x00008000UL -#define PSW_ASC_HOME 0x0000C000UL - -#else /* __s390x__ */ - -#define PSW_MASK_PER 0x4000000000000000UL -#define PSW_MASK_DAT 0x0400000000000000UL -#define PSW_MASK_IO 0x0200000000000000UL -#define PSW_MASK_EXT 0x0100000000000000UL -#define PSW_MASK_BASE 0x0000000000000000UL -#define PSW_MASK_KEY 0x00F0000000000000UL -#define PSW_MASK_MCHECK 0x0004000000000000UL -#define PSW_MASK_WAIT 0x0002000000000000UL -#define PSW_MASK_PSTATE 0x0001000000000000UL -#define PSW_MASK_ASC 0x0000C00000000000UL -#define PSW_MASK_CC 0x0000300000000000UL -#define PSW_MASK_PM 0x00000F0000000000UL -#define PSW_MASK_RI 0x0000008000000000UL -#define PSW_MASK_EA 0x0000000100000000UL -#define PSW_MASK_BA 0x0000000080000000UL - -#define PSW_MASK_USER 0x0000FF0180000000UL - -#define PSW_ADDR_AMODE 0x0000000000000000UL -#define PSW_ADDR_INSN 0xFFFFFFFFFFFFFFFFUL - -#define PSW_DEFAULT_KEY (((unsigned long) PAGE_DEFAULT_ACC) << 52) - -#define PSW_ASC_PRIMARY 0x0000000000000000UL -#define PSW_ASC_ACCREG 0x0000400000000000UL -#define PSW_ASC_SECONDARY 0x0000800000000000UL -#define PSW_ASC_HOME 0x0000C00000000000UL - -#endif /* __s390x__ */ - - /* * The s390_regs structure is used to define the elf_gregset_t. */ diff --git a/arch/s390/include/uapi/asm/raw3270.h b/arch/s390/include/uapi/asm/raw3270.h new file mode 100644 index 000000000000..6676f102bd50 --- /dev/null +++ b/arch/s390/include/uapi/asm/raw3270.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __ASM_S390_UAPI_RAW3270_H +#define __ASM_S390_UAPI_RAW3270_H + +/* Local Channel Commands */ +#define TC_WRITE 0x01 /* Write */ +#define TC_RDBUF 0x02 /* Read Buffer */ +#define TC_EWRITE 0x05 /* Erase write */ +#define TC_READMOD 0x06 /* Read modified */ +#define TC_EWRITEA 0x0d /* Erase write alternate */ +#define TC_WRITESF 0x11 /* Write structured field */ + +/* Buffer Control Orders */ +#define TO_GE 0x08 /* Graphics Escape */ +#define TO_SF 0x1d /* Start field */ +#define TO_SBA 0x11 /* Set buffer address */ +#define TO_IC 0x13 /* Insert cursor */ +#define TO_PT 0x05 /* Program tab */ +#define TO_RA 0x3c /* Repeat to address */ +#define TO_SFE 0x29 /* Start field extended */ +#define TO_EUA 0x12 /* Erase unprotected to address */ +#define TO_MF 0x2c /* Modify field */ +#define TO_SA 0x28 /* Set attribute */ + +/* Field Attribute Bytes */ +#define TF_INPUT 0x40 /* Visible input */ +#define TF_INPUTN 0x4c /* Invisible input */ +#define TF_INMDT 0xc1 /* Visible, Set-MDT */ +#define TF_LOG 0x60 + +/* Character Attribute Bytes */ +#define TAT_RESET 0x00 +#define TAT_FIELD 0xc0 +#define TAT_EXTHI 0x41 +#define TAT_FGCOLOR 0x42 +#define TAT_CHARS 0x43 +#define TAT_BGCOLOR 0x45 +#define TAT_TRANS 0x46 + +/* Extended-Highlighting Bytes */ +#define TAX_RESET 0x00 +#define TAX_BLINK 0xf1 +#define TAX_REVER 0xf2 +#define TAX_UNDER 0xf4 + +/* Reset value */ +#define TAR_RESET 0x00 + +/* Color values */ +#define TAC_RESET 0x00 +#define TAC_BLUE 0xf1 +#define TAC_RED 0xf2 +#define TAC_PINK 0xf3 +#define TAC_GREEN 0xf4 +#define TAC_TURQ 0xf5 +#define TAC_YELLOW 0xf6 +#define TAC_WHITE 0xf7 +#define TAC_DEFAULT 0x00 + +/* Write Control Characters */ +#define TW_NONE 0x40 /* No particular action */ +#define TW_KR 0xc2 /* Keyboard restore */ +#define TW_PLUSALARM 0x04 /* Add this bit for alarm */ + +#define RAW3270_FIRSTMINOR 1 /* First minor number */ +#define RAW3270_MAXDEVS 255 /* Max number of 3270 devices */ + +#define AID_CLEAR 0x6d +#define AID_ENTER 0x7d +#define AID_PF3 0xf3 +#define AID_PF7 0xf7 +#define AID_PF8 0xf8 +#define AID_READ_PARTITION 0x88 + +#endif /* __ASM_S390_UAPI_RAW3270_H */ diff --git a/arch/s390/include/uapi/asm/schid.h b/arch/s390/include/uapi/asm/schid.h index 58fca6f48410..a3e1cf168553 100644 --- a/arch/s390/include/uapi/asm/schid.h +++ b/arch/s390/include/uapi/asm/schid.h @@ -4,6 +4,8 @@ #include <linux/types.h> +#ifndef __ASSEMBLY__ + struct subchannel_id { __u32 cssid : 8; __u32 : 4; @@ -13,5 +15,6 @@ struct subchannel_id { __u32 sch_no : 16; } __attribute__ ((packed, aligned(4))); +#endif /* __ASSEMBLY__ */ #endif /* _UAPIASM_SCHID_H */ diff --git a/arch/s390/include/uapi/asm/setup.h b/arch/s390/include/uapi/asm/setup.h index 1f8803a31079..598d769e76df 100644 --- a/arch/s390/include/uapi/asm/setup.h +++ b/arch/s390/include/uapi/asm/setup.h @@ -1,14 +1 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * S390 version - * Copyright IBM Corp. 1999, 2010 - */ - -#ifndef _UAPI_ASM_S390_SETUP_H -#define _UAPI_ASM_S390_SETUP_H - -#define COMMAND_LINE_SIZE 4096 - -#define ARCH_COMMAND_LINE_SIZE 896 - -#endif /* _UAPI_ASM_S390_SETUP_H */ diff --git a/arch/s390/include/uapi/asm/sie.h b/arch/s390/include/uapi/asm/sie.h index 6ca1e68d7103..ede318653c87 100644 --- a/arch/s390/include/uapi/asm/sie.h +++ b/arch/s390/include/uapi/asm/sie.h @@ -29,7 +29,7 @@ { 0x13, "SIGP conditional emergency signal" }, \ { 0x15, "SIGP sense running" }, \ { 0x16, "SIGP set multithreading"}, \ - { 0x17, "SIGP store additional status ait address"} + { 0x17, "SIGP store additional status at address"} #define icpt_prog_codes \ { 0x0001, "Prog Operation" }, \ diff --git a/arch/s390/include/uapi/asm/signal.h b/arch/s390/include/uapi/asm/signal.h index 9a14a611ed82..e74d6ba1bd3b 100644 --- a/arch/s390/include/uapi/asm/signal.h +++ b/arch/s390/include/uapi/asm/signal.h @@ -65,30 +65,6 @@ typedef unsigned long sigset_t; #define SIGRTMIN 32 #define SIGRTMAX _NSIG -/* - * SA_FLAGS values: - * - * SA_ONSTACK indicates that a registered stack_t will be used. - * SA_RESTART flag to get restarting signals (which were the default long ago) - * SA_NOCLDSTOP flag to turn off SIGCHLD when children stop. - * SA_RESETHAND clears the handler when the signal is delivered. - * SA_NOCLDWAIT flag on SIGCHLD to inhibit zombies. - * SA_NODEFER prevents the current signal from being masked in the handler. - * - * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single - * Unix names RESETHAND and NODEFER respectively. - */ -#define SA_NOCLDSTOP 0x00000001 -#define SA_NOCLDWAIT 0x00000002 -#define SA_SIGINFO 0x00000004 -#define SA_ONSTACK 0x08000000 -#define SA_RESTART 0x10000000 -#define SA_NODEFER 0x40000000 -#define SA_RESETHAND 0x80000000 - -#define SA_NOMASK SA_NODEFER -#define SA_ONESHOT SA_RESETHAND - #define SA_RESTORER 0x04000000 #define MINSIGSTKSZ 2048 @@ -132,7 +108,7 @@ struct sigaction { typedef struct sigaltstack { void __user *ss_sp; int ss_flags; - size_t ss_size; + __kernel_size_t ss_size; } stack_t; diff --git a/arch/s390/include/uapi/asm/statfs.h b/arch/s390/include/uapi/asm/statfs.h index 72604f7792c3..f85b50723dd3 100644 --- a/arch/s390/include/uapi/asm/statfs.h +++ b/arch/s390/include/uapi/asm/statfs.h @@ -30,7 +30,7 @@ struct statfs { unsigned int f_namelen; unsigned int f_frsize; unsigned int f_flags; - unsigned int f_spare[4]; + unsigned int f_spare[5]; }; struct statfs64 { @@ -45,7 +45,7 @@ struct statfs64 { unsigned int f_namelen; unsigned int f_frsize; unsigned int f_flags; - unsigned int f_spare[4]; + unsigned int f_spare[5]; }; #endif diff --git a/arch/s390/include/uapi/asm/termios.h b/arch/s390/include/uapi/asm/termios.h deleted file mode 100644 index 54223169c806..000000000000 --- a/arch/s390/include/uapi/asm/termios.h +++ /dev/null @@ -1,50 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * S390 version - * - * Derived from "include/asm-i386/termios.h" - */ - -#ifndef _UAPI_S390_TERMIOS_H -#define _UAPI_S390_TERMIOS_H - -#include <asm/termbits.h> -#include <asm/ioctls.h> - -struct winsize { - unsigned short ws_row; - unsigned short ws_col; - unsigned short ws_xpixel; - unsigned short ws_ypixel; -}; - -#define NCC 8 -struct termio { - unsigned short c_iflag; /* input mode flags */ - unsigned short c_oflag; /* output mode flags */ - unsigned short c_cflag; /* control mode flags */ - unsigned short c_lflag; /* local mode flags */ - unsigned char c_line; /* line discipline */ - unsigned char c_cc[NCC]; /* control characters */ -}; - -/* modem lines */ -#define TIOCM_LE 0x001 -#define TIOCM_DTR 0x002 -#define TIOCM_RTS 0x004 -#define TIOCM_ST 0x008 -#define TIOCM_SR 0x010 -#define TIOCM_CTS 0x020 -#define TIOCM_CAR 0x040 -#define TIOCM_RNG 0x080 -#define TIOCM_DSR 0x100 -#define TIOCM_CD TIOCM_CAR -#define TIOCM_RI TIOCM_RNG -#define TIOCM_OUT1 0x2000 -#define TIOCM_OUT2 0x4000 -#define TIOCM_LOOP 0x8000 - -/* ioctl (fd, TIOCSERGETLSR, &result) where result may be as below */ - - -#endif /* _UAPI_S390_TERMIOS_H */ diff --git a/arch/s390/include/uapi/asm/types.h b/arch/s390/include/uapi/asm/types.h index da034c606314..84457dbb26b4 100644 --- a/arch/s390/include/uapi/asm/types.h +++ b/arch/s390/include/uapi/asm/types.h @@ -12,15 +12,18 @@ #ifndef __ASSEMBLY__ -/* A address type so that arithmetic can be done on it & it can be upgraded to - 64 bit when necessary -*/ -typedef unsigned long addr_t; +typedef unsigned long addr_t; typedef __signed__ long saddr_t; typedef struct { - __u32 u[4]; -} __vector128; + union { + struct { + __u64 high; + __u64 low; + }; + __u32 u[4]; + }; +} __attribute__((packed, aligned(4))) __vector128; #endif /* __ASSEMBLY__ */ diff --git a/arch/s390/include/uapi/asm/uvdevice.h b/arch/s390/include/uapi/asm/uvdevice.h new file mode 100644 index 000000000000..b9c2f14a6af3 --- /dev/null +++ b/arch/s390/include/uapi/asm/uvdevice.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright IBM Corp. 2022 + * Author(s): Steffen Eiden <seiden@linux.ibm.com> + */ +#ifndef __S390_ASM_UVDEVICE_H +#define __S390_ASM_UVDEVICE_H + +#include <linux/types.h> + +struct uvio_ioctl_cb { + __u32 flags; + __u16 uv_rc; /* UV header rc value */ + __u16 uv_rrc; /* UV header rrc value */ + __u64 argument_addr; /* Userspace address of uvio argument */ + __u32 argument_len; + __u8 reserved14[0x40 - 0x14]; /* must be zero */ +}; + +#define UVIO_ATT_USER_DATA_LEN 0x100 +#define UVIO_ATT_UID_LEN 0x10 +struct uvio_attest { + __u64 arcb_addr; /* 0x0000 */ + __u64 meas_addr; /* 0x0008 */ + __u64 add_data_addr; /* 0x0010 */ + __u8 user_data[UVIO_ATT_USER_DATA_LEN]; /* 0x0018 */ + __u8 config_uid[UVIO_ATT_UID_LEN]; /* 0x0118 */ + __u32 arcb_len; /* 0x0128 */ + __u32 meas_len; /* 0x012c */ + __u32 add_data_len; /* 0x0130 */ + __u16 user_data_len; /* 0x0134 */ + __u16 reserved136; /* 0x0136 */ +}; + +/** + * uvio_uvdev_info - Information of supported functions + * @supp_uvio_cmds - supported IOCTLs by this device + * @supp_uv_cmds - supported UVCs corresponding to the IOCTL + * + * UVIO request to get information about supported request types by this + * uvdevice and the Ultravisor. Everything is output. Bits are in LSB0 + * ordering. If the bit is set in both, @supp_uvio_cmds and @supp_uv_cmds, the + * uvdevice and the Ultravisor support that call. + * + * Note that bit 0 (UVIO_IOCTL_UVDEV_INFO_NR) is always zero for `supp_uv_cmds` + * as there is no corresponding UV-call. + */ +struct uvio_uvdev_info { + /* + * If bit `n` is set, this device supports the IOCTL with nr `n`. + */ + __u64 supp_uvio_cmds; + /* + * If bit `n` is set, the Ultravisor(UV) supports the UV-call + * corresponding to the IOCTL with nr `n` in the calling contextx (host + * or guest). The value is only valid if the corresponding bit in + * @supp_uvio_cmds is set as well. + */ + __u64 supp_uv_cmds; +}; + +/* + * The following max values define an upper length for the IOCTL in/out buffers. + * However, they do not represent the maximum the Ultravisor allows which is + * often way smaller. By allowing larger buffer sizes we hopefully do not need + * to update the code with every machine update. It is therefore possible for + * userspace to request more memory than actually used by kernel/UV. + */ +#define UVIO_ATT_ARCB_MAX_LEN 0x100000 +#define UVIO_ATT_MEASUREMENT_MAX_LEN 0x8000 +#define UVIO_ATT_ADDITIONAL_MAX_LEN 0x8000 +#define UVIO_ADD_SECRET_MAX_LEN 0x100000 +#define UVIO_LIST_SECRETS_LEN 0x1000 + +#define UVIO_DEVICE_NAME "uv" +#define UVIO_TYPE_UVC 'u' + +enum UVIO_IOCTL_NR { + UVIO_IOCTL_UVDEV_INFO_NR = 0x00, + UVIO_IOCTL_ATT_NR, + UVIO_IOCTL_ADD_SECRET_NR, + UVIO_IOCTL_LIST_SECRETS_NR, + UVIO_IOCTL_LOCK_SECRETS_NR, + /* must be the last entry */ + UVIO_IOCTL_NUM_IOCTLS +}; + +#define UVIO_IOCTL(nr) _IOWR(UVIO_TYPE_UVC, nr, struct uvio_ioctl_cb) +#define UVIO_IOCTL_UVDEV_INFO UVIO_IOCTL(UVIO_IOCTL_UVDEV_INFO_NR) +#define UVIO_IOCTL_ATT UVIO_IOCTL(UVIO_IOCTL_ATT_NR) +#define UVIO_IOCTL_ADD_SECRET UVIO_IOCTL(UVIO_IOCTL_ADD_SECRET_NR) +#define UVIO_IOCTL_LIST_SECRETS UVIO_IOCTL(UVIO_IOCTL_LIST_SECRETS_NR) +#define UVIO_IOCTL_LOCK_SECRETS UVIO_IOCTL(UVIO_IOCTL_LOCK_SECRETS_NR) + +#define UVIO_SUPP_CALL(nr) (1ULL << (nr)) +#define UVIO_SUPP_UDEV_INFO UVIO_SUPP_CALL(UVIO_IOCTL_UDEV_INFO_NR) +#define UVIO_SUPP_ATT UVIO_SUPP_CALL(UVIO_IOCTL_ATT_NR) +#define UVIO_SUPP_ADD_SECRET UVIO_SUPP_CALL(UVIO_IOCTL_ADD_SECRET_NR) +#define UVIO_SUPP_LIST_SECRETS UVIO_SUPP_CALL(UVIO_IOCTL_LIST_SECRETS_NR) +#define UVIO_SUPP_LOCK_SECRETS UVIO_SUPP_CALL(UVIO_IOCTL_LOCK_SECRETS_NR) + +#endif /* __S390_ASM_UVDEVICE_H */ diff --git a/arch/s390/include/uapi/asm/zcrypt.h b/arch/s390/include/uapi/asm/zcrypt.h index f9e5e1f0821d..f4785abe1b9f 100644 --- a/arch/s390/include/uapi/asm/zcrypt.h +++ b/arch/s390/include/uapi/asm/zcrypt.h @@ -4,7 +4,7 @@ * * zcrypt 2.2.1 (user-visible header) * - * Copyright IBM Corp. 2001, 2019 + * Copyright IBM Corp. 2001, 2022 * Author(s): Robert Burroughs * Eric Rossman (edrossma@us.ibm.com) * @@ -36,12 +36,12 @@ * - length(n_modulus) = inputdatalength */ struct ica_rsa_modexpo { - char __user *inputdata; - unsigned int inputdatalength; - char __user *outputdata; - unsigned int outputdatalength; - char __user *b_key; - char __user *n_modulus; + __u8 __user *inputdata; + __u32 inputdatalength; + __u8 __user *outputdata; + __u32 outputdatalength; + __u8 __user *b_key; + __u8 __user *n_modulus; }; /** @@ -59,15 +59,15 @@ struct ica_rsa_modexpo { * - length(u_mult_inv) = inputdatalength/2 + 8 */ struct ica_rsa_modexpo_crt { - char __user *inputdata; - unsigned int inputdatalength; - char __user *outputdata; - unsigned int outputdatalength; - char __user *bp_key; - char __user *bq_key; - char __user *np_prime; - char __user *nq_prime; - char __user *u_mult_inv; + __u8 __user *inputdata; + __u32 inputdatalength; + __u8 __user *outputdata; + __u32 outputdatalength; + __u8 __user *bp_key; + __u8 __user *bq_key; + __u8 __user *np_prime; + __u8 __user *nq_prime; + __u8 __user *u_mult_inv; }; /** @@ -83,67 +83,66 @@ struct ica_rsa_modexpo_crt { * key block */ struct CPRBX { - unsigned short cprb_len; /* CPRB length 220 */ - unsigned char cprb_ver_id; /* CPRB version id. 0x02 */ - unsigned char pad_000[3]; /* Alignment pad bytes */ - unsigned char func_id[2]; /* function id 0x5432 */ - unsigned char cprb_flags[4]; /* Flags */ - unsigned int req_parml; /* request parameter buffer len */ - unsigned int req_datal; /* request data buffer */ - unsigned int rpl_msgbl; /* reply message block length */ - unsigned int rpld_parml; /* replied parameter block len */ - unsigned int rpl_datal; /* reply data block len */ - unsigned int rpld_datal; /* replied data block len */ - unsigned int req_extbl; /* request extension block len */ - unsigned char pad_001[4]; /* reserved */ - unsigned int rpld_extbl; /* replied extension block len */ - unsigned char padx000[16 - sizeof(char *)]; - unsigned char *req_parmb; /* request parm block 'address' */ - unsigned char padx001[16 - sizeof(char *)]; - unsigned char *req_datab; /* request data block 'address' */ - unsigned char padx002[16 - sizeof(char *)]; - unsigned char *rpl_parmb; /* reply parm block 'address' */ - unsigned char padx003[16 - sizeof(char *)]; - unsigned char *rpl_datab; /* reply data block 'address' */ - unsigned char padx004[16 - sizeof(char *)]; - unsigned char *req_extb; /* request extension block 'addr'*/ - unsigned char padx005[16 - sizeof(char *)]; - unsigned char *rpl_extb; /* reply extension block 'address'*/ - unsigned short ccp_rtcode; /* server return code */ - unsigned short ccp_rscode; /* server reason code */ - unsigned int mac_data_len; /* Mac Data Length */ - unsigned char logon_id[8]; /* Logon Identifier */ - unsigned char mac_value[8]; /* Mac Value */ - unsigned char mac_content_flgs;/* Mac content flag byte */ - unsigned char pad_002; /* Alignment */ - unsigned short domain; /* Domain */ - unsigned char usage_domain[4];/* Usage domain */ - unsigned char cntrl_domain[4];/* Control domain */ - unsigned char S390enf_mask[4];/* S/390 enforcement mask */ - unsigned char pad_004[36]; /* reserved */ + __u16 cprb_len; /* CPRB length 220 */ + __u8 cprb_ver_id; /* CPRB version id. 0x02 */ + __u8 ctfm; /* Command Type Filtering Mask */ + __u8 pad_000[2]; /* Alignment pad bytes */ + __u8 func_id[2]; /* function id 0x5432 */ + __u8 cprb_flags[4]; /* Flags */ + __u32 req_parml; /* request parameter buffer len */ + __u32 req_datal; /* request data buffer */ + __u32 rpl_msgbl; /* reply message block length */ + __u32 rpld_parml; /* replied parameter block len */ + __u32 rpl_datal; /* reply data block len */ + __u32 rpld_datal; /* replied data block len */ + __u32 req_extbl; /* request extension block len */ + __u8 _pad_001[4]; /* reserved */ + __u32 rpld_extbl; /* replied extension block len */ + __u8 _pad_002[16 - sizeof(__u8 *)]; + __u8 __user *req_parmb; /* request parm block 'address' */ + __u8 _pad_003[16 - sizeof(__u8 *)]; + __u8 __user *req_datab; /* request data block 'address' */ + __u8 _pad_004[16 - sizeof(__u8 *)]; + __u8 __user *rpl_parmb; /* reply parm block 'address' */ + __u8 _pad_005[16 - sizeof(__u8 *)]; + __u8 __user *rpl_datab; /* reply data block 'address' */ + __u8 _pad_006[16 - sizeof(__u8 *)]; + __u8 __user *req_extb; /* request extension block 'addr'*/ + __u8 _pad_007[16 - sizeof(__u8 *)]; + __u8 __user *rpl_extb; /* reply extension block 'address'*/ + __u16 ccp_rtcode; /* server return code */ + __u16 ccp_rscode; /* server reason code */ + __u32 mac_data_len; /* Mac Data Length */ + __u8 logon_id[8]; /* Logon Identifier */ + __u8 mac_value[8]; /* Mac Value */ + __u8 mac_content_flgs; /* Mac content flag byte */ + __u8 _pad_008; /* Alignment */ + __u16 domain; /* Domain */ + __u8 _pad_009[12]; /* reserved, checked for zeros */ + __u8 _pad_010[36]; /* reserved */ } __attribute__((packed)); /** * xcRB */ struct ica_xcRB { - unsigned short agent_ID; - unsigned int user_defined; - unsigned short request_ID; - unsigned int request_control_blk_length; - unsigned char padding1[16 - sizeof(char *)]; - char __user *request_control_blk_addr; - unsigned int request_data_length; - char padding2[16 - sizeof(char *)]; - char __user *request_data_address; - unsigned int reply_control_blk_length; - char padding3[16 - sizeof(char *)]; - char __user *reply_control_blk_addr; - unsigned int reply_data_length; - char padding4[16 - sizeof(char *)]; - char __user *reply_data_addr; - unsigned short priority_window; - unsigned int status; + __u16 agent_ID; + __u32 user_defined; + __u16 request_ID; + __u32 request_control_blk_length; + __u8 _padding1[16 - sizeof(__u8 *)]; + __u8 __user *request_control_blk_addr; + __u32 request_data_length; + __u8 _padding2[16 - sizeof(__u8 *)]; + __u8 __user *request_data_address; + __u32 reply_control_blk_length; + __u8 _padding3[16 - sizeof(__u8 *)]; + __u8 __user *reply_control_blk_addr; + __u32 reply_data_length; + __u8 __padding4[16 - sizeof(__u8 *)]; + __u8 __user *reply_data_addr; + __u16 priority_window; + __u32 status; } __attribute__((packed)); /** @@ -161,17 +160,17 @@ struct ica_xcRB { * @payload_len: Payload length */ struct ep11_cprb { - __u16 cprb_len; - unsigned char cprb_ver_id; - unsigned char pad_000[2]; - unsigned char flags; - unsigned char func_id[2]; - __u32 source_id; - __u32 target_id; - __u32 ret_code; - __u32 reserved1; - __u32 reserved2; - __u32 payload_len; + __u16 cprb_len; + __u8 cprb_ver_id; + __u8 pad_000[2]; + __u8 flags; + __u8 func_id[2]; + __u32 source_id; + __u32 target_id; + __u32 ret_code; + __u32 reserved1; + __u32 reserved2; + __u32 payload_len; } __attribute__((packed)); /** @@ -197,13 +196,13 @@ struct ep11_target_dev { */ struct ep11_urb { __u16 targets_num; - __u64 targets; + __u8 __user *targets; __u64 weight; __u64 req_no; __u64 req_len; - __u64 req; + __u8 __user *req; __u64 resp_len; - __u64 resp; + __u8 __user *resp; } __attribute__((packed)); /** @@ -237,7 +236,9 @@ struct zcrypt_device_matrix_ext { struct zcrypt_device_status_ext device[MAX_ZDEV_ENTRIES_EXT]; }; -#define AUTOSELECT 0xFFFFFFFF +#define AUTOSELECT 0xFFFFFFFF +#define AUTOSEL_AP ((__u16)0xFFFF) +#define AUTOSEL_DOM ((__u16)0xFFFF) #define ZCRYPT_IOCTL_MAGIC 'z' @@ -286,7 +287,7 @@ struct zcrypt_device_matrix_ext { * 0x08: CEX3A * 0x0a: CEX4 * 0x0b: CEX5 - * 0x0c: CEX6 and CEX7 + * 0x0c: CEX6, CEX7 or CEX8 * 0x0d: device is disabled * * ZCRYPT_QDEPTH_MASK @@ -303,12 +304,12 @@ struct zcrypt_device_matrix_ext { /** * Supported ioctl calls */ -#define ICARSAMODEXPO _IOC(_IOC_READ|_IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x05, 0) -#define ICARSACRT _IOC(_IOC_READ|_IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x06, 0) -#define ZSECSENDCPRB _IOC(_IOC_READ|_IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x81, 0) -#define ZSENDEP11CPRB _IOC(_IOC_READ|_IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x04, 0) +#define ICARSAMODEXPO _IOC(_IOC_READ | _IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x05, 0) +#define ICARSACRT _IOC(_IOC_READ | _IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x06, 0) +#define ZSECSENDCPRB _IOC(_IOC_READ | _IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x81, 0) +#define ZSENDEP11CPRB _IOC(_IOC_READ | _IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x04, 0) -#define ZCRYPT_DEVICE_STATUS _IOC(_IOC_READ|_IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x5f, 0) +#define ZCRYPT_DEVICE_STATUS _IOC(_IOC_READ | _IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x5f, 0) #define ZCRYPT_STATUS_MASK _IOR(ZCRYPT_IOCTL_MAGIC, 0x58, char[MAX_ZDEV_CARDIDS_EXT]) #define ZCRYPT_QDEPTH_MASK _IOR(ZCRYPT_IOCTL_MAGIC, 0x59, char[MAX_ZDEV_CARDIDS_EXT]) #define ZCRYPT_PERDEV_REQCNT _IOR(ZCRYPT_IOCTL_MAGIC, 0x5a, int[MAX_ZDEV_CARDIDS_EXT]) @@ -350,7 +351,7 @@ struct zcrypt_device_matrix { }; /* Deprecated: use ZCRYPT_DEVICE_STATUS */ -#define ZDEVICESTATUS _IOC(_IOC_READ|_IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x4f, 0) +#define ZDEVICESTATUS _IOC(_IOC_READ | _IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x4f, 0) /* Deprecated: use ZCRYPT_STATUS_MASK */ #define Z90STAT_STATUS_MASK _IOR(ZCRYPT_IOCTL_MAGIC, 0x48, char[64]) /* Deprecated: use ZCRYPT_QDEPTH_MASK */ diff --git a/arch/s390/kernel/.gitignore b/arch/s390/kernel/.gitignore index c5f676c3c224..bbb90f92d051 100644 --- a/arch/s390/kernel/.gitignore +++ b/arch/s390/kernel/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only vmlinux.lds diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 2b1203cf7be6..7a562b4199c8 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -10,6 +10,7 @@ CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE) # Do not trace early setup code CFLAGS_REMOVE_early.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_rethook.o = $(CC_FLAGS_FTRACE) endif @@ -33,51 +34,52 @@ CFLAGS_stacktrace.o += -fno-optimize-sibling-calls CFLAGS_dumpstack.o += -fno-optimize-sibling-calls CFLAGS_unwind_bc.o += -fno-optimize-sibling-calls -# -# Pass UTS_MACHINE for user_regset definition -# -CFLAGS_ptrace.o += -DUTS_MACHINE='"$(UTS_MACHINE)"' - -obj-y := traps.o time.o process.o base.o early.o setup.o idle.o vtime.o -obj-y += processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o -obj-y += debug.o irq.o ipl.o dis.o diag.o vdso.o -obj-y += sysinfo.o lgr.o os_info.o machine_kexec.o pgm_check.o +obj-y := head64.o traps.o time.o process.o earlypgm.o early.o setup.o idle.o vtime.o +obj-y += processor.o syscall.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o +obj-y += debug.o irq.o ipl.o dis.o diag.o vdso.o cpufeature.o +obj-y += sysinfo.o lgr.o os_info.o ctlreg.o obj-y += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o -obj-y += entry.o reipl.o relocate_kernel.o kdebugfs.o alternative.o +obj-y += entry.o reipl.o kdebugfs.o alternative.o obj-y += nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o unwind_bc.o -obj-y += smp.o +obj-y += smp.o text_amode31.o stacktrace.o abs_lowcore.o facility.o -extra-y += head64.o vmlinux.lds +extra-y += vmlinux.lds obj-$(CONFIG_SYSFS) += nospec-sysfs.o CFLAGS_REMOVE_nospec-branch.o += $(CC_FLAGS_EXPOLINE) obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_SCHED_TOPOLOGY) += topology.o -obj-$(CONFIG_HIBERNATION) += suspend.o swsusp.o +obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_AUDIT) += audit.o compat-obj-$(CONFIG_AUDIT) += compat_audit.o obj-$(CONFIG_COMPAT) += compat_linux.o compat_signal.o obj-$(CONFIG_COMPAT) += $(compat-obj-y) obj-$(CONFIG_EARLY_PRINTK) += early_printk.o -obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_KPROBES) += kprobes.o -obj-$(CONFIG_FUNCTION_TRACER) += mcount.o ftrace.o +obj-$(CONFIG_KPROBES) += kprobes_insn_page.o +obj-$(CONFIG_KPROBES) += mcount.o +obj-$(CONFIG_RETHOOK) += rethook.o +obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o +obj-$(CONFIG_FUNCTION_TRACER) += mcount.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o +obj-$(CONFIG_KEXEC_CORE) += machine_kexec.o relocate_kernel.o obj-$(CONFIG_UPROBES) += uprobes.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_KEXEC_FILE) += machine_kexec_file.o kexec_image.o obj-$(CONFIG_KEXEC_FILE) += kexec_elf.o +obj-$(CONFIG_CERT_STORE) += cert_store.o +obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT) += ima_arch.o -obj-$(CONFIG_IMA) += ima_arch.o - -obj-$(CONFIG_PERF_EVENTS) += perf_event.o perf_cpum_cf_common.o +obj-$(CONFIG_PERF_EVENTS) += perf_event.o obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf.o perf_cpum_sf.o obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_events.o perf_regs.o -obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_diag.o +obj-$(CONFIG_PERF_EVENTS) += perf_pai_crypto.o perf_pai_ext.o obj-$(CONFIG_TRACEPOINTS) += trace.o +obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o # vdso obj-y += vdso64/ +obj-$(CONFIG_COMPAT) += vdso32/ diff --git a/arch/s390/kernel/abs_lowcore.c b/arch/s390/kernel/abs_lowcore.c new file mode 100644 index 000000000000..f9efc54ec4b7 --- /dev/null +++ b/arch/s390/kernel/abs_lowcore.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/pgtable.h> +#include <asm/abs_lowcore.h> + +unsigned long __bootdata_preserved(__abs_lowcore); + +int abs_lowcore_map(int cpu, struct lowcore *lc, bool alloc) +{ + unsigned long addr = __abs_lowcore + (cpu * sizeof(struct lowcore)); + unsigned long phys = __pa(lc); + int rc, i; + + for (i = 0; i < LC_PAGES; i++) { + rc = __vmem_map_4k_page(addr, phys, PAGE_KERNEL, alloc); + if (rc) { + /* + * Do not unmap allocated page tables in case the + * allocation was not requested. In such a case the + * request is expected coming from an atomic context, + * while the unmap attempt might sleep. + */ + if (alloc) { + for (--i; i >= 0; i--) { + addr -= PAGE_SIZE; + vmem_unmap_4k_page(addr); + } + } + return rc; + } + addr += PAGE_SIZE; + phys += PAGE_SIZE; + } + return 0; +} + +void abs_lowcore_unmap(int cpu) +{ + unsigned long addr = __abs_lowcore + (cpu * sizeof(struct lowcore)); + int i; + + for (i = 0; i < LC_PAGES; i++) { + vmem_unmap_4k_page(addr); + addr += PAGE_SIZE; + } +} diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c index 8e1f2aee85ef..e7bca29f9c34 100644 --- a/arch/s390/kernel/alternative.c +++ b/arch/s390/kernel/alternative.c @@ -1,11 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/module.h> +#include <linux/cpu.h> +#include <linux/smp.h> +#include <asm/text-patching.h> #include <asm/alternative.h> #include <asm/facility.h> #include <asm/nospec-branch.h> -#define MAX_PATCH_LEN (255 - 1) - static int __initdata_or_module alt_instr_disabled; static int __init disable_alternative_instructions(char *str) @@ -16,86 +17,30 @@ static int __init disable_alternative_instructions(char *str) early_param("noaltinstr", disable_alternative_instructions); -struct brcl_insn { - u16 opc; - s32 disp; -} __packed; - -static u16 __initdata_or_module nop16 = 0x0700; -static u32 __initdata_or_module nop32 = 0x47000000; -static struct brcl_insn __initdata_or_module nop48 = { - 0xc004, 0 -}; - -static const void *nops[] __initdata_or_module = { - &nop16, - &nop32, - &nop48 -}; - -static void __init_or_module add_jump_padding(void *insns, unsigned int len) -{ - struct brcl_insn brcl = { - 0xc0f4, - len / 2 - }; - - memcpy(insns, &brcl, sizeof(brcl)); - insns += sizeof(brcl); - len -= sizeof(brcl); - - while (len > 0) { - memcpy(insns, &nop16, 2); - insns += 2; - len -= 2; - } -} - -static void __init_or_module add_padding(void *insns, unsigned int len) -{ - if (len > 6) - add_jump_padding(insns, len); - else if (len >= 2) - memcpy(insns, nops[len / 2 - 1], len); -} - static void __init_or_module __apply_alternatives(struct alt_instr *start, struct alt_instr *end) { struct alt_instr *a; u8 *instr, *replacement; - u8 insnbuf[MAX_PATCH_LEN]; /* * The scan order should be from start to end. A later scanned * alternative code can overwrite previously scanned alternative code. */ for (a = start; a < end; a++) { - int insnbuf_sz = 0; - instr = (u8 *)&a->instr_offset + a->instr_offset; replacement = (u8 *)&a->repl_offset + a->repl_offset; - if (!__test_facility(a->facility, - S390_lowcore.alt_stfle_fac_list)) + if (!__test_facility(a->facility, alt_stfle_fac_list)) continue; - if (unlikely(a->instrlen % 2 || a->replacementlen % 2)) { + if (unlikely(a->instrlen % 2)) { WARN_ONCE(1, "cpu alternatives instructions length is " "odd, skipping patching\n"); continue; } - memcpy(insnbuf, replacement, a->replacementlen); - insnbuf_sz = a->replacementlen; - - if (a->instrlen > a->replacementlen) { - add_padding(insnbuf + a->replacementlen, - a->instrlen - a->replacementlen); - insnbuf_sz += a->instrlen - a->replacementlen; - } - - s390_kernel_write(instr, insnbuf, insnbuf_sz); + s390_kernel_write(instr, replacement, a->instrlen); } } @@ -111,3 +56,20 @@ void __init apply_alternative_instructions(void) { apply_alternatives(__alt_instructions, __alt_instructions_end); } + +static void do_sync_core(void *info) +{ + sync_core(); +} + +void text_poke_sync(void) +{ + on_each_cpu(do_sync_core, NULL, 1); +} + +void text_poke_sync_lock(void) +{ + cpus_read_lock(); + text_poke_sync(); + cpus_read_unlock(); +} diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index ce33406cfe83..fa5f6885c74a 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -11,11 +11,10 @@ #include <linux/kvm_host.h> #include <linux/sched.h> #include <linux/purgatory.h> +#include <linux/pgtable.h> +#include <linux/ftrace.h> #include <asm/idle.h> -#include <asm/vdso.h> -#include <asm/pgtable.h> #include <asm/gmap.h> -#include <asm/nmi.h> #include <asm/stacktrace.h> int main(void) @@ -27,80 +26,55 @@ int main(void) BLANK(); /* thread struct offsets */ OFFSET(__THREAD_ksp, thread_struct, ksp); - OFFSET(__THREAD_sysc_table, thread_struct, sys_call_table); - OFFSET(__THREAD_last_break, thread_struct, last_break); - OFFSET(__THREAD_FPU_fpc, thread_struct, fpu.fpc); - OFFSET(__THREAD_FPU_regs, thread_struct, fpu.regs); - OFFSET(__THREAD_per_cause, thread_struct, per_event.cause); - OFFSET(__THREAD_per_address, thread_struct, per_event.address); - OFFSET(__THREAD_per_paid, thread_struct, per_event.paid); - OFFSET(__THREAD_trap_tdb, thread_struct, trap_tdb); BLANK(); /* thread info offsets */ OFFSET(__TI_flags, task_struct, thread_info.flags); BLANK(); /* pt_regs offsets */ - OFFSET(__PT_ARGS, pt_regs, args); OFFSET(__PT_PSW, pt_regs, psw); OFFSET(__PT_GPRS, pt_regs, gprs); + OFFSET(__PT_R0, pt_regs, gprs[0]); + OFFSET(__PT_R1, pt_regs, gprs[1]); + OFFSET(__PT_R2, pt_regs, gprs[2]); + OFFSET(__PT_R3, pt_regs, gprs[3]); + OFFSET(__PT_R4, pt_regs, gprs[4]); + OFFSET(__PT_R5, pt_regs, gprs[5]); + OFFSET(__PT_R6, pt_regs, gprs[6]); + OFFSET(__PT_R7, pt_regs, gprs[7]); + OFFSET(__PT_R8, pt_regs, gprs[8]); + OFFSET(__PT_R9, pt_regs, gprs[9]); + OFFSET(__PT_R10, pt_regs, gprs[10]); + OFFSET(__PT_R11, pt_regs, gprs[11]); + OFFSET(__PT_R12, pt_regs, gprs[12]); + OFFSET(__PT_R13, pt_regs, gprs[13]); + OFFSET(__PT_R14, pt_regs, gprs[14]); + OFFSET(__PT_R15, pt_regs, gprs[15]); OFFSET(__PT_ORIG_GPR2, pt_regs, orig_gpr2); - OFFSET(__PT_INT_CODE, pt_regs, int_code); - OFFSET(__PT_INT_PARM, pt_regs, int_parm); - OFFSET(__PT_INT_PARM_LONG, pt_regs, int_parm_long); OFFSET(__PT_FLAGS, pt_regs, flags); + OFFSET(__PT_CR1, pt_regs, cr1); + OFFSET(__PT_LAST_BREAK, pt_regs, last_break); DEFINE(__PT_SIZE, sizeof(struct pt_regs)); BLANK(); /* stack_frame offsets */ OFFSET(__SF_BACKCHAIN, stack_frame, back_chain); OFFSET(__SF_GPRS, stack_frame, gprs); - OFFSET(__SF_EMPTY, stack_frame, empty1); - OFFSET(__SF_SIE_CONTROL, stack_frame, empty1[0]); - OFFSET(__SF_SIE_SAVEAREA, stack_frame, empty1[1]); - OFFSET(__SF_SIE_REASON, stack_frame, empty1[2]); - OFFSET(__SF_SIE_FLAGS, stack_frame, empty1[3]); - BLANK(); - /* timeval/timezone offsets for use by vdso */ - OFFSET(__VDSO_UPD_COUNT, vdso_data, tb_update_count); - OFFSET(__VDSO_XTIME_STAMP, vdso_data, xtime_tod_stamp); - OFFSET(__VDSO_XTIME_SEC, vdso_data, xtime_clock_sec); - OFFSET(__VDSO_XTIME_NSEC, vdso_data, xtime_clock_nsec); - OFFSET(__VDSO_XTIME_CRS_SEC, vdso_data, xtime_coarse_sec); - OFFSET(__VDSO_XTIME_CRS_NSEC, vdso_data, xtime_coarse_nsec); - OFFSET(__VDSO_WTOM_SEC, vdso_data, wtom_clock_sec); - OFFSET(__VDSO_WTOM_NSEC, vdso_data, wtom_clock_nsec); - OFFSET(__VDSO_WTOM_CRS_SEC, vdso_data, wtom_coarse_sec); - OFFSET(__VDSO_WTOM_CRS_NSEC, vdso_data, wtom_coarse_nsec); - OFFSET(__VDSO_TIMEZONE, vdso_data, tz_minuteswest); - OFFSET(__VDSO_ECTG_OK, vdso_data, ectg_available); - OFFSET(__VDSO_TK_MULT, vdso_data, tk_mult); - OFFSET(__VDSO_TK_SHIFT, vdso_data, tk_shift); - OFFSET(__VDSO_TS_DIR, vdso_data, ts_dir); - OFFSET(__VDSO_TS_END, vdso_data, ts_end); - OFFSET(__VDSO_ECTG_BASE, vdso_per_cpu_data, ectg_timer_base); - OFFSET(__VDSO_ECTG_USER, vdso_per_cpu_data, ectg_user_time); - OFFSET(__VDSO_GETCPU_VAL, vdso_per_cpu_data, getcpu_val); - BLANK(); - /* constants used by the vdso */ - DEFINE(__CLOCK_REALTIME, CLOCK_REALTIME); - DEFINE(__CLOCK_MONOTONIC, CLOCK_MONOTONIC); - DEFINE(__CLOCK_REALTIME_COARSE, CLOCK_REALTIME_COARSE); - DEFINE(__CLOCK_MONOTONIC_COARSE, CLOCK_MONOTONIC_COARSE); - DEFINE(__CLOCK_THREAD_CPUTIME_ID, CLOCK_THREAD_CPUTIME_ID); - DEFINE(__CLOCK_REALTIME_RES, MONOTONIC_RES_NSEC); - DEFINE(__CLOCK_COARSE_RES, LOW_RES_NSEC); + OFFSET(__SF_EMPTY, stack_frame, empty[0]); + OFFSET(__SF_SIE_CONTROL, stack_frame, sie_control_block); + OFFSET(__SF_SIE_SAVEAREA, stack_frame, sie_savearea); + OFFSET(__SF_SIE_REASON, stack_frame, sie_reason); + OFFSET(__SF_SIE_FLAGS, stack_frame, sie_flags); + OFFSET(__SF_SIE_CONTROL_PHYS, stack_frame, sie_control_block_phys); + DEFINE(STACK_FRAME_OVERHEAD, sizeof(struct stack_frame)); BLANK(); /* idle data offsets */ OFFSET(__CLOCK_IDLE_ENTER, s390_idle_data, clock_idle_enter); - OFFSET(__CLOCK_IDLE_EXIT, s390_idle_data, clock_idle_exit); OFFSET(__TIMER_IDLE_ENTER, s390_idle_data, timer_idle_enter); - OFFSET(__TIMER_IDLE_EXIT, s390_idle_data, timer_idle_exit); + OFFSET(__MT_CYCLES_ENTER, s390_idle_data, mt_cycles_enter); BLANK(); /* hardware defined lowcore locations 0x000 - 0x1ff */ OFFSET(__LC_EXT_PARAMS, lowcore, ext_params); OFFSET(__LC_EXT_CPU_ADDR, lowcore, ext_cpu_addr); OFFSET(__LC_EXT_INT_CODE, lowcore, ext_int_code); - OFFSET(__LC_SVC_ILC, lowcore, svc_ilc); - OFFSET(__LC_SVC_INT_CODE, lowcore, svc_code); OFFSET(__LC_PGM_ILC, lowcore, pgm_ilc); OFFSET(__LC_PGM_INT_CODE, lowcore, pgm_code); OFFSET(__LC_DATA_EXC_CODE, lowcore, data_exc_code); @@ -118,12 +92,12 @@ int main(void) OFFSET(__LC_SUBCHANNEL_NR, lowcore, subchannel_nr); OFFSET(__LC_IO_INT_PARM, lowcore, io_int_parm); OFFSET(__LC_IO_INT_WORD, lowcore, io_int_word); - OFFSET(__LC_STFL_FAC_LIST, lowcore, stfl_fac_list); - OFFSET(__LC_STFLE_FAC_LIST, lowcore, stfle_fac_list); OFFSET(__LC_MCCK_CODE, lowcore, mcck_interruption_code); OFFSET(__LC_EXT_DAMAGE_CODE, lowcore, external_damage_code); OFFSET(__LC_MCCK_FAIL_STOR_ADDR, lowcore, failing_storage_address); - OFFSET(__LC_LAST_BREAK, lowcore, breaking_event_addr); + OFFSET(__LC_PGM_LAST_BREAK, lowcore, pgm_last_break); + OFFSET(__LC_RETURN_LPSWE, lowcore, return_lpswe); + OFFSET(__LC_RETURN_MCCK_LPSWE, lowcore, return_mcck_lpswe); OFFSET(__LC_RST_OLD_PSW, lowcore, restart_old_psw); OFFSET(__LC_EXT_OLD_PSW, lowcore, external_old_psw); OFFSET(__LC_SVC_OLD_PSW, lowcore, svc_old_psw); @@ -143,39 +117,33 @@ int main(void) OFFSET(__LC_CPU_FLAGS, lowcore, cpu_flags); OFFSET(__LC_RETURN_PSW, lowcore, return_psw); OFFSET(__LC_RETURN_MCCK_PSW, lowcore, return_mcck_psw); - OFFSET(__LC_SYNC_ENTER_TIMER, lowcore, sync_enter_timer); - OFFSET(__LC_ASYNC_ENTER_TIMER, lowcore, async_enter_timer); + OFFSET(__LC_SYS_ENTER_TIMER, lowcore, sys_enter_timer); OFFSET(__LC_MCCK_ENTER_TIMER, lowcore, mcck_enter_timer); OFFSET(__LC_EXIT_TIMER, lowcore, exit_timer); - OFFSET(__LC_USER_TIMER, lowcore, user_timer); - OFFSET(__LC_SYSTEM_TIMER, lowcore, system_timer); - OFFSET(__LC_STEAL_TIMER, lowcore, steal_timer); OFFSET(__LC_LAST_UPDATE_TIMER, lowcore, last_update_timer); OFFSET(__LC_LAST_UPDATE_CLOCK, lowcore, last_update_clock); OFFSET(__LC_INT_CLOCK, lowcore, int_clock); - OFFSET(__LC_MCCK_CLOCK, lowcore, mcck_clock); - OFFSET(__LC_CLOCK_COMPARATOR, lowcore, clock_comparator); OFFSET(__LC_BOOT_CLOCK, lowcore, boot_clock); OFFSET(__LC_CURRENT, lowcore, current_task); OFFSET(__LC_KERNEL_STACK, lowcore, kernel_stack); OFFSET(__LC_ASYNC_STACK, lowcore, async_stack); OFFSET(__LC_NODAT_STACK, lowcore, nodat_stack); OFFSET(__LC_RESTART_STACK, lowcore, restart_stack); + OFFSET(__LC_MCCK_STACK, lowcore, mcck_stack); OFFSET(__LC_RESTART_FN, lowcore, restart_fn); OFFSET(__LC_RESTART_DATA, lowcore, restart_data); OFFSET(__LC_RESTART_SOURCE, lowcore, restart_source); + OFFSET(__LC_RESTART_FLAGS, lowcore, restart_flags); + OFFSET(__LC_KERNEL_ASCE, lowcore, kernel_asce); OFFSET(__LC_USER_ASCE, lowcore, user_asce); - OFFSET(__LC_VDSO_ASCE, lowcore, vdso_asce); OFFSET(__LC_LPP, lowcore, lpp); OFFSET(__LC_CURRENT_PID, lowcore, current_pid); - OFFSET(__LC_PERCPU_OFFSET, lowcore, percpu_offset); - OFFSET(__LC_VDSO_PER_CPU, lowcore, vdso_per_cpu_data); - OFFSET(__LC_MACHINE_FLAGS, lowcore, machine_flags); - OFFSET(__LC_PREEMPT_COUNT, lowcore, preempt_count); OFFSET(__LC_GMAP, lowcore, gmap); - OFFSET(__LC_BR_R1, lowcore, br_r1_trampoline); + OFFSET(__LC_LAST_BREAK, lowcore, last_break); /* software defined ABI-relevant lowcore locations 0xe00 - 0xe20 */ OFFSET(__LC_DUMP_REIPL, lowcore, ipib); + OFFSET(__LC_VMCORE_INFO, lowcore, vmcore_info); + OFFSET(__LC_OS_INFO, lowcore, os_info); /* hardware defined lowcore locations 0x1000 - 0x18ff */ OFFSET(__LC_MCESAD, lowcore, mcesad); OFFSET(__LC_EXT_PARAMS2, lowcore, ext_params2); @@ -187,13 +155,11 @@ int main(void) OFFSET(__LC_TOD_PROGREG_SAVE_AREA, lowcore, tod_progreg_save_area); OFFSET(__LC_CPU_TIMER_SAVE_AREA, lowcore, cpu_timer_save_area); OFFSET(__LC_CLOCK_COMP_SAVE_AREA, lowcore, clock_comp_save_area); + OFFSET(__LC_LAST_BREAK_SAVE_AREA, lowcore, last_break_save_area); OFFSET(__LC_AREGS_SAVE_AREA, lowcore, access_regs_save_area); OFFSET(__LC_CREGS_SAVE_AREA, lowcore, cregs_save_area); OFFSET(__LC_PGM_TDB, lowcore, pgm_tdb); BLANK(); - /* extended machine check save area */ - OFFSET(__MCESA_GS_SAVE_AREA, mcesa, guarded_storage_save_area); - BLANK(); /* gmap/sie offsets */ OFFSET(__GMAP_ASCE, gmap, asce); OFFSET(__SIE_PROG0C, kvm_s390_sie_block, prog0c); @@ -202,5 +168,23 @@ int main(void) OFFSET(__KEXEC_SHA_REGION_START, kexec_sha_region, start); OFFSET(__KEXEC_SHA_REGION_LEN, kexec_sha_region, len); DEFINE(__KEXEC_SHA_REGION_SIZE, sizeof(struct kexec_sha_region)); + /* sizeof kernel parameter area */ + DEFINE(__PARMAREA_SIZE, sizeof(struct parmarea)); + /* kernel parameter area offsets */ + DEFINE(IPL_DEVICE, PARMAREA + offsetof(struct parmarea, ipl_device)); + DEFINE(INITRD_START, PARMAREA + offsetof(struct parmarea, initrd_start)); + DEFINE(INITRD_SIZE, PARMAREA + offsetof(struct parmarea, initrd_size)); + DEFINE(OLDMEM_BASE, PARMAREA + offsetof(struct parmarea, oldmem_base)); + DEFINE(OLDMEM_SIZE, PARMAREA + offsetof(struct parmarea, oldmem_size)); + DEFINE(COMMAND_LINE, PARMAREA + offsetof(struct parmarea, command_line)); + DEFINE(MAX_COMMAND_LINE_SIZE, PARMAREA + offsetof(struct parmarea, max_command_line_size)); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + /* function graph return value tracing */ + OFFSET(__FGRAPH_RET_GPR2, fgraph_ret_regs, gpr2); + OFFSET(__FGRAPH_RET_FP, fgraph_ret_regs, fp); + DEFINE(__FGRAPH_RET_SIZE, sizeof(struct fgraph_ret_regs)); +#endif + OFFSET(__FTRACE_REGS_PT_REGS, ftrace_regs, regs); + DEFINE(__FTRACE_REGS_SIZE, sizeof(struct ftrace_regs)); return 0; } diff --git a/arch/s390/kernel/audit.c b/arch/s390/kernel/audit.c index d395c6c9944c..02051a596b87 100644 --- a/arch/s390/kernel/audit.c +++ b/arch/s390/kernel/audit.c @@ -47,15 +47,17 @@ int audit_classify_syscall(int abi, unsigned syscall) #endif switch(syscall) { case __NR_open: - return 2; + return AUDITSC_OPEN; case __NR_openat: - return 3; + return AUDITSC_OPENAT; case __NR_socketcall: - return 4; + return AUDITSC_SOCKETCALL; case __NR_execve: - return 5; + return AUDITSC_EXECVE; + case __NR_openat2: + return AUDITSC_OPENAT2; default: - return 0; + return AUDITSC_NATIVE; } } diff --git a/arch/s390/kernel/base.S b/arch/s390/kernel/base.S deleted file mode 100644 index b79e0fd571f8..000000000000 --- a/arch/s390/kernel/base.S +++ /dev/null @@ -1,63 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * arch/s390/kernel/base.S - * - * Copyright IBM Corp. 2006, 2007 - * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com> - * Michael Holzheu <holzheu@de.ibm.com> - */ - -#include <linux/linkage.h> -#include <asm/asm-offsets.h> -#include <asm/nospec-insn.h> -#include <asm/ptrace.h> -#include <asm/sigp.h> - - GEN_BR_THUNK %r9 - GEN_BR_THUNK %r14 - -ENTRY(s390_base_ext_handler) - stmg %r0,%r15,__LC_SAVE_AREA_ASYNC - basr %r13,0 -0: aghi %r15,-STACK_FRAME_OVERHEAD - larl %r1,s390_base_ext_handler_fn - lg %r9,0(%r1) - ltgr %r9,%r9 - jz 1f - BASR_EX %r14,%r9 -1: lmg %r0,%r15,__LC_SAVE_AREA_ASYNC - ni __LC_EXT_OLD_PSW+1,0xfd # clear wait state bit - lpswe __LC_EXT_OLD_PSW -ENDPROC(s390_base_ext_handler) - - .section .bss - .align 8 - .globl s390_base_ext_handler_fn -s390_base_ext_handler_fn: - .quad 0 - .previous - -ENTRY(s390_base_pgm_handler) - stmg %r0,%r15,__LC_SAVE_AREA_SYNC - basr %r13,0 -0: aghi %r15,-STACK_FRAME_OVERHEAD - larl %r1,s390_base_pgm_handler_fn - lg %r9,0(%r1) - ltgr %r9,%r9 - jz 1f - BASR_EX %r14,%r9 - lmg %r0,%r15,__LC_SAVE_AREA_SYNC - lpswe __LC_PGM_OLD_PSW -1: lpswe disabled_wait_psw-0b(%r13) -ENDPROC(s390_base_pgm_handler) - - .align 8 -disabled_wait_psw: - .quad 0x0002000180000000,0x0000000000000000 + s390_base_pgm_handler - - .section .bss - .align 8 - .globl s390_base_pgm_handler_fn -s390_base_pgm_handler_fn: - .quad 0 - .previous diff --git a/arch/s390/kernel/cache.c b/arch/s390/kernel/cache.c index d66825e53fce..56254fa06f99 100644 --- a/arch/s390/kernel/cache.c +++ b/arch/s390/kernel/cache.c @@ -3,7 +3,6 @@ * Extract CPU cache information and expose them via sysfs. * * Copyright IBM Corp. 2012 - * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com> */ #include <linux/seq_file.h> @@ -47,7 +46,7 @@ struct cache_info { #define CACHE_MAX_LEVEL 8 union cache_topology { struct cache_info ci[CACHE_MAX_LEVEL]; - unsigned long long raw; + unsigned long raw; }; static const char * const cache_type_string[] = { @@ -71,8 +70,6 @@ void show_cacheinfo(struct seq_file *m) struct cacheinfo *cache; int idx; - if (!test_facility(34)) - return; this_cpu_ci = get_cpu_cacheinfo(cpumask_any(cpu_online_mask)); for (idx = 0; idx < this_cpu_ci->num_leaves; idx++) { cache = this_cpu_ci->info_list + idx; @@ -132,8 +129,6 @@ int init_cache_level(unsigned int cpu) union cache_topology ct; enum cache_type ctype; - if (!test_facility(34)) - return -EOPNOTSUPP; if (!this_cpu_ci) return -EINVAL; ct.raw = ecag(EXTRACT_TOPOLOGY, 0, 0); @@ -157,8 +152,6 @@ int populate_cache_leaves(unsigned int cpu) union cache_topology ct; enum cache_type ctype; - if (!test_facility(34)) - return -EOPNOTSUPP; ct.raw = ecag(EXTRACT_TOPOLOGY, 0, 0); for (idx = 0, level = 0; level < this_cpu_ci->num_levels && idx < this_cpu_ci->num_leaves; idx++, level++) { diff --git a/arch/s390/kernel/cert_store.c b/arch/s390/kernel/cert_store.c new file mode 100644 index 000000000000..554447768bdd --- /dev/null +++ b/arch/s390/kernel/cert_store.c @@ -0,0 +1,812 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DIAG 0x320 support and certificate store handling + * + * Copyright IBM Corp. 2023 + * Author(s): Anastasia Eskova <anastasia.eskova@ibm.com> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/delay.h> +#include <linux/device.h> +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/key-type.h> +#include <linux/key.h> +#include <linux/keyctl.h> +#include <linux/kobject.h> +#include <linux/module.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/sysfs.h> +#include <crypto/sha2.h> +#include <keys/user-type.h> +#include <asm/debug.h> +#include <asm/diag.h> +#include <asm/ebcdic.h> +#include <asm/sclp.h> + +#define DIAG_MAX_RETRIES 10 + +#define VCE_FLAGS_VALID_MASK 0x80 + +#define ISM_LEN_DWORDS 4 +#define VCSSB_LEN_BYTES 128 +#define VCSSB_LEN_NO_CERTS 4 +#define VCB_LEN_NO_CERTS 64 +#define VC_NAME_LEN_BYTES 64 + +#define CERT_STORE_KEY_TYPE_NAME "cert_store_key" +#define CERT_STORE_KEYRING_NAME "cert_store" + +static debug_info_t *cert_store_dbf; +static debug_info_t *cert_store_hexdump; + +#define pr_dbf_msg(fmt, ...) \ + debug_sprintf_event(cert_store_dbf, 3, fmt "\n", ## __VA_ARGS__) + +enum diag320_subcode { + DIAG320_SUBCODES = 0, + DIAG320_STORAGE = 1, + DIAG320_CERT_BLOCK = 2, +}; + +enum diag320_rc { + DIAG320_RC_OK = 0x0001, + DIAG320_RC_CS_NOMATCH = 0x0306, +}; + +/* Verification Certificates Store Support Block (VCSSB). */ +struct vcssb { + u32 vcssb_length; + u8 pad_0x04[3]; + u8 version; + u8 pad_0x08[8]; + u32 cs_token; + u8 pad_0x14[12]; + u16 total_vc_index_count; + u16 max_vc_index_count; + u8 pad_0x24[28]; + u32 max_vce_length; + u32 max_vcxe_length; + u8 pad_0x48[8]; + u32 max_single_vcb_length; + u32 total_vcb_length; + u32 max_single_vcxb_length; + u32 total_vcxb_length; + u8 pad_0x60[32]; +} __packed __aligned(8); + +/* Verification Certificate Entry (VCE) Header. */ +struct vce_header { + u32 vce_length; + u8 flags; + u8 key_type; + u16 vc_index; + u8 vc_name[VC_NAME_LEN_BYTES]; /* EBCDIC */ + u8 vc_format; + u8 pad_0x49; + u16 key_id_length; + u8 pad_0x4c; + u8 vc_hash_type; + u16 vc_hash_length; + u8 pad_0x50[4]; + u32 vc_length; + u8 pad_0x58[8]; + u16 vc_hash_offset; + u16 vc_offset; + u8 pad_0x64[28]; +} __packed __aligned(4); + +/* Verification Certificate Block (VCB) Header. */ +struct vcb_header { + u32 vcb_input_length; + u8 pad_0x04[4]; + u16 first_vc_index; + u16 last_vc_index; + u32 pad_0x0c; + u32 cs_token; + u8 pad_0x14[12]; + u32 vcb_output_length; + u8 pad_0x24[3]; + u8 version; + u16 stored_vc_count; + u16 remaining_vc_count; + u8 pad_0x2c[20]; +} __packed __aligned(4); + +/* Verification Certificate Block (VCB). */ +struct vcb { + struct vcb_header vcb_hdr; + u8 vcb_buf[]; +} __packed __aligned(4); + +/* Verification Certificate Entry (VCE). */ +struct vce { + struct vce_header vce_hdr; + u8 cert_data_buf[]; +} __packed __aligned(4); + +static void cert_store_key_describe(const struct key *key, struct seq_file *m) +{ + char ascii[VC_NAME_LEN_BYTES + 1]; + + /* + * First 64 bytes of the key description is key name in EBCDIC CP 500. + * Convert it to ASCII for displaying in /proc/keys. + */ + strscpy(ascii, key->description, sizeof(ascii)); + EBCASC_500(ascii, VC_NAME_LEN_BYTES); + seq_puts(m, ascii); + + seq_puts(m, &key->description[VC_NAME_LEN_BYTES]); + if (key_is_positive(key)) + seq_printf(m, ": %u", key->datalen); +} + +/* + * Certificate store key type takes over properties of + * user key but cannot be updated. + */ +static struct key_type key_type_cert_store_key = { + .name = CERT_STORE_KEY_TYPE_NAME, + .preparse = user_preparse, + .free_preparse = user_free_preparse, + .instantiate = generic_key_instantiate, + .revoke = user_revoke, + .destroy = user_destroy, + .describe = cert_store_key_describe, + .read = user_read, +}; + +/* Logging functions. */ +static void pr_dbf_vcb(const struct vcb *b) +{ + pr_dbf_msg("VCB Header:"); + pr_dbf_msg("vcb_input_length: %d", b->vcb_hdr.vcb_input_length); + pr_dbf_msg("first_vc_index: %d", b->vcb_hdr.first_vc_index); + pr_dbf_msg("last_vc_index: %d", b->vcb_hdr.last_vc_index); + pr_dbf_msg("cs_token: %d", b->vcb_hdr.cs_token); + pr_dbf_msg("vcb_output_length: %d", b->vcb_hdr.vcb_output_length); + pr_dbf_msg("version: %d", b->vcb_hdr.version); + pr_dbf_msg("stored_vc_count: %d", b->vcb_hdr.stored_vc_count); + pr_dbf_msg("remaining_vc_count: %d", b->vcb_hdr.remaining_vc_count); +} + +static void pr_dbf_vce(const struct vce *e) +{ + unsigned char vc_name[VC_NAME_LEN_BYTES + 1]; + char log_string[VC_NAME_LEN_BYTES + 40]; + + pr_dbf_msg("VCE Header:"); + pr_dbf_msg("vce_hdr.vce_length: %d", e->vce_hdr.vce_length); + pr_dbf_msg("vce_hdr.flags: %d", e->vce_hdr.flags); + pr_dbf_msg("vce_hdr.key_type: %d", e->vce_hdr.key_type); + pr_dbf_msg("vce_hdr.vc_index: %d", e->vce_hdr.vc_index); + pr_dbf_msg("vce_hdr.vc_format: %d", e->vce_hdr.vc_format); + pr_dbf_msg("vce_hdr.key_id_length: %d", e->vce_hdr.key_id_length); + pr_dbf_msg("vce_hdr.vc_hash_type: %d", e->vce_hdr.vc_hash_type); + pr_dbf_msg("vce_hdr.vc_hash_length: %d", e->vce_hdr.vc_hash_length); + pr_dbf_msg("vce_hdr.vc_hash_offset: %d", e->vce_hdr.vc_hash_offset); + pr_dbf_msg("vce_hdr.vc_length: %d", e->vce_hdr.vc_length); + pr_dbf_msg("vce_hdr.vc_offset: %d", e->vce_hdr.vc_offset); + + /* Certificate name in ASCII. */ + memcpy(vc_name, e->vce_hdr.vc_name, VC_NAME_LEN_BYTES); + EBCASC_500(vc_name, VC_NAME_LEN_BYTES); + vc_name[VC_NAME_LEN_BYTES] = '\0'; + + snprintf(log_string, sizeof(log_string), + "index: %d vce_hdr.vc_name (ASCII): %s", + e->vce_hdr.vc_index, vc_name); + debug_text_event(cert_store_hexdump, 3, log_string); + + /* Certificate data. */ + debug_text_event(cert_store_hexdump, 3, "VCE: Certificate data start"); + debug_event(cert_store_hexdump, 3, (u8 *)e->cert_data_buf, 128); + debug_text_event(cert_store_hexdump, 3, "VCE: Certificate data end"); + debug_event(cert_store_hexdump, 3, + (u8 *)e->cert_data_buf + e->vce_hdr.vce_length - 128, 128); +} + +static void pr_dbf_vcssb(const struct vcssb *s) +{ + debug_text_event(cert_store_hexdump, 3, "DIAG320 Subcode1"); + debug_event(cert_store_hexdump, 3, (u8 *)s, VCSSB_LEN_BYTES); + + pr_dbf_msg("VCSSB:"); + pr_dbf_msg("vcssb_length: %u", s->vcssb_length); + pr_dbf_msg("version: %u", s->version); + pr_dbf_msg("cs_token: %u", s->cs_token); + pr_dbf_msg("total_vc_index_count: %u", s->total_vc_index_count); + pr_dbf_msg("max_vc_index_count: %u", s->max_vc_index_count); + pr_dbf_msg("max_vce_length: %u", s->max_vce_length); + pr_dbf_msg("max_vcxe_length: %u", s->max_vce_length); + pr_dbf_msg("max_single_vcb_length: %u", s->max_single_vcb_length); + pr_dbf_msg("total_vcb_length: %u", s->total_vcb_length); + pr_dbf_msg("max_single_vcxb_length: %u", s->max_single_vcxb_length); + pr_dbf_msg("total_vcxb_length: %u", s->total_vcxb_length); +} + +static int __diag320(unsigned long subcode, void *addr) +{ + union register_pair rp = { .even = (unsigned long)addr, }; + + asm volatile( + " diag %[rp],%[subcode],0x320\n" + "0: nopr %%r7\n" + EX_TABLE(0b, 0b) + : [rp] "+d" (rp.pair) + : [subcode] "d" (subcode) + : "cc", "memory"); + + return rp.odd; +} + +static int diag320(unsigned long subcode, void *addr) +{ + diag_stat_inc(DIAG_STAT_X320); + + return __diag320(subcode, addr); +} + +/* + * Calculate SHA256 hash of the VCE certificate and compare it to hash stored in + * VCE. Return -EINVAL if hashes don't match. + */ +static int check_certificate_hash(const struct vce *vce) +{ + u8 hash[SHA256_DIGEST_SIZE]; + u16 vc_hash_length; + u8 *vce_hash; + + vce_hash = (u8 *)vce + vce->vce_hdr.vc_hash_offset; + vc_hash_length = vce->vce_hdr.vc_hash_length; + sha256((u8 *)vce + vce->vce_hdr.vc_offset, vce->vce_hdr.vc_length, hash); + if (memcmp(vce_hash, hash, vc_hash_length) == 0) + return 0; + + pr_dbf_msg("SHA256 hash of received certificate does not match"); + debug_text_event(cert_store_hexdump, 3, "VCE hash:"); + debug_event(cert_store_hexdump, 3, vce_hash, SHA256_DIGEST_SIZE); + debug_text_event(cert_store_hexdump, 3, "Calculated hash:"); + debug_event(cert_store_hexdump, 3, hash, SHA256_DIGEST_SIZE); + + return -EINVAL; +} + +static int check_certificate_valid(const struct vce *vce) +{ + if (!(vce->vce_hdr.flags & VCE_FLAGS_VALID_MASK)) { + pr_dbf_msg("Certificate entry is invalid"); + return -EINVAL; + } + if (vce->vce_hdr.vc_format != 1) { + pr_dbf_msg("Certificate format is not supported"); + return -EINVAL; + } + if (vce->vce_hdr.vc_hash_type != 1) { + pr_dbf_msg("Hash type is not supported"); + return -EINVAL; + } + + return check_certificate_hash(vce); +} + +static struct key *get_user_session_keyring(void) +{ + key_ref_t us_keyring_ref; + + us_keyring_ref = lookup_user_key(KEY_SPEC_USER_SESSION_KEYRING, + KEY_LOOKUP_CREATE, KEY_NEED_LINK); + if (IS_ERR(us_keyring_ref)) { + pr_dbf_msg("Couldn't get user session keyring: %ld", + PTR_ERR(us_keyring_ref)); + return ERR_PTR(-ENOKEY); + } + key_ref_put(us_keyring_ref); + return key_ref_to_ptr(us_keyring_ref); +} + +/* Invalidate all keys from cert_store keyring. */ +static int invalidate_keyring_keys(struct key *keyring) +{ + unsigned long num_keys, key_index; + size_t keyring_payload_len; + key_serial_t *key_array; + struct key *current_key; + int rc; + + keyring_payload_len = key_type_keyring.read(keyring, NULL, 0); + num_keys = keyring_payload_len / sizeof(key_serial_t); + key_array = kcalloc(num_keys, sizeof(key_serial_t), GFP_KERNEL); + if (!key_array) + return -ENOMEM; + + rc = key_type_keyring.read(keyring, (char *)key_array, keyring_payload_len); + if (rc != keyring_payload_len) { + pr_dbf_msg("Couldn't read keyring payload"); + goto out; + } + + for (key_index = 0; key_index < num_keys; key_index++) { + current_key = key_lookup(key_array[key_index]); + pr_dbf_msg("Invalidating key %08x", current_key->serial); + + key_invalidate(current_key); + key_put(current_key); + rc = key_unlink(keyring, current_key); + if (rc) { + pr_dbf_msg("Couldn't unlink key %08x: %d", current_key->serial, rc); + break; + } + } +out: + kfree(key_array); + return rc; +} + +static struct key *find_cs_keyring(void) +{ + key_ref_t cs_keyring_ref; + struct key *cs_keyring; + + cs_keyring_ref = keyring_search(make_key_ref(get_user_session_keyring(), true), + &key_type_keyring, CERT_STORE_KEYRING_NAME, + false); + if (!IS_ERR(cs_keyring_ref)) { + cs_keyring = key_ref_to_ptr(cs_keyring_ref); + key_ref_put(cs_keyring_ref); + goto found; + } + /* Search default locations: thread, process, session keyrings */ + cs_keyring = request_key(&key_type_keyring, CERT_STORE_KEYRING_NAME, NULL); + if (IS_ERR(cs_keyring)) + return NULL; + key_put(cs_keyring); +found: + return cs_keyring; +} + +static void cleanup_cs_keys(void) +{ + struct key *cs_keyring; + + cs_keyring = find_cs_keyring(); + if (!cs_keyring) + return; + + pr_dbf_msg("Found cert_store keyring. Purging..."); + /* + * Remove cert_store_key_type in case invalidation + * of old cert_store keys failed (= severe error). + */ + if (invalidate_keyring_keys(cs_keyring)) + unregister_key_type(&key_type_cert_store_key); + + keyring_clear(cs_keyring); + key_invalidate(cs_keyring); + key_put(cs_keyring); + key_unlink(get_user_session_keyring(), cs_keyring); +} + +static struct key *create_cs_keyring(void) +{ + static struct key *cs_keyring; + + /* Cleanup previous cs_keyring and all associated keys if any. */ + cleanup_cs_keys(); + cs_keyring = keyring_alloc(CERT_STORE_KEYRING_NAME, GLOBAL_ROOT_UID, + GLOBAL_ROOT_GID, current_cred(), + (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA | KEY_ALLOC_SET_KEEP, + NULL, get_user_session_keyring()); + if (IS_ERR(cs_keyring)) { + pr_dbf_msg("Can't allocate cert_store keyring"); + return NULL; + } + + pr_dbf_msg("Successfully allocated cert_store keyring: %08x", cs_keyring->serial); + + /* + * In case a previous clean-up ran into an + * error and unregistered key type. + */ + register_key_type(&key_type_cert_store_key); + + return cs_keyring; +} + +/* + * Allocate memory and create key description in format + * [key name in EBCDIC]:[VCE index]:[CS token]. + * Return a pointer to key description or NULL if memory + * allocation failed. Memory should be freed by caller. + */ +static char *get_key_description(struct vcssb *vcssb, const struct vce *vce) +{ + size_t len, name_len; + u32 cs_token; + char *desc; + + cs_token = vcssb->cs_token; + /* Description string contains "%64s:%05u:%010u\0". */ + name_len = sizeof(vce->vce_hdr.vc_name); + len = name_len + 1 + 5 + 1 + 10 + 1; + desc = kmalloc(len, GFP_KERNEL); + if (!desc) + return NULL; + + memcpy(desc, vce->vce_hdr.vc_name, name_len); + snprintf(desc + name_len, len - name_len, ":%05u:%010u", + vce->vce_hdr.vc_index, cs_token); + + return desc; +} + +/* + * Create a key of type "cert_store_key" using the data from VCE for key + * payload and key description. Link the key to "cert_store" keyring. + */ +static int create_key_from_vce(struct vcssb *vcssb, struct vce *vce, + struct key *keyring) +{ + key_ref_t newkey; + char *desc; + int rc; + + desc = get_key_description(vcssb, vce); + if (!desc) + return -ENOMEM; + + newkey = key_create_or_update( + make_key_ref(keyring, true), CERT_STORE_KEY_TYPE_NAME, + desc, (u8 *)vce + vce->vce_hdr.vc_offset, + vce->vce_hdr.vc_length, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA); + + rc = PTR_ERR_OR_ZERO(newkey); + if (rc) { + pr_dbf_msg("Couldn't create a key from Certificate Entry (%d)", rc); + rc = -ENOKEY; + goto out; + } + + key_ref_put(newkey); +out: + kfree(desc); + return rc; +} + +/* Get Verification Certificate Storage Size block with DIAG320 subcode2. */ +static int get_vcssb(struct vcssb *vcssb) +{ + int diag320_rc; + + memset(vcssb, 0, sizeof(*vcssb)); + vcssb->vcssb_length = VCSSB_LEN_BYTES; + diag320_rc = diag320(DIAG320_STORAGE, vcssb); + pr_dbf_vcssb(vcssb); + + if (diag320_rc != DIAG320_RC_OK) { + pr_dbf_msg("Diag 320 Subcode 1 returned bad RC: %04x", diag320_rc); + return -EIO; + } + if (vcssb->vcssb_length == VCSSB_LEN_NO_CERTS) { + pr_dbf_msg("No certificates available for current configuration"); + return -ENOKEY; + } + + return 0; +} + +static u32 get_4k_mult_vcb_size(struct vcssb *vcssb) +{ + return round_up(vcssb->max_single_vcb_length, PAGE_SIZE); +} + +/* Fill input fields of single-entry VCB that will be read by LPAR. */ +static void fill_vcb_input(struct vcssb *vcssb, struct vcb *vcb, u16 index) +{ + memset(vcb, 0, sizeof(*vcb)); + vcb->vcb_hdr.vcb_input_length = get_4k_mult_vcb_size(vcssb); + vcb->vcb_hdr.cs_token = vcssb->cs_token; + + /* Request single entry. */ + vcb->vcb_hdr.first_vc_index = index; + vcb->vcb_hdr.last_vc_index = index; +} + +static void extract_vce_from_sevcb(struct vcb *vcb, struct vce *vce) +{ + struct vce *extracted_vce; + + extracted_vce = (struct vce *)vcb->vcb_buf; + memcpy(vce, vcb->vcb_buf, extracted_vce->vce_hdr.vce_length); + pr_dbf_vce(vce); +} + +static int get_sevcb(struct vcssb *vcssb, u16 index, struct vcb *vcb) +{ + int rc, diag320_rc; + + fill_vcb_input(vcssb, vcb, index); + + diag320_rc = diag320(DIAG320_CERT_BLOCK, vcb); + pr_dbf_msg("Diag 320 Subcode2 RC %2x", diag320_rc); + pr_dbf_vcb(vcb); + + switch (diag320_rc) { + case DIAG320_RC_OK: + rc = 0; + if (vcb->vcb_hdr.vcb_output_length == VCB_LEN_NO_CERTS) { + pr_dbf_msg("No certificate entry for index %u", index); + rc = -ENOKEY; + } else if (vcb->vcb_hdr.remaining_vc_count != 0) { + /* Retry on insufficient space. */ + pr_dbf_msg("Couldn't get all requested certificates"); + rc = -EAGAIN; + } + break; + case DIAG320_RC_CS_NOMATCH: + pr_dbf_msg("Certificate Store token mismatch"); + rc = -EAGAIN; + break; + default: + pr_dbf_msg("Diag 320 Subcode2 returned bad rc (0x%4x)", diag320_rc); + rc = -EINVAL; + break; + } + + return rc; +} + +/* + * Allocate memory for single-entry VCB, get VCB via DIAG320 subcode 2 call, + * extract VCE and create a key from its' certificate. + */ +static int create_key_from_sevcb(struct vcssb *vcssb, u16 index, + struct key *keyring) +{ + struct vcb *vcb; + struct vce *vce; + int rc; + + rc = -ENOMEM; + vcb = vmalloc(get_4k_mult_vcb_size(vcssb)); + vce = vmalloc(vcssb->max_single_vcb_length - sizeof(vcb->vcb_hdr)); + if (!vcb || !vce) + goto out; + + rc = get_sevcb(vcssb, index, vcb); + if (rc) + goto out; + + extract_vce_from_sevcb(vcb, vce); + rc = check_certificate_valid(vce); + if (rc) + goto out; + + rc = create_key_from_vce(vcssb, vce, keyring); + if (rc) + goto out; + + pr_dbf_msg("Successfully created key from Certificate Entry %d", index); +out: + vfree(vce); + vfree(vcb); + return rc; +} + +/* + * Request a single-entry VCB for each VCE available for the partition. + * Create a key from it and link it to cert_store keyring. If no keys + * could be created (i.e. VCEs were invalid) return -ENOKEY. + */ +static int add_certificates_to_keyring(struct vcssb *vcssb, struct key *keyring) +{ + int rc, index, count, added; + + count = 0; + added = 0; + /* Certificate Store entries indices start with 1 and have no gaps. */ + for (index = 1; index < vcssb->total_vc_index_count + 1; index++) { + pr_dbf_msg("Creating key from VCE %u", index); + rc = create_key_from_sevcb(vcssb, index, keyring); + count++; + + if (rc == -EAGAIN) + return rc; + + if (rc) + pr_dbf_msg("Creating key from VCE %u failed (%d)", index, rc); + else + added++; + } + + if (added == 0) { + pr_dbf_msg("Processed %d entries. No keys created", count); + return -ENOKEY; + } + + pr_info("Added %d of %d keys to cert_store keyring", added, count); + + /* + * Do not allow to link more keys to certificate store keyring after all + * the VCEs were processed. + */ + rc = keyring_restrict(make_key_ref(keyring, true), NULL, NULL); + if (rc) + pr_dbf_msg("Failed to set restriction to cert_store keyring (%d)", rc); + + return 0; +} + +/* + * Check which DIAG320 subcodes are installed. + * Return -ENOENT if subcodes 1 or 2 are not available. + */ +static int query_diag320_subcodes(void) +{ + unsigned long ism[ISM_LEN_DWORDS]; + int rc; + + rc = diag320(0, ism); + if (rc != DIAG320_RC_OK) { + pr_dbf_msg("DIAG320 subcode query returned %04x", rc); + return -ENOENT; + } + + debug_text_event(cert_store_hexdump, 3, "DIAG320 Subcode 0"); + debug_event(cert_store_hexdump, 3, ism, sizeof(ism)); + + if (!test_bit_inv(1, ism) || !test_bit_inv(2, ism)) { + pr_dbf_msg("Not all required DIAG320 subcodes are installed"); + return -ENOENT; + } + + return 0; +} + +/* + * Check if Certificate Store is supported by the firmware and DIAG320 subcodes + * 1 and 2 are installed. Create cert_store keyring and link all certificates + * available for the current partition to it as "cert_store_key" type + * keys. On refresh or error invalidate cert_store keyring and destroy + * all keys of "cert_store_key" type. + */ +static int fill_cs_keyring(void) +{ + struct key *cs_keyring; + struct vcssb *vcssb; + int rc; + + rc = -ENOMEM; + vcssb = kmalloc(VCSSB_LEN_BYTES, GFP_KERNEL); + if (!vcssb) + goto cleanup_keys; + + rc = -ENOENT; + if (!sclp.has_diag320) { + pr_dbf_msg("Certificate Store is not supported"); + goto cleanup_keys; + } + + rc = query_diag320_subcodes(); + if (rc) + goto cleanup_keys; + + rc = get_vcssb(vcssb); + if (rc) + goto cleanup_keys; + + rc = -ENOMEM; + cs_keyring = create_cs_keyring(); + if (!cs_keyring) + goto cleanup_keys; + + rc = add_certificates_to_keyring(vcssb, cs_keyring); + if (rc) + goto cleanup_cs_keyring; + + goto out; + +cleanup_cs_keyring: + key_put(cs_keyring); +cleanup_keys: + cleanup_cs_keys(); +out: + kfree(vcssb); + return rc; +} + +static DEFINE_MUTEX(cs_refresh_lock); +static int cs_status_val = -1; + +static ssize_t cs_status_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + if (cs_status_val == -1) + return sysfs_emit(buf, "uninitialized\n"); + else if (cs_status_val == 0) + return sysfs_emit(buf, "ok\n"); + + return sysfs_emit(buf, "failed (%d)\n", cs_status_val); +} + +static struct kobj_attribute cs_status_attr = __ATTR_RO(cs_status); + +static ssize_t refresh_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int rc, retries; + + pr_dbf_msg("Refresh certificate store information requested"); + rc = mutex_lock_interruptible(&cs_refresh_lock); + if (rc) + return rc; + + for (retries = 0; retries < DIAG_MAX_RETRIES; retries++) { + /* Request certificates from certificate store. */ + rc = fill_cs_keyring(); + if (rc) + pr_dbf_msg("Failed to refresh certificate store information (%d)", rc); + if (rc != -EAGAIN) + break; + } + cs_status_val = rc; + mutex_unlock(&cs_refresh_lock); + + return rc ?: count; +} + +static struct kobj_attribute refresh_attr = __ATTR_WO(refresh); + +static const struct attribute *cert_store_attrs[] __initconst = { + &cs_status_attr.attr, + &refresh_attr.attr, + NULL, +}; + +static struct kobject *cert_store_kobj; + +static int __init cert_store_init(void) +{ + int rc = -ENOMEM; + + cert_store_dbf = debug_register("cert_store_msg", 10, 1, 64); + if (!cert_store_dbf) + goto cleanup_dbf; + + cert_store_hexdump = debug_register("cert_store_hexdump", 3, 1, 128); + if (!cert_store_hexdump) + goto cleanup_dbf; + + debug_register_view(cert_store_hexdump, &debug_hex_ascii_view); + debug_register_view(cert_store_dbf, &debug_sprintf_view); + + /* Create directory /sys/firmware/cert_store. */ + cert_store_kobj = kobject_create_and_add("cert_store", firmware_kobj); + if (!cert_store_kobj) + goto cleanup_dbf; + + rc = sysfs_create_files(cert_store_kobj, cert_store_attrs); + if (rc) + goto cleanup_kobj; + + register_key_type(&key_type_cert_store_key); + + return rc; + +cleanup_kobj: + kobject_put(cert_store_kobj); +cleanup_dbf: + debug_unregister(cert_store_dbf); + debug_unregister(cert_store_hexdump); + + return rc; +} +device_initcall(cert_store_init); diff --git a/arch/s390/kernel/compat_audit.c b/arch/s390/kernel/compat_audit.c index 444fb1f66944..a7c46e8310f0 100644 --- a/arch/s390/kernel/compat_audit.c +++ b/arch/s390/kernel/compat_audit.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #undef __s390x__ +#include <linux/audit_arch.h> #include <asm/unistd.h> #include "audit.h" @@ -32,14 +33,16 @@ int s390_classify_syscall(unsigned syscall) { switch(syscall) { case __NR_open: - return 2; + return AUDITSC_OPEN; case __NR_openat: - return 3; + return AUDITSC_OPENAT; case __NR_socketcall: - return 4; + return AUDITSC_SOCKETCALL; case __NR_execve: - return 5; + return AUDITSC_EXECVE; + case __NR_openat2: + return AUDITSC_OPENAT2; default: - return 1; + return AUDITSC_COMPAT; } } diff --git a/arch/s390/kernel/compat_linux.h b/arch/s390/kernel/compat_linux.h index 64509e7dbd3b..ef23739b277c 100644 --- a/arch/s390/kernel/compat_linux.h +++ b/arch/s390/kernel/compat_linux.h @@ -5,69 +5,59 @@ #include <linux/compat.h> #include <linux/socket.h> #include <linux/syscalls.h> +#include <asm/ptrace.h> -/* Macro that masks the high order bit of an 32 bit pointer and converts it*/ -/* to a 64 bit pointer */ -#define A(__x) ((unsigned long)((__x) & 0x7FFFFFFFUL)) -#define AA(__x) \ - ((unsigned long)(__x)) +/* + * Macro that masks the high order bit of a 32 bit pointer and + * converts it to a 64 bit pointer. + */ +#define A(__x) ((unsigned long)((__x) & 0x7FFFFFFFUL)) +#define AA(__x) ((unsigned long)(__x)) /* Now 32bit compatibility types */ struct ipc_kludge_32 { - __u32 msgp; /* pointer */ - __s32 msgtyp; + __u32 msgp; /* pointer */ + __s32 msgtyp; }; /* asm/sigcontext.h */ -typedef union -{ - __u64 d; - __u32 f; +typedef union { + __u64 d; + __u32 f; } freg_t32; -typedef struct -{ +typedef struct { unsigned int fpc; unsigned int pad; - freg_t32 fprs[__NUM_FPRS]; + freg_t32 fprs[__NUM_FPRS]; } _s390_fp_regs32; -typedef struct -{ - __u32 mask; - __u32 addr; -} _psw_t32 __attribute__ ((aligned(8))); - -typedef struct -{ - _psw_t32 psw; +typedef struct { + psw_t32 psw; __u32 gprs[__NUM_GPRS]; __u32 acrs[__NUM_ACRS]; } _s390_regs_common32; -typedef struct -{ +typedef struct { _s390_regs_common32 regs; - _s390_fp_regs32 fpregs; + _s390_fp_regs32 fpregs; } _sigregs32; -typedef struct -{ - __u32 gprs_high[__NUM_GPRS]; - __u64 vxrs_low[__NUM_VXRS_LOW]; - __vector128 vxrs_high[__NUM_VXRS_HIGH]; - __u8 __reserved[128]; +typedef struct { + __u32 gprs_high[__NUM_GPRS]; + __u64 vxrs_low[__NUM_VXRS_LOW]; + __vector128 vxrs_high[__NUM_VXRS_HIGH]; + __u8 __reserved[128]; } _sigregs_ext32; #define _SIGCONTEXT_NSIG32 64 #define _SIGCONTEXT_NSIG_BPW32 32 #define __SIGNAL_FRAMESIZE32 96 -#define _SIGMASK_COPY_SIZE32 (sizeof(u32)*2) +#define _SIGMASK_COPY_SIZE32 (sizeof(u32) * 2) -struct sigcontext32 -{ +struct sigcontext32 { __u32 oldmask[_COMPAT_NSIG_WORDS]; - __u32 sregs; /* pointer */ + __u32 sregs; /* pointer */ }; /* asm/signal.h */ @@ -75,11 +65,11 @@ struct sigcontext32 /* asm/ucontext.h */ struct ucontext32 { __u32 uc_flags; - __u32 uc_link; /* pointer */ + __u32 uc_link; /* pointer */ compat_stack_t uc_stack; _sigregs32 uc_mcontext; compat_sigset_t uc_sigmask; - /* Allow for uc_sigmask growth. Glibc uses a 1024-bit sigset_t. */ + /* Allow for uc_sigmask growth. Glibc uses a 1024-bit sigset_t. */ unsigned char __unused[128 - sizeof(compat_sigset_t)]; _sigregs_ext32 uc_mcontext_ext; }; @@ -88,25 +78,6 @@ struct stat64_emu31; struct mmap_arg_struct_emu31; struct fadvise64_64_args; -long compat_sys_s390_chown16(const char __user *filename, u16 user, u16 group); -long compat_sys_s390_lchown16(const char __user *filename, u16 user, u16 group); -long compat_sys_s390_fchown16(unsigned int fd, u16 user, u16 group); -long compat_sys_s390_setregid16(u16 rgid, u16 egid); -long compat_sys_s390_setgid16(u16 gid); -long compat_sys_s390_setreuid16(u16 ruid, u16 euid); -long compat_sys_s390_setuid16(u16 uid); -long compat_sys_s390_setresuid16(u16 ruid, u16 euid, u16 suid); -long compat_sys_s390_getresuid16(u16 __user *ruid, u16 __user *euid, u16 __user *suid); -long compat_sys_s390_setresgid16(u16 rgid, u16 egid, u16 sgid); -long compat_sys_s390_getresgid16(u16 __user *rgid, u16 __user *egid, u16 __user *sgid); -long compat_sys_s390_setfsuid16(u16 uid); -long compat_sys_s390_setfsgid16(u16 gid); -long compat_sys_s390_getgroups16(int gidsetsize, u16 __user *grouplist); -long compat_sys_s390_setgroups16(int gidsetsize, u16 __user *grouplist); -long compat_sys_s390_getuid16(void); -long compat_sys_s390_geteuid16(void); -long compat_sys_s390_getgid16(void); -long compat_sys_s390_getegid16(void); long compat_sys_s390_truncate64(const char __user *path, u32 high, u32 low); long compat_sys_s390_ftruncate64(unsigned int fd, u32 high, u32 low); long compat_sys_s390_pread64(unsigned int fd, char __user *ubuf, compat_size_t count, u32 high, u32 low); @@ -118,8 +89,8 @@ long compat_sys_s390_fstat64(unsigned int fd, struct stat64_emu31 __user *statbu long compat_sys_s390_fstatat64(unsigned int dfd, const char __user *filename, struct stat64_emu31 __user *statbuf, int flag); long compat_sys_s390_old_mmap(struct mmap_arg_struct_emu31 __user *arg); long compat_sys_s390_mmap2(struct mmap_arg_struct_emu31 __user *arg); -long compat_sys_s390_read(unsigned int fd, char __user * buf, compat_size_t count); -long compat_sys_s390_write(unsigned int fd, const char __user * buf, compat_size_t count); +long compat_sys_s390_read(unsigned int fd, char __user *buf, compat_size_t count); +long compat_sys_s390_write(unsigned int fd, const char __user *buf, compat_size_t count); long compat_sys_s390_fadvise64(int fd, u32 high, u32 low, compat_size_t len, int advise); long compat_sys_s390_fadvise64_64(struct fadvise64_64_args __user *args); long compat_sys_s390_sync_file_range(int fd, u32 offhigh, u32 offlow, u32 nhigh, u32 nlow, unsigned int flags); diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c index 38d4bdbc34b9..f8fc6c25d051 100644 --- a/arch/s390/kernel/compat_signal.c +++ b/arch/s390/kernel/compat_signal.c @@ -28,6 +28,8 @@ #include <linux/uaccess.h> #include <asm/lowcore.h> #include <asm/switch_to.h> +#include <asm/vdso.h> +#include <asm/fpu/api.h> #include "compat_linux.h" #include "compat_ptrace.h" #include "entry.h" @@ -88,7 +90,7 @@ static int restore_sigregs32(struct pt_regs *regs,_sigregs32 __user *sregs) _sigregs32 user_sregs; int i; - /* Alwys make any pending restarted system call return -EINTR */ + /* Always make any pending restarted system call return -EINTR */ current->restart_block.fn = do_no_restart_syscall; if (__copy_from_user(&user_sregs, &sregs->regs, sizeof(user_sregs))) @@ -97,10 +99,6 @@ static int restore_sigregs32(struct pt_regs *regs,_sigregs32 __user *sregs) if (!is_ri_task(current) && (user_sregs.regs.psw.mask & PSW32_MASK_RI)) return -EINVAL; - /* Test the floating-point-control word. */ - if (test_fp_ctl(user_sregs.fpregs.fpc)) - return -EINVAL; - /* Use regs->psw.mask instead of PSW_USER_BITS to preserve PER bit. */ regs->psw.mask = (regs->psw.mask & ~(PSW_MASK_USER | PSW_MASK_RI)) | (__u64)(user_sregs.regs.psw.mask & PSW32_MASK_USER) << 32 | @@ -136,9 +134,9 @@ static int save_sigregs_ext32(struct pt_regs *regs, return -EFAULT; /* Save vector registers to signal stack */ - if (MACHINE_HAS_VX) { + if (cpu_has_vx()) { for (i = 0; i < __NUM_VXRS_LOW; i++) - vxrs[i] = *((__u64 *)(current->thread.fpu.vxrs + i) + 1); + vxrs[i] = current->thread.fpu.vxrs[i].low; if (__copy_to_user(&sregs_ext->vxrs_low, vxrs, sizeof(sregs_ext->vxrs_low)) || __copy_to_user(&sregs_ext->vxrs_high, @@ -164,7 +162,7 @@ static int restore_sigregs_ext32(struct pt_regs *regs, *(__u32 *)®s->gprs[i] = gprs_high[i]; /* Restore vector registers from signal stack */ - if (MACHINE_HAS_VX) { + if (cpu_has_vx()) { if (__copy_from_user(vxrs, &sregs_ext->vxrs_low, sizeof(sregs_ext->vxrs_low)) || __copy_from_user(current->thread.fpu.vxrs + __NUM_VXRS_LOW, @@ -172,7 +170,7 @@ static int restore_sigregs_ext32(struct pt_regs *regs, sizeof(sregs_ext->vxrs_high))) return -EFAULT; for (i = 0; i < __NUM_VXRS_LOW; i++) - *((__u64 *)(current->thread.fpu.vxrs + i) + 1) = vxrs[i]; + current->thread.fpu.vxrs[i].low = vxrs[i]; } return 0; } @@ -264,7 +262,7 @@ static int setup_frame32(struct ksignal *ksig, sigset_t *set, * the machine supports it */ frame_size = sizeof(*frame) - sizeof(frame->sregs_ext.__reserved); - if (!MACHINE_HAS_VX) + if (!cpu_has_vx()) frame_size -= sizeof(frame->sregs_ext.vxrs_low) + sizeof(frame->sregs_ext.vxrs_high); frame = get_sigframe(&ksig->ka, regs, frame_size); @@ -303,11 +301,7 @@ static int setup_frame32(struct ksignal *ksig, sigset_t *set, restorer = (unsigned long __force) ksig->ka.sa.sa_restorer | PSW32_ADDR_AMODE; } else { - /* Signal frames without vectors registers are short ! */ - __u16 __user *svc = (void __user *) frame + frame_size - 2; - if (__put_user(S390_SYSCALL_OPCODE | __NR_sigreturn, svc)) - return -EFAULT; - restorer = (unsigned long __force) svc | PSW32_ADDR_AMODE; + restorer = VDSO32_SYMBOL(current, sigreturn); } /* Set up registers for signal handler */ @@ -351,11 +345,12 @@ static int setup_rt_frame32(struct ksignal *ksig, sigset_t *set, * the machine supports it */ uc_flags = UC_GPRS_HIGH; - if (MACHINE_HAS_VX) { + if (cpu_has_vx()) { uc_flags |= UC_VXRS; - } else + } else { frame_size -= sizeof(frame->uc.uc_mcontext_ext.vxrs_low) + sizeof(frame->uc.uc_mcontext_ext.vxrs_high); + } frame = get_sigframe(&ksig->ka, regs, frame_size); if (frame == (void __user *) -1UL) return -EFAULT; @@ -370,10 +365,7 @@ static int setup_rt_frame32(struct ksignal *ksig, sigset_t *set, restorer = (unsigned long __force) ksig->ka.sa.sa_restorer | PSW32_ADDR_AMODE; } else { - __u16 __user *svc = &frame->svc_insn; - if (__put_user(S390_SYSCALL_OPCODE | __NR_rt_sigreturn, svc)) - return -EFAULT; - restorer = (unsigned long __force) svc | PSW32_ADDR_AMODE; + restorer = VDSO32_SYMBOL(current, rt_sigreturn); } /* Create siginfo on the signal stack */ diff --git a/arch/s390/kernel/cpcmd.c b/arch/s390/kernel/cpcmd.c index af013b4244d3..b210a29d3ee9 100644 --- a/arch/s390/kernel/cpcmd.c +++ b/arch/s390/kernel/cpcmd.c @@ -16,41 +16,45 @@ #include <linux/stddef.h> #include <linux/string.h> #include <linux/mm.h> +#include <linux/io.h> #include <asm/diag.h> #include <asm/ebcdic.h> #include <asm/cpcmd.h> -#include <asm/io.h> static DEFINE_SPINLOCK(cpcmd_lock); static char cpcmd_buf[241]; static int diag8_noresponse(int cmdlen) { - register unsigned long reg2 asm ("2") = (addr_t) cpcmd_buf; - register unsigned long reg3 asm ("3") = cmdlen; - asm volatile( - " diag %1,%0,0x8\n" - : "+d" (reg3) : "d" (reg2) : "cc"); - return reg3; + " diag %[rx],%[ry],0x8\n" + : [ry] "+&d" (cmdlen) + : [rx] "d" (__pa(cpcmd_buf)) + : "cc"); + return cmdlen; } static int diag8_response(int cmdlen, char *response, int *rlen) { - register unsigned long reg2 asm ("2") = (addr_t) cpcmd_buf; - register unsigned long reg3 asm ("3") = (addr_t) response; - register unsigned long reg4 asm ("4") = cmdlen | 0x40000000L; - register unsigned long reg5 asm ("5") = *rlen; + union register_pair rx, ry; + int cc; + rx.even = __pa(cpcmd_buf); + rx.odd = __pa(response); + ry.even = cmdlen | 0x40000000L; + ry.odd = *rlen; asm volatile( - " diag %2,%0,0x8\n" - " brc 8,1f\n" - " agr %1,%4\n" - "1:\n" - : "+d" (reg4), "+d" (reg5) - : "d" (reg2), "d" (reg3), "d" (*rlen) : "cc"); - *rlen = reg5; - return reg4; + " diag %[rx],%[ry],0x8\n" + " ipm %[cc]\n" + " srl %[cc],28\n" + : [cc] "=&d" (cc), [ry] "+&d" (ry.pair) + : [rx] "d" (rx.pair) + : "cc"); + if (cc) + *rlen += ry.odd; + else + *rlen = ry.odd; + return ry.even; } /* diff --git a/arch/s390/kernel/cpufeature.c b/arch/s390/kernel/cpufeature.c new file mode 100644 index 000000000000..1b2ae42a0c15 --- /dev/null +++ b/arch/s390/kernel/cpufeature.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2022 + */ + +#include <linux/cpufeature.h> +#include <linux/bug.h> +#include <asm/elf.h> + +enum { + TYPE_HWCAP, + TYPE_FACILITY, +}; + +struct s390_cpu_feature { + unsigned int type : 4; + unsigned int num : 28; +}; + +static struct s390_cpu_feature s390_cpu_features[MAX_CPU_FEATURES] = { + [S390_CPU_FEATURE_MSA] = {.type = TYPE_HWCAP, .num = HWCAP_NR_MSA}, + [S390_CPU_FEATURE_VXRS] = {.type = TYPE_HWCAP, .num = HWCAP_NR_VXRS}, + [S390_CPU_FEATURE_UV] = {.type = TYPE_FACILITY, .num = 158}, +}; + +/* + * cpu_have_feature - Test CPU features on module initialization + */ +int cpu_have_feature(unsigned int num) +{ + struct s390_cpu_feature *feature; + + if (WARN_ON_ONCE(num >= MAX_CPU_FEATURES)) + return 0; + feature = &s390_cpu_features[num]; + switch (feature->type) { + case TYPE_HWCAP: + return !!(elf_hwcap & BIT(feature->num)); + case TYPE_FACILITY: + return test_facility(feature->num); + default: + WARN_ON_ONCE(1); + return 0; + } +} +EXPORT_SYMBOL(cpu_have_feature); diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index f96a5857bbfd..5c46c2659305 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -15,11 +15,14 @@ #include <linux/slab.h> #include <linux/memblock.h> #include <linux/elf.h> +#include <linux/uio.h> #include <asm/asm-offsets.h> #include <asm/os_info.h> #include <asm/elf.h> #include <asm/ipl.h> #include <asm/sclp.h> +#include <asm/maccess.h> +#include <asm/fpu/api.h> #define PTR_ADD(x, y) (((char *) (x)) + ((unsigned long) (y))) #define PTR_SUB(x, y) (((char *) (x)) - ((unsigned long) (y))) @@ -44,7 +47,7 @@ struct save_area { u64 fprs[16]; u32 fpc; u32 prefix; - u64 todpreg; + u32 todpreg; u64 timer; u64 todcmp; u64 vxrs_low[16]; @@ -60,9 +63,9 @@ struct save_area * __init save_area_alloc(bool is_boot_cpu) { struct save_area *sa; - sa = (void *) memblock_phys_alloc(sizeof(*sa), 8); + sa = memblock_alloc(sizeof(*sa), 8); if (!sa) - panic("Failed to allocate save area\n"); + return NULL; if (is_boot_cpu) list_add(&sa->list, &dump_save_areas); @@ -108,126 +111,65 @@ void __init save_area_add_vxrs(struct save_area *sa, __vector128 *vxrs) /* Copy lower halves of vector registers 0-15 */ for (i = 0; i < 16; i++) - memcpy(&sa->vxrs_low[i], &vxrs[i].u[2], 8); + sa->vxrs_low[i] = vxrs[i].low; /* Copy vector registers 16-31 */ memcpy(sa->vxrs_high, vxrs + 16, 16 * sizeof(__vector128)); } -/* - * Return physical address for virtual address - */ -static inline void *load_real_addr(void *addr) -{ - unsigned long real_addr; - - asm volatile( - " lra %0,0(%1)\n" - " jz 0f\n" - " la %0,0\n" - "0:" - : "=a" (real_addr) : "a" (addr) : "cc"); - return (void *)real_addr; -} - -/* - * Copy memory of the old, dumped system to a kernel space virtual address - */ -int copy_oldmem_kernel(void *dst, void *src, size_t count) +static size_t copy_oldmem_iter(struct iov_iter *iter, unsigned long src, size_t count) { - unsigned long from, len; - void *ra; - int rc; + size_t len, copied, res = 0; while (count) { - from = __pa(src); - if (!OLDMEM_BASE && from < sclp.hsa_size) { - /* Copy from zfcpdump HSA area */ - len = min(count, sclp.hsa_size - from); - rc = memcpy_hsa_kernel(dst, from, len); - if (rc) - return rc; + if (!oldmem_data.start && src < sclp.hsa_size) { + /* Copy from zfcp/nvme dump HSA area */ + len = min(count, sclp.hsa_size - src); + copied = memcpy_hsa_iter(iter, src, len); } else { /* Check for swapped kdump oldmem areas */ - if (OLDMEM_BASE && from - OLDMEM_BASE < OLDMEM_SIZE) { - from -= OLDMEM_BASE; - len = min(count, OLDMEM_SIZE - from); - } else if (OLDMEM_BASE && from < OLDMEM_SIZE) { - len = min(count, OLDMEM_SIZE - from); - from += OLDMEM_BASE; + if (oldmem_data.start && src - oldmem_data.start < oldmem_data.size) { + src -= oldmem_data.start; + len = min(count, oldmem_data.size - src); + } else if (oldmem_data.start && src < oldmem_data.size) { + len = min(count, oldmem_data.size - src); + src += oldmem_data.start; } else { len = count; } - if (is_vmalloc_or_module_addr(dst)) { - ra = load_real_addr(dst); - len = min(PAGE_SIZE - offset_in_page(ra), len); - } else { - ra = dst; - } - if (memcpy_real(ra, (void *) from, len)) - return -EFAULT; + copied = memcpy_real_iter(iter, src, len); } - dst += len; - src += len; - count -= len; + count -= copied; + src += copied; + res += copied; + if (copied < len) + break; } - return 0; + return res; } -/* - * Copy memory of the old, dumped system to a user space virtual address - */ -static int copy_oldmem_user(void __user *dst, void *src, size_t count) +int copy_oldmem_kernel(void *dst, unsigned long src, size_t count) { - unsigned long from, len; - int rc; + struct iov_iter iter; + struct kvec kvec; - while (count) { - from = __pa(src); - if (!OLDMEM_BASE && from < sclp.hsa_size) { - /* Copy from zfcpdump HSA area */ - len = min(count, sclp.hsa_size - from); - rc = memcpy_hsa_user(dst, from, len); - if (rc) - return rc; - } else { - /* Check for swapped kdump oldmem areas */ - if (OLDMEM_BASE && from - OLDMEM_BASE < OLDMEM_SIZE) { - from -= OLDMEM_BASE; - len = min(count, OLDMEM_SIZE - from); - } else if (OLDMEM_BASE && from < OLDMEM_SIZE) { - len = min(count, OLDMEM_SIZE - from); - from += OLDMEM_BASE; - } else { - len = count; - } - rc = copy_to_user_real(dst, (void *) from, count); - if (rc) - return rc; - } - dst += len; - src += len; - count -= len; - } + kvec.iov_base = dst; + kvec.iov_len = count; + iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count); + if (copy_oldmem_iter(&iter, src, count) < count) + return -EFAULT; return 0; } /* * Copy one page from "oldmem" */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, - unsigned long offset, int userbuf) +ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, size_t csize, + unsigned long offset) { - void *src; - int rc; + unsigned long src; - if (!csize) - return 0; - src = (void *) (pfn << PAGE_SHIFT) + offset; - if (userbuf) - rc = copy_oldmem_user((void __force __user *) buf, src, csize); - else - rc = copy_oldmem_kernel((void *) buf, src, csize); - return rc; + src = pfn_to_phys(pfn) + offset; + return copy_oldmem_iter(iter, src, csize); } /* @@ -243,10 +185,10 @@ static int remap_oldmem_pfn_range_kdump(struct vm_area_struct *vma, unsigned long size_old; int rc; - if (pfn < OLDMEM_SIZE >> PAGE_SHIFT) { - size_old = min(size, OLDMEM_SIZE - (pfn << PAGE_SHIFT)); + if (pfn < oldmem_data.size >> PAGE_SHIFT) { + size_old = min(size, oldmem_data.size - (pfn << PAGE_SHIFT)); rc = remap_pfn_range(vma, from, - pfn + (OLDMEM_BASE >> PAGE_SHIFT), + pfn + (oldmem_data.start >> PAGE_SHIFT), size_old, prot); if (rc || size == size_old) return rc; @@ -258,7 +200,7 @@ static int remap_oldmem_pfn_range_kdump(struct vm_area_struct *vma, } /* - * Remap "oldmem" for zfcpdump + * Remap "oldmem" for zfcp/nvme dump * * We only map available memory above HSA size. Memory below HSA size * is read on demand using the copy_oldmem_page() function. @@ -283,12 +225,12 @@ static int remap_oldmem_pfn_range_zfcpdump(struct vm_area_struct *vma, } /* - * Remap "oldmem" for kdump or zfcpdump + * Remap "oldmem" for kdump or zfcp/nvme dump */ int remap_oldmem_pfn_range(struct vm_area_struct *vma, unsigned long from, unsigned long pfn, unsigned long size, pgprot_t prot) { - if (OLDMEM_BASE) + if (oldmem_data.start) return remap_oldmem_pfn_range_kdump(vma, from, pfn, size, prot); else return remap_oldmem_pfn_range_zfcpdump(vma, from, pfn, size, @@ -365,7 +307,7 @@ static void *fill_cpu_elf_notes(void *ptr, int cpu, struct save_area *sa) memcpy(&nt_prstatus.pr_reg.gprs, sa->gprs, sizeof(sa->gprs)); memcpy(&nt_prstatus.pr_reg.psw, sa->psw, sizeof(sa->psw)); memcpy(&nt_prstatus.pr_reg.acrs, sa->acrs, sizeof(sa->acrs)); - nt_prstatus.pr_pid = cpu; + nt_prstatus.common.pr_pid = cpu; /* Prepare fpregset (floating point) note */ memset(&nt_fpregset, 0, sizeof(nt_fpregset)); memcpy(&nt_fpregset.fpc, &sa->fpc, sizeof(sa->fpc)); @@ -378,7 +320,7 @@ static void *fill_cpu_elf_notes(void *ptr, int cpu, struct save_area *sa) ptr = nt_init(ptr, NT_S390_TODPREG, &sa->todpreg, sizeof(sa->todpreg)); ptr = nt_init(ptr, NT_S390_CTRS, &sa->ctrs, sizeof(sa->ctrs)); ptr = nt_init(ptr, NT_S390_PREFIX, &sa->prefix, sizeof(sa->prefix)); - if (MACHINE_HAS_VX) { + if (cpu_has_vx()) { ptr = nt_init(ptr, NT_S390_VXRS_HIGH, &sa->vxrs_high, sizeof(sa->vxrs_high)); ptr = nt_init(ptr, NT_S390_VXRS_LOW, @@ -402,7 +344,7 @@ static size_t get_cpu_elf_notes_size(void) size += nt_size(NT_S390_TODPREG, sizeof(sa->todpreg)); size += nt_size(NT_S390_CTRS, sizeof(sa->ctrs)); size += nt_size(NT_S390_PREFIX, sizeof(sa->prefix)); - if (MACHINE_HAS_VX) { + if (cpu_has_vx()) { size += nt_size(NT_S390_VXRS_HIGH, sizeof(sa->vxrs_high)); size += nt_size(NT_S390_VXRS_LOW, sizeof(sa->vxrs_low)); } @@ -429,10 +371,10 @@ static void *nt_prpsinfo(void *ptr) static void *get_vmcoreinfo_old(unsigned long *size) { char nt_name[11], *vmcoreinfo; + unsigned long addr; Elf64_Nhdr note; - void *addr; - if (copy_oldmem_kernel(&addr, &S390_lowcore.vmcore_info, sizeof(addr))) + if (copy_oldmem_kernel(&addr, __LC_VMCORE_INFO, sizeof(addr))) return NULL; memset(nt_name, 0, sizeof(nt_name)); if (copy_oldmem_kernel(¬e, addr, sizeof(note))) @@ -549,8 +491,7 @@ static int get_mem_chunk_cnt(void) int cnt = 0; u64 idx; - for_each_mem_range(idx, &memblock.physmem, &oldmem_type, NUMA_NO_NODE, - MEMBLOCK_NONE, NULL, NULL, NULL) + for_each_physmem_range(idx, &oldmem_type, NULL, NULL) cnt++; return cnt; } @@ -558,17 +499,16 @@ static int get_mem_chunk_cnt(void) /* * Initialize ELF loads (new kernel) */ -static void loads_init(Elf64_Phdr *phdr, u64 loads_offset) +static void loads_init(Elf64_Phdr *phdr) { phys_addr_t start, end; u64 idx; - for_each_mem_range(idx, &memblock.physmem, &oldmem_type, NUMA_NO_NODE, - MEMBLOCK_NONE, &start, &end, NULL) { + for_each_physmem_range(idx, &oldmem_type, &start, &end) { phdr->p_filesz = end - start; phdr->p_type = PT_LOAD; phdr->p_offset = start; - phdr->p_vaddr = start; + phdr->p_vaddr = (unsigned long)__va(start); phdr->p_paddr = start; phdr->p_memsz = end - start; phdr->p_flags = PF_R | PF_W | PF_X; @@ -629,23 +569,23 @@ static size_t get_elfcorehdr_size(int mem_chunk_cnt) int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size) { Elf64_Phdr *phdr_notes, *phdr_loads; + size_t alloc_size; int mem_chunk_cnt; void *ptr, *hdr; - u32 alloc_size; u64 hdr_off; - /* If we are not in kdump or zfcpdump mode return */ - if (!OLDMEM_BASE && ipl_info.type != IPL_TYPE_FCP_DUMP) + /* If we are not in kdump or zfcp/nvme dump mode return */ + if (!oldmem_data.start && !is_ipl_type_dump()) return 0; - /* If we cannot get HSA size for zfcpdump return error */ - if (ipl_info.type == IPL_TYPE_FCP_DUMP && !sclp.hsa_size) + /* If we cannot get HSA size for zfcp/nvme dump return error */ + if (is_ipl_type_dump() && !sclp.hsa_size) return -ENODEV; /* For kdump, exclude previous crashkernel memory */ - if (OLDMEM_BASE) { - oldmem_region.base = OLDMEM_BASE; - oldmem_region.size = OLDMEM_SIZE; - oldmem_type.total_size = OLDMEM_SIZE; + if (oldmem_data.start) { + oldmem_region.base = oldmem_data.start; + oldmem_region.size = oldmem_data.size; + oldmem_type.total_size = oldmem_data.size; } mem_chunk_cnt = get_mem_chunk_cnt(); @@ -673,7 +613,7 @@ int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size) ptr = notes_init(phdr_notes, ptr, ((unsigned long) hdr) + hdr_off); /* Init loads */ hdr_off = PTR_DIFF(ptr, hdr); - loads_init(phdr_loads, hdr_off); + loads_init(phdr_loads); *addr = (unsigned long long) hdr; *size = (unsigned long long) hdr_off; BUG_ON(elfcorehdr_size > alloc_size); diff --git a/arch/s390/kernel/ctlreg.c b/arch/s390/kernel/ctlreg.c new file mode 100644 index 000000000000..8cc26cf2c64a --- /dev/null +++ b/arch/s390/kernel/ctlreg.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 1999, 2023 + */ + +#include <linux/irqflags.h> +#include <linux/spinlock.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/smp.h> +#include <linux/cache.h> +#include <asm/abs_lowcore.h> +#include <asm/ctlreg.h> + +/* + * ctl_lock guards access to global control register contents which + * are kept in the control register save area within absolute lowcore + * at physical address zero. + */ +static DEFINE_SPINLOCK(system_ctl_lock); + +void system_ctlreg_lock(void) + __acquires(&system_ctl_lock) +{ + spin_lock(&system_ctl_lock); +} + +void system_ctlreg_unlock(void) + __releases(&system_ctl_lock) +{ + spin_unlock(&system_ctl_lock); +} + +static bool system_ctlreg_area_init __ro_after_init; + +void __init system_ctlreg_init_save_area(struct lowcore *lc) +{ + struct lowcore *abs_lc; + + abs_lc = get_abs_lowcore(); + __local_ctl_store(0, 15, lc->cregs_save_area); + __local_ctl_store(0, 15, abs_lc->cregs_save_area); + put_abs_lowcore(abs_lc); + system_ctlreg_area_init = true; +} + +struct ctlreg_parms { + unsigned long andval; + unsigned long orval; + unsigned long val; + int request; + int cr; +}; + +static void ctlreg_callback(void *info) +{ + struct ctlreg_parms *pp = info; + struct ctlreg regs[16]; + + __local_ctl_store(0, 15, regs); + if (pp->request == CTLREG_LOAD) { + regs[pp->cr].val = pp->val; + } else { + regs[pp->cr].val &= pp->andval; + regs[pp->cr].val |= pp->orval; + } + __local_ctl_load(0, 15, regs); +} + +static void system_ctlreg_update(void *info) +{ + unsigned long flags; + + if (system_state == SYSTEM_BOOTING) { + /* + * For very early calls do not call on_each_cpu() + * since not everything might be setup. + */ + local_irq_save(flags); + ctlreg_callback(info); + local_irq_restore(flags); + } else { + on_each_cpu(ctlreg_callback, info, 1); + } +} + +void system_ctlreg_modify(unsigned int cr, unsigned long data, int request) +{ + struct ctlreg_parms pp = { .cr = cr, .request = request, }; + struct lowcore *abs_lc; + + switch (request) { + case CTLREG_SET_BIT: + pp.orval = 1UL << data; + pp.andval = -1UL; + break; + case CTLREG_CLEAR_BIT: + pp.orval = 0; + pp.andval = ~(1UL << data); + break; + case CTLREG_LOAD: + pp.val = data; + break; + } + if (system_ctlreg_area_init) { + system_ctlreg_lock(); + abs_lc = get_abs_lowcore(); + if (request == CTLREG_LOAD) { + abs_lc->cregs_save_area[cr].val = pp.val; + } else { + abs_lc->cregs_save_area[cr].val &= pp.andval; + abs_lc->cregs_save_area[cr].val |= pp.orval; + } + put_abs_lowcore(abs_lc); + system_ctlreg_update(&pp); + system_ctlreg_unlock(); + } else { + system_ctlreg_update(&pp); + } +} +EXPORT_SYMBOL(system_ctlreg_modify); diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index 6d321f5f101d..85328a0ef3b6 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -2,7 +2,7 @@ /* * S/390 debug facility * - * Copyright IBM Corp. 1999, 2012 + * Copyright IBM Corp. 1999, 2020 * * Author(s): Michael Holzheu (holzheu@de.ibm.com), * Holger Smolinski (Holger.Smolinski@de.ibm.com) @@ -24,6 +24,7 @@ #include <linux/export.h> #include <linux/init.h> #include <linux/fs.h> +#include <linux/minmax.h> #include <linux/debugfs.h> #include <asm/debug.h> @@ -59,7 +60,7 @@ typedef struct { * except of floats, and long long (32 bit) * */ - long args[0]; + long args[]; } debug_sprintf_entry_t; /* internal function prototyes */ @@ -90,27 +91,13 @@ static int debug_input_flush_fn(debug_info_t *id, struct debug_view *view, size_t user_buf_size, loff_t *offset); static int debug_hex_ascii_format_fn(debug_info_t *id, struct debug_view *view, char *out_buf, const char *in_buf); -static int debug_raw_format_fn(debug_info_t *id, - struct debug_view *view, char *out_buf, - const char *in_buf); -static int debug_raw_header_fn(debug_info_t *id, struct debug_view *view, - int area, debug_entry_t *entry, char *out_buf); - static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view, - char *out_buf, debug_sprintf_entry_t *curr_event); + char *out_buf, const char *inbuf); +static void debug_areas_swap(debug_info_t *a, debug_info_t *b); +static void debug_events_append(debug_info_t *dest, debug_info_t *src); /* globals */ -struct debug_view debug_raw_view = { - "raw", - NULL, - &debug_raw_header_fn, - &debug_raw_format_fn, - NULL, - NULL -}; -EXPORT_SYMBOL(debug_raw_view); - struct debug_view debug_hex_ascii_view = { "hex_ascii", NULL, @@ -152,7 +139,7 @@ struct debug_view debug_sprintf_view = { "sprintf", NULL, &debug_dflt_header_fn, - (debug_format_proc_t *)&debug_sprintf_format_fn, + &debug_sprintf_format_fn, NULL, NULL }; @@ -198,9 +185,10 @@ static debug_entry_t ***debug_areas_alloc(int pages_per_area, int nr_areas) if (!areas) goto fail_malloc_areas; for (i = 0; i < nr_areas; i++) { + /* GFP_NOWARN to avoid user triggerable WARN, we handle fails */ areas[i] = kmalloc_array(pages_per_area, sizeof(debug_entry_t *), - GFP_KERNEL); + GFP_KERNEL | __GFP_NOWARN); if (!areas[i]) goto fail_malloc_areas2; for (j = 0; j < pages_per_area; j++) { @@ -262,7 +250,7 @@ static debug_info_t *debug_info_alloc(const char *name, int pages_per_area, rc->level = level; rc->buf_size = buf_size; rc->entry_size = sizeof(debug_entry_t) + buf_size; - strlcpy(rc->name, name, sizeof(rc->name)); + strscpy(rc->name, name, sizeof(rc->name)); memset(rc->views, 0, DEBUG_MAX_VIEWS * sizeof(struct debug_view *)); memset(rc->debugfs_entries, 0, DEBUG_MAX_VIEWS * sizeof(struct dentry *)); refcount_set(&(rc->ref_count), 0); @@ -326,24 +314,6 @@ static debug_info_t *debug_info_create(const char *name, int pages_per_area, goto out; rc->mode = mode & ~S_IFMT; - - /* create root directory */ - rc->debugfs_root_entry = debugfs_create_dir(rc->name, - debug_debugfs_root_entry); - - /* append new element to linked list */ - if (!debug_area_first) { - /* first element in list */ - debug_area_first = rc; - rc->prev = NULL; - } else { - /* append element to end of list */ - debug_area_last->next = rc; - rc->prev = debug_area_last; - } - debug_area_last = rc; - rc->next = NULL; - refcount_set(&rc->ref_count, 1); out: return rc; @@ -403,27 +373,10 @@ static void debug_info_get(debug_info_t *db_info) */ static void debug_info_put(debug_info_t *db_info) { - int i; - if (!db_info) return; - if (refcount_dec_and_test(&db_info->ref_count)) { - for (i = 0; i < DEBUG_MAX_VIEWS; i++) { - if (!db_info->views[i]) - continue; - debugfs_remove(db_info->debugfs_entries[i]); - } - debugfs_remove(db_info->debugfs_root_entry); - if (db_info == debug_area_first) - debug_area_first = db_info->next; - if (db_info == debug_area_last) - debug_area_last = db_info->prev; - if (db_info->prev) - db_info->prev->next = db_info->next; - if (db_info->next) - db_info->next->prev = db_info->prev; + if (refcount_dec_and_test(&db_info->ref_count)) debug_info_free(db_info); - } } /* @@ -448,7 +401,7 @@ static int debug_format_entry(file_private_info_t *p_info) act_entry = (debug_entry_t *) ((char *)id_snap->areas[p_info->act_area] [p_info->act_page] + p_info->act_entry); - if (act_entry->id.stck == 0LL) + if (act_entry->clock == 0LL) goto out; /* empty entry */ if (view->header_proc) len += view->header_proc(id_snap, view, p_info->act_area, @@ -647,6 +600,31 @@ static int debug_close(struct inode *inode, struct file *file) return 0; /* success */ } +/* Create debugfs entries and add to internal list. */ +static void _debug_register(debug_info_t *id) +{ + /* create root directory */ + id->debugfs_root_entry = debugfs_create_dir(id->name, + debug_debugfs_root_entry); + + /* append new element to linked list */ + if (!debug_area_first) { + /* first element in list */ + debug_area_first = id; + id->prev = NULL; + } else { + /* append element to end of list */ + debug_area_last->next = id; + id->prev = debug_area_last; + } + debug_area_last = id; + id->next = NULL; + + debug_register_view(id, &debug_level_view); + debug_register_view(id, &debug_flush_view); + debug_register_view(id, &debug_pages_view); +} + /** * debug_register_mode() - creates and initializes debug area. * @@ -676,19 +654,16 @@ debug_info_t *debug_register_mode(const char *name, int pages_per_area, if ((uid != 0) || (gid != 0)) pr_warn("Root becomes the owner of all s390dbf files in sysfs\n"); BUG_ON(!initialized); - mutex_lock(&debug_mutex); /* create new debug_info */ rc = debug_info_create(name, pages_per_area, nr_areas, buf_size, mode); - if (!rc) - goto out; - debug_register_view(rc, &debug_level_view); - debug_register_view(rc, &debug_flush_view); - debug_register_view(rc, &debug_pages_view); -out: - if (!rc) + if (rc) { + mutex_lock(&debug_mutex); + _debug_register(rc); + mutex_unlock(&debug_mutex); + } else { pr_err("Registering debug feature %s failed\n", name); - mutex_unlock(&debug_mutex); + } return rc; } EXPORT_SYMBOL(debug_register_mode); @@ -718,6 +693,82 @@ debug_info_t *debug_register(const char *name, int pages_per_area, EXPORT_SYMBOL(debug_register); /** + * debug_register_static() - registers a static debug area + * + * @id: Handle for static debug area + * @pages_per_area: Number of pages per area + * @nr_areas: Number of debug areas + * + * Register debug_info_t defined using DEFINE_STATIC_DEBUG_INFO. + * + * Note: This function is called automatically via an initcall generated by + * DEFINE_STATIC_DEBUG_INFO. + */ +void debug_register_static(debug_info_t *id, int pages_per_area, int nr_areas) +{ + unsigned long flags; + debug_info_t *copy; + + if (!initialized) { + pr_err("Tried to register debug feature %s too early\n", + id->name); + return; + } + + copy = debug_info_alloc("", pages_per_area, nr_areas, id->buf_size, + id->level, ALL_AREAS); + if (!copy) { + pr_err("Registering debug feature %s failed\n", id->name); + + /* Clear pointers to prevent tracing into released initdata. */ + spin_lock_irqsave(&id->lock, flags); + id->areas = NULL; + id->active_pages = NULL; + id->active_entries = NULL; + spin_unlock_irqrestore(&id->lock, flags); + + return; + } + + /* Replace static trace area with dynamic copy. */ + spin_lock_irqsave(&id->lock, flags); + debug_events_append(copy, id); + debug_areas_swap(id, copy); + spin_unlock_irqrestore(&id->lock, flags); + + /* Clear pointers to initdata and discard copy. */ + copy->areas = NULL; + copy->active_pages = NULL; + copy->active_entries = NULL; + debug_info_free(copy); + + mutex_lock(&debug_mutex); + _debug_register(id); + mutex_unlock(&debug_mutex); +} + +/* Remove debugfs entries and remove from internal list. */ +static void _debug_unregister(debug_info_t *id) +{ + int i; + + for (i = 0; i < DEBUG_MAX_VIEWS; i++) { + if (!id->views[i]) + continue; + debugfs_remove(id->debugfs_entries[i]); + } + debugfs_remove(id->debugfs_root_entry); + if (id == debug_area_first) + debug_area_first = id->next; + if (id == debug_area_last) + debug_area_last = id->prev; + if (id->prev) + id->prev->next = id->next; + if (id->next) + id->next->prev = id->prev; +} + +/** * debug_unregister() - give back debug area. * * @id: handle for debug log @@ -730,8 +781,10 @@ void debug_unregister(debug_info_t *id) if (!id) return; mutex_lock(&debug_mutex); - debug_info_put(id); + _debug_unregister(id); mutex_unlock(&debug_mutex); + + debug_info_put(id); } EXPORT_SYMBOL(debug_unregister); @@ -741,35 +794,28 @@ EXPORT_SYMBOL(debug_unregister); */ static int debug_set_size(debug_info_t *id, int nr_areas, int pages_per_area) { - debug_entry_t ***new_areas; + debug_info_t *new_id; unsigned long flags; - int rc = 0; if (!id || (nr_areas <= 0) || (pages_per_area < 0)) return -EINVAL; - if (pages_per_area > 0) { - new_areas = debug_areas_alloc(pages_per_area, nr_areas); - if (!new_areas) { - pr_info("Allocating memory for %i pages failed\n", - pages_per_area); - rc = -ENOMEM; - goto out; - } - } else { - new_areas = NULL; + + new_id = debug_info_alloc("", pages_per_area, nr_areas, id->buf_size, + id->level, ALL_AREAS); + if (!new_id) { + pr_info("Allocating memory for %i pages failed\n", + pages_per_area); + return -ENOMEM; } + spin_lock_irqsave(&id->lock, flags); - debug_areas_free(id); - id->areas = new_areas; - id->nr_areas = nr_areas; - id->pages_per_area = pages_per_area; - id->active_area = 0; - memset(id->active_entries, 0, sizeof(int)*id->nr_areas); - memset(id->active_pages, 0, sizeof(int)*id->nr_areas); + debug_events_append(new_id, id); + debug_areas_swap(new_id, id); + debug_info_free(new_id); spin_unlock_irqrestore(&id->lock, flags); pr_info("%s: set new size (%i pages)\n", id->name, pages_per_area); -out: - return rc; + + return 0; } /** @@ -787,16 +833,17 @@ void debug_set_level(debug_info_t *id, int new_level) if (!id) return; - spin_lock_irqsave(&id->lock, flags); + if (new_level == DEBUG_OFF_LEVEL) { - id->level = DEBUG_OFF_LEVEL; pr_info("%s: switched off\n", id->name); } else if ((new_level > DEBUG_MAX_LEVEL) || (new_level < 0)) { pr_info("%s: level %i is out of range (%i - %i)\n", id->name, new_level, 0, DEBUG_MAX_LEVEL); - } else { - id->level = new_level; + return; } + + spin_lock_irqsave(&id->lock, flags); + id->level = new_level; spin_unlock_irqrestore(&id->lock, flags); } EXPORT_SYMBOL(debug_set_level); @@ -836,6 +883,42 @@ static inline debug_entry_t *get_active_entry(debug_info_t *id) id->active_entries[id->active_area]); } +/* Swap debug areas of a and b. */ +static void debug_areas_swap(debug_info_t *a, debug_info_t *b) +{ + swap(a->nr_areas, b->nr_areas); + swap(a->pages_per_area, b->pages_per_area); + swap(a->areas, b->areas); + swap(a->active_area, b->active_area); + swap(a->active_pages, b->active_pages); + swap(a->active_entries, b->active_entries); +} + +/* Append all debug events in active area from source to destination log. */ +static void debug_events_append(debug_info_t *dest, debug_info_t *src) +{ + debug_entry_t *from, *to, *last; + + if (!src->areas || !dest->areas) + return; + + /* Loop over all entries in src, starting with oldest. */ + from = get_active_entry(src); + last = from; + do { + if (from->clock != 0LL) { + to = get_active_entry(dest); + memset(to, 0, dest->entry_size); + memcpy(to, from, min(src->entry_size, + dest->entry_size)); + proceed_active_entry(dest); + } + + proceed_active_entry(src); + from = get_active_entry(src); + } while (from != last); +} + /* * debug_finish_entry: * - set timestamp, caller address, cpu number etc. @@ -844,12 +927,17 @@ static inline debug_entry_t *get_active_entry(debug_info_t *id) static inline void debug_finish_entry(debug_info_t *id, debug_entry_t *active, int level, int exception) { - active->id.stck = get_tod_clock_fast() - - *(unsigned long long *) &tod_clock_base[1]; - active->id.fields.cpuid = smp_processor_id(); + unsigned long timestamp; + union tod_clock clk; + + store_tod_clock_ext(&clk); + timestamp = clk.us; + timestamp -= TOD_UNIX_EPOCH >> 12; + active->clock = timestamp; + active->cpu = smp_processor_id(); active->caller = __builtin_return_address(0); - active->id.fields.exception = exception; - active->id.fields.level = level; + active->exception = exception; + active->level = level; proceed_active_entry(id); if (exception) proceed_active_area(id); @@ -867,7 +955,7 @@ static int debug_active = 1; * if debug_active is already off */ static int s390dbf_procactive(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (!write || debug_stoppable || !debug_active) return proc_dointvec(table, write, buffer, lenp, ppos); @@ -890,17 +978,6 @@ static struct ctl_table s390dbf_table[] = { .mode = S_IRUGO | S_IWUSR, .proc_handler = s390dbf_procactive, }, - { } -}; - -static struct ctl_table s390dbf_dir_table[] = { - { - .procname = "s390dbf", - .maxlen = 0, - .mode = S_IRUGO | S_IXUGO, - .child = s390dbf_table, - }, - { } }; static struct ctl_table_header *s390dbf_sysctl_header; @@ -1121,16 +1198,17 @@ int debug_register_view(debug_info_t *id, struct debug_view *view) break; } if (i == DEBUG_MAX_VIEWS) { - pr_err("Registering view %s/%s would exceed the maximum " - "number of views %i\n", id->name, view->name, i); rc = -1; } else { id->views[i] = view; id->debugfs_entries[i] = pde; } spin_unlock_irqrestore(&id->lock, flags); - if (rc) + if (rc) { + pr_err("Registering view %s/%s would exceed the maximum " + "number of views %i\n", id->name, view->name, i); debugfs_remove(pde); + } out: return rc; } @@ -1385,32 +1463,6 @@ out: } /* - * prints debug header in raw format - */ -static int debug_raw_header_fn(debug_info_t *id, struct debug_view *view, - int area, debug_entry_t *entry, char *out_buf) -{ - int rc; - - rc = sizeof(debug_entry_t); - memcpy(out_buf, entry, sizeof(debug_entry_t)); - return rc; -} - -/* - * prints debug data in raw format - */ -static int debug_raw_format_fn(debug_info_t *id, struct debug_view *view, - char *out_buf, const char *in_buf) -{ - int rc; - - rc = id->buf_size; - memcpy(out_buf, in_buf, id->buf_size); - return rc; -} - -/* * prints debug data in hex/ascii format */ static int debug_hex_ascii_format_fn(debug_info_t *id, struct debug_view *view, @@ -1439,25 +1491,24 @@ static int debug_hex_ascii_format_fn(debug_info_t *id, struct debug_view *view, int debug_dflt_header_fn(debug_info_t *id, struct debug_view *view, int area, debug_entry_t *entry, char *out_buf) { - unsigned long base, sec, usec; + unsigned long sec, usec; unsigned long caller; unsigned int level; char *except_str; int rc = 0; - level = entry->id.fields.level; - base = (*(unsigned long *) &tod_clock_base[0]) >> 4; - sec = (entry->id.stck >> 12) + base - (TOD_UNIX_EPOCH >> 12); + level = entry->level; + sec = entry->clock; usec = do_div(sec, USEC_PER_SEC); - if (entry->id.fields.exception) + if (entry->exception) except_str = "*"; else except_str = "-"; caller = (unsigned long) entry->caller; - rc += sprintf(out_buf, "%02i %011ld:%06lu %1u %1s %02i %pK ", + rc += sprintf(out_buf, "%02i %011ld:%06lu %1u %1s %04u %px ", area, sec, usec, level, except_str, - entry->id.fields.cpuid, (void *)caller); + entry->cpu, (void *)caller); return rc; } EXPORT_SYMBOL(debug_dflt_header_fn); @@ -1470,8 +1521,9 @@ EXPORT_SYMBOL(debug_dflt_header_fn); #define DEBUG_SPRINTF_MAX_ARGS 10 static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view, - char *out_buf, debug_sprintf_entry_t *curr_event) + char *out_buf, const char *inbuf) { + debug_sprintf_entry_t *curr_event = (debug_sprintf_entry_t *)inbuf; int num_longs, num_used_args = 0, i, rc = 0; int index[DEBUG_SPRINTF_MAX_ARGS]; @@ -1511,7 +1563,7 @@ out: */ static int __init debug_init(void) { - s390dbf_sysctl_header = register_sysctl_table(s390dbf_dir_table); + s390dbf_sysctl_header = register_sysctl("s390dbf", s390dbf_table); mutex_lock(&debug_mutex); debug_debugfs_root_entry = debugfs_create_dir(DEBUG_DIR_ROOT, NULL); initialized = 1; diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag.c index e9dac9a24d3f..92fdc35f028c 100644 --- a/arch/s390/kernel/diag.c +++ b/arch/s390/kernel/diag.c @@ -11,9 +11,12 @@ #include <linux/cpu.h> #include <linux/seq_file.h> #include <linux/debugfs.h> +#include <linux/vmalloc.h> +#include <asm/asm-extable.h> #include <asm/diag.h> #include <asm/trace/diag.h> #include <asm/sections.h> +#include "entry.h" struct diag_stat { unsigned int counter[NR_DIAG_STAT]; @@ -33,6 +36,7 @@ static const struct diag_desc diag_map[NR_DIAG_STAT] = { [DIAG_STAT_X014] = { .code = 0x014, .name = "Spool File Services" }, [DIAG_STAT_X044] = { .code = 0x044, .name = "Voluntary Timeslice End" }, [DIAG_STAT_X064] = { .code = 0x064, .name = "NSS Manipulation" }, + [DIAG_STAT_X08C] = { .code = 0x08c, .name = "Access 3270 Display Device Information" }, [DIAG_STAT_X09C] = { .code = 0x09c, .name = "Relinquish Timeslice" }, [DIAG_STAT_X0DC] = { .code = 0x0dc, .name = "Appldata Control" }, [DIAG_STAT_X204] = { .code = 0x204, .name = "Logical-CPU Utilization" }, @@ -47,11 +51,24 @@ static const struct diag_desc diag_map[NR_DIAG_STAT] = { [DIAG_STAT_X304] = { .code = 0x304, .name = "Partition-Resource Service" }, [DIAG_STAT_X308] = { .code = 0x308, .name = "List-Directed IPL" }, [DIAG_STAT_X318] = { .code = 0x318, .name = "CP Name and Version Codes" }, + [DIAG_STAT_X320] = { .code = 0x320, .name = "Certificate Store" }, [DIAG_STAT_X500] = { .code = 0x500, .name = "Virtio Service" }, }; -struct diag_ops __bootdata_preserved(diag_dma_ops); -struct diag210 *__bootdata_preserved(__diag210_tmp_dma); +struct diag_ops __amode31_ref diag_amode31_ops = { + .diag210 = _diag210_amode31, + .diag26c = _diag26c_amode31, + .diag14 = _diag14_amode31, + .diag0c = _diag0c_amode31, + .diag8c = _diag8c_amode31, + .diag308_reset = _diag308_reset_amode31 +}; + +static struct diag210 _diag210_tmp_amode31 __section(".amode31.data"); +struct diag210 __amode31_ref *__diag210_tmp_amode31 = &_diag210_tmp_amode31; + +static struct diag8c _diag8c_tmp_amode31 __section(".amode31.data"); +static struct diag8c __amode31_ref *__diag8c_tmp_amode31 = &_diag8c_tmp_amode31; static int show_diag_stat(struct seq_file *m, void *v) { @@ -59,7 +76,7 @@ static int show_diag_stat(struct seq_file *m, void *v) unsigned long n = (unsigned long) v - 1; int cpu, prec, tmp; - get_online_cpus(); + cpus_read_lock(); if (n == 0) { seq_puts(m, " "); @@ -78,13 +95,13 @@ static int show_diag_stat(struct seq_file *m, void *v) } seq_printf(m, " %s\n", diag_map[n-1].name); } - put_online_cpus(); + cpus_read_unlock(); return 0; } static void *show_diag_stat_start(struct seq_file *m, loff_t *pos) { - return *pos <= nr_cpu_ids ? (void *)((unsigned long) *pos + 1) : NULL; + return *pos <= NR_DIAG_STAT ? (void *)((unsigned long) *pos + 1) : NULL; } static void *show_diag_stat_next(struct seq_file *m, void *v, loff_t *pos) @@ -104,18 +121,7 @@ static const struct seq_operations show_diag_stat_sops = { .show = show_diag_stat, }; -static int show_diag_stat_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &show_diag_stat_sops); -} - -static const struct file_operations show_diag_stat_fops = { - .open = show_diag_stat_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - +DEFINE_SEQ_ATTRIBUTE(show_diag_stat); static int __init show_diag_stat_init(void) { @@ -133,7 +139,7 @@ void diag_stat_inc(enum diag_stat_enum nr) } EXPORT_SYMBOL(diag_stat_inc); -void diag_stat_inc_norecursion(enum diag_stat_enum nr) +void notrace diag_stat_inc_norecursion(enum diag_stat_enum nr) { this_cpu_inc(diag_stat.counter[nr]); trace_s390_diagnose_norecursion(diag_map[nr].code); @@ -146,26 +152,46 @@ EXPORT_SYMBOL(diag_stat_inc_norecursion); int diag14(unsigned long rx, unsigned long ry1, unsigned long subcode) { diag_stat_inc(DIAG_STAT_X014); - return diag_dma_ops.diag14(rx, ry1, subcode); + return diag_amode31_ops.diag14(rx, ry1, subcode); } EXPORT_SYMBOL(diag14); static inline int __diag204(unsigned long *subcode, unsigned long size, void *addr) { - register unsigned long _subcode asm("0") = *subcode; - register unsigned long _size asm("1") = size; + union register_pair rp = { .even = *subcode, .odd = size }; asm volatile( - " diag %2,%0,0x204\n" + " diag %[addr],%[rp],0x204\n" "0: nopr %%r7\n" EX_TABLE(0b,0b) - : "+d" (_subcode), "+d" (_size) : "d" (addr) : "memory"); - *subcode = _subcode; - return _size; + : [rp] "+&d" (rp.pair) : [addr] "d" (addr) : "memory"); + *subcode = rp.even; + return rp.odd; } +/** + * diag204() - Issue diagnose 204 call. + * @subcode: Subcode of diagnose 204 to be executed. + * @size: Size of area in pages which @area points to, if given. + * @addr: Vmalloc'ed memory area where the result is written to. + * + * Execute diagnose 204 with the given subcode and write the result to the + * memory area specified with @addr. For subcodes which do not write a + * result to memory both @size and @addr must be zero. If @addr is + * specified it must be page aligned and must have been allocated with + * vmalloc(). Conversion to real / physical addresses will be handled by + * this function if required. + */ int diag204(unsigned long subcode, unsigned long size, void *addr) { + if (addr) { + if (WARN_ON_ONCE(!is_vmalloc_addr(addr))) + return -1; + if (WARN_ON_ONCE(!IS_ALIGNED((unsigned long)addr, PAGE_SIZE))) + return -1; + } + if ((subcode & DIAG204_SUBCODE_MASK) == DIAG204_SUBC_STIB4) + addr = (void *)pfn_to_phys(vmalloc_to_pfn(addr)); diag_stat_inc(DIAG_STAT_X204); size = __diag204(&subcode, size, addr); if (subcode) @@ -184,20 +210,42 @@ int diag210(struct diag210 *addr) int ccode; spin_lock_irqsave(&diag210_lock, flags); - *__diag210_tmp_dma = *addr; + *__diag210_tmp_amode31 = *addr; diag_stat_inc(DIAG_STAT_X210); - ccode = diag_dma_ops.diag210(__diag210_tmp_dma); + ccode = diag_amode31_ops.diag210(__diag210_tmp_amode31); - *addr = *__diag210_tmp_dma; + *addr = *__diag210_tmp_amode31; spin_unlock_irqrestore(&diag210_lock, flags); return ccode; } EXPORT_SYMBOL(diag210); +/* + * Diagnose 8C: Access 3270 Display Device Information + */ +int diag8c(struct diag8c *addr, struct ccw_dev_id *devno) +{ + static DEFINE_SPINLOCK(diag8c_lock); + unsigned long flags; + int ccode; + + spin_lock_irqsave(&diag8c_lock, flags); + + diag_stat_inc(DIAG_STAT_X08C); + ccode = diag_amode31_ops.diag8c(__diag8c_tmp_amode31, devno, sizeof(*addr)); + + *addr = *__diag8c_tmp_amode31; + spin_unlock_irqrestore(&diag8c_lock, flags); + + return ccode; +} +EXPORT_SYMBOL(diag8c); + int diag224(void *ptr) { + unsigned long addr = __pa(ptr); int rc = -EOPNOTSUPP; diag_stat_inc(DIAG_STAT_X224); @@ -206,7 +254,7 @@ int diag224(void *ptr) "0: lhi %0,0x0\n" "1:\n" EX_TABLE(0b,1b) - : "+d" (rc) :"d" (0), "d" (ptr) : "memory"); + : "+d" (rc) :"d" (0), "d" (addr) : "memory"); return rc; } EXPORT_SYMBOL(diag224); @@ -217,6 +265,6 @@ EXPORT_SYMBOL(diag224); int diag26c(void *req, void *resp, enum diag26c_sc subcode) { diag_stat_inc(DIAG_STAT_X26C); - return diag_dma_ops.diag26c(req, resp, subcode); + return diag_amode31_ops.diag26c(req, resp, subcode); } EXPORT_SYMBOL(diag26c); diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c index f304802ecf7b..89dc826a8d2e 100644 --- a/arch/s390/kernel/dis.c +++ b/arch/s390/kernel/dis.c @@ -24,8 +24,8 @@ #include <linux/kdebug.h> #include <linux/uaccess.h> #include <linux/atomic.h> +#include <linux/io.h> #include <asm/dis.h> -#include <asm/io.h> #include <asm/cpcmd.h> #include <asm/lowcore.h> #include <asm/debug.h> @@ -278,6 +278,7 @@ static const unsigned char formats[][6] = { [INSTR_SIL_RDI] = { D_20, B_16, I16_32, 0, 0, 0 }, [INSTR_SIL_RDU] = { D_20, B_16, U16_32, 0, 0, 0 }, [INSTR_SIY_IRD] = { D20_20, B_16, I8_8, 0, 0, 0 }, + [INSTR_SIY_RD] = { D20_20, B_16, 0, 0, 0, 0 }, [INSTR_SIY_URD] = { D20_20, B_16, U8_8, 0, 0, 0 }, [INSTR_SI_RD] = { D_20, B_16, 0, 0, 0, 0 }, [INSTR_SI_URD] = { D_20, B_16, U8_8, 0, 0, 0 }, @@ -312,10 +313,12 @@ static const unsigned char formats[][6] = { [INSTR_VRR_VV] = { V_8, V_12, 0, 0, 0, 0 }, [INSTR_VRR_VV0U] = { V_8, V_12, U4_32, 0, 0, 0 }, [INSTR_VRR_VV0U0U] = { V_8, V_12, U4_32, U4_24, 0, 0 }, + [INSTR_VRR_VV0U2] = { V_8, V_12, U4_24, 0, 0, 0 }, [INSTR_VRR_VV0UU2] = { V_8, V_12, U4_32, U4_28, 0, 0 }, [INSTR_VRR_VV0UUU] = { V_8, V_12, U4_32, U4_28, U4_24, 0 }, [INSTR_VRR_VVV] = { V_8, V_12, V_16, 0, 0, 0 }, [INSTR_VRR_VVV0U] = { V_8, V_12, V_16, U4_32, 0, 0 }, + [INSTR_VRR_VVV0U0] = { V_8, V_12, V_16, U4_24, 0, 0 }, [INSTR_VRR_VVV0U0U] = { V_8, V_12, V_16, U4_32, U4_24, 0 }, [INSTR_VRR_VVV0UU] = { V_8, V_12, V_16, U4_32, U4_28, 0 }, [INSTR_VRR_VVV0UUU] = { V_8, V_12, V_16, U4_32, U4_28, U4_24 }, @@ -482,32 +485,38 @@ static int print_insn(char *buffer, unsigned char *code, unsigned long addr) return (int) (ptr - buffer); } +static int copy_from_regs(struct pt_regs *regs, void *dst, void *src, int len) +{ + if (user_mode(regs)) { + if (copy_from_user(dst, (char __user *)src, len)) + return -EFAULT; + } else { + if (copy_from_kernel_nofault(dst, src, len)) + return -EFAULT; + } + return 0; +} + void show_code(struct pt_regs *regs) { char *mode = user_mode(regs) ? "User" : "Krnl"; unsigned char code[64]; char buffer[128], *ptr; - mm_segment_t old_fs; unsigned long addr; int start, end, opsize, hops, i; /* Get a snapshot of the 64 bytes surrounding the fault address. */ - old_fs = get_fs(); - set_fs(user_mode(regs) ? USER_DS : KERNEL_DS); for (start = 32; start && regs->psw.addr >= 34 - start; start -= 2) { addr = regs->psw.addr - 34 + start; - if (__copy_from_user(code + start - 2, - (char __user *) addr, 2)) + if (copy_from_regs(regs, code + start - 2, (void *)addr, 2)) break; } for (end = 32; end < 64; end += 2) { addr = regs->psw.addr + end - 32; - if (__copy_from_user(code + end, - (char __user *) addr, 2)) + if (copy_from_regs(regs, code + end, (void *)addr, 2)) break; } - set_fs(old_fs); - /* Code snapshot useable ? */ + /* Code snapshot usable ? */ if ((regs->psw.addr & 1) || start >= end) { printk("%s Code: Bad PSW.\n", mode); return; @@ -557,7 +566,7 @@ void show_code(struct pt_regs *regs) void print_fn_code(unsigned char *code, unsigned long len) { - char buffer[64], *ptr; + char buffer[128], *ptr; int opsize, i; while (len) { diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c index d306fe04489a..d2012635b093 100644 --- a/arch/s390/kernel/dumpstack.c +++ b/arch/s390/kernel/dumpstack.c @@ -41,51 +41,50 @@ const char *stack_type_name(enum stack_type type) EXPORT_SYMBOL_GPL(stack_type_name); static inline bool in_stack(unsigned long sp, struct stack_info *info, - enum stack_type type, unsigned long low, - unsigned long high) + enum stack_type type, unsigned long stack) { - if (sp < low || sp >= high) + if (sp < stack || sp >= stack + THREAD_SIZE) return false; info->type = type; - info->begin = low; - info->end = high; + info->begin = stack; + info->end = stack + THREAD_SIZE; return true; } static bool in_task_stack(unsigned long sp, struct task_struct *task, struct stack_info *info) { - unsigned long stack; + unsigned long stack = (unsigned long)task_stack_page(task); - stack = (unsigned long) task_stack_page(task); - return in_stack(sp, info, STACK_TYPE_TASK, stack, stack + THREAD_SIZE); + return in_stack(sp, info, STACK_TYPE_TASK, stack); } static bool in_irq_stack(unsigned long sp, struct stack_info *info) { - unsigned long frame_size, top; + unsigned long stack = S390_lowcore.async_stack - STACK_INIT_OFFSET; - frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs); - top = S390_lowcore.async_stack + frame_size; - return in_stack(sp, info, STACK_TYPE_IRQ, top - THREAD_SIZE, top); + return in_stack(sp, info, STACK_TYPE_IRQ, stack); } static bool in_nodat_stack(unsigned long sp, struct stack_info *info) { - unsigned long frame_size, top; + unsigned long stack = S390_lowcore.nodat_stack - STACK_INIT_OFFSET; - frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs); - top = S390_lowcore.nodat_stack + frame_size; - return in_stack(sp, info, STACK_TYPE_NODAT, top - THREAD_SIZE, top); + return in_stack(sp, info, STACK_TYPE_NODAT, stack); +} + +static bool in_mcck_stack(unsigned long sp, struct stack_info *info) +{ + unsigned long stack = S390_lowcore.mcck_stack - STACK_INIT_OFFSET; + + return in_stack(sp, info, STACK_TYPE_MCCK, stack); } static bool in_restart_stack(unsigned long sp, struct stack_info *info) { - unsigned long frame_size, top; + unsigned long stack = S390_lowcore.restart_stack - STACK_INIT_OFFSET; - frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs); - top = S390_lowcore.restart_stack + frame_size; - return in_stack(sp, info, STACK_TYPE_RESTART, top - THREAD_SIZE, top); + return in_stack(sp, info, STACK_TYPE_RESTART, stack); } int get_stack_info(unsigned long sp, struct task_struct *task, @@ -108,7 +107,8 @@ int get_stack_info(unsigned long sp, struct task_struct *task, /* Check per-cpu stacks */ if (!in_irq_stack(sp, info) && !in_nodat_stack(sp, info) && - !in_restart_stack(sp, info)) + !in_restart_stack(sp, info) && + !in_mcck_stack(sp, info)) goto unknown; recursion_check: @@ -126,22 +126,29 @@ unknown: return -EINVAL; } -void show_stack(struct task_struct *task, unsigned long *stack) +void show_stack(struct task_struct *task, unsigned long *stack, + const char *loglvl) { struct unwind_state state; - printk("Call Trace:\n"); + printk("%sCall Trace:\n", loglvl); unwind_for_each_frame(&state, task, NULL, (unsigned long) stack) - printk(state.reliable ? " [<%016lx>] %pSR \n" : - "([<%016lx>] %pSR)\n", - state.ip, (void *) state.ip); + printk(state.reliable ? "%s [<%016lx>] %pSR \n" : + "%s([<%016lx>] %pSR)\n", + loglvl, state.ip, (void *) state.ip); debug_show_held_locks(task ? : current); } static void show_last_breaking_event(struct pt_regs *regs) { printk("Last Breaking-Event-Address:\n"); - printk(" [<%016lx>] %pSR\n", regs->args[0], (void *)regs->args[0]); + printk(" [<%016lx>] ", regs->last_break); + if (user_mode(regs)) { + print_vma_addr(KERN_CONT, regs->last_break); + pr_cont("\n"); + } else { + pr_cont("%pSR\n", (void *)regs->last_break); + } } void show_registers(struct pt_regs *regs) @@ -175,13 +182,13 @@ void show_regs(struct pt_regs *regs) show_registers(regs); /* Show stack backtrace if pt_regs is from kernel mode */ if (!user_mode(regs)) - show_stack(NULL, (unsigned long *) regs->gprs[15]); + show_stack(NULL, (unsigned long *) regs->gprs[15], KERN_DEFAULT); show_last_breaking_event(regs); } static DEFINE_SPINLOCK(die_lock); -void die(struct pt_regs *regs, const char *str) +void __noreturn die(struct pt_regs *regs, const char *str) { static int die_counter; @@ -195,6 +202,8 @@ void die(struct pt_regs *regs, const char *str) regs->int_code >> 17, ++die_counter); #ifdef CONFIG_PREEMPT pr_cont("PREEMPT "); +#elif defined(CONFIG_PREEMPT_RT) + pr_cont("PREEMPT_RT "); #endif pr_cont("SMP "); if (debug_pagealloc_enabled()) @@ -211,5 +220,5 @@ void die(struct pt_regs *regs, const char *str) if (panic_on_oops) panic("Fatal exception: panic_on_oops"); oops_exit(); - do_exit(SIGSEGV); + make_task_dead(SIGSEGV); } diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index db32a55daaec..2345ea332b97 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -2,7 +2,6 @@ /* * Copyright IBM Corp. 2007, 2009 * Author(s): Hongjie Yang <hongjie@us.ibm.com>, - * Heiko Carstens <heiko.carstens@de.ibm.com> */ #define KMSG_COMPONENT "setup" @@ -18,6 +17,8 @@ #include <linux/pfn.h> #include <linux/uaccess.h> #include <linux/kernel.h> +#include <asm/asm-extable.h> +#include <linux/memblock.h> #include <asm/diag.h> #include <asm/ebcdic.h> #include <asm/ipl.h> @@ -33,18 +34,43 @@ #include <asm/switch_to.h> #include "entry.h" +#define decompressor_handled_param(param) \ +static int __init ignore_decompressor_param_##param(char *s) \ +{ \ + return 0; \ +} \ +early_param(#param, ignore_decompressor_param_##param) + +decompressor_handled_param(mem); +decompressor_handled_param(vmalloc); +decompressor_handled_param(dfltcc); +decompressor_handled_param(facilities); +decompressor_handled_param(nokaslr); +decompressor_handled_param(cmma); +#if IS_ENABLED(CONFIG_KVM) +decompressor_handled_param(prot_virt); +#endif + +static void __init kasan_early_init(void) +{ +#ifdef CONFIG_KASAN + init_task.kasan_depth = 0; + sclp_early_printk("KernelAddressSanitizer initialized\n"); +#endif +} + static void __init reset_tod_clock(void) { - u64 time; + union tod_clock clk; - if (store_tod_clock(&time) == 0) + if (store_tod_clock_ext_cc(&clk) == 0) return; /* TOD clock not running. Set the clock to Unix Epoch. */ - if (set_tod_clock(TOD_UNIX_EPOCH) != 0 || store_tod_clock(&time) != 0) + if (set_tod_clock(TOD_UNIX_EPOCH) || store_tod_clock_ext_cc(&clk)) disabled_wait(); - memset(tod_clock_base, 0, 16); - *(__u64 *) &tod_clock_base[1] = TOD_UNIX_EPOCH; + memset(&tod_clock_base, 0, sizeof(tod_clock_base)); + tod_clock_base.tod = TOD_UNIX_EPOCH; S390_lowcore.last_update_clock = TOD_UNIX_EPOCH; } @@ -147,44 +173,27 @@ static __init void setup_topology(void) topology_max_mnest = max_mnest; } -static void early_pgm_check_handler(void) +void __do_early_pgm_check(struct pt_regs *regs) { - const struct exception_table_entry *fixup; - unsigned long cr0, cr0_new; - unsigned long addr; - - addr = S390_lowcore.program_old_psw.addr; - fixup = s390_search_extables(addr); - if (!fixup) + if (!fixup_exception(regs)) disabled_wait(); - /* Disable low address protection before storing into lowcore. */ - __ctl_store(cr0, 0, 0); - cr0_new = cr0 & ~(1UL << 28); - __ctl_load(cr0_new, 0, 0); - S390_lowcore.program_old_psw.addr = extable_fixup(fixup); - __ctl_load(cr0, 0, 0); } static noinline __init void setup_lowcore_early(void) { psw_t psw; - psw.mask = PSW_MASK_BASE | PSW_DEFAULT_KEY | PSW_MASK_EA | PSW_MASK_BA; - psw.addr = (unsigned long) s390_base_ext_handler; - S390_lowcore.external_new_psw = psw; - psw.addr = (unsigned long) s390_base_pgm_handler; + psw.addr = (unsigned long)early_pgm_check_handler; + psw.mask = PSW_KERNEL_BITS; S390_lowcore.program_new_psw = psw; - s390_base_pgm_handler_fn = early_pgm_check_handler; S390_lowcore.preempt_count = INIT_PREEMPT_COUNT; } static noinline __init void setup_facility_list(void) { - memcpy(S390_lowcore.alt_stfle_fac_list, - S390_lowcore.stfle_fac_list, - sizeof(S390_lowcore.alt_stfle_fac_list)); + memcpy(alt_stfle_fac_list, stfle_fac_list, sizeof(alt_stfle_fac_list)); if (!IS_ENABLED(CONFIG_KERNEL_NOBP)) - __clear_facility(82, S390_lowcore.alt_stfle_fac_list); + __clear_facility(82, alt_stfle_fac_list); } static __init void detect_diag9c(void) @@ -204,26 +213,11 @@ static __init void detect_diag9c(void) S390_lowcore.machine_flags |= MACHINE_FLAG_DIAG9C; } -static __init void detect_diag44(void) -{ - int rc; - - diag_stat_inc(DIAG_STAT_X044); - asm volatile( - " diag 0,0,0x44\n" - "0: la %0,0\n" - "1:\n" - EX_TABLE(0b,1b) - : "=d" (rc) : "0" (-EOPNOTSUPP) : "cc"); - if (!rc) - S390_lowcore.machine_flags |= MACHINE_FLAG_DIAG44; -} - static __init void detect_machine_facilities(void) { if (test_facility(8)) { S390_lowcore.machine_flags |= MACHINE_FLAG_EDAT1; - __ctl_set_bit(0, 23); + system_ctl_set_bit(0, CR0_EDAT_BIT); } if (test_facility(78)) S390_lowcore.machine_flags |= MACHINE_FLAG_EDAT2; @@ -231,26 +225,28 @@ static __init void detect_machine_facilities(void) S390_lowcore.machine_flags |= MACHINE_FLAG_IDTE; if (test_facility(50) && test_facility(73)) { S390_lowcore.machine_flags |= MACHINE_FLAG_TE; - __ctl_set_bit(0, 55); + system_ctl_set_bit(0, CR0_TRANSACTIONAL_EXECUTION_BIT); } if (test_facility(51)) S390_lowcore.machine_flags |= MACHINE_FLAG_TLB_LC; - if (test_facility(129)) { - S390_lowcore.machine_flags |= MACHINE_FLAG_VX; - __ctl_set_bit(0, 17); - } - if (test_facility(130) && !noexec_disabled) { + if (test_facility(129)) + system_ctl_set_bit(0, CR0_VECTOR_BIT); + if (test_facility(130)) S390_lowcore.machine_flags |= MACHINE_FLAG_NX; - __ctl_set_bit(0, 20); - } if (test_facility(133)) S390_lowcore.machine_flags |= MACHINE_FLAG_GS; - if (test_facility(139) && (tod_clock_base[1] & 0x80)) { + if (test_facility(139) && (tod_clock_base.tod >> 63)) { /* Enabled signed clock comparator comparisons */ S390_lowcore.machine_flags |= MACHINE_FLAG_SCC; clock_comparator_max = -1ULL >> 1; - __ctl_set_bit(0, 53); + system_ctl_set_bit(0, CR0_CLOCK_COMPARATOR_SIGN_BIT); } + if (IS_ENABLED(CONFIG_PCI) && test_facility(153)) { + S390_lowcore.machine_flags |= MACHINE_FLAG_PCI_MIO; + /* the control bit is set during PCI initialization */ + } + if (test_facility(194)) + S390_lowcore.machine_flags |= MACHINE_FLAG_RDP; } static inline void save_vector_registers(void) @@ -261,15 +257,9 @@ static inline void save_vector_registers(void) #endif } -static inline void setup_control_registers(void) +static inline void setup_low_address_protection(void) { - unsigned long reg; - - __ctl_store(reg, 0, 0); - reg |= CR0_LOW_ADDRESS_PROTECTION; - reg |= CR0_EMERGENCY_SIGNAL_SUBMASK; - reg |= CR0_EXTERNAL_CALL_SUBMASK; - __ctl_load(reg, 0, 0); + system_ctl_set_bit(0, CR0_LOW_ADDRESS_PROTECTION_BIT); } static inline void setup_access_registers(void) @@ -279,64 +269,37 @@ static inline void setup_access_registers(void) restore_access_regs(acrs); } -static int __init disable_vector_extension(char *str) -{ - S390_lowcore.machine_flags &= ~MACHINE_FLAG_VX; - __ctl_clear_bit(0, 17); - return 0; -} -early_param("novx", disable_vector_extension); - -static int __init cad_setup(char *str) -{ - bool enabled; - int rc; - - rc = kstrtobool(str, &enabled); - if (!rc && enabled && test_facility(128)) - /* Enable problem state CAD. */ - __ctl_set_bit(2, 3); - return rc; -} -early_param("cad", cad_setup); - char __bootdata(early_command_line)[COMMAND_LINE_SIZE]; static void __init setup_boot_command_line(void) { /* copy arch command line */ - strlcpy(boot_command_line, early_command_line, ARCH_COMMAND_LINE_SIZE); + strscpy(boot_command_line, early_command_line, COMMAND_LINE_SIZE); } -static void __init check_image_bootable(void) +static void __init sort_amode31_extable(void) { - if (!memcmp(EP_STRING, (void *)EP_OFFSET, strlen(EP_STRING))) - return; - - sclp_early_printk("Linux kernel boot failure: An attempt to boot a vmlinux ELF image failed.\n"); - sclp_early_printk("This image does not contain all parts necessary for starting up. Use\n"); - sclp_early_printk("bzImage or arch/s390/boot/compressed/vmlinux instead.\n"); - disabled_wait(); + sort_extable(__start_amode31_ex_table, __stop_amode31_ex_table); } void __init startup_init(void) { + kasan_early_init(); reset_tod_clock(); - check_image_bootable(); time_early_init(); init_kernel_storage_key(); lockdep_off(); + sort_amode31_extable(); setup_lowcore_early(); setup_facility_list(); detect_machine_type(); setup_arch_string(); setup_boot_command_line(); detect_diag9c(); - detect_diag44(); detect_machine_facilities(); save_vector_registers(); setup_topology(); sclp_early_detect(); - setup_control_registers(); + setup_low_address_protection(); setup_access_registers(); lockdep_on(); } diff --git a/arch/s390/kernel/early_printk.c b/arch/s390/kernel/early_printk.c index 6f24d83bc5dc..d9d53f44008a 100644 --- a/arch/s390/kernel/early_printk.c +++ b/arch/s390/kernel/early_printk.c @@ -10,7 +10,7 @@ static void sclp_early_write(struct console *con, const char *s, unsigned int len) { - __sclp_early_printk(s, len, 0); + __sclp_early_printk(s, len); } static struct console sclp_early_console = { diff --git a/arch/s390/kernel/earlypgm.S b/arch/s390/kernel/earlypgm.S new file mode 100644 index 000000000000..c634871f0d90 --- /dev/null +++ b/arch/s390/kernel/earlypgm.S @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 2006, 2007 + * Author(s): Michael Holzheu <holzheu@de.ibm.com> + */ + +#include <linux/linkage.h> +#include <asm/asm-offsets.h> + +SYM_CODE_START(early_pgm_check_handler) + stmg %r8,%r15,__LC_SAVE_AREA_SYNC + aghi %r15,-(STACK_FRAME_OVERHEAD+__PT_SIZE) + la %r11,STACK_FRAME_OVERHEAD(%r15) + xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) + stmg %r0,%r7,__PT_R0(%r11) + mvc __PT_PSW(16,%r11),__LC_PGM_OLD_PSW + mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC + lgr %r2,%r11 + brasl %r14,__do_early_pgm_check + mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) + lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) + lpswe __LC_RETURN_PSW +SYM_CODE_END(early_pgm_check_handler) diff --git a/arch/s390/kernel/ebcdic.c b/arch/s390/kernel/ebcdic.c index 7f8246c9be08..0e51fa537262 100644 --- a/arch/s390/kernel/ebcdic.c +++ b/arch/s390/kernel/ebcdic.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * ECBDIC -> ASCII, ASCII -> ECBDIC, + * EBCDIC -> ASCII, ASCII -> EBCDIC, * upper to lower case (EBCDIC) conversion tables. * * S390 version diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 270d1d145761..49a11f6dd7ae 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -6,15 +6,15 @@ * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com), * Hartmut Penner (hp@de.ibm.com), * Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com), - * Heiko Carstens <heiko.carstens@de.ibm.com> */ +#include <linux/export.h> #include <linux/init.h> #include <linux/linkage.h> +#include <asm/asm-extable.h> #include <asm/alternative-asm.h> #include <asm/processor.h> #include <asm/cache.h> -#include <asm/ctl_reg.h> #include <asm/dwarf.h> #include <asm/errno.h> #include <asm/ptrace.h> @@ -27,67 +27,29 @@ #include <asm/vx-insn.h> #include <asm/setup.h> #include <asm/nmi.h> -#include <asm/export.h> #include <asm/nospec-insn.h> -__PT_R0 = __PT_GPRS -__PT_R1 = __PT_GPRS + 8 -__PT_R2 = __PT_GPRS + 16 -__PT_R3 = __PT_GPRS + 24 -__PT_R4 = __PT_GPRS + 32 -__PT_R5 = __PT_GPRS + 40 -__PT_R6 = __PT_GPRS + 48 -__PT_R7 = __PT_GPRS + 56 -__PT_R8 = __PT_GPRS + 64 -__PT_R9 = __PT_GPRS + 72 -__PT_R10 = __PT_GPRS + 80 -__PT_R11 = __PT_GPRS + 88 -__PT_R12 = __PT_GPRS + 96 -__PT_R13 = __PT_GPRS + 104 -__PT_R14 = __PT_GPRS + 112 -__PT_R15 = __PT_GPRS + 120 - -STACK_SHIFT = PAGE_SHIFT + THREAD_SIZE_ORDER -STACK_SIZE = 1 << STACK_SHIFT -STACK_INIT = STACK_SIZE - STACK_FRAME_OVERHEAD - __PT_SIZE - -_TIF_WORK = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \ - _TIF_UPROBE | _TIF_GUARDED_STORAGE | _TIF_PATCH_PENDING) -_TIF_TRACE = (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | \ - _TIF_SYSCALL_TRACEPOINT) -_CIF_WORK = (_CIF_MCCK_PENDING | _CIF_ASCE_PRIMARY | \ - _CIF_ASCE_SECONDARY | _CIF_FPU) -_PIF_WORK = (_PIF_PER_TRAP | _PIF_SYSCALL_RESTART) - _LPP_OFFSET = __LC_LPP -#define BASED(name) name-cleanup_critical(%r13) + .macro STBEAR address + ALTERNATIVE "nop", ".insn s,0xb2010000,\address", 193 + .endm - .macro TRACE_IRQS_ON -#ifdef CONFIG_TRACE_IRQFLAGS - basr %r2,%r0 - brasl %r14,trace_hardirqs_on_caller -#endif + .macro LBEAR address + ALTERNATIVE "nop", ".insn s,0xb2000000,\address", 193 .endm - .macro TRACE_IRQS_OFF -#ifdef CONFIG_TRACE_IRQFLAGS - basr %r2,%r0 - brasl %r14,trace_hardirqs_off_caller -#endif + .macro LPSWEY address,lpswe + ALTERNATIVE "b \lpswe; nopr", ".insn siy,0xeb0000000071,\address,0", 193 .endm - .macro LOCKDEP_SYS_EXIT -#ifdef CONFIG_LOCKDEP - tm __PT_PSW+1(%r11),0x01 # returning to user ? - jz .+10 - brasl %r14,lockdep_sys_exit -#endif + .macro MBEAR reg + ALTERNATIVE "brcl 0,0", __stringify(mvc __PT_LAST_BREAK(8,\reg),__LC_LAST_BREAK), 193 .endm .macro CHECK_STACK savearea #ifdef CONFIG_CHECK_STACK - tml %r15,STACK_SIZE - CONFIG_STACK_GUARD + tml %r15,THREAD_SIZE - CONFIG_STACK_GUARD lghi %r14,\savearea jz stack_overflow #endif @@ -96,12 +58,14 @@ _LPP_OFFSET = __LC_LPP .macro CHECK_VMAP_STACK savearea,oklabel #ifdef CONFIG_VMAP_STACK lgr %r14,%r15 - nill %r14,0x10000 - STACK_SIZE - oill %r14,STACK_INIT + nill %r14,0x10000 - THREAD_SIZE + oill %r14,STACK_INIT_OFFSET clg %r14,__LC_KERNEL_STACK je \oklabel clg %r14,__LC_ASYNC_STACK je \oklabel + clg %r14,__LC_MCCK_STACK + je \oklabel clg %r14,__LC_NODAT_STACK je \oklabel clg %r14,__LC_RESTART_STACK @@ -113,56 +77,6 @@ _LPP_OFFSET = __LC_LPP #endif .endm - .macro SWITCH_ASYNC savearea,timer - tmhh %r8,0x0001 # interrupting from user ? - jnz 1f - lgr %r14,%r9 - slg %r14,BASED(.Lcritical_start) - clg %r14,BASED(.Lcritical_length) - jhe 0f - lghi %r11,\savearea # inside critical section, do cleanup - brasl %r14,cleanup_critical - tmhh %r8,0x0001 # retest problem state after cleanup - jnz 1f -0: lg %r14,__LC_ASYNC_STACK # are we already on the target stack? - slgr %r14,%r15 - srag %r14,%r14,STACK_SHIFT - jnz 2f - CHECK_STACK \savearea - aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) - j 3f -1: UPDATE_VTIME %r14,%r15,\timer - BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP -2: lg %r15,__LC_ASYNC_STACK # load async stack -3: la %r11,STACK_FRAME_OVERHEAD(%r15) - .endm - - .macro UPDATE_VTIME w1,w2,enter_timer - lg \w1,__LC_EXIT_TIMER - lg \w2,__LC_LAST_UPDATE_TIMER - slg \w1,\enter_timer - slg \w2,__LC_EXIT_TIMER - alg \w1,__LC_USER_TIMER - alg \w2,__LC_SYSTEM_TIMER - stg \w1,__LC_USER_TIMER - stg \w2,__LC_SYSTEM_TIMER - mvc __LC_LAST_UPDATE_TIMER(8),\enter_timer - .endm - - .macro REENABLE_IRQS - stg %r8,__LC_RETURN_PSW - ni __LC_RETURN_PSW,0xbf - ssm __LC_RETURN_PSW - .endm - - .macro STCK savearea -#ifdef CONFIG_HAVE_MARCH_Z9_109_FEATURES - .insn s,0xb27c0000,\savearea # store clock fast -#else - .insn s,0xb2050000,\savearea # store clock -#endif - .endm - /* * The TSTMSK macro generates a test-under-mask instruction by * calculating the memory offset for the specified mask value. @@ -186,46 +100,76 @@ _LPP_OFFSET = __LC_LPP .endm .macro BPOFF - ALTERNATIVE "", ".long 0xb2e8c000", 82 + ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,12,0", 82 .endm .macro BPON - ALTERNATIVE "", ".long 0xb2e8d000", 82 + ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,13,0", 82 .endm .macro BPENTER tif_ptr,tif_mask - ALTERNATIVE "TSTMSK \tif_ptr,\tif_mask; jz .+8; .long 0xb2e8d000", \ - "", 82 + ALTERNATIVE "TSTMSK \tif_ptr,\tif_mask; jz .+8; .insn rrf,0xb2e80000,0,0,13,0", \ + "j .+12; nop; nop", 82 .endm .macro BPEXIT tif_ptr,tif_mask TSTMSK \tif_ptr,\tif_mask - ALTERNATIVE "jz .+8; .long 0xb2e8c000", \ - "jnz .+8; .long 0xb2e8d000", 82 + ALTERNATIVE "jz .+8; .insn rrf,0xb2e80000,0,0,12,0", \ + "jnz .+8; .insn rrf,0xb2e80000,0,0,13,0", 82 + .endm + +#if IS_ENABLED(CONFIG_KVM) + /* + * The OUTSIDE macro jumps to the provided label in case the value + * in the provided register is outside of the provided range. The + * macro is useful for checking whether a PSW stored in a register + * pair points inside or outside of a block of instructions. + * @reg: register to check + * @start: start of the range + * @end: end of the range + * @outside_label: jump here if @reg is outside of [@start..@end) + */ + .macro OUTSIDE reg,start,end,outside_label + lgr %r14,\reg + larl %r13,\start + slgr %r14,%r13 + clgfrl %r14,.Lrange_size\@ + jhe \outside_label + .section .rodata, "a" + .balign 4 +.Lrange_size\@: + .long \end - \start + .previous + .endm + + .macro SIEEXIT + lg %r9,__SF_SIE_CONTROL(%r15) # get control block pointer + ni __SIE_PROG0C+3(%r9),0xfe # no longer in SIE + lctlg %c1,%c1,__LC_KERNEL_ASCE # load primary asce + larl %r9,sie_exit # skip forward to sie_exit + .endm +#endif + + .macro STACKLEAK_ERASE +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK + brasl %r14,stackleak_erase_on_task_stack +#endif .endm - GEN_BR_THUNK %r9 GEN_BR_THUNK %r14 - GEN_BR_THUNK %r14,%r11 .section .kprobes.text, "ax" .Ldummy: /* - * This nop exists only in order to avoid that __switch_to starts at - * the beginning of the kprobes text section. In that case we would - * have several symbols at the same address. E.g. objdump would take - * an arbitrary symbol name when disassembling this code. - * With the added nop in between the __switch_to symbol is unique - * again. + * The following nop exists only in order to avoid that the next + * symbol starts at the beginning of the kprobes text section. + * In that case there would be several symbols at the same address. + * E.g. objdump would take an arbitrary symbol when disassembling + * the code. + * With the added nop in between this cannot happen. */ nop 0 -ENTRY(__bpon) - .globl __bpon - BPON - BR_EX %r14 -ENDPROC(__bpon) - /* * Scheduler resume function, called by switch_to * gpr2 = (task_struct *) prev @@ -233,11 +177,11 @@ ENDPROC(__bpon) * Returns: * gpr2 = prev */ -ENTRY(__switch_to) +SYM_FUNC_START(__switch_to) stmg %r6,%r15,__SF_GPRS(%r15) # store gprs of prev task lghi %r4,__TASK_stack lghi %r1,__TASK_thread - llill %r5,STACK_INIT + llill %r5,STACK_INIT_OFFSET stg %r15,__THREAD_ksp(%r1,%r2) # store kernel stack of prev lg %r15,0(%r4,%r3) # start of kernel stack of next agr %r15,%r5 # end of kernel stack of next @@ -247,30 +191,26 @@ ENTRY(__switch_to) aghi %r3,__TASK_pid mvc __LC_CURRENT_PID(4,%r0),0(%r3) # store pid of next lmg %r6,%r15,__SF_GPRS(%r15) # load gprs of next task - ALTERNATIVE "", ".insn s,0xb2800000,_LPP_OFFSET", 40 + ALTERNATIVE "nop", "lpp _LPP_OFFSET", 40 BR_EX %r14 -ENDPROC(__switch_to) - -.L__critical_start: +SYM_FUNC_END(__switch_to) #if IS_ENABLED(CONFIG_KVM) /* - * sie64a calling convention: - * %r2 pointer to sie control block - * %r3 guest register save area + * __sie64a calling convention: + * %r2 pointer to sie control block phys + * %r3 pointer to sie control block virt + * %r4 guest register save area */ -ENTRY(sie64a) +SYM_FUNC_START(__sie64a) stmg %r6,%r14,__SF_GPRS(%r15) # save kernel registers lg %r12,__LC_CURRENT - stg %r2,__SF_SIE_CONTROL(%r15) # save control block pointer - stg %r3,__SF_SIE_SAVEAREA(%r15) # save guest register save area + stg %r2,__SF_SIE_CONTROL_PHYS(%r15) # save sie block physical.. + stg %r3,__SF_SIE_CONTROL(%r15) # ...and virtual addresses + stg %r4,__SF_SIE_SAVEAREA(%r15) # save guest register save area xc __SF_SIE_REASON(8,%r15),__SF_SIE_REASON(%r15) # reason code = 0 mvc __SF_SIE_FLAGS(8,%r15),__TI_flags(%r12) # copy thread flags - TSTMSK __LC_CPU_FLAGS,_CIF_FPU # load guest fp/vx registers ? - jno .Lsie_load_guest_gprs - brasl %r14,load_fpu_regs # load guest fp/vx regs -.Lsie_load_guest_gprs: - lmg %r0,%r13,0(%r3) # load guest gprs 0-13 + lmg %r0,%r13,0(%r4) # load guest gprs 0-13 lg %r14,__LC_GMAP # get gmap pointer ltgr %r14,%r14 jz .Lsie_gmap @@ -282,35 +222,37 @@ ENTRY(sie64a) jnz .Lsie_skip TSTMSK __LC_CPU_FLAGS,_CIF_FPU jo .Lsie_skip # exit if fp/vx regs changed - BPEXIT __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST) + lg %r14,__SF_SIE_CONTROL_PHYS(%r15) # get sie block phys addr + BPEXIT __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST .Lsie_entry: sie 0(%r14) -.Lsie_exit: +# Let the next instruction be NOP to avoid triggering a machine check +# and handling it in a guest as result of the instruction execution. + nopr 7 +.Lsie_leave: BPOFF - BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST) + BPENTER __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST .Lsie_skip: + lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE - lctlg %c1,%c1,__LC_USER_ASCE # load primary asce + lctlg %c1,%c1,__LC_KERNEL_ASCE # load primary asce .Lsie_done: # some program checks are suppressing. C code (e.g. do_protection_exception) # will rewind the PSW by the ILC, which is often 4 bytes in case of SIE. There # are some corner cases (e.g. runtime instrumentation) where ILC is unpredictable. -# Other instructions between sie64a and .Lsie_done should not cause program +# Other instructions between __sie64a and .Lsie_done should not cause program # interrupts. So lets use 3 nops as a landing pad for all possible rewinds. -# See also .Lcleanup_sie .Lrewind_pad6: nopr 7 .Lrewind_pad4: nopr 7 .Lrewind_pad2: nopr 7 - .globl sie_exit -sie_exit: +SYM_INNER_LABEL(sie_exit, SYM_L_GLOBAL) lg %r14,__SF_SIE_SAVEAREA(%r15) # load guest register save area stmg %r0,%r13,0(%r14) # save guest gprs 0-13 xgr %r0,%r0 # clear guest registers to xgr %r1,%r1 # prevent speculative use - xgr %r2,%r2 xgr %r3,%r3 xgr %r4,%r4 xgr %r5,%r5 @@ -326,838 +268,244 @@ sie_exit: EX_TABLE(.Lrewind_pad4,.Lsie_fault) EX_TABLE(.Lrewind_pad2,.Lsie_fault) EX_TABLE(sie_exit,.Lsie_fault) -ENDPROC(sie64a) -EXPORT_SYMBOL(sie64a) +SYM_FUNC_END(__sie64a) +EXPORT_SYMBOL(__sie64a) EXPORT_SYMBOL(sie_exit) #endif /* * SVC interrupt handler routine. System calls are synchronous events and - * are executed with interrupts enabled. + * are entered with interrupts disabled. */ -ENTRY(system_call) - stpt __LC_SYNC_ENTER_TIMER -.Lsysc_stmg: +SYM_CODE_START(system_call) + stpt __LC_SYS_ENTER_TIMER stmg %r8,%r15,__LC_SAVE_AREA_SYNC BPOFF - lg %r12,__LC_CURRENT - lghi %r13,__TASK_thread - lghi %r14,_PIF_SYSCALL + lghi %r14,0 .Lsysc_per: + STBEAR __LC_LAST_BREAK + lctlg %c1,%c1,__LC_KERNEL_ASCE lg %r15,__LC_KERNEL_STACK - la %r11,STACK_FRAME_OVERHEAD(%r15) # pointer to pt_regs -.Lsysc_vtime: - UPDATE_VTIME %r8,%r9,__LC_SYNC_ENTER_TIMER - BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP - stmg %r0,%r7,__PT_R0(%r11) - mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC - mvc __PT_PSW(16,%r11),__LC_SVC_OLD_PSW - mvc __PT_INT_CODE(4,%r11),__LC_SVC_ILC - stg %r14,__PT_FLAGS(%r11) -.Lsysc_do_svc: + xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) + stmg %r0,%r7,STACK_FRAME_OVERHEAD+__PT_R0(%r15) # clear user controlled register to prevent speculative use xgr %r0,%r0 - # load address of system call table - lg %r10,__THREAD_sysc_table(%r13,%r12) - llgh %r8,__PT_INT_CODE+2(%r11) - slag %r8,%r8,3 # shift and test for svc 0 - jnz .Lsysc_nr_ok - # svc 0: system call number in %r1 - llgfr %r1,%r1 # clear high word in r1 - cghi %r1,NR_syscalls - jnl .Lsysc_nr_ok - sth %r1,__PT_INT_CODE+2(%r11) - slag %r8,%r1,3 -.Lsysc_nr_ok: - xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) - stg %r2,__PT_ORIG_GPR2(%r11) - stg %r7,STACK_FRAME_OVERHEAD(%r15) - lg %r9,0(%r8,%r10) # get system call add. - TSTMSK __TI_flags(%r12),_TIF_TRACE - jnz .Lsysc_tracesys - BASR_EX %r14,%r9 # call sys_xxxx - stg %r2,__PT_R2(%r11) # store return value - -.Lsysc_return: -#ifdef CONFIG_DEBUG_RSEQ - lgr %r2,%r11 - brasl %r14,rseq_syscall -#endif - LOCKDEP_SYS_EXIT -.Lsysc_tif: - TSTMSK __PT_FLAGS(%r11),_PIF_WORK - jnz .Lsysc_work - TSTMSK __TI_flags(%r12),_TIF_WORK - jnz .Lsysc_work # check for work - TSTMSK __LC_CPU_FLAGS,_CIF_WORK - jnz .Lsysc_work - BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP -.Lsysc_restore: - lg %r14,__LC_VDSO_PER_CPU - lmg %r0,%r10,__PT_R0(%r11) - mvc __LC_RETURN_PSW(16),__PT_PSW(%r11) -.Lsysc_exit_timer: + xgr %r1,%r1 + xgr %r4,%r4 + xgr %r5,%r5 + xgr %r6,%r6 + xgr %r7,%r7 + xgr %r8,%r8 + xgr %r9,%r9 + xgr %r10,%r10 + xgr %r11,%r11 + la %r2,STACK_FRAME_OVERHEAD(%r15) # pointer to pt_regs + mvc __PT_R8(64,%r2),__LC_SAVE_AREA_SYNC + MBEAR %r2 + lgr %r3,%r14 + brasl %r14,__do_syscall + STACKLEAK_ERASE + lctlg %c1,%c1,__LC_USER_ASCE + mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) + BPON + LBEAR STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15) + lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) stpt __LC_EXIT_TIMER - mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER - lmg %r11,%r15,__PT_R11(%r11) - lpswe __LC_RETURN_PSW -.Lsysc_done: - -# -# One of the work bits is on. Find out which one. -# -.Lsysc_work: - TSTMSK __LC_CPU_FLAGS,_CIF_MCCK_PENDING - jo .Lsysc_mcck_pending - TSTMSK __TI_flags(%r12),_TIF_NEED_RESCHED - jo .Lsysc_reschedule - TSTMSK __PT_FLAGS(%r11),_PIF_SYSCALL_RESTART - jo .Lsysc_syscall_restart -#ifdef CONFIG_UPROBES - TSTMSK __TI_flags(%r12),_TIF_UPROBE - jo .Lsysc_uprobe_notify -#endif - TSTMSK __TI_flags(%r12),_TIF_GUARDED_STORAGE - jo .Lsysc_guarded_storage - TSTMSK __PT_FLAGS(%r11),_PIF_PER_TRAP - jo .Lsysc_singlestep -#ifdef CONFIG_LIVEPATCH - TSTMSK __TI_flags(%r12),_TIF_PATCH_PENDING - jo .Lsysc_patch_pending # handle live patching just before - # signals and possible syscall restart -#endif - TSTMSK __PT_FLAGS(%r11),_PIF_SYSCALL_RESTART - jo .Lsysc_syscall_restart - TSTMSK __TI_flags(%r12),_TIF_SIGPENDING - jo .Lsysc_sigpending - TSTMSK __TI_flags(%r12),_TIF_NOTIFY_RESUME - jo .Lsysc_notify_resume - TSTMSK __LC_CPU_FLAGS,_CIF_FPU - jo .Lsysc_vxrs - TSTMSK __LC_CPU_FLAGS,(_CIF_ASCE_PRIMARY|_CIF_ASCE_SECONDARY) - jnz .Lsysc_asce - j .Lsysc_return # beware of critical section cleanup - -# -# _TIF_NEED_RESCHED is set, call schedule -# -.Lsysc_reschedule: - larl %r14,.Lsysc_return - jg schedule - -# -# _CIF_MCCK_PENDING is set, call handler -# -.Lsysc_mcck_pending: - larl %r14,.Lsysc_return - jg s390_handle_mcck # TIF bit will be cleared by handler - -# -# _CIF_ASCE_PRIMARY and/or _CIF_ASCE_SECONDARY set, load user space asce -# -.Lsysc_asce: - ni __LC_CPU_FLAGS+7,255-_CIF_ASCE_SECONDARY - lctlg %c7,%c7,__LC_VDSO_ASCE # load secondary asce - TSTMSK __LC_CPU_FLAGS,_CIF_ASCE_PRIMARY - jz .Lsysc_return -#ifndef CONFIG_HAVE_MARCH_Z10_FEATURES - tm __LC_STFLE_FAC_LIST+3,0x10 # has MVCOS ? - jnz .Lsysc_set_fs_fixup - ni __LC_CPU_FLAGS+7,255-_CIF_ASCE_PRIMARY - lctlg %c1,%c1,__LC_USER_ASCE # load primary asce - j .Lsysc_return -.Lsysc_set_fs_fixup: -#endif - larl %r14,.Lsysc_return - jg set_fs_fixup - -# -# CIF_FPU is set, restore floating-point controls and floating-point registers. -# -.Lsysc_vxrs: - larl %r14,.Lsysc_return - jg load_fpu_regs - -# -# _TIF_SIGPENDING is set, call do_signal -# -.Lsysc_sigpending: - lgr %r2,%r11 # pass pointer to pt_regs - brasl %r14,do_signal - TSTMSK __PT_FLAGS(%r11),_PIF_SYSCALL - jno .Lsysc_return -.Lsysc_do_syscall: - lghi %r13,__TASK_thread - lmg %r2,%r7,__PT_R2(%r11) # load svc arguments - lghi %r1,0 # svc 0 returns -ENOSYS - j .Lsysc_do_svc - -# -# _TIF_NOTIFY_RESUME is set, call do_notify_resume -# -.Lsysc_notify_resume: - lgr %r2,%r11 # pass pointer to pt_regs - larl %r14,.Lsysc_return - jg do_notify_resume - -# -# _TIF_UPROBE is set, call uprobe_notify_resume -# -#ifdef CONFIG_UPROBES -.Lsysc_uprobe_notify: - lgr %r2,%r11 # pass pointer to pt_regs - larl %r14,.Lsysc_return - jg uprobe_notify_resume -#endif - -# -# _TIF_GUARDED_STORAGE is set, call guarded_storage_load -# -.Lsysc_guarded_storage: - lgr %r2,%r11 # pass pointer to pt_regs - larl %r14,.Lsysc_return - jg gs_load_bc_cb -# -# _TIF_PATCH_PENDING is set, call klp_update_patch_state -# -#ifdef CONFIG_LIVEPATCH -.Lsysc_patch_pending: - lg %r2,__LC_CURRENT # pass pointer to task struct - larl %r14,.Lsysc_return - jg klp_update_patch_state -#endif - -# -# _PIF_PER_TRAP is set, call do_per_trap -# -.Lsysc_singlestep: - ni __PT_FLAGS+7(%r11),255-_PIF_PER_TRAP - lgr %r2,%r11 # pass pointer to pt_regs - larl %r14,.Lsysc_return - jg do_per_trap - -# -# _PIF_SYSCALL_RESTART is set, repeat the current system call -# -.Lsysc_syscall_restart: - ni __PT_FLAGS+7(%r11),255-_PIF_SYSCALL_RESTART - lmg %r1,%r7,__PT_R1(%r11) # load svc arguments - lg %r2,__PT_ORIG_GPR2(%r11) - j .Lsysc_do_svc - -# -# call tracehook_report_syscall_entry/tracehook_report_syscall_exit before -# and after the system call -# -.Lsysc_tracesys: - lgr %r2,%r11 # pass pointer to pt_regs - la %r3,0 - llgh %r0,__PT_INT_CODE+2(%r11) - stg %r0,__PT_R2(%r11) - brasl %r14,do_syscall_trace_enter - lghi %r0,NR_syscalls - clgr %r0,%r2 - jnh .Lsysc_tracenogo - sllg %r8,%r2,3 - lg %r9,0(%r8,%r10) -.Lsysc_tracego: - lmg %r3,%r7,__PT_R3(%r11) - stg %r7,STACK_FRAME_OVERHEAD(%r15) - lg %r2,__PT_ORIG_GPR2(%r11) - BASR_EX %r14,%r9 # call sys_xxx - stg %r2,__PT_R2(%r11) # store return value -.Lsysc_tracenogo: - TSTMSK __TI_flags(%r12),_TIF_TRACE - jz .Lsysc_return - lgr %r2,%r11 # pass pointer to pt_regs - larl %r14,.Lsysc_return - jg do_syscall_trace_exit -ENDPROC(system_call) + LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE +SYM_CODE_END(system_call) # # a new process exits the kernel with ret_from_fork # -ENTRY(ret_from_fork) - la %r11,STACK_FRAME_OVERHEAD(%r15) - lg %r12,__LC_CURRENT - brasl %r14,schedule_tail - TRACE_IRQS_ON - ssm __LC_SVC_NEW_PSW # reenable interrupts - tm __PT_PSW+1(%r11),0x01 # forking a kernel thread ? - jne .Lsysc_tracenogo - # it's a kernel thread - lmg %r9,%r10,__PT_R9(%r11) # load gprs - la %r2,0(%r10) - BASR_EX %r14,%r9 - j .Lsysc_tracenogo -ENDPROC(ret_from_fork) - -ENTRY(kernel_thread_starter) - la %r2,0(%r10) - BASR_EX %r14,%r9 - j .Lsysc_tracenogo -ENDPROC(kernel_thread_starter) +SYM_CODE_START(ret_from_fork) + lgr %r3,%r11 + brasl %r14,__ret_from_fork + STACKLEAK_ERASE + lctlg %c1,%c1,__LC_USER_ASCE + mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) + BPON + LBEAR STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15) + lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) + stpt __LC_EXIT_TIMER + LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE +SYM_CODE_END(ret_from_fork) /* * Program check handler routine */ -ENTRY(pgm_check_handler) - stpt __LC_SYNC_ENTER_TIMER +SYM_CODE_START(pgm_check_handler) + stpt __LC_SYS_ENTER_TIMER BPOFF stmg %r8,%r15,__LC_SAVE_AREA_SYNC - lg %r10,__LC_LAST_BREAK - lg %r12,__LC_CURRENT - lghi %r11,0 - larl %r13,cleanup_critical + lghi %r10,0 lmg %r8,%r9,__LC_PGM_OLD_PSW - tmhh %r8,0x0001 # test problem state bit - jnz 2f # -> fault in user space + tmhh %r8,0x0001 # coming from user space? + jno .Lpgm_skip_asce + lctlg %c1,%c1,__LC_KERNEL_ASCE + j 3f # -> fault in user space +.Lpgm_skip_asce: #if IS_ENABLED(CONFIG_KVM) - # cleanup critical section for program checks in sie64a - lgr %r14,%r9 - slg %r14,BASED(.Lsie_critical_start) - clg %r14,BASED(.Lsie_critical_length) - jhe 0f - lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer - ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE - lctlg %c1,%c1,__LC_USER_ASCE # load primary asce - larl %r9,sie_exit # skip forward to sie_exit - lghi %r11,_PIF_GUEST_FAULT + # cleanup critical section for program checks in __sie64a + OUTSIDE %r9,.Lsie_gmap,.Lsie_done,1f + BPENTER __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST + SIEEXIT + lghi %r10,_PIF_GUEST_FAULT #endif -0: tmhh %r8,0x4000 # PER bit set in old PSW ? - jnz 1f # -> enabled, can't be a double fault +1: tmhh %r8,0x4000 # PER bit set in old PSW ? + jnz 2f # -> enabled, can't be a double fault tm __LC_PGM_ILC+3,0x80 # check for per exception jnz .Lpgm_svcper # -> single stepped svc -1: CHECK_STACK __LC_SAVE_AREA_SYNC +2: CHECK_STACK __LC_SAVE_AREA_SYNC aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) # CHECK_VMAP_STACK branches to stack_overflow or 4f CHECK_VMAP_STACK __LC_SAVE_AREA_SYNC,4f -2: UPDATE_VTIME %r14,%r15,__LC_SYNC_ENTER_TIMER - BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP - lg %r15,__LC_KERNEL_STACK - lgr %r14,%r12 - aghi %r14,__TASK_thread # pointer to thread_struct - lghi %r13,__LC_PGM_TDB - tm __LC_PGM_ILC+2,0x02 # check for transaction abort - jz 3f - mvc __THREAD_trap_tdb(256,%r14),0(%r13) -3: stg %r10,__THREAD_last_break(%r14) -4: lgr %r13,%r11 - la %r11,STACK_FRAME_OVERHEAD(%r15) +3: lg %r15,__LC_KERNEL_STACK +4: la %r11,STACK_FRAME_OVERHEAD(%r15) + stg %r10,__PT_FLAGS(%r11) + xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) stmg %r0,%r7,__PT_R0(%r11) + mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC + mvc __PT_LAST_BREAK(8,%r11),__LC_PGM_LAST_BREAK + stmg %r8,%r9,__PT_PSW(%r11) + # clear user controlled registers to prevent speculative use xgr %r0,%r0 xgr %r1,%r1 - xgr %r2,%r2 xgr %r3,%r3 xgr %r4,%r4 xgr %r5,%r5 xgr %r6,%r6 xgr %r7,%r7 - mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC - stmg %r8,%r9,__PT_PSW(%r11) - mvc __PT_INT_CODE(4,%r11),__LC_PGM_ILC - mvc __PT_INT_PARM_LONG(8,%r11),__LC_TRANS_EXC_CODE - stg %r13,__PT_FLAGS(%r11) - stg %r10,__PT_ARGS(%r11) - tm __LC_PGM_ILC+3,0x80 # check for per exception - jz 5f - tmhh %r8,0x0001 # kernel per event ? - jz .Lpgm_kprobe - oi __PT_FLAGS+7(%r11),_PIF_PER_TRAP - mvc __THREAD_per_address(8,%r14),__LC_PER_ADDRESS - mvc __THREAD_per_cause(2,%r14),__LC_PER_CODE - mvc __THREAD_per_paid(1,%r14),__LC_PER_ACCESS_ID -5: REENABLE_IRQS - xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) - larl %r1,pgm_check_table - llgh %r10,__PT_INT_CODE+2(%r11) - nill %r10,0x007f - sll %r10,3 - je .Lpgm_return - lg %r9,0(%r10,%r1) # load address of handler routine - lgr %r2,%r11 # pass pointer to pt_regs - BASR_EX %r14,%r9 # branch to interrupt-handler -.Lpgm_return: - LOCKDEP_SYS_EXIT - tm __PT_PSW+1(%r11),0x01 # returning to user ? - jno .Lsysc_restore - TSTMSK __PT_FLAGS(%r11),_PIF_SYSCALL - jo .Lsysc_do_syscall - j .Lsysc_tif - -# -# PER event in supervisor state, must be kprobes -# -.Lpgm_kprobe: - REENABLE_IRQS - xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) - lgr %r2,%r11 # pass pointer to pt_regs - brasl %r14,do_per_trap - j .Lpgm_return + lgr %r2,%r11 + brasl %r14,__do_pgm_check + tmhh %r8,0x0001 # returning to user space? + jno .Lpgm_exit_kernel + STACKLEAK_ERASE + lctlg %c1,%c1,__LC_USER_ASCE + BPON + stpt __LC_EXIT_TIMER +.Lpgm_exit_kernel: + mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) + LBEAR STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15) + lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) + LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE # # single stepped system call # .Lpgm_svcper: mvc __LC_RETURN_PSW(8),__LC_SVC_NEW_PSW - lghi %r13,__TASK_thread larl %r14,.Lsysc_per stg %r14,__LC_RETURN_PSW+8 - lghi %r14,_PIF_SYSCALL | _PIF_PER_TRAP - lpswe __LC_RETURN_PSW # branch to .Lsysc_per and enable irqs -ENDPROC(pgm_check_handler) + lghi %r14,1 + LBEAR __LC_PGM_LAST_BREAK + LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE # branch to .Lsysc_per +SYM_CODE_END(pgm_check_handler) /* - * IO interrupt handler routine + * Interrupt handler macro used for external and IO interrupts. */ -ENTRY(io_int_handler) - STCK __LC_INT_CLOCK - stpt __LC_ASYNC_ENTER_TIMER +.macro INT_HANDLER name,lc_old_psw,handler +SYM_CODE_START(\name) + stckf __LC_INT_CLOCK + stpt __LC_SYS_ENTER_TIMER + STBEAR __LC_LAST_BREAK BPOFF stmg %r8,%r15,__LC_SAVE_AREA_ASYNC - lg %r12,__LC_CURRENT - larl %r13,cleanup_critical - lmg %r8,%r9,__LC_IO_OLD_PSW - SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_ENTER_TIMER + lmg %r8,%r9,\lc_old_psw + tmhh %r8,0x0001 # interrupting from user ? + jnz 1f +#if IS_ENABLED(CONFIG_KVM) + OUTSIDE %r9,.Lsie_gmap,.Lsie_done,0f + BPENTER __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST + SIEEXIT +#endif +0: CHECK_STACK __LC_SAVE_AREA_ASYNC + aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) + j 2f +1: lctlg %c1,%c1,__LC_KERNEL_ASCE + lg %r15,__LC_KERNEL_STACK +2: xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) + la %r11,STACK_FRAME_OVERHEAD(%r15) stmg %r0,%r7,__PT_R0(%r11) # clear user controlled registers to prevent speculative use xgr %r0,%r0 xgr %r1,%r1 - xgr %r2,%r2 xgr %r3,%r3 xgr %r4,%r4 xgr %r5,%r5 xgr %r6,%r6 xgr %r7,%r7 xgr %r10,%r10 + xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) mvc __PT_R8(64,%r11),__LC_SAVE_AREA_ASYNC + MBEAR %r11 stmg %r8,%r9,__PT_PSW(%r11) - mvc __PT_INT_CODE(12,%r11),__LC_SUBCHANNEL_ID - xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) - TSTMSK __LC_CPU_FLAGS,_CIF_IGNORE_IRQ - jo .Lio_restore - TRACE_IRQS_OFF - xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) -.Lio_loop: lgr %r2,%r11 # pass pointer to pt_regs - lghi %r3,IO_INTERRUPT - tm __PT_INT_CODE+8(%r11),0x80 # adapter interrupt ? - jz .Lio_call - lghi %r3,THIN_INTERRUPT -.Lio_call: - brasl %r14,do_IRQ - TSTMSK __LC_MACHINE_FLAGS,MACHINE_FLAG_LPAR - jz .Lio_return - tpi 0 - jz .Lio_return - mvc __PT_INT_CODE(12,%r11),__LC_SUBCHANNEL_ID - j .Lio_loop -.Lio_return: - LOCKDEP_SYS_EXIT - TRACE_IRQS_ON -.Lio_tif: - TSTMSK __TI_flags(%r12),_TIF_WORK - jnz .Lio_work # there is work to do (signals etc.) - TSTMSK __LC_CPU_FLAGS,_CIF_WORK - jnz .Lio_work -.Lio_restore: - lg %r14,__LC_VDSO_PER_CPU - lmg %r0,%r10,__PT_R0(%r11) + brasl %r14,\handler mvc __LC_RETURN_PSW(16),__PT_PSW(%r11) - tm __PT_PSW+1(%r11),0x01 # returning to user ? - jno .Lio_exit_kernel - BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP -.Lio_exit_timer: + tmhh %r8,0x0001 # returning to user ? + jno 2f + STACKLEAK_ERASE + lctlg %c1,%c1,__LC_USER_ASCE + BPON stpt __LC_EXIT_TIMER - mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER -.Lio_exit_kernel: - lmg %r11,%r15,__PT_R11(%r11) - lpswe __LC_RETURN_PSW -.Lio_done: - -# -# There is work todo, find out in which context we have been interrupted: -# 1) if we return to user space we can do all _TIF_WORK work -# 2) if we return to kernel code and kvm is enabled check if we need to -# modify the psw to leave SIE -# 3) if we return to kernel code and preemptive scheduling is enabled check -# the preemption counter and if it is zero call preempt_schedule_irq -# Before any work can be done, a switch to the kernel stack is required. -# -.Lio_work: - tm __PT_PSW+1(%r11),0x01 # returning to user ? - jo .Lio_work_user # yes -> do resched & signal -#ifdef CONFIG_PREEMPT - # check for preemptive scheduling - icm %r0,15,__LC_PREEMPT_COUNT - jnz .Lio_restore # preemption is disabled - TSTMSK __TI_flags(%r12),_TIF_NEED_RESCHED - jno .Lio_restore - # switch to kernel stack - lg %r1,__PT_R15(%r11) - aghi %r1,-(STACK_FRAME_OVERHEAD + __PT_SIZE) - mvc STACK_FRAME_OVERHEAD(__PT_SIZE,%r1),0(%r11) - xc __SF_BACKCHAIN(8,%r1),__SF_BACKCHAIN(%r1) - la %r11,STACK_FRAME_OVERHEAD(%r1) - lgr %r15,%r1 - # TRACE_IRQS_ON already done at .Lio_return, call - # TRACE_IRQS_OFF to keep things symmetrical - TRACE_IRQS_OFF - brasl %r14,preempt_schedule_irq - j .Lio_return -#else - j .Lio_restore -#endif - -# -# Need to do work before returning to userspace, switch to kernel stack -# -.Lio_work_user: - lg %r1,__LC_KERNEL_STACK - mvc STACK_FRAME_OVERHEAD(__PT_SIZE,%r1),0(%r11) - xc __SF_BACKCHAIN(8,%r1),__SF_BACKCHAIN(%r1) - la %r11,STACK_FRAME_OVERHEAD(%r1) - lgr %r15,%r1 - -# -# One of the work bits is on. Find out which one. -# -.Lio_work_tif: - TSTMSK __LC_CPU_FLAGS,_CIF_MCCK_PENDING - jo .Lio_mcck_pending - TSTMSK __TI_flags(%r12),_TIF_NEED_RESCHED - jo .Lio_reschedule -#ifdef CONFIG_LIVEPATCH - TSTMSK __TI_flags(%r12),_TIF_PATCH_PENDING - jo .Lio_patch_pending -#endif - TSTMSK __TI_flags(%r12),_TIF_SIGPENDING - jo .Lio_sigpending - TSTMSK __TI_flags(%r12),_TIF_NOTIFY_RESUME - jo .Lio_notify_resume - TSTMSK __TI_flags(%r12),_TIF_GUARDED_STORAGE - jo .Lio_guarded_storage - TSTMSK __LC_CPU_FLAGS,_CIF_FPU - jo .Lio_vxrs - TSTMSK __LC_CPU_FLAGS,(_CIF_ASCE_PRIMARY|_CIF_ASCE_SECONDARY) - jnz .Lio_asce - j .Lio_return # beware of critical section cleanup - -# -# _CIF_MCCK_PENDING is set, call handler -# -.Lio_mcck_pending: - # TRACE_IRQS_ON already done at .Lio_return - brasl %r14,s390_handle_mcck # TIF bit will be cleared by handler - TRACE_IRQS_OFF - j .Lio_return +2: LBEAR __PT_LAST_BREAK(%r11) + lmg %r0,%r15,__PT_R0(%r11) + LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE +SYM_CODE_END(\name) +.endm -# -# _CIF_ASCE_PRIMARY and/or CIF_ASCE_SECONDARY set, load user space asce -# -.Lio_asce: - ni __LC_CPU_FLAGS+7,255-_CIF_ASCE_SECONDARY - lctlg %c7,%c7,__LC_VDSO_ASCE # load secondary asce - TSTMSK __LC_CPU_FLAGS,_CIF_ASCE_PRIMARY - jz .Lio_return -#ifndef CONFIG_HAVE_MARCH_Z10_FEATURES - tm __LC_STFLE_FAC_LIST+3,0x10 # has MVCOS ? - jnz .Lio_set_fs_fixup - ni __LC_CPU_FLAGS+7,255-_CIF_ASCE_PRIMARY - lctlg %c1,%c1,__LC_USER_ASCE # load primary asce - j .Lio_return -.Lio_set_fs_fixup: -#endif - larl %r14,.Lio_return - jg set_fs_fixup - -# -# CIF_FPU is set, restore floating-point controls and floating-point registers. -# -.Lio_vxrs: - larl %r14,.Lio_return - jg load_fpu_regs - -# -# _TIF_GUARDED_STORAGE is set, call guarded_storage_load -# -.Lio_guarded_storage: - # TRACE_IRQS_ON already done at .Lio_return - ssm __LC_SVC_NEW_PSW # reenable interrupts - lgr %r2,%r11 # pass pointer to pt_regs - brasl %r14,gs_load_bc_cb - ssm __LC_PGM_NEW_PSW # disable I/O and ext. interrupts - TRACE_IRQS_OFF - j .Lio_return - -# -# _TIF_NEED_RESCHED is set, call schedule -# -.Lio_reschedule: - # TRACE_IRQS_ON already done at .Lio_return - ssm __LC_SVC_NEW_PSW # reenable interrupts - brasl %r14,schedule # call scheduler - ssm __LC_PGM_NEW_PSW # disable I/O and ext. interrupts - TRACE_IRQS_OFF - j .Lio_return - -# -# _TIF_PATCH_PENDING is set, call klp_update_patch_state -# -#ifdef CONFIG_LIVEPATCH -.Lio_patch_pending: - lg %r2,__LC_CURRENT # pass pointer to task struct - larl %r14,.Lio_return - jg klp_update_patch_state -#endif - -# -# _TIF_SIGPENDING or is set, call do_signal -# -.Lio_sigpending: - # TRACE_IRQS_ON already done at .Lio_return - ssm __LC_SVC_NEW_PSW # reenable interrupts - lgr %r2,%r11 # pass pointer to pt_regs - brasl %r14,do_signal - ssm __LC_PGM_NEW_PSW # disable I/O and ext. interrupts - TRACE_IRQS_OFF - j .Lio_return - -# -# _TIF_NOTIFY_RESUME or is set, call do_notify_resume -# -.Lio_notify_resume: - # TRACE_IRQS_ON already done at .Lio_return - ssm __LC_SVC_NEW_PSW # reenable interrupts - lgr %r2,%r11 # pass pointer to pt_regs - brasl %r14,do_notify_resume - ssm __LC_PGM_NEW_PSW # disable I/O and ext. interrupts - TRACE_IRQS_OFF - j .Lio_return -ENDPROC(io_int_handler) +INT_HANDLER ext_int_handler,__LC_EXT_OLD_PSW,do_ext_irq +INT_HANDLER io_int_handler,__LC_IO_OLD_PSW,do_io_irq /* - * External interrupt handler routine + * Load idle PSW. */ -ENTRY(ext_int_handler) - STCK __LC_INT_CLOCK - stpt __LC_ASYNC_ENTER_TIMER - BPOFF - stmg %r8,%r15,__LC_SAVE_AREA_ASYNC - lg %r12,__LC_CURRENT - larl %r13,cleanup_critical - lmg %r8,%r9,__LC_EXT_OLD_PSW - SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_ENTER_TIMER - stmg %r0,%r7,__PT_R0(%r11) - # clear user controlled registers to prevent speculative use - xgr %r0,%r0 - xgr %r1,%r1 - xgr %r2,%r2 - xgr %r3,%r3 - xgr %r4,%r4 - xgr %r5,%r5 - xgr %r6,%r6 - xgr %r7,%r7 - xgr %r10,%r10 - mvc __PT_R8(64,%r11),__LC_SAVE_AREA_ASYNC - stmg %r8,%r9,__PT_PSW(%r11) - lghi %r1,__LC_EXT_PARAMS2 - mvc __PT_INT_CODE(4,%r11),__LC_EXT_CPU_ADDR - mvc __PT_INT_PARM(4,%r11),__LC_EXT_PARAMS - mvc __PT_INT_PARM_LONG(8,%r11),0(%r1) - xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) - TSTMSK __LC_CPU_FLAGS,_CIF_IGNORE_IRQ - jo .Lio_restore - TRACE_IRQS_OFF - xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) - lgr %r2,%r11 # pass pointer to pt_regs - lghi %r3,EXT_INTERRUPT - brasl %r14,do_IRQ - j .Lio_return -ENDPROC(ext_int_handler) - -/* - * Load idle PSW. The second "half" of this function is in .Lcleanup_idle. - */ -ENTRY(psw_idle) +SYM_FUNC_START(psw_idle) + stg %r14,(__SF_GPRS+8*8)(%r15) stg %r3,__SF_EMPTY(%r15) - larl %r1,.Lpsw_idle_lpsw+4 + larl %r1,psw_idle_exit stg %r1,__SF_EMPTY+8(%r15) larl %r1,smp_cpu_mtid llgf %r1,0(%r1) ltgr %r1,%r1 jz .Lpsw_idle_stcctm - .insn rsy,0xeb0000000017,%r1,5,__SF_EMPTY+16(%r15) + .insn rsy,0xeb0000000017,%r1,5,__MT_CYCLES_ENTER(%r2) .Lpsw_idle_stcctm: oi __LC_CPU_FLAGS+7,_CIF_ENABLED_WAIT BPON - STCK __CLOCK_IDLE_ENTER(%r2) + stckf __CLOCK_IDLE_ENTER(%r2) stpt __TIMER_IDLE_ENTER(%r2) -.Lpsw_idle_lpsw: lpswe __SF_EMPTY(%r15) +SYM_INNER_LABEL(psw_idle_exit, SYM_L_GLOBAL) BR_EX %r14 -.Lpsw_idle_end: -ENDPROC(psw_idle) - -/* - * Store floating-point controls and floating-point or vector register - * depending whether the vector facility is available. A critical section - * cleanup assures that the registers are stored even if interrupted for - * some other work. The CIF_FPU flag is set to trigger a lazy restore - * of the register contents at return from io or a system call. - */ -ENTRY(save_fpu_regs) - lg %r2,__LC_CURRENT - aghi %r2,__TASK_thread - TSTMSK __LC_CPU_FLAGS,_CIF_FPU - jo .Lsave_fpu_regs_exit - stfpc __THREAD_FPU_fpc(%r2) - lg %r3,__THREAD_FPU_regs(%r2) - TSTMSK __LC_MACHINE_FLAGS,MACHINE_FLAG_VX - jz .Lsave_fpu_regs_fp # no -> store FP regs - VSTM %v0,%v15,0,%r3 # vstm 0,15,0(3) - VSTM %v16,%v31,256,%r3 # vstm 16,31,256(3) - j .Lsave_fpu_regs_done # -> set CIF_FPU flag -.Lsave_fpu_regs_fp: - std 0,0(%r3) - std 1,8(%r3) - std 2,16(%r3) - std 3,24(%r3) - std 4,32(%r3) - std 5,40(%r3) - std 6,48(%r3) - std 7,56(%r3) - std 8,64(%r3) - std 9,72(%r3) - std 10,80(%r3) - std 11,88(%r3) - std 12,96(%r3) - std 13,104(%r3) - std 14,112(%r3) - std 15,120(%r3) -.Lsave_fpu_regs_done: - oi __LC_CPU_FLAGS+7,_CIF_FPU -.Lsave_fpu_regs_exit: - BR_EX %r14 -.Lsave_fpu_regs_end: -ENDPROC(save_fpu_regs) -EXPORT_SYMBOL(save_fpu_regs) - -/* - * Load floating-point controls and floating-point or vector registers. - * A critical section cleanup assures that the register contents are - * loaded even if interrupted for some other work. - * - * There are special calling conventions to fit into sysc and io return work: - * %r15: <kernel stack> - * The function requires: - * %r4 - */ -load_fpu_regs: - lg %r4,__LC_CURRENT - aghi %r4,__TASK_thread - TSTMSK __LC_CPU_FLAGS,_CIF_FPU - jno .Lload_fpu_regs_exit - lfpc __THREAD_FPU_fpc(%r4) - TSTMSK __LC_MACHINE_FLAGS,MACHINE_FLAG_VX - lg %r4,__THREAD_FPU_regs(%r4) # %r4 <- reg save area - jz .Lload_fpu_regs_fp # -> no VX, load FP regs - VLM %v0,%v15,0,%r4 - VLM %v16,%v31,256,%r4 - j .Lload_fpu_regs_done -.Lload_fpu_regs_fp: - ld 0,0(%r4) - ld 1,8(%r4) - ld 2,16(%r4) - ld 3,24(%r4) - ld 4,32(%r4) - ld 5,40(%r4) - ld 6,48(%r4) - ld 7,56(%r4) - ld 8,64(%r4) - ld 9,72(%r4) - ld 10,80(%r4) - ld 11,88(%r4) - ld 12,96(%r4) - ld 13,104(%r4) - ld 14,112(%r4) - ld 15,120(%r4) -.Lload_fpu_regs_done: - ni __LC_CPU_FLAGS+7,255-_CIF_FPU -.Lload_fpu_regs_exit: - BR_EX %r14 -.Lload_fpu_regs_end: -ENDPROC(load_fpu_regs) - -.L__critical_end: +SYM_FUNC_END(psw_idle) /* * Machine check handler routines */ -ENTRY(mcck_int_handler) - STCK __LC_MCCK_CLOCK +SYM_CODE_START(mcck_int_handler) BPOFF la %r1,4095 # validate r1 spt __LC_CPU_TIMER_SAVE_AREA-4095(%r1) # validate cpu timer - sckc __LC_CLOCK_COMPARATOR # validate comparator - lam %a0,%a15,__LC_AREGS_SAVE_AREA-4095(%r1) # validate acrs - lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1)# validate gprs - lg %r12,__LC_CURRENT - larl %r13,cleanup_critical + LBEAR __LC_LAST_BREAK_SAVE_AREA-4095(%r1) # validate bear + lmg %r0,%r15,__LC_GPREGS_SAVE_AREA # validate gprs lmg %r8,%r9,__LC_MCK_OLD_PSW TSTMSK __LC_MCCK_CODE,MCCK_CODE_SYSTEM_DAMAGE jo .Lmcck_panic # yes -> rest of mcck code invalid TSTMSK __LC_MCCK_CODE,MCCK_CODE_CR_VALID jno .Lmcck_panic # control registers invalid -> panic - la %r14,4095 - lctlg %c0,%c15,__LC_CREGS_SAVE_AREA-4095(%r14) # validate ctl regs + lctlg %c0,%c15,__LC_CREGS_SAVE_AREA # validate ctl regs ptlb - lg %r11,__LC_MCESAD-4095(%r14) # extended machine check save area - nill %r11,0xfc00 # MCESA_ORIGIN_MASK - TSTMSK __LC_CREGS_SAVE_AREA+16-4095(%r14),CR2_GUARDED_STORAGE - jno 0f - TSTMSK __LC_MCCK_CODE,MCCK_CODE_GS_VALID - jno 0f - .insn rxy,0xe3000000004d,0,__MCESA_GS_SAVE_AREA(%r11) # LGSC -0: l %r14,__LC_FP_CREG_SAVE_AREA-4095(%r14) - TSTMSK __LC_MCCK_CODE,MCCK_CODE_FC_VALID - jo 0f - sr %r14,%r14 -0: sfpc %r14 - TSTMSK __LC_MACHINE_FLAGS,MACHINE_FLAG_VX - jo 0f - lghi %r14,__LC_FPREGS_SAVE_AREA - ld %f0,0(%r14) - ld %f1,8(%r14) - ld %f2,16(%r14) - ld %f3,24(%r14) - ld %f4,32(%r14) - ld %f5,40(%r14) - ld %f6,48(%r14) - ld %f7,56(%r14) - ld %f8,64(%r14) - ld %f9,72(%r14) - ld %f10,80(%r14) - ld %f11,88(%r14) - ld %f12,96(%r14) - ld %f13,104(%r14) - ld %f14,112(%r14) - ld %f15,120(%r14) - j 1f -0: VLM %v0,%v15,0,%r11 - VLM %v16,%v31,256,%r11 -1: lghi %r14,__LC_CPU_TIMER_SAVE_AREA + lghi %r14,__LC_CPU_TIMER_SAVE_AREA mvc __LC_MCCK_ENTER_TIMER(8),0(%r14) TSTMSK __LC_MCCK_CODE,MCCK_CODE_CPU_TIMER_VALID jo 3f - la %r14,__LC_SYNC_ENTER_TIMER - clc 0(8,%r14),__LC_ASYNC_ENTER_TIMER - jl 0f - la %r14,__LC_ASYNC_ENTER_TIMER -0: clc 0(8,%r14),__LC_EXIT_TIMER + la %r14,__LC_SYS_ENTER_TIMER + clc 0(8,%r14),__LC_EXIT_TIMER jl 1f la %r14,__LC_EXIT_TIMER 1: clc 0(8,%r14),__LC_LAST_UPDATE_TIMER @@ -1168,18 +516,27 @@ ENTRY(mcck_int_handler) 3: TSTMSK __LC_MCCK_CODE,MCCK_CODE_PSW_MWP_VALID jno .Lmcck_panic tmhh %r8,0x0001 # interrupting from user ? - jnz 4f + jnz .Lmcck_user TSTMSK __LC_MCCK_CODE,MCCK_CODE_PSW_IA_VALID jno .Lmcck_panic -4: ssm __LC_PGM_NEW_PSW # turn dat on, keep irqs off - SWITCH_ASYNC __LC_GPREGS_SAVE_AREA+64,__LC_MCCK_ENTER_TIMER -.Lmcck_skip: +#if IS_ENABLED(CONFIG_KVM) + OUTSIDE %r9,.Lsie_gmap,.Lsie_done,.Lmcck_user + OUTSIDE %r9,.Lsie_entry,.Lsie_leave,4f + oi __LC_CPU_FLAGS+7, _CIF_MCCK_GUEST +4: BPENTER __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST + SIEEXIT +#endif +.Lmcck_user: + lg %r15,__LC_MCCK_STACK + la %r11,STACK_FRAME_OVERHEAD(%r15) + stctg %c1,%c1,__PT_CR1(%r11) + lctlg %c1,%c1,__LC_KERNEL_ASCE + xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) lghi %r14,__LC_GPREGS_SAVE_AREA+64 stmg %r0,%r7,__PT_R0(%r11) # clear user controlled registers to prevent speculative use xgr %r0,%r0 xgr %r1,%r1 - xgr %r2,%r2 xgr %r3,%r3 xgr %r4,%r4 xgr %r5,%r5 @@ -1192,42 +549,57 @@ ENTRY(mcck_int_handler) xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) lgr %r2,%r11 # pass pointer to pt_regs brasl %r14,s390_do_machine_check - tm __PT_PSW+1(%r11),0x01 # returning to user ? - jno .Lmcck_return - lg %r1,__LC_KERNEL_STACK # switch to kernel stack - mvc STACK_FRAME_OVERHEAD(__PT_SIZE,%r1),0(%r11) - xc __SF_BACKCHAIN(8,%r1),__SF_BACKCHAIN(%r1) - la %r11,STACK_FRAME_OVERHEAD(%r1) - lgr %r15,%r1 - TSTMSK __LC_CPU_FLAGS,_CIF_MCCK_PENDING - jno .Lmcck_return - TRACE_IRQS_OFF - brasl %r14,s390_handle_mcck - TRACE_IRQS_ON -.Lmcck_return: - lg %r14,__LC_VDSO_PER_CPU + lctlg %c1,%c1,__PT_CR1(%r11) lmg %r0,%r10,__PT_R0(%r11) mvc __LC_RETURN_MCCK_PSW(16),__PT_PSW(%r11) # move return PSW tm __LC_RETURN_MCCK_PSW+1,0x01 # returning to user ? jno 0f - BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP + BPON stpt __LC_EXIT_TIMER - mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER -0: lmg %r11,%r15,__PT_R11(%r11) - lpswe __LC_RETURN_MCCK_PSW +0: ALTERNATIVE "nop", __stringify(lghi %r12,__LC_LAST_BREAK_SAVE_AREA),193 + LBEAR 0(%r12) + lmg %r11,%r15,__PT_R11(%r11) + LPSWEY __LC_RETURN_MCCK_PSW,__LC_RETURN_MCCK_LPSWE .Lmcck_panic: - lg %r15,__LC_NODAT_STACK - la %r11,STACK_FRAME_OVERHEAD(%r15) - j .Lmcck_skip -ENDPROC(mcck_int_handler) - -# -# PSW restart interrupt handler -# -ENTRY(restart_int_handler) - ALTERNATIVE "", ".insn s,0xb2800000,_LPP_OFFSET", 40 + /* + * Iterate over all possible CPU addresses in the range 0..0xffff + * and stop each CPU using signal processor. Use compare and swap + * to allow just one CPU-stopper and prevent concurrent CPUs from + * stopping each other while leaving the others running. + */ + lhi %r5,0 + lhi %r6,1 + larl %r7,stop_lock + cs %r5,%r6,0(%r7) # single CPU-stopper only + jnz 4f + larl %r7,this_cpu + stap 0(%r7) # this CPU address + lh %r4,0(%r7) + nilh %r4,0 + lhi %r0,1 + sll %r0,16 # CPU counter + lhi %r3,0 # next CPU address +0: cr %r3,%r4 + je 2f +1: sigp %r1,%r3,SIGP_STOP # stop next CPU + brc SIGP_CC_BUSY,1b +2: ahi %r3,1 + brct %r0,0b +3: sigp %r1,%r4,SIGP_STOP # stop this CPU + brc SIGP_CC_BUSY,3b +4: j 4b +SYM_CODE_END(mcck_int_handler) + +SYM_CODE_START(restart_int_handler) + ALTERNATIVE "nop", "lpp _LPP_OFFSET", 40 stg %r15,__LC_SAVE_AREA_RESTART + TSTMSK __LC_RESTART_FLAGS,RESTART_FLAG_CTLREGS,4 + jz 0f + lctlg %c0,%c15,__LC_CREGS_SAVE_AREA +0: larl %r15,daton_psw + lpswe 0(%r15) # turn dat on, keep irqs off +.Ldaton: lg %r15,__LC_RESTART_STACK xc STACK_FRAME_OVERHEAD(__PT_SIZE,%r15),STACK_FRAME_OVERHEAD(%r15) stmg %r0,%r14,STACK_FRAME_OVERHEAD+__PT_R0(%r15) @@ -1236,7 +608,7 @@ ENTRY(restart_int_handler) xc 0(STACK_FRAME_OVERHEAD,%r15),0(%r15) lg %r1,__LC_RESTART_FN # load fn, parm & source cpu lg %r2,__LC_RESTART_DATA - lg %r3,__LC_RESTART_SOURCE + lgf %r3,__LC_RESTART_SOURCE ltgr %r3,%r3 # test source cpu address jm 1f # negative -> skip source stop 0: sigp %r4,%r3,SIGP_SENSE # sigp sense to source cpu @@ -1247,7 +619,7 @@ ENTRY(restart_int_handler) 2: sigp %r4,%r3,SIGP_STOP # sigp stop to current cpu brc 2,2b 3: j 3b -ENDPROC(restart_int_handler) +SYM_CODE_END(restart_int_handler) .section .kprobes.text, "ax" @@ -1257,7 +629,7 @@ ENDPROC(restart_int_handler) * No need to properly save the registers, we are going to panic anyway. * Setup a pt_regs so that show_trace can provide a good call trace. */ -ENTRY(stack_overflow) +SYM_CODE_START(stack_overflow) lg %r15,__LC_NODAT_STACK # change to panic stack la %r11,STACK_FRAME_OVERHEAD(%r15) stmg %r0,%r7,__PT_R0(%r11) @@ -1267,278 +639,31 @@ ENTRY(stack_overflow) xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) lgr %r2,%r11 # pass pointer to pt_regs jg kernel_stack_overflow -ENDPROC(stack_overflow) +SYM_CODE_END(stack_overflow) #endif -ENTRY(cleanup_critical) -#if IS_ENABLED(CONFIG_KVM) - clg %r9,BASED(.Lcleanup_table_sie) # .Lsie_gmap - jl 0f - clg %r9,BASED(.Lcleanup_table_sie+8)# .Lsie_done - jl .Lcleanup_sie -#endif - clg %r9,BASED(.Lcleanup_table) # system_call - jl 0f - clg %r9,BASED(.Lcleanup_table+8) # .Lsysc_do_svc - jl .Lcleanup_system_call - clg %r9,BASED(.Lcleanup_table+16) # .Lsysc_tif - jl 0f - clg %r9,BASED(.Lcleanup_table+24) # .Lsysc_restore - jl .Lcleanup_sysc_tif - clg %r9,BASED(.Lcleanup_table+32) # .Lsysc_done - jl .Lcleanup_sysc_restore - clg %r9,BASED(.Lcleanup_table+40) # .Lio_tif - jl 0f - clg %r9,BASED(.Lcleanup_table+48) # .Lio_restore - jl .Lcleanup_io_tif - clg %r9,BASED(.Lcleanup_table+56) # .Lio_done - jl .Lcleanup_io_restore - clg %r9,BASED(.Lcleanup_table+64) # psw_idle - jl 0f - clg %r9,BASED(.Lcleanup_table+72) # .Lpsw_idle_end - jl .Lcleanup_idle - clg %r9,BASED(.Lcleanup_table+80) # save_fpu_regs - jl 0f - clg %r9,BASED(.Lcleanup_table+88) # .Lsave_fpu_regs_end - jl .Lcleanup_save_fpu_regs - clg %r9,BASED(.Lcleanup_table+96) # load_fpu_regs - jl 0f - clg %r9,BASED(.Lcleanup_table+104) # .Lload_fpu_regs_end - jl .Lcleanup_load_fpu_regs -0: BR_EX %r14,%r11 -ENDPROC(cleanup_critical) - - .align 8 -.Lcleanup_table: - .quad system_call - .quad .Lsysc_do_svc - .quad .Lsysc_tif - .quad .Lsysc_restore - .quad .Lsysc_done - .quad .Lio_tif - .quad .Lio_restore - .quad .Lio_done - .quad psw_idle - .quad .Lpsw_idle_end - .quad save_fpu_regs - .quad .Lsave_fpu_regs_end - .quad load_fpu_regs - .quad .Lload_fpu_regs_end - -#if IS_ENABLED(CONFIG_KVM) -.Lcleanup_table_sie: - .quad .Lsie_gmap - .quad .Lsie_done - -.Lcleanup_sie: - cghi %r11,__LC_SAVE_AREA_ASYNC #Is this in normal interrupt? - je 1f - slg %r9,BASED(.Lsie_crit_mcck_start) - clg %r9,BASED(.Lsie_crit_mcck_length) - jh 1f - oi __LC_CPU_FLAGS+7, _CIF_MCCK_GUEST -1: BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST) - lg %r9,__SF_SIE_CONTROL(%r15) # get control block pointer - ni __SIE_PROG0C+3(%r9),0xfe # no longer in SIE - lctlg %c1,%c1,__LC_USER_ASCE # load primary asce - larl %r9,sie_exit # skip forward to sie_exit - BR_EX %r14,%r11 -#endif + .section .data, "aw" + .balign 4 +SYM_DATA_LOCAL(stop_lock, .long 0) +SYM_DATA_LOCAL(this_cpu, .short 0) + .balign 8 +SYM_DATA_START_LOCAL(daton_psw) + .quad PSW_KERNEL_BITS + .quad .Ldaton +SYM_DATA_END(daton_psw) -.Lcleanup_system_call: - # check if stpt has been executed - clg %r9,BASED(.Lcleanup_system_call_insn) - jh 0f - mvc __LC_SYNC_ENTER_TIMER(8),__LC_ASYNC_ENTER_TIMER - cghi %r11,__LC_SAVE_AREA_ASYNC - je 0f - mvc __LC_SYNC_ENTER_TIMER(8),__LC_MCCK_ENTER_TIMER -0: # check if stmg has been executed - clg %r9,BASED(.Lcleanup_system_call_insn+8) - jh 0f - mvc __LC_SAVE_AREA_SYNC(64),0(%r11) -0: # check if base register setup + TIF bit load has been done - clg %r9,BASED(.Lcleanup_system_call_insn+16) - jhe 0f - # set up saved register r12 task struct pointer - stg %r12,32(%r11) - # set up saved register r13 __TASK_thread offset - mvc 40(8,%r11),BASED(.Lcleanup_system_call_const) -0: # check if the user time update has been done - clg %r9,BASED(.Lcleanup_system_call_insn+24) - jh 0f - lg %r15,__LC_EXIT_TIMER - slg %r15,__LC_SYNC_ENTER_TIMER - alg %r15,__LC_USER_TIMER - stg %r15,__LC_USER_TIMER -0: # check if the system time update has been done - clg %r9,BASED(.Lcleanup_system_call_insn+32) - jh 0f - lg %r15,__LC_LAST_UPDATE_TIMER - slg %r15,__LC_EXIT_TIMER - alg %r15,__LC_SYSTEM_TIMER - stg %r15,__LC_SYSTEM_TIMER -0: # update accounting time stamp - mvc __LC_LAST_UPDATE_TIMER(8),__LC_SYNC_ENTER_TIMER - BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP - # set up saved register r11 - lg %r15,__LC_KERNEL_STACK - la %r9,STACK_FRAME_OVERHEAD(%r15) - stg %r9,24(%r11) # r11 pt_regs pointer - # fill pt_regs - mvc __PT_R8(64,%r9),__LC_SAVE_AREA_SYNC - stmg %r0,%r7,__PT_R0(%r9) - mvc __PT_PSW(16,%r9),__LC_SVC_OLD_PSW - mvc __PT_INT_CODE(4,%r9),__LC_SVC_ILC - xc __PT_FLAGS(8,%r9),__PT_FLAGS(%r9) - mvi __PT_FLAGS+7(%r9),_PIF_SYSCALL - # setup saved register r15 - stg %r15,56(%r11) # r15 stack pointer - # set new psw address and exit - larl %r9,.Lsysc_do_svc - BR_EX %r14,%r11 -.Lcleanup_system_call_insn: - .quad system_call - .quad .Lsysc_stmg - .quad .Lsysc_per - .quad .Lsysc_vtime+36 - .quad .Lsysc_vtime+42 -.Lcleanup_system_call_const: - .quad __TASK_thread - -.Lcleanup_sysc_tif: - larl %r9,.Lsysc_tif - BR_EX %r14,%r11 - -.Lcleanup_sysc_restore: - # check if stpt has been executed - clg %r9,BASED(.Lcleanup_sysc_restore_insn) - jh 0f - mvc __LC_EXIT_TIMER(8),__LC_ASYNC_ENTER_TIMER - cghi %r11,__LC_SAVE_AREA_ASYNC - je 0f - mvc __LC_EXIT_TIMER(8),__LC_MCCK_ENTER_TIMER -0: clg %r9,BASED(.Lcleanup_sysc_restore_insn+8) - je 1f - lg %r9,24(%r11) # get saved pointer to pt_regs - mvc __LC_RETURN_PSW(16),__PT_PSW(%r9) - mvc 0(64,%r11),__PT_R8(%r9) - lmg %r0,%r7,__PT_R0(%r9) -1: lmg %r8,%r9,__LC_RETURN_PSW - BR_EX %r14,%r11 -.Lcleanup_sysc_restore_insn: - .quad .Lsysc_exit_timer - .quad .Lsysc_done - 4 - -.Lcleanup_io_tif: - larl %r9,.Lio_tif - BR_EX %r14,%r11 - -.Lcleanup_io_restore: - # check if stpt has been executed - clg %r9,BASED(.Lcleanup_io_restore_insn) - jh 0f - mvc __LC_EXIT_TIMER(8),__LC_MCCK_ENTER_TIMER -0: clg %r9,BASED(.Lcleanup_io_restore_insn+8) - je 1f - lg %r9,24(%r11) # get saved r11 pointer to pt_regs - mvc __LC_RETURN_PSW(16),__PT_PSW(%r9) - mvc 0(64,%r11),__PT_R8(%r9) - lmg %r0,%r7,__PT_R0(%r9) -1: lmg %r8,%r9,__LC_RETURN_PSW - BR_EX %r14,%r11 -.Lcleanup_io_restore_insn: - .quad .Lio_exit_timer - .quad .Lio_done - 4 - -.Lcleanup_idle: - ni __LC_CPU_FLAGS+7,255-_CIF_ENABLED_WAIT - # copy interrupt clock & cpu timer - mvc __CLOCK_IDLE_EXIT(8,%r2),__LC_INT_CLOCK - mvc __TIMER_IDLE_EXIT(8,%r2),__LC_ASYNC_ENTER_TIMER - cghi %r11,__LC_SAVE_AREA_ASYNC - je 0f - mvc __CLOCK_IDLE_EXIT(8,%r2),__LC_MCCK_CLOCK - mvc __TIMER_IDLE_EXIT(8,%r2),__LC_MCCK_ENTER_TIMER -0: # check if stck & stpt have been executed - clg %r9,BASED(.Lcleanup_idle_insn) - jhe 1f - mvc __CLOCK_IDLE_ENTER(8,%r2),__CLOCK_IDLE_EXIT(%r2) - mvc __TIMER_IDLE_ENTER(8,%r2),__TIMER_IDLE_EXIT(%r2) -1: # calculate idle cycles - clg %r9,BASED(.Lcleanup_idle_insn) - jl 3f - larl %r1,smp_cpu_mtid - llgf %r1,0(%r1) - ltgr %r1,%r1 - jz 3f - .insn rsy,0xeb0000000017,%r1,5,__SF_EMPTY+80(%r15) - larl %r3,mt_cycles - ag %r3,__LC_PERCPU_OFFSET - la %r4,__SF_EMPTY+16(%r15) -2: lg %r0,0(%r3) - slg %r0,0(%r4) - alg %r0,64(%r4) - stg %r0,0(%r3) - la %r3,8(%r3) - la %r4,8(%r4) - brct %r1,2b -3: # account system time going idle - lg %r9,__LC_STEAL_TIMER - alg %r9,__CLOCK_IDLE_ENTER(%r2) - slg %r9,__LC_LAST_UPDATE_CLOCK - stg %r9,__LC_STEAL_TIMER - mvc __LC_LAST_UPDATE_CLOCK(8),__CLOCK_IDLE_EXIT(%r2) - lg %r9,__LC_SYSTEM_TIMER - alg %r9,__LC_LAST_UPDATE_TIMER - slg %r9,__TIMER_IDLE_ENTER(%r2) - stg %r9,__LC_SYSTEM_TIMER - mvc __LC_LAST_UPDATE_TIMER(8),__TIMER_IDLE_EXIT(%r2) - # prepare return psw - nihh %r8,0xfcfd # clear irq & wait state bits - lg %r9,48(%r11) # return from psw_idle - BR_EX %r14,%r11 -.Lcleanup_idle_insn: - .quad .Lpsw_idle_lpsw - -.Lcleanup_save_fpu_regs: - larl %r9,save_fpu_regs - BR_EX %r14,%r11 - -.Lcleanup_load_fpu_regs: - larl %r9,load_fpu_regs - BR_EX %r14,%r11 - -/* - * Integer constants - */ - .align 8 -.Lcritical_start: - .quad .L__critical_start -.Lcritical_length: - .quad .L__critical_end - .L__critical_start -#if IS_ENABLED(CONFIG_KVM) -.Lsie_critical_start: - .quad .Lsie_gmap -.Lsie_critical_length: - .quad .Lsie_done - .Lsie_gmap -.Lsie_crit_mcck_start: - .quad .Lsie_entry -.Lsie_crit_mcck_length: - .quad .Lsie_skip - .Lsie_entry -#endif .section .rodata, "a" #define SYSCALL(esame,emu) .quad __s390x_ ## esame - .globl sys_call_table -sys_call_table: +SYM_DATA_START(sys_call_table) #include "asm/syscall_table.h" +SYM_DATA_END(sys_call_table) #undef SYSCALL #ifdef CONFIG_COMPAT #define SYSCALL(esame,emu) .quad __s390_ ## emu - .globl sys_call_table_emu -sys_call_table_emu: +SYM_DATA_START(sys_call_table_emu) #include "asm/syscall_table.h" +SYM_DATA_END(sys_call_table_emu) #undef SYSCALL #endif diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h index b2956d49b6ad..9f41853f36b9 100644 --- a/arch/s390/kernel/entry.h +++ b/arch/s390/kernel/entry.h @@ -5,11 +5,11 @@ #include <linux/percpu.h> #include <linux/types.h> #include <linux/signal.h> +#include <asm/extable.h> #include <asm/ptrace.h> #include <asm/idle.h> extern void *restart_stack; -extern unsigned long suspend_zero_pages; void system_call(void); void pgm_check_handler(void); @@ -17,53 +17,29 @@ void ext_int_handler(void); void io_int_handler(void); void mcck_int_handler(void); void restart_int_handler(void); -void restart_call_handler(void); +void early_pgm_check_handler(void); -asmlinkage long do_syscall_trace_enter(struct pt_regs *regs); -asmlinkage void do_syscall_trace_exit(struct pt_regs *regs); +void __ret_from_fork(struct task_struct *prev, struct pt_regs *regs); +void __do_pgm_check(struct pt_regs *regs); +void __do_syscall(struct pt_regs *regs, int per_trap); +void __do_early_pgm_check(struct pt_regs *regs); void do_protection_exception(struct pt_regs *regs); void do_dat_exception(struct pt_regs *regs); - -void addressing_exception(struct pt_regs *regs); -void data_exception(struct pt_regs *regs); -void default_trap_handler(struct pt_regs *regs); -void divide_exception(struct pt_regs *regs); -void execute_exception(struct pt_regs *regs); -void hfp_divide_exception(struct pt_regs *regs); -void hfp_overflow_exception(struct pt_regs *regs); -void hfp_significance_exception(struct pt_regs *regs); -void hfp_sqrt_exception(struct pt_regs *regs); -void hfp_underflow_exception(struct pt_regs *regs); -void illegal_op(struct pt_regs *regs); -void operand_exception(struct pt_regs *regs); -void overflow_exception(struct pt_regs *regs); -void privileged_op(struct pt_regs *regs); -void space_switch_exception(struct pt_regs *regs); -void special_op_exception(struct pt_regs *regs); -void specification_exception(struct pt_regs *regs); -void transaction_exception(struct pt_regs *regs); -void translation_exception(struct pt_regs *regs); -void vector_exception(struct pt_regs *regs); - -void do_per_trap(struct pt_regs *regs); +void do_secure_storage_access(struct pt_regs *regs); +void do_non_secure_storage_access(struct pt_regs *regs); +void do_secure_storage_violation(struct pt_regs *regs); void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str); -void syscall_trace(struct pt_regs *regs, int entryexit); void kernel_stack_overflow(struct pt_regs * regs); -void do_signal(struct pt_regs *regs); void handle_signal32(struct ksignal *ksig, sigset_t *oldset, struct pt_regs *regs); -void do_notify_resume(struct pt_regs *regs); -void __init init_IRQ(void); -void do_IRQ(struct pt_regs *regs, int irq); -void do_restart(void); -void __init startup_init_nobss(void); +void do_io_irq(struct pt_regs *regs); +void do_ext_irq(struct pt_regs *regs); +void do_restart(void *arg); void __init startup_init(void); void die(struct pt_regs *regs, const char *str); int setup_profiling_timer(unsigned int multiplier); -void __init time_init(void); -void s390_early_resume(void); unsigned long prepare_ftrace_return(unsigned long parent, unsigned long sp, unsigned long ip); struct s390_mmap_arg_struct; @@ -82,10 +58,18 @@ long sys_s390_sthyi(unsigned long function_code, void __user *buffer, u64 __user DECLARE_PER_CPU(u64, mt_cycles[8]); -void gs_load_bc_cb(struct pt_regs *regs); -void set_fs_fixup(void); - unsigned long stack_alloc(void); void stack_free(unsigned long stack); +extern char kprobes_insn_page[]; + +extern char _samode31[], _eamode31[]; +extern char _stext_amode31[], _etext_amode31[]; +extern struct exception_table_entry _start_amode31_ex_table[]; +extern struct exception_table_entry _stop_amode31_ex_table[]; + +#define __amode31_data __section(".amode31.data") +#define __amode31_ref __section(".amode31.refs") +extern long _start_amode31_refs[], _end_amode31_refs[]; + #endif /* _ENTRY_H */ diff --git a/arch/s390/kernel/facility.c b/arch/s390/kernel/facility.c new file mode 100644 index 000000000000..f02127219a27 --- /dev/null +++ b/arch/s390/kernel/facility.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2023 + */ + +#include <asm/facility.h> + +unsigned int stfle_size(void) +{ + static unsigned int size; + unsigned int r; + u64 dummy; + + r = READ_ONCE(size); + if (!r) { + r = __stfle_asm(&dummy, 1) + 1; + WRITE_ONCE(size, r); + } + return r; +} +EXPORT_SYMBOL(stfle_size); diff --git a/arch/s390/kernel/fpu.c b/arch/s390/kernel/fpu.c index 0da378e2eb25..a4f3449cc814 100644 --- a/arch/s390/kernel/fpu.c +++ b/arch/s390/kernel/fpu.c @@ -10,8 +10,7 @@ #include <linux/sched.h> #include <asm/fpu/types.h> #include <asm/fpu/api.h> - -asm(".include \"asm/vx-insn.h\"\n"); +#include <asm/vx-insn.h> void __kernel_fpu_begin(struct kernel_fpu *state, u32 flags) { @@ -25,7 +24,7 @@ void __kernel_fpu_begin(struct kernel_fpu *state, u32 flags) /* Save floating point control */ asm volatile("stfpc %0" : "=Q" (state->fpc)); - if (!MACHINE_HAS_VX) { + if (!cpu_has_vx()) { if (flags & KERNEL_VXR_V0V7) { /* Save floating-point registers */ asm volatile("std 0,%0" : "=Q" (state->fprs[0])); @@ -107,7 +106,7 @@ void __kernel_fpu_end(struct kernel_fpu *state, u32 flags) /* Restore floating-point controls */ asm volatile("lfpc %0" : : "Q" (state->fpc)); - if (!MACHINE_HAS_VX) { + if (!cpu_has_vx()) { if (flags & KERNEL_VXR_V0V7) { /* Restore floating-point registers */ asm volatile("ld 0,%0" : : "Q" (state->fprs[0])); @@ -175,3 +174,90 @@ void __kernel_fpu_end(struct kernel_fpu *state, u32 flags) : "1", "cc"); } EXPORT_SYMBOL(__kernel_fpu_end); + +void __load_fpu_regs(void) +{ + unsigned long *regs = current->thread.fpu.regs; + struct fpu *state = ¤t->thread.fpu; + + sfpc_safe(state->fpc); + if (likely(cpu_has_vx())) { + asm volatile("lgr 1,%0\n" + "VLM 0,15,0,1\n" + "VLM 16,31,256,1\n" + : + : "d" (regs) + : "1", "cc", "memory"); + } else { + asm volatile("ld 0,%0" : : "Q" (regs[0])); + asm volatile("ld 1,%0" : : "Q" (regs[1])); + asm volatile("ld 2,%0" : : "Q" (regs[2])); + asm volatile("ld 3,%0" : : "Q" (regs[3])); + asm volatile("ld 4,%0" : : "Q" (regs[4])); + asm volatile("ld 5,%0" : : "Q" (regs[5])); + asm volatile("ld 6,%0" : : "Q" (regs[6])); + asm volatile("ld 7,%0" : : "Q" (regs[7])); + asm volatile("ld 8,%0" : : "Q" (regs[8])); + asm volatile("ld 9,%0" : : "Q" (regs[9])); + asm volatile("ld 10,%0" : : "Q" (regs[10])); + asm volatile("ld 11,%0" : : "Q" (regs[11])); + asm volatile("ld 12,%0" : : "Q" (regs[12])); + asm volatile("ld 13,%0" : : "Q" (regs[13])); + asm volatile("ld 14,%0" : : "Q" (regs[14])); + asm volatile("ld 15,%0" : : "Q" (regs[15])); + } + clear_cpu_flag(CIF_FPU); +} + +void load_fpu_regs(void) +{ + raw_local_irq_disable(); + __load_fpu_regs(); + raw_local_irq_enable(); +} +EXPORT_SYMBOL(load_fpu_regs); + +void save_fpu_regs(void) +{ + unsigned long flags, *regs; + struct fpu *state; + + local_irq_save(flags); + + if (test_cpu_flag(CIF_FPU)) + goto out; + + state = ¤t->thread.fpu; + regs = current->thread.fpu.regs; + + asm volatile("stfpc %0" : "=Q" (state->fpc)); + if (likely(cpu_has_vx())) { + asm volatile("lgr 1,%0\n" + "VSTM 0,15,0,1\n" + "VSTM 16,31,256,1\n" + : + : "d" (regs) + : "1", "cc", "memory"); + } else { + asm volatile("std 0,%0" : "=Q" (regs[0])); + asm volatile("std 1,%0" : "=Q" (regs[1])); + asm volatile("std 2,%0" : "=Q" (regs[2])); + asm volatile("std 3,%0" : "=Q" (regs[3])); + asm volatile("std 4,%0" : "=Q" (regs[4])); + asm volatile("std 5,%0" : "=Q" (regs[5])); + asm volatile("std 6,%0" : "=Q" (regs[6])); + asm volatile("std 7,%0" : "=Q" (regs[7])); + asm volatile("std 8,%0" : "=Q" (regs[8])); + asm volatile("std 9,%0" : "=Q" (regs[9])); + asm volatile("std 10,%0" : "=Q" (regs[10])); + asm volatile("std 11,%0" : "=Q" (regs[11])); + asm volatile("std 12,%0" : "=Q" (regs[12])); + asm volatile("std 13,%0" : "=Q" (regs[13])); + asm volatile("std 14,%0" : "=Q" (regs[14])); + asm volatile("std 15,%0" : "=Q" (regs[15])); + } + set_cpu_flag(CIF_FPU); +out: + local_irq_restore(flags); +} +EXPORT_SYMBOL(save_fpu_regs); diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index 1bb85f60c0dd..c46381ea04ec 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -4,8 +4,7 @@ * * Copyright IBM Corp. 2009,2014 * - * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>, - * Martin Schwidefsky <schwidefsky@de.ibm.com> + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> */ #include <linux/moduleloader.h> @@ -17,179 +16,217 @@ #include <linux/kprobes.h> #include <trace/syscall.h> #include <asm/asm-offsets.h> +#include <asm/text-patching.h> #include <asm/cacheflush.h> +#include <asm/ftrace.lds.h> +#include <asm/nospec-branch.h> #include <asm/set_memory.h> #include "entry.h" +#include "ftrace.h" /* - * The mcount code looks like this: - * stg %r14,8(%r15) # offset 0 - * larl %r1,<&counter> # offset 6 - * brasl %r14,_mcount # offset 12 - * lg %r14,8(%r15) # offset 18 - * Total length is 24 bytes. Only the first instruction will be patched - * by ftrace_make_call / ftrace_make_nop. - * The enabled ftrace code block looks like this: + * To generate function prologue either gcc's hotpatch feature (since gcc 4.8) + * or a combination of -pg -mrecord-mcount -mnop-mcount -mfentry flags + * (since gcc 9 / clang 10) is used. + * In both cases the original and also the disabled function prologue contains + * only a single six byte instruction and looks like this: + * > brcl 0,0 # offset 0 + * To enable ftrace the code gets patched like above and afterwards looks + * like this: * > brasl %r0,ftrace_caller # offset 0 - * larl %r1,<&counter> # offset 6 - * brasl %r14,_mcount # offset 12 - * lg %r14,8(%r15) # offset 18 + * + * The instruction will be patched by ftrace_make_call / ftrace_make_nop. * The ftrace function gets called with a non-standard C function call ABI * where r0 contains the return address. It is also expected that the called * function only clobbers r0 and r1, but restores r2-r15. * For module code we can't directly jump to ftrace caller, but need a * trampoline (ftrace_plt), which clobbers also r1. - * The return point of the ftrace function has offset 24, so execution - * continues behind the mcount block. - * The disabled ftrace code block looks like this: - * > jg .+24 # offset 0 - * larl %r1,<&counter> # offset 6 - * brasl %r14,_mcount # offset 12 - * lg %r14,8(%r15) # offset 18 - * The jg instruction branches to offset 24 to skip as many instructions - * as possible. - * In case we use gcc's hotpatch feature the original and also the disabled - * function prologue contains only a single six byte instruction and looks - * like this: - * > brcl 0,0 # offset 0 - * To enable ftrace the code gets patched like above and afterwards looks - * like this: - * > brasl %r0,ftrace_caller # offset 0 */ -unsigned long ftrace_plt; +void *ftrace_func __read_mostly = ftrace_stub; +struct ftrace_insn { + u16 opc; + s32 disp; +} __packed; -static inline void ftrace_generate_orig_insn(struct ftrace_insn *insn) +#ifdef CONFIG_MODULES +static char *ftrace_plt; +#endif /* CONFIG_MODULES */ + +static const char *ftrace_shared_hotpatch_trampoline(const char **end) { -#if defined(CC_USING_HOTPATCH) || defined(CC_USING_NOP_MCOUNT) - /* brcl 0,0 */ - insn->opc = 0xc004; - insn->disp = 0; -#else - /* stg r14,8(r15) */ - insn->opc = 0xe3e0; - insn->disp = 0xf0080024; -#endif + const char *tstart, *tend; + + tstart = ftrace_shared_hotpatch_trampoline_br; + tend = ftrace_shared_hotpatch_trampoline_br_end; +#ifdef CONFIG_EXPOLINE + if (!nospec_disable) { + tstart = ftrace_shared_hotpatch_trampoline_exrl; + tend = ftrace_shared_hotpatch_trampoline_exrl_end; + } +#endif /* CONFIG_EXPOLINE */ + if (end) + *end = tend; + return tstart; } -static inline int is_kprobe_on_ftrace(struct ftrace_insn *insn) +bool ftrace_need_init_nop(void) { -#ifdef CONFIG_KPROBES - if (insn->opc == BREAKPOINT_INSTRUCTION) - return 1; -#endif - return 0; + return true; } -static inline void ftrace_generate_kprobe_nop_insn(struct ftrace_insn *insn) +int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) { -#ifdef CONFIG_KPROBES - insn->opc = BREAKPOINT_INSTRUCTION; - insn->disp = KPROBE_ON_FTRACE_NOP; + static struct ftrace_hotpatch_trampoline *next_vmlinux_trampoline = + __ftrace_hotpatch_trampolines_start; + static const char orig[6] = { 0xc0, 0x04, 0x00, 0x00, 0x00, 0x00 }; + static struct ftrace_hotpatch_trampoline *trampoline; + struct ftrace_hotpatch_trampoline **next_trampoline; + struct ftrace_hotpatch_trampoline *trampolines_end; + struct ftrace_hotpatch_trampoline tmp; + struct ftrace_insn *insn; + const char *shared; + s32 disp; + + BUILD_BUG_ON(sizeof(struct ftrace_hotpatch_trampoline) != + SIZEOF_FTRACE_HOTPATCH_TRAMPOLINE); + + next_trampoline = &next_vmlinux_trampoline; + trampolines_end = __ftrace_hotpatch_trampolines_end; + shared = ftrace_shared_hotpatch_trampoline(NULL); +#ifdef CONFIG_MODULES + if (mod) { + next_trampoline = &mod->arch.next_trampoline; + trampolines_end = mod->arch.trampolines_end; + shared = ftrace_plt; + } #endif + + if (WARN_ON_ONCE(*next_trampoline >= trampolines_end)) + return -ENOMEM; + trampoline = (*next_trampoline)++; + + /* Check for the compiler-generated fentry nop (brcl 0, .). */ + if (WARN_ON_ONCE(memcmp((const void *)rec->ip, &orig, sizeof(orig)))) + return -EINVAL; + + /* Generate the trampoline. */ + tmp.brasl_opc = 0xc015; /* brasl %r1, shared */ + tmp.brasl_disp = (shared - (const char *)&trampoline->brasl_opc) / 2; + tmp.interceptor = FTRACE_ADDR; + tmp.rest_of_intercepted_function = rec->ip + sizeof(struct ftrace_insn); + s390_kernel_write(trampoline, &tmp, sizeof(tmp)); + + /* Generate a jump to the trampoline. */ + disp = ((char *)trampoline - (char *)rec->ip) / 2; + insn = (struct ftrace_insn *)rec->ip; + s390_kernel_write(&insn->disp, &disp, sizeof(disp)); + + return 0; } -static inline void ftrace_generate_kprobe_call_insn(struct ftrace_insn *insn) +static struct ftrace_hotpatch_trampoline *ftrace_get_trampoline(struct dyn_ftrace *rec) { -#ifdef CONFIG_KPROBES - insn->opc = BREAKPOINT_INSTRUCTION; - insn->disp = KPROBE_ON_FTRACE_CALL; -#endif + struct ftrace_hotpatch_trampoline *trampoline; + struct ftrace_insn insn; + s64 disp; + u16 opc; + + if (copy_from_kernel_nofault(&insn, (void *)rec->ip, sizeof(insn))) + return ERR_PTR(-EFAULT); + disp = (s64)insn.disp * 2; + trampoline = (void *)(rec->ip + disp); + if (get_kernel_nofault(opc, &trampoline->brasl_opc)) + return ERR_PTR(-EFAULT); + if (opc != 0xc015) + return ERR_PTR(-EINVAL); + return trampoline; } int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, unsigned long addr) { + struct ftrace_hotpatch_trampoline *trampoline; + u64 old; + + trampoline = ftrace_get_trampoline(rec); + if (IS_ERR(trampoline)) + return PTR_ERR(trampoline); + if (get_kernel_nofault(old, &trampoline->interceptor)) + return -EFAULT; + if (old != old_addr) + return -EINVAL; + s390_kernel_write(&trampoline->interceptor, &addr, sizeof(addr)); return 0; } -int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, - unsigned long addr) +static int ftrace_patch_branch_mask(void *addr, u16 expected, bool enable) { - struct ftrace_insn orig, new, old; + u16 old; + u8 op; - if (probe_kernel_read(&old, (void *) rec->ip, sizeof(old))) + if (get_kernel_nofault(old, addr)) return -EFAULT; - if (addr == MCOUNT_ADDR) { - /* Initial code replacement */ - ftrace_generate_orig_insn(&orig); - ftrace_generate_nop_insn(&new); - } else if (is_kprobe_on_ftrace(&old)) { - /* - * If we find a breakpoint instruction, a kprobe has been - * placed at the beginning of the function. We write the - * constant KPROBE_ON_FTRACE_NOP into the remaining four - * bytes of the original instruction so that the kprobes - * handler can execute a nop, if it reaches this breakpoint. - */ - ftrace_generate_kprobe_call_insn(&orig); - ftrace_generate_kprobe_nop_insn(&new); - } else { - /* Replace ftrace call with a nop. */ - ftrace_generate_call_insn(&orig, rec->ip); - ftrace_generate_nop_insn(&new); - } - /* Verify that the to be replaced code matches what we expect. */ - if (memcmp(&orig, &old, sizeof(old))) + if (old != expected) return -EINVAL; - s390_kernel_write((void *) rec->ip, &new, sizeof(new)); + /* set mask field to all ones or zeroes */ + op = enable ? 0xf4 : 0x04; + s390_kernel_write((char *)addr + 1, &op, sizeof(op)); return 0; } +int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, + unsigned long addr) +{ + /* Expect brcl 0xf,... */ + return ftrace_patch_branch_mask((void *)rec->ip, 0xc0f4, false); +} + int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { - struct ftrace_insn orig, new, old; + struct ftrace_hotpatch_trampoline *trampoline; - if (probe_kernel_read(&old, (void *) rec->ip, sizeof(old))) - return -EFAULT; - if (is_kprobe_on_ftrace(&old)) { - /* - * If we find a breakpoint instruction, a kprobe has been - * placed at the beginning of the function. We write the - * constant KPROBE_ON_FTRACE_CALL into the remaining four - * bytes of the original instruction so that the kprobes - * handler can execute a brasl if it reaches this breakpoint. - */ - ftrace_generate_kprobe_nop_insn(&orig); - ftrace_generate_kprobe_call_insn(&new); - } else { - /* Replace nop with an ftrace call. */ - ftrace_generate_nop_insn(&orig); - ftrace_generate_call_insn(&new, rec->ip); - } - /* Verify that the to be replaced code matches what we expect. */ - if (memcmp(&orig, &old, sizeof(old))) - return -EINVAL; - s390_kernel_write((void *) rec->ip, &new, sizeof(new)); - return 0; + trampoline = ftrace_get_trampoline(rec); + if (IS_ERR(trampoline)) + return PTR_ERR(trampoline); + s390_kernel_write(&trampoline->interceptor, &addr, sizeof(addr)); + /* Expect brcl 0x0,... */ + return ftrace_patch_branch_mask((void *)rec->ip, 0xc004, true); } int ftrace_update_ftrace_func(ftrace_func_t func) { + ftrace_func = func; return 0; } -int __init ftrace_dyn_arch_init(void) +void arch_ftrace_update_code(int command) { - return 0; + ftrace_modify_all_code(command); +} + +void ftrace_arch_code_modify_post_process(void) +{ + /* + * Flush any pre-fetched instructions on all + * CPUs to make the new code visible. + */ + text_poke_sync_lock(); } #ifdef CONFIG_MODULES static int __init ftrace_plt_init(void) { - unsigned int *ip; + const char *start, *end; - ftrace_plt = (unsigned long) module_alloc(PAGE_SIZE); + ftrace_plt = module_alloc(PAGE_SIZE); if (!ftrace_plt) panic("cannot allocate ftrace plt\n"); - ip = (unsigned int *) ftrace_plt; - ip[0] = 0x0d10e310; /* basr 1,0; lg 1,10(1); br 1 */ - ip[1] = 0x100a0004; - ip[2] = 0x07f10000; - ip[3] = FTRACE_ADDR >> 32; - ip[4] = FTRACE_ADDR & 0xffffffff; - set_memory_ro(ftrace_plt, 1); + + start = ftrace_shared_hotpatch_trampoline(&end); + memcpy(ftrace_plt, start, end - start); + set_memory_rox((unsigned long)ftrace_plt, 1); return 0; } device_initcall(ftrace_plt_init); @@ -226,18 +263,78 @@ NOKPROBE_SYMBOL(prepare_ftrace_return); */ int ftrace_enable_ftrace_graph_caller(void) { - u8 op = 0x04; /* set mask field to zero */ + int rc; - s390_kernel_write(__va(ftrace_graph_caller)+1, &op, sizeof(op)); + /* Expect brc 0xf,... */ + rc = ftrace_patch_branch_mask(ftrace_graph_caller, 0xa7f4, false); + if (rc) + return rc; + text_poke_sync_lock(); return 0; } int ftrace_disable_ftrace_graph_caller(void) { - u8 op = 0xf4; /* set mask field to all ones */ + int rc; - s390_kernel_write(__va(ftrace_graph_caller)+1, &op, sizeof(op)); + /* Expect brc 0x0,... */ + rc = ftrace_patch_branch_mask(ftrace_graph_caller, 0xa704, true); + if (rc) + return rc; + text_poke_sync_lock(); return 0; } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +#ifdef CONFIG_KPROBES_ON_FTRACE +void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct ftrace_regs *fregs) +{ + struct kprobe_ctlblk *kcb; + struct pt_regs *regs; + struct kprobe *p; + int bit; + + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) + return; + + regs = ftrace_get_regs(fregs); + p = get_kprobe((kprobe_opcode_t *)ip); + if (!regs || unlikely(!p) || kprobe_disabled(p)) + goto out; + + if (kprobe_running()) { + kprobes_inc_nmissed_count(p); + goto out; + } + + __this_cpu_write(current_kprobe, p); + + kcb = get_kprobe_ctlblk(); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + + instruction_pointer_set(regs, ip); + + if (!p->pre_handler || !p->pre_handler(p, regs)) { + + instruction_pointer_set(regs, ip + MCOUNT_INSN_SIZE); + + if (unlikely(p->post_handler)) { + kcb->kprobe_status = KPROBE_HIT_SSDONE; + p->post_handler(p, regs, 0); + } + } + __this_cpu_write(current_kprobe, NULL); +out: + ftrace_test_recursion_unlock(bit); +} +NOKPROBE_SYMBOL(kprobe_ftrace_handler); + +int arch_prepare_kprobe_ftrace(struct kprobe *p) +{ + p->ainsn.insn = NULL; + return 0; +} +#endif diff --git a/arch/s390/kernel/ftrace.h b/arch/s390/kernel/ftrace.h new file mode 100644 index 000000000000..7f75a9616406 --- /dev/null +++ b/arch/s390/kernel/ftrace.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _FTRACE_H +#define _FTRACE_H + +#include <asm/types.h> + +struct ftrace_hotpatch_trampoline { + u16 brasl_opc; + s32 brasl_disp; + s16: 16; + u64 rest_of_intercepted_function; + u64 interceptor; +} __packed; + +extern struct ftrace_hotpatch_trampoline __ftrace_hotpatch_trampolines_start[]; +extern struct ftrace_hotpatch_trampoline __ftrace_hotpatch_trampolines_end[]; +extern const char ftrace_shared_hotpatch_trampoline_br[]; +extern const char ftrace_shared_hotpatch_trampoline_br_end[]; +extern const char ftrace_shared_hotpatch_trampoline_exrl[]; +extern const char ftrace_shared_hotpatch_trampoline_exrl_end[]; +extern const char ftrace_plt_template[]; +extern const char ftrace_plt_template_end[]; + +#endif /* _FTRACE_H */ diff --git a/arch/s390/kernel/guarded_storage.c b/arch/s390/kernel/guarded_storage.c index d14dd1c2e524..0b68168d9566 100644 --- a/arch/s390/kernel/guarded_storage.c +++ b/arch/s390/kernel/guarded_storage.c @@ -28,7 +28,7 @@ static int gs_enable(void) return -ENOMEM; gs_cb->gsd = 25; preempt_disable(); - __ctl_set_bit(2, 4); + local_ctl_set_bit(2, CR2_GUARDED_STORAGE_BIT); load_gs_cb(gs_cb); current->thread.gs_cb = gs_cb; preempt_enable(); @@ -42,7 +42,7 @@ static int gs_disable(void) preempt_disable(); kfree(current->thread.gs_cb); current->thread.gs_cb = NULL; - __ctl_clear_bit(2, 4); + local_ctl_clear_bit(2, CR2_GUARDED_STORAGE_BIT); preempt_enable(); } return 0; @@ -84,7 +84,7 @@ void gs_load_bc_cb(struct pt_regs *regs) if (gs_cb) { kfree(current->thread.gs_cb); current->thread.gs_bc_cb = NULL; - __ctl_set_bit(2, 4); + local_ctl_set_bit(2, CR2_GUARDED_STORAGE_BIT); load_gs_cb(gs_cb); current->thread.gs_cb = gs_cb; } diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S index 8b88dbbda7df..45413b04efc5 100644 --- a/arch/s390/kernel/head64.S +++ b/arch/s390/kernel/head64.S @@ -5,7 +5,6 @@ * Author(s): Hartmut Penner <hp@de.ibm.com> * Martin Schwidefsky <schwidefsky@de.ibm.com> * Rob van der Heij <rvdhei@iae.nl> - * Heiko Carstens <heiko.carstens@de.ibm.com> * */ @@ -17,32 +16,25 @@ #include <asm/ptrace.h> __HEAD -ENTRY(startup_continue) - tm __LC_STFLE_FAC_LIST+5,0x80 # LPP available ? - jz 0f - xc __LC_LPP+1(7,0),__LC_LPP+1 # clear lpp and current_pid - mvi __LC_LPP,0x80 # and set LPP_MAGIC - .insn s,0xb2800000,__LC_LPP # load program parameter -0: larl %r1,tod_clock_base +SYM_CODE_START(startup_continue) + larl %r1,tod_clock_base mvc 0(16,%r1),__LC_BOOT_CLOCK - larl %r13,.LPG1 # get base # # Setup stack # larl %r14,init_task stg %r14,__LC_CURRENT - larl %r15,init_thread_union+THREAD_SIZE-STACK_FRAME_OVERHEAD-__PT_SIZE -#ifdef CONFIG_KASAN - brasl %r14,kasan_early_init -#endif + larl %r15,init_thread_union+STACK_INIT_OFFSET + stg %r15,__LC_KERNEL_STACK + brasl %r14,sclp_early_adjust_va # allow sclp_early_printk brasl %r14,startup_init # s390 specific early init brasl %r14,start_kernel # common init code # # We returned from start_kernel ?!? PANIK # basr %r13,0 - lpswe .Ldw-.(%r13) # load disabled wait psw + lpswe dw_psw-.(%r13) # load disabled wait psw +SYM_CODE_END(startup_continue) - .align 16 -.LPG1: -.Ldw: .quad 0x0002000180000000,0x0000000000000000 + .balign 16 +SYM_DATA_LOCAL(dw_psw, .quad 0x0002000180000000,0x0000000000000000) diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c index 8f8456816d83..e7239aaf428b 100644 --- a/arch/s390/kernel/idle.c +++ b/arch/s390/kernel/idle.c @@ -9,134 +9,86 @@ #include <linux/kernel.h> #include <linux/kernel_stat.h> -#include <linux/kprobes.h> #include <linux/notifier.h> #include <linux/init.h> #include <linux/cpu.h> -#include <linux/sched/cputime.h> +#include <trace/events/power.h> +#include <asm/cpu_mf.h> +#include <asm/cputime.h> #include <asm/nmi.h> #include <asm/smp.h> #include "entry.h" static DEFINE_PER_CPU(struct s390_idle_data, s390_idle); -void enabled_wait(void) +void account_idle_time_irq(void) { struct s390_idle_data *idle = this_cpu_ptr(&s390_idle); - unsigned long long idle_time; - unsigned long psw_mask; + unsigned long idle_time; + u64 cycles_new[8]; + int i; + + if (smp_cpu_mtid) { + stcctm(MT_DIAG, smp_cpu_mtid, cycles_new); + for (i = 0; i < smp_cpu_mtid; i++) + this_cpu_add(mt_cycles[i], cycles_new[i] - idle->mt_cycles_enter[i]); + } - trace_hardirqs_on(); + idle_time = S390_lowcore.int_clock - idle->clock_idle_enter; - /* Wait for external, I/O or machine check interrupt. */ - psw_mask = PSW_KERNEL_BITS | PSW_MASK_WAIT | PSW_MASK_DAT | - PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK; - clear_cpu_flag(CIF_NOHZ_DELAY); + S390_lowcore.steal_timer += idle->clock_idle_enter - S390_lowcore.last_update_clock; + S390_lowcore.last_update_clock = S390_lowcore.int_clock; - /* Call the assembler magic in entry.S */ - psw_idle(idle, psw_mask); - - trace_hardirqs_off(); + S390_lowcore.system_timer += S390_lowcore.last_update_timer - idle->timer_idle_enter; + S390_lowcore.last_update_timer = S390_lowcore.sys_enter_timer; /* Account time spent with enabled wait psw loaded as idle time. */ - write_seqcount_begin(&idle->seqcount); - idle_time = idle->clock_idle_exit - idle->clock_idle_enter; - idle->clock_idle_enter = idle->clock_idle_exit = 0ULL; - idle->idle_time += idle_time; - idle->idle_count++; + WRITE_ONCE(idle->idle_time, READ_ONCE(idle->idle_time) + idle_time); + WRITE_ONCE(idle->idle_count, READ_ONCE(idle->idle_count) + 1); account_idle_time(cputime_to_nsecs(idle_time)); - write_seqcount_end(&idle->seqcount); } -NOKPROBE_SYMBOL(enabled_wait); + +void noinstr arch_cpu_idle(void) +{ + struct s390_idle_data *idle = this_cpu_ptr(&s390_idle); + unsigned long psw_mask; + + /* Wait for external, I/O or machine check interrupt. */ + psw_mask = PSW_KERNEL_BITS | PSW_MASK_WAIT | + PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK; + clear_cpu_flag(CIF_NOHZ_DELAY); + + /* psw_idle() returns with interrupts disabled. */ + psw_idle(idle, psw_mask); +} static ssize_t show_idle_count(struct device *dev, - struct device_attribute *attr, char *buf) + struct device_attribute *attr, char *buf) { struct s390_idle_data *idle = &per_cpu(s390_idle, dev->id); - unsigned long long idle_count; - unsigned int seq; - - do { - seq = read_seqcount_begin(&idle->seqcount); - idle_count = READ_ONCE(idle->idle_count); - if (READ_ONCE(idle->clock_idle_enter)) - idle_count++; - } while (read_seqcount_retry(&idle->seqcount, seq)); - return sprintf(buf, "%llu\n", idle_count); + + return sysfs_emit(buf, "%lu\n", READ_ONCE(idle->idle_count)); } DEVICE_ATTR(idle_count, 0444, show_idle_count, NULL); static ssize_t show_idle_time(struct device *dev, - struct device_attribute *attr, char *buf) + struct device_attribute *attr, char *buf) { - unsigned long long now, idle_time, idle_enter, idle_exit, in_idle; struct s390_idle_data *idle = &per_cpu(s390_idle, dev->id); - unsigned int seq; - - do { - seq = read_seqcount_begin(&idle->seqcount); - idle_time = READ_ONCE(idle->idle_time); - idle_enter = READ_ONCE(idle->clock_idle_enter); - idle_exit = READ_ONCE(idle->clock_idle_exit); - } while (read_seqcount_retry(&idle->seqcount, seq)); - in_idle = 0; - now = get_tod_clock(); - if (idle_enter) { - if (idle_exit) { - in_idle = idle_exit - idle_enter; - } else if (now > idle_enter) { - in_idle = now - idle_enter; - } - } - idle_time += in_idle; - return sprintf(buf, "%llu\n", idle_time >> 12); -} -DEVICE_ATTR(idle_time_us, 0444, show_idle_time, NULL); -u64 arch_cpu_idle_time(int cpu) -{ - struct s390_idle_data *idle = &per_cpu(s390_idle, cpu); - unsigned long long now, idle_enter, idle_exit, in_idle; - unsigned int seq; - - do { - seq = read_seqcount_begin(&idle->seqcount); - idle_enter = READ_ONCE(idle->clock_idle_enter); - idle_exit = READ_ONCE(idle->clock_idle_exit); - } while (read_seqcount_retry(&idle->seqcount, seq)); - in_idle = 0; - now = get_tod_clock(); - if (idle_enter) { - if (idle_exit) { - in_idle = idle_exit - idle_enter; - } else if (now > idle_enter) { - in_idle = now - idle_enter; - } - } - return cputime_to_nsecs(in_idle); + return sysfs_emit(buf, "%lu\n", READ_ONCE(idle->idle_time) >> 12); } +DEVICE_ATTR(idle_time_us, 0444, show_idle_time, NULL); void arch_cpu_idle_enter(void) { - local_mcck_disable(); -} - -void arch_cpu_idle(void) -{ - if (!test_cpu_flag(CIF_MCCK_PENDING)) - /* Halt the cpu and keep track of cpu time accounting. */ - enabled_wait(); - local_irq_enable(); } void arch_cpu_idle_exit(void) { - local_mcck_enable(); - if (test_cpu_flag(CIF_MCCK_PENDING)) - s390_handle_mcck(); } -void arch_cpu_idle_dead(void) +void __noreturn arch_cpu_idle_dead(void) { cpu_die(); } diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index 6837affc19e8..ba75f6bee774 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -4,7 +4,6 @@ * * Copyright IBM Corp. 2005, 2012 * Author(s): Michael Holzheu <holzheu@de.ibm.com> - * Heiko Carstens <heiko.carstens@de.ibm.com> * Volker Sameske <sameske@de.ibm.com> */ @@ -13,12 +12,15 @@ #include <linux/init.h> #include <linux/device.h> #include <linux/delay.h> +#include <linux/kstrtox.h> +#include <linux/panic_notifier.h> #include <linux/reboot.h> #include <linux/ctype.h> #include <linux/fs.h> #include <linux/gfp.h> #include <linux/crash_dump.h> #include <linux/debug_locks.h> +#include <asm/asm-extable.h> #include <asm/diag.h> #include <asm/ipl.h> #include <asm/smp.h> @@ -28,6 +30,7 @@ #include <asm/sclp.h> #include <asm/checksum.h> #include <asm/debug.h> +#include <asm/abs_lowcore.h> #include <asm/os_info.h> #include <asm/sections.h> #include <asm/boot_data.h> @@ -37,12 +40,18 @@ #define IPL_UNKNOWN_STR "unknown" #define IPL_CCW_STR "ccw" +#define IPL_ECKD_STR "eckd" +#define IPL_ECKD_DUMP_STR "eckd_dump" #define IPL_FCP_STR "fcp" #define IPL_FCP_DUMP_STR "fcp_dump" +#define IPL_NVME_STR "nvme" +#define IPL_NVME_DUMP_STR "nvme_dump" #define IPL_NSS_STR "nss" #define DUMP_CCW_STR "ccw" +#define DUMP_ECKD_STR "eckd" #define DUMP_FCP_STR "fcp" +#define DUMP_NVME_STR "nvme" #define DUMP_NONE_STR "none" /* @@ -87,12 +96,20 @@ static char *ipl_type_str(enum ipl_type type) switch (type) { case IPL_TYPE_CCW: return IPL_CCW_STR; + case IPL_TYPE_ECKD: + return IPL_ECKD_STR; + case IPL_TYPE_ECKD_DUMP: + return IPL_ECKD_DUMP_STR; case IPL_TYPE_FCP: return IPL_FCP_STR; case IPL_TYPE_FCP_DUMP: return IPL_FCP_DUMP_STR; case IPL_TYPE_NSS: return IPL_NSS_STR; + case IPL_TYPE_NVME: + return IPL_NVME_STR; + case IPL_TYPE_NVME_DUMP: + return IPL_NVME_DUMP_STR; case IPL_TYPE_UNKNOWN: default: return IPL_UNKNOWN_STR; @@ -103,6 +120,8 @@ enum dump_type { DUMP_TYPE_NONE = 1, DUMP_TYPE_CCW = 2, DUMP_TYPE_FCP = 4, + DUMP_TYPE_NVME = 8, + DUMP_TYPE_ECKD = 16, }; static char *dump_type_str(enum dump_type type) @@ -112,8 +131,12 @@ static char *dump_type_str(enum dump_type type) return DUMP_NONE_STR; case DUMP_TYPE_CCW: return DUMP_CCW_STR; + case DUMP_TYPE_ECKD: + return DUMP_ECKD_STR; case DUMP_TYPE_FCP: return DUMP_FCP_STR; + case DUMP_TYPE_NVME: + return DUMP_NVME_STR; default: return NULL; } @@ -133,37 +156,48 @@ static int reipl_capabilities = IPL_TYPE_UNKNOWN; static enum ipl_type reipl_type = IPL_TYPE_UNKNOWN; static struct ipl_parameter_block *reipl_block_fcp; +static struct ipl_parameter_block *reipl_block_nvme; static struct ipl_parameter_block *reipl_block_ccw; +static struct ipl_parameter_block *reipl_block_eckd; static struct ipl_parameter_block *reipl_block_nss; static struct ipl_parameter_block *reipl_block_actual; static int dump_capabilities = DUMP_TYPE_NONE; static enum dump_type dump_type = DUMP_TYPE_NONE; static struct ipl_parameter_block *dump_block_fcp; +static struct ipl_parameter_block *dump_block_nvme; static struct ipl_parameter_block *dump_block_ccw; +static struct ipl_parameter_block *dump_block_eckd; static struct sclp_ipl_info sclp_ipl_info; -static inline int __diag308(unsigned long subcode, void *addr) +static bool reipl_nvme_clear; +static bool reipl_fcp_clear; +static bool reipl_ccw_clear; +static bool reipl_eckd_clear; + +static unsigned long os_info_flags; + +static inline int __diag308(unsigned long subcode, unsigned long addr) { - register unsigned long _addr asm("0") = (unsigned long) addr; - register unsigned long _rc asm("1") = 0; + union register_pair r1; + r1.even = addr; + r1.odd = 0; asm volatile( - " diag %0,%2,0x308\n" + " diag %[r1],%[subcode],0x308\n" "0: nopr %%r7\n" EX_TABLE(0b,0b) - : "+d" (_addr), "+d" (_rc) - : "d" (subcode) : "cc", "memory"); - return _rc; + : [r1] "+&d" (r1.pair) + : [subcode] "d" (subcode) + : "cc", "memory"); + return r1.odd; } int diag308(unsigned long subcode, void *addr) { - if (IS_ENABLED(CONFIG_KASAN)) - __arch_local_irq_stosm(0x04); /* enable DAT */ diag_stat_inc(DIAG_STAT_X308); - return __diag308(subcode, addr); + return __diag308(subcode, addr ? virt_to_phys(addr) : 0); } EXPORT_SYMBOL_GPL(diag308); @@ -174,7 +208,7 @@ static ssize_t sys_##_prefix##_##_name##_show(struct kobject *kobj, \ struct kobj_attribute *attr, \ char *page) \ { \ - return snprintf(page, PAGE_SIZE, _format, ##args); \ + return scnprintf(page, PAGE_SIZE, _format, ##args); \ } #define IPL_ATTR_CCW_STORE_FN(_prefix, _name, _ipl_blk) \ @@ -200,14 +234,14 @@ IPL_ATTR_SHOW_FN(_prefix, _name, "0.%x.%04x\n", \ _ipl_blk.ssid, _ipl_blk.devno); \ IPL_ATTR_CCW_STORE_FN(_prefix, _name, _ipl_blk); \ static struct kobj_attribute sys_##_prefix##_##_name##_attr = \ - __ATTR(_name, (S_IRUGO | S_IWUSR), \ + __ATTR(_name, 0644, \ sys_##_prefix##_##_name##_show, \ sys_##_prefix##_##_name##_store) \ #define DEFINE_IPL_ATTR_RO(_prefix, _name, _format, _value) \ IPL_ATTR_SHOW_FN(_prefix, _name, _format, _value) \ static struct kobj_attribute sys_##_prefix##_##_name##_attr = \ - __ATTR(_name, S_IRUGO, sys_##_prefix##_##_name##_show, NULL) + __ATTR(_name, 0444, sys_##_prefix##_##_name##_show, NULL) #define DEFINE_IPL_ATTR_RW(_prefix, _name, _fmt_out, _fmt_in, _value) \ IPL_ATTR_SHOW_FN(_prefix, _name, _fmt_out, (unsigned long long) _value) \ @@ -222,7 +256,7 @@ static ssize_t sys_##_prefix##_##_name##_store(struct kobject *kobj, \ return len; \ } \ static struct kobj_attribute sys_##_prefix##_##_name##_attr = \ - __ATTR(_name,(S_IRUGO | S_IWUSR), \ + __ATTR(_name, 0644, \ sys_##_prefix##_##_name##_show, \ sys_##_prefix##_##_name##_store) @@ -232,12 +266,12 @@ static ssize_t sys_##_prefix##_##_name##_store(struct kobject *kobj, \ struct kobj_attribute *attr, \ const char *buf, size_t len) \ { \ - strncpy(_value, buf, sizeof(_value) - 1); \ + strscpy(_value, buf, sizeof(_value)); \ strim(_value); \ return len; \ } \ static struct kobj_attribute sys_##_prefix##_##_name##_attr = \ - __ATTR(_name,(S_IRUGO | S_IWUSR), \ + __ATTR(_name, 0644, \ sys_##_prefix##_##_name##_show, \ sys_##_prefix##_##_name##_store) @@ -258,6 +292,16 @@ static __init enum ipl_type get_ipl_type(void) return IPL_TYPE_FCP_DUMP; else return IPL_TYPE_FCP; + case IPL_PBT_NVME: + if (ipl_block.nvme.opt == IPL_PB0_NVME_OPT_DUMP) + return IPL_TYPE_NVME_DUMP; + else + return IPL_TYPE_NVME; + case IPL_PBT_ECKD: + if (ipl_block.eckd.opt == IPL_PB0_ECKD_OPT_DUMP) + return IPL_TYPE_ECKD_DUMP; + else + return IPL_TYPE_ECKD; } return IPL_TYPE_UNKNOWN; } @@ -302,7 +346,7 @@ static ssize_t ipl_vm_parm_show(struct kobject *kobj, } static struct kobj_attribute sys_ipl_vm_parm_attr = - __ATTR(parm, S_IRUGO, ipl_vm_parm_show, NULL); + __ATTR(parm, 0444, ipl_vm_parm_show, NULL); static ssize_t sys_ipl_device_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) @@ -311,16 +355,23 @@ static ssize_t sys_ipl_device_show(struct kobject *kobj, case IPL_TYPE_CCW: return sprintf(page, "0.%x.%04x\n", ipl_block.ccw.ssid, ipl_block.ccw.devno); + case IPL_TYPE_ECKD: + case IPL_TYPE_ECKD_DUMP: + return sprintf(page, "0.%x.%04x\n", ipl_block.eckd.ssid, + ipl_block.eckd.devno); case IPL_TYPE_FCP: case IPL_TYPE_FCP_DUMP: return sprintf(page, "0.0.%04x\n", ipl_block.fcp.devno); + case IPL_TYPE_NVME: + case IPL_TYPE_NVME_DUMP: + return sprintf(page, "%08ux\n", ipl_block.nvme.fid); default: return 0; } } static struct kobj_attribute sys_ipl_device_attr = - __ATTR(device, S_IRUGO, sys_ipl_device_show, NULL); + __ATTR(device, 0444, sys_ipl_device_show, NULL); static ssize_t ipl_parameter_read(struct file *filp, struct kobject *kobj, struct bin_attribute *attr, char *buf, @@ -330,7 +381,7 @@ static ssize_t ipl_parameter_read(struct file *filp, struct kobject *kobj, ipl_block.hdr.len); } static struct bin_attribute ipl_parameter_attr = - __BIN_ATTR(binary_parameter, S_IRUGO, ipl_parameter_read, NULL, + __BIN_ATTR(binary_parameter, 0444, ipl_parameter_read, NULL, PAGE_SIZE); static ssize_t ipl_scp_data_read(struct file *filp, struct kobject *kobj, @@ -342,8 +393,35 @@ static ssize_t ipl_scp_data_read(struct file *filp, struct kobject *kobj, return memory_read_from_buffer(buf, count, &off, scp_data, size); } + +static ssize_t ipl_nvme_scp_data_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, char *buf, + loff_t off, size_t count) +{ + unsigned int size = ipl_block.nvme.scp_data_len; + void *scp_data = &ipl_block.nvme.scp_data; + + return memory_read_from_buffer(buf, count, &off, scp_data, size); +} + +static ssize_t ipl_eckd_scp_data_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, char *buf, + loff_t off, size_t count) +{ + unsigned int size = ipl_block.eckd.scp_data_len; + void *scp_data = &ipl_block.eckd.scp_data; + + return memory_read_from_buffer(buf, count, &off, scp_data, size); +} + static struct bin_attribute ipl_scp_data_attr = - __BIN_ATTR(scp_data, S_IRUGO, ipl_scp_data_read, NULL, PAGE_SIZE); + __BIN_ATTR(scp_data, 0444, ipl_scp_data_read, NULL, PAGE_SIZE); + +static struct bin_attribute ipl_nvme_scp_data_attr = + __BIN_ATTR(scp_data, 0444, ipl_nvme_scp_data_read, NULL, PAGE_SIZE); + +static struct bin_attribute ipl_eckd_scp_data_attr = + __BIN_ATTR(scp_data, 0444, ipl_eckd_scp_data_read, NULL, PAGE_SIZE); static struct bin_attribute *ipl_fcp_bin_attrs[] = { &ipl_parameter_attr, @@ -351,6 +429,18 @@ static struct bin_attribute *ipl_fcp_bin_attrs[] = { NULL, }; +static struct bin_attribute *ipl_nvme_bin_attrs[] = { + &ipl_parameter_attr, + &ipl_nvme_scp_data_attr, + NULL, +}; + +static struct bin_attribute *ipl_eckd_bin_attrs[] = { + &ipl_parameter_attr, + &ipl_eckd_scp_data_attr, + NULL, +}; + /* FCP ipl device attributes */ DEFINE_IPL_ATTR_RO(ipl_fcp, wwpn, "0x%016llx\n", @@ -362,6 +452,94 @@ DEFINE_IPL_ATTR_RO(ipl_fcp, bootprog, "%lld\n", DEFINE_IPL_ATTR_RO(ipl_fcp, br_lba, "%lld\n", (unsigned long long)ipl_block.fcp.br_lba); +/* NVMe ipl device attributes */ +DEFINE_IPL_ATTR_RO(ipl_nvme, fid, "0x%08llx\n", + (unsigned long long)ipl_block.nvme.fid); +DEFINE_IPL_ATTR_RO(ipl_nvme, nsid, "0x%08llx\n", + (unsigned long long)ipl_block.nvme.nsid); +DEFINE_IPL_ATTR_RO(ipl_nvme, bootprog, "%lld\n", + (unsigned long long)ipl_block.nvme.bootprog); +DEFINE_IPL_ATTR_RO(ipl_nvme, br_lba, "%lld\n", + (unsigned long long)ipl_block.nvme.br_lba); + +/* ECKD ipl device attributes */ +DEFINE_IPL_ATTR_RO(ipl_eckd, bootprog, "%lld\n", + (unsigned long long)ipl_block.eckd.bootprog); + +#define IPL_ATTR_BR_CHR_SHOW_FN(_name, _ipb) \ +static ssize_t eckd_##_name##_br_chr_show(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + char *buf) \ +{ \ + struct ipl_pb0_eckd *ipb = &(_ipb); \ + \ + if (!ipb->br_chr.cyl && \ + !ipb->br_chr.head && \ + !ipb->br_chr.record) \ + return sprintf(buf, "auto\n"); \ + \ + return sprintf(buf, "0x%x,0x%x,0x%x\n", \ + ipb->br_chr.cyl, \ + ipb->br_chr.head, \ + ipb->br_chr.record); \ +} + +#define IPL_ATTR_BR_CHR_STORE_FN(_name, _ipb) \ +static ssize_t eckd_##_name##_br_chr_store(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + const char *buf, size_t len) \ +{ \ + struct ipl_pb0_eckd *ipb = &(_ipb); \ + unsigned long args[3] = { 0 }; \ + char *p, *p1, *tmp = NULL; \ + int i, rc; \ + \ + if (!strncmp(buf, "auto", 4)) \ + goto out; \ + \ + tmp = kstrdup(buf, GFP_KERNEL); \ + p = tmp; \ + for (i = 0; i < 3; i++) { \ + p1 = strsep(&p, ", "); \ + if (!p1) { \ + rc = -EINVAL; \ + goto err; \ + } \ + rc = kstrtoul(p1, 0, args + i); \ + if (rc) \ + goto err; \ + } \ + \ + rc = -EINVAL; \ + if (i != 3) \ + goto err; \ + \ + if ((args[0] || args[1]) && !args[2]) \ + goto err; \ + \ + if (args[0] > UINT_MAX || args[1] > 255 || args[2] > 255) \ + goto err; \ + \ +out: \ + ipb->br_chr.cyl = args[0]; \ + ipb->br_chr.head = args[1]; \ + ipb->br_chr.record = args[2]; \ + rc = len; \ +err: \ + kfree(tmp); \ + return rc; \ +} + +IPL_ATTR_BR_CHR_SHOW_FN(ipl, ipl_block.eckd); +static struct kobj_attribute sys_ipl_eckd_br_chr_attr = + __ATTR(br_chr, 0644, eckd_ipl_br_chr_show, NULL); + +IPL_ATTR_BR_CHR_SHOW_FN(reipl, reipl_block_eckd->eckd); +IPL_ATTR_BR_CHR_STORE_FN(reipl, reipl_block_eckd->eckd); + +static struct kobj_attribute sys_reipl_eckd_br_chr_attr = + __ATTR(br_chr, 0644, eckd_reipl_br_chr_show, eckd_reipl_br_chr_store); + static ssize_t ipl_ccw_loadparm_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { @@ -379,15 +557,12 @@ static struct kobj_attribute sys_ipl_ccw_loadparm_attr = __ATTR(loadparm, 0444, ipl_ccw_loadparm_show, NULL); static struct attribute *ipl_fcp_attrs[] = { - &sys_ipl_type_attr.attr, &sys_ipl_device_attr.attr, &sys_ipl_fcp_wwpn_attr.attr, &sys_ipl_fcp_lun_attr.attr, &sys_ipl_fcp_bootprog_attr.attr, &sys_ipl_fcp_br_lba_attr.attr, &sys_ipl_ccw_loadparm_attr.attr, - &sys_ipl_secure_attr.attr, - &sys_ipl_has_secure_attr.attr, NULL, }; @@ -396,24 +571,45 @@ static struct attribute_group ipl_fcp_attr_group = { .bin_attrs = ipl_fcp_bin_attrs, }; +static struct attribute *ipl_nvme_attrs[] = { + &sys_ipl_nvme_fid_attr.attr, + &sys_ipl_nvme_nsid_attr.attr, + &sys_ipl_nvme_bootprog_attr.attr, + &sys_ipl_nvme_br_lba_attr.attr, + &sys_ipl_ccw_loadparm_attr.attr, + NULL, +}; + +static struct attribute_group ipl_nvme_attr_group = { + .attrs = ipl_nvme_attrs, + .bin_attrs = ipl_nvme_bin_attrs, +}; + +static struct attribute *ipl_eckd_attrs[] = { + &sys_ipl_eckd_bootprog_attr.attr, + &sys_ipl_eckd_br_chr_attr.attr, + &sys_ipl_ccw_loadparm_attr.attr, + &sys_ipl_device_attr.attr, + NULL, +}; + +static struct attribute_group ipl_eckd_attr_group = { + .attrs = ipl_eckd_attrs, + .bin_attrs = ipl_eckd_bin_attrs, +}; + /* CCW ipl device attributes */ static struct attribute *ipl_ccw_attrs_vm[] = { - &sys_ipl_type_attr.attr, &sys_ipl_device_attr.attr, &sys_ipl_ccw_loadparm_attr.attr, &sys_ipl_vm_parm_attr.attr, - &sys_ipl_secure_attr.attr, - &sys_ipl_has_secure_attr.attr, NULL, }; static struct attribute *ipl_ccw_attrs_lpar[] = { - &sys_ipl_type_attr.attr, &sys_ipl_device_attr.attr, &sys_ipl_ccw_loadparm_attr.attr, - &sys_ipl_secure_attr.attr, - &sys_ipl_has_secure_attr.attr, NULL, }; @@ -425,22 +621,21 @@ static struct attribute_group ipl_ccw_attr_group_lpar = { .attrs = ipl_ccw_attrs_lpar }; -/* UNKNOWN ipl device attributes */ - -static struct attribute *ipl_unknown_attrs[] = { +static struct attribute *ipl_common_attrs[] = { &sys_ipl_type_attr.attr, + &sys_ipl_secure_attr.attr, + &sys_ipl_has_secure_attr.attr, NULL, }; -static struct attribute_group ipl_unknown_attr_group = { - .attrs = ipl_unknown_attrs, +static struct attribute_group ipl_common_attr_group = { + .attrs = ipl_common_attrs, }; static struct kset *ipl_kset; static void __ipl_run(void *unused) { - __bpon(); diag308(DIAG308_LOAD_CLEAR, NULL); } @@ -458,6 +653,9 @@ static int __init ipl_init(void) rc = -ENOMEM; goto out; } + rc = sysfs_create_group(&ipl_kset->kobj, &ipl_common_attr_group); + if (rc) + goto out; switch (ipl_info.type) { case IPL_TYPE_CCW: if (MACHINE_IS_VM) @@ -467,13 +665,19 @@ static int __init ipl_init(void) rc = sysfs_create_group(&ipl_kset->kobj, &ipl_ccw_attr_group_lpar); break; + case IPL_TYPE_ECKD: + case IPL_TYPE_ECKD_DUMP: + rc = sysfs_create_group(&ipl_kset->kobj, &ipl_eckd_attr_group); + break; case IPL_TYPE_FCP: case IPL_TYPE_FCP_DUMP: rc = sysfs_create_group(&ipl_kset->kobj, &ipl_fcp_attr_group); break; + case IPL_TYPE_NVME: + case IPL_TYPE_NVME_DUMP: + rc = sysfs_create_group(&ipl_kset->kobj, &ipl_nvme_attr_group); + break; default: - rc = sysfs_create_group(&ipl_kset->kobj, - &ipl_unknown_attr_group); break; } out: @@ -564,11 +768,11 @@ static ssize_t reipl_ccw_vmparm_store(struct kobject *kobj, } static struct kobj_attribute sys_reipl_nss_vmparm_attr = - __ATTR(parm, S_IRUGO | S_IWUSR, reipl_nss_vmparm_show, - reipl_nss_vmparm_store); + __ATTR(parm, 0644, reipl_nss_vmparm_show, + reipl_nss_vmparm_store); static struct kobj_attribute sys_reipl_ccw_vmparm_attr = - __ATTR(parm, S_IRUGO | S_IWUSR, reipl_ccw_vmparm_show, - reipl_ccw_vmparm_store); + __ATTR(parm, 0644, reipl_ccw_vmparm_show, + reipl_ccw_vmparm_store); /* FCP reipl device attributes */ @@ -608,7 +812,7 @@ static ssize_t reipl_fcp_scpdata_write(struct file *filp, struct kobject *kobj, return count; } static struct bin_attribute sys_reipl_fcp_scp_data_attr = - __BIN_ATTR(scp_data, (S_IRUGO | S_IWUSR), reipl_fcp_scpdata_read, + __BIN_ATTR(scp_data, 0644, reipl_fcp_scpdata_read, reipl_fcp_scpdata_write, DIAG308_SCPDATA_SIZE); static struct bin_attribute *reipl_fcp_bin_attrs[] = { @@ -673,24 +877,43 @@ static ssize_t reipl_generic_loadparm_store(struct ipl_parameter_block *ipb, return len; } -/* FCP wrapper */ -static ssize_t reipl_fcp_loadparm_show(struct kobject *kobj, - struct kobj_attribute *attr, char *page) +#define DEFINE_GENERIC_LOADPARM(name) \ +static ssize_t reipl_##name##_loadparm_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *page) \ +{ \ + return reipl_generic_loadparm_show(reipl_block_##name, page); \ +} \ +static ssize_t reipl_##name##_loadparm_store(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + const char *buf, size_t len) \ +{ \ + return reipl_generic_loadparm_store(reipl_block_##name, buf, len); \ +} \ +static struct kobj_attribute sys_reipl_##name##_loadparm_attr = \ + __ATTR(loadparm, 0644, reipl_##name##_loadparm_show, \ + reipl_##name##_loadparm_store) + +DEFINE_GENERIC_LOADPARM(fcp); +DEFINE_GENERIC_LOADPARM(nvme); +DEFINE_GENERIC_LOADPARM(ccw); +DEFINE_GENERIC_LOADPARM(nss); +DEFINE_GENERIC_LOADPARM(eckd); + +static ssize_t reipl_fcp_clear_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) { - return reipl_generic_loadparm_show(reipl_block_fcp, page); + return sprintf(page, "%u\n", reipl_fcp_clear); } -static ssize_t reipl_fcp_loadparm_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t len) +static ssize_t reipl_fcp_clear_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) { - return reipl_generic_loadparm_store(reipl_block_fcp, buf, len); + if (kstrtobool(buf, &reipl_fcp_clear) < 0) + return -EINVAL; + return len; } -static struct kobj_attribute sys_reipl_fcp_loadparm_attr = - __ATTR(loadparm, S_IRUGO | S_IWUSR, reipl_fcp_loadparm_show, - reipl_fcp_loadparm_store); - static struct attribute *reipl_fcp_attrs[] = { &sys_reipl_fcp_device_attr.attr, &sys_reipl_fcp_wwpn_attr.attr, @@ -706,51 +929,129 @@ static struct attribute_group reipl_fcp_attr_group = { .bin_attrs = reipl_fcp_bin_attrs, }; -/* CCW reipl device attributes */ -DEFINE_IPL_CCW_ATTR_RW(reipl_ccw, device, reipl_block_ccw->ccw); +static struct kobj_attribute sys_reipl_fcp_clear_attr = + __ATTR(clear, 0644, reipl_fcp_clear_show, reipl_fcp_clear_store); -/* NSS wrapper */ -static ssize_t reipl_nss_loadparm_show(struct kobject *kobj, - struct kobj_attribute *attr, char *page) +/* NVME reipl device attributes */ + +static ssize_t reipl_nvme_scpdata_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, + char *buf, loff_t off, size_t count) { - return reipl_generic_loadparm_show(reipl_block_nss, page); + size_t size = reipl_block_nvme->nvme.scp_data_len; + void *scp_data = reipl_block_nvme->nvme.scp_data; + + return memory_read_from_buffer(buf, count, &off, scp_data, size); } -static ssize_t reipl_nss_loadparm_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t len) +static ssize_t reipl_nvme_scpdata_write(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, + char *buf, loff_t off, size_t count) { - return reipl_generic_loadparm_store(reipl_block_nss, buf, len); + size_t scpdata_len = count; + size_t padding; + + if (off) + return -EINVAL; + + memcpy(reipl_block_nvme->nvme.scp_data, buf, count); + if (scpdata_len % 8) { + padding = 8 - (scpdata_len % 8); + memset(reipl_block_nvme->nvme.scp_data + scpdata_len, + 0, padding); + scpdata_len += padding; + } + + reipl_block_nvme->hdr.len = IPL_BP_FCP_LEN + scpdata_len; + reipl_block_nvme->nvme.len = IPL_BP0_FCP_LEN + scpdata_len; + reipl_block_nvme->nvme.scp_data_len = scpdata_len; + + return count; } -/* CCW wrapper */ -static ssize_t reipl_ccw_loadparm_show(struct kobject *kobj, - struct kobj_attribute *attr, char *page) +static struct bin_attribute sys_reipl_nvme_scp_data_attr = + __BIN_ATTR(scp_data, 0644, reipl_nvme_scpdata_read, + reipl_nvme_scpdata_write, DIAG308_SCPDATA_SIZE); + +static struct bin_attribute *reipl_nvme_bin_attrs[] = { + &sys_reipl_nvme_scp_data_attr, + NULL, +}; + +DEFINE_IPL_ATTR_RW(reipl_nvme, fid, "0x%08llx\n", "%llx\n", + reipl_block_nvme->nvme.fid); +DEFINE_IPL_ATTR_RW(reipl_nvme, nsid, "0x%08llx\n", "%llx\n", + reipl_block_nvme->nvme.nsid); +DEFINE_IPL_ATTR_RW(reipl_nvme, bootprog, "%lld\n", "%lld\n", + reipl_block_nvme->nvme.bootprog); +DEFINE_IPL_ATTR_RW(reipl_nvme, br_lba, "%lld\n", "%lld\n", + reipl_block_nvme->nvme.br_lba); + +static struct attribute *reipl_nvme_attrs[] = { + &sys_reipl_nvme_fid_attr.attr, + &sys_reipl_nvme_nsid_attr.attr, + &sys_reipl_nvme_bootprog_attr.attr, + &sys_reipl_nvme_br_lba_attr.attr, + &sys_reipl_nvme_loadparm_attr.attr, + NULL, +}; + +static struct attribute_group reipl_nvme_attr_group = { + .attrs = reipl_nvme_attrs, + .bin_attrs = reipl_nvme_bin_attrs +}; + +static ssize_t reipl_nvme_clear_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return sprintf(page, "%u\n", reipl_nvme_clear); +} + +static ssize_t reipl_nvme_clear_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + if (kstrtobool(buf, &reipl_nvme_clear) < 0) + return -EINVAL; + return len; +} + +static struct kobj_attribute sys_reipl_nvme_clear_attr = + __ATTR(clear, 0644, reipl_nvme_clear_show, reipl_nvme_clear_store); + +/* CCW reipl device attributes */ +DEFINE_IPL_CCW_ATTR_RW(reipl_ccw, device, reipl_block_ccw->ccw); + +static ssize_t reipl_ccw_clear_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) { - return reipl_generic_loadparm_show(reipl_block_ccw, page); + return sprintf(page, "%u\n", reipl_ccw_clear); } -static ssize_t reipl_ccw_loadparm_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t len) +static ssize_t reipl_ccw_clear_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) { - return reipl_generic_loadparm_store(reipl_block_ccw, buf, len); + if (kstrtobool(buf, &reipl_ccw_clear) < 0) + return -EINVAL; + return len; } -static struct kobj_attribute sys_reipl_ccw_loadparm_attr = - __ATTR(loadparm, S_IRUGO | S_IWUSR, reipl_ccw_loadparm_show, - reipl_ccw_loadparm_store); +static struct kobj_attribute sys_reipl_ccw_clear_attr = + __ATTR(clear, 0644, reipl_ccw_clear_show, reipl_ccw_clear_store); static struct attribute *reipl_ccw_attrs_vm[] = { &sys_reipl_ccw_device_attr.attr, &sys_reipl_ccw_loadparm_attr.attr, &sys_reipl_ccw_vmparm_attr.attr, + &sys_reipl_ccw_clear_attr.attr, NULL, }; static struct attribute *reipl_ccw_attrs_lpar[] = { &sys_reipl_ccw_device_attr.attr, &sys_reipl_ccw_loadparm_attr.attr, + &sys_reipl_ccw_clear_attr.attr, NULL, }; @@ -764,6 +1065,86 @@ static struct attribute_group reipl_ccw_attr_group_lpar = { .attrs = reipl_ccw_attrs_lpar, }; +/* ECKD reipl device attributes */ + +static ssize_t reipl_eckd_scpdata_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, + char *buf, loff_t off, size_t count) +{ + size_t size = reipl_block_eckd->eckd.scp_data_len; + void *scp_data = reipl_block_eckd->eckd.scp_data; + + return memory_read_from_buffer(buf, count, &off, scp_data, size); +} + +static ssize_t reipl_eckd_scpdata_write(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, + char *buf, loff_t off, size_t count) +{ + size_t scpdata_len = count; + size_t padding; + + if (off) + return -EINVAL; + + memcpy(reipl_block_eckd->eckd.scp_data, buf, count); + if (scpdata_len % 8) { + padding = 8 - (scpdata_len % 8); + memset(reipl_block_eckd->eckd.scp_data + scpdata_len, + 0, padding); + scpdata_len += padding; + } + + reipl_block_eckd->hdr.len = IPL_BP_ECKD_LEN + scpdata_len; + reipl_block_eckd->eckd.len = IPL_BP0_ECKD_LEN + scpdata_len; + reipl_block_eckd->eckd.scp_data_len = scpdata_len; + + return count; +} + +static struct bin_attribute sys_reipl_eckd_scp_data_attr = + __BIN_ATTR(scp_data, 0644, reipl_eckd_scpdata_read, + reipl_eckd_scpdata_write, DIAG308_SCPDATA_SIZE); + +static struct bin_attribute *reipl_eckd_bin_attrs[] = { + &sys_reipl_eckd_scp_data_attr, + NULL, +}; + +DEFINE_IPL_CCW_ATTR_RW(reipl_eckd, device, reipl_block_eckd->eckd); +DEFINE_IPL_ATTR_RW(reipl_eckd, bootprog, "%lld\n", "%lld\n", + reipl_block_eckd->eckd.bootprog); + +static struct attribute *reipl_eckd_attrs[] = { + &sys_reipl_eckd_device_attr.attr, + &sys_reipl_eckd_bootprog_attr.attr, + &sys_reipl_eckd_br_chr_attr.attr, + &sys_reipl_eckd_loadparm_attr.attr, + NULL, +}; + +static struct attribute_group reipl_eckd_attr_group = { + .attrs = reipl_eckd_attrs, + .bin_attrs = reipl_eckd_bin_attrs +}; + +static ssize_t reipl_eckd_clear_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return sprintf(page, "%u\n", reipl_eckd_clear); +} + +static ssize_t reipl_eckd_clear_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + if (kstrtobool(buf, &reipl_eckd_clear) < 0) + return -EINVAL; + return len; +} + +static struct kobj_attribute sys_reipl_eckd_clear_attr = + __ATTR(clear, 0644, reipl_eckd_clear_show, reipl_eckd_clear_store); /* NSS reipl device attributes */ static void reipl_get_ascii_nss_name(char *dst, @@ -811,12 +1192,8 @@ static ssize_t reipl_nss_name_store(struct kobject *kobj, } static struct kobj_attribute sys_reipl_nss_name_attr = - __ATTR(name, S_IRUGO | S_IWUSR, reipl_nss_name_show, - reipl_nss_name_store); - -static struct kobj_attribute sys_reipl_nss_loadparm_attr = - __ATTR(loadparm, S_IRUGO | S_IWUSR, reipl_nss_loadparm_show, - reipl_nss_loadparm_store); + __ATTR(name, 0644, reipl_nss_name_show, + reipl_nss_name_store); static struct attribute *reipl_nss_attrs[] = { &sys_reipl_nss_name_attr.attr, @@ -847,9 +1224,15 @@ static int reipl_set_type(enum ipl_type type) case IPL_TYPE_CCW: reipl_block_actual = reipl_block_ccw; break; + case IPL_TYPE_ECKD: + reipl_block_actual = reipl_block_eckd; + break; case IPL_TYPE_FCP: reipl_block_actual = reipl_block_fcp; break; + case IPL_TYPE_NVME: + reipl_block_actual = reipl_block_nvme; + break; case IPL_TYPE_NSS: reipl_block_actual = reipl_block_nss; break; @@ -874,8 +1257,12 @@ static ssize_t reipl_type_store(struct kobject *kobj, if (strncmp(buf, IPL_CCW_STR, strlen(IPL_CCW_STR)) == 0) rc = reipl_set_type(IPL_TYPE_CCW); + else if (strncmp(buf, IPL_ECKD_STR, strlen(IPL_ECKD_STR)) == 0) + rc = reipl_set_type(IPL_TYPE_ECKD); else if (strncmp(buf, IPL_FCP_STR, strlen(IPL_FCP_STR)) == 0) rc = reipl_set_type(IPL_TYPE_FCP); + else if (strncmp(buf, IPL_NVME_STR, strlen(IPL_NVME_STR)) == 0) + rc = reipl_set_type(IPL_TYPE_NVME); else if (strncmp(buf, IPL_NSS_STR, strlen(IPL_NSS_STR)) == 0) rc = reipl_set_type(IPL_TYPE_NSS); return (rc != 0) ? rc : len; @@ -886,17 +1273,39 @@ static struct kobj_attribute reipl_type_attr = static struct kset *reipl_kset; static struct kset *reipl_fcp_kset; +static struct kset *reipl_nvme_kset; +static struct kset *reipl_eckd_kset; static void __reipl_run(void *unused) { switch (reipl_type) { case IPL_TYPE_CCW: diag308(DIAG308_SET, reipl_block_ccw); - diag308(DIAG308_LOAD_CLEAR, NULL); + if (reipl_ccw_clear) + diag308(DIAG308_LOAD_CLEAR, NULL); + else + diag308(DIAG308_LOAD_NORMAL_DUMP, NULL); + break; + case IPL_TYPE_ECKD: + diag308(DIAG308_SET, reipl_block_eckd); + if (reipl_eckd_clear) + diag308(DIAG308_LOAD_CLEAR, NULL); + else + diag308(DIAG308_LOAD_NORMAL, NULL); break; case IPL_TYPE_FCP: diag308(DIAG308_SET, reipl_block_fcp); - diag308(DIAG308_LOAD_CLEAR, NULL); + if (reipl_fcp_clear) + diag308(DIAG308_LOAD_CLEAR, NULL); + else + diag308(DIAG308_LOAD_NORMAL, NULL); + break; + case IPL_TYPE_NVME: + diag308(DIAG308_SET, reipl_block_nvme); + if (reipl_nvme_clear) + diag308(DIAG308_LOAD_CLEAR, NULL); + else + diag308(DIAG308_LOAD_NORMAL, NULL); break; case IPL_TYPE_NSS: diag308(DIAG308_SET, reipl_block_nss); @@ -906,6 +1315,8 @@ static void __reipl_run(void *unused) diag308(DIAG308_LOAD_CLEAR, NULL); break; case IPL_TYPE_FCP_DUMP: + case IPL_TYPE_NVME_DUMP: + case IPL_TYPE_ECKD_DUMP: break; } disabled_wait(); @@ -1008,10 +1419,16 @@ static int __init reipl_fcp_init(void) } rc = sysfs_create_group(&reipl_fcp_kset->kobj, &reipl_fcp_attr_group); - if (rc) { - kset_unregister(reipl_fcp_kset); - free_page((unsigned long) reipl_block_fcp); - return rc; + if (rc) + goto out1; + + if (test_facility(141)) { + rc = sysfs_create_file(&reipl_fcp_kset->kobj, + &sys_reipl_fcp_clear_attr.attr); + if (rc) + goto out2; + } else { + reipl_fcp_clear = true; } if (ipl_info.type == IPL_TYPE_FCP) { @@ -1032,6 +1449,121 @@ static int __init reipl_fcp_init(void) } reipl_capabilities |= IPL_TYPE_FCP; return 0; + +out2: + sysfs_remove_group(&reipl_fcp_kset->kobj, &reipl_fcp_attr_group); +out1: + kset_unregister(reipl_fcp_kset); + free_page((unsigned long) reipl_block_fcp); + return rc; +} + +static int __init reipl_nvme_init(void) +{ + int rc; + + reipl_block_nvme = (void *) get_zeroed_page(GFP_KERNEL); + if (!reipl_block_nvme) + return -ENOMEM; + + /* sysfs: create kset for mixing attr group and bin attrs */ + reipl_nvme_kset = kset_create_and_add(IPL_NVME_STR, NULL, + &reipl_kset->kobj); + if (!reipl_nvme_kset) { + free_page((unsigned long) reipl_block_nvme); + return -ENOMEM; + } + + rc = sysfs_create_group(&reipl_nvme_kset->kobj, &reipl_nvme_attr_group); + if (rc) + goto out1; + + if (test_facility(141)) { + rc = sysfs_create_file(&reipl_nvme_kset->kobj, + &sys_reipl_nvme_clear_attr.attr); + if (rc) + goto out2; + } else { + reipl_nvme_clear = true; + } + + if (ipl_info.type == IPL_TYPE_NVME) { + memcpy(reipl_block_nvme, &ipl_block, sizeof(ipl_block)); + /* + * Fix loadparm: There are systems where the (SCSI) LOADPARM + * is invalid in the IPL parameter block, so take it + * always from sclp_ipl_info. + */ + memcpy(reipl_block_nvme->nvme.loadparm, sclp_ipl_info.loadparm, + LOADPARM_LEN); + } else { + reipl_block_nvme->hdr.len = IPL_BP_NVME_LEN; + reipl_block_nvme->hdr.version = IPL_PARM_BLOCK_VERSION; + reipl_block_nvme->nvme.len = IPL_BP0_NVME_LEN; + reipl_block_nvme->nvme.pbt = IPL_PBT_NVME; + reipl_block_nvme->nvme.opt = IPL_PB0_NVME_OPT_IPL; + } + reipl_capabilities |= IPL_TYPE_NVME; + return 0; + +out2: + sysfs_remove_group(&reipl_nvme_kset->kobj, &reipl_nvme_attr_group); +out1: + kset_unregister(reipl_nvme_kset); + free_page((unsigned long) reipl_block_nvme); + return rc; +} + +static int __init reipl_eckd_init(void) +{ + int rc; + + if (!sclp.has_sipl_eckd) + return 0; + + reipl_block_eckd = (void *)get_zeroed_page(GFP_KERNEL); + if (!reipl_block_eckd) + return -ENOMEM; + + /* sysfs: create kset for mixing attr group and bin attrs */ + reipl_eckd_kset = kset_create_and_add(IPL_ECKD_STR, NULL, + &reipl_kset->kobj); + if (!reipl_eckd_kset) { + free_page((unsigned long)reipl_block_eckd); + return -ENOMEM; + } + + rc = sysfs_create_group(&reipl_eckd_kset->kobj, &reipl_eckd_attr_group); + if (rc) + goto out1; + + if (test_facility(141)) { + rc = sysfs_create_file(&reipl_eckd_kset->kobj, + &sys_reipl_eckd_clear_attr.attr); + if (rc) + goto out2; + } else { + reipl_eckd_clear = true; + } + + if (ipl_info.type == IPL_TYPE_ECKD) { + memcpy(reipl_block_eckd, &ipl_block, sizeof(ipl_block)); + } else { + reipl_block_eckd->hdr.len = IPL_BP_ECKD_LEN; + reipl_block_eckd->hdr.version = IPL_PARM_BLOCK_VERSION; + reipl_block_eckd->eckd.len = IPL_BP0_ECKD_LEN; + reipl_block_eckd->eckd.pbt = IPL_PBT_ECKD; + reipl_block_eckd->eckd.opt = IPL_PB0_ECKD_OPT_IPL; + } + reipl_capabilities |= IPL_TYPE_ECKD; + return 0; + +out2: + sysfs_remove_group(&reipl_eckd_kset->kobj, &reipl_eckd_attr_group); +out1: + kset_unregister(reipl_eckd_kset); + free_page((unsigned long)reipl_block_eckd); + return rc; } static int __init reipl_type_init(void) @@ -1049,9 +1581,15 @@ static int __init reipl_type_init(void) if (reipl_block->pb0_hdr.pbt == IPL_PBT_FCP) { memcpy(reipl_block_fcp, reipl_block, size); reipl_type = IPL_TYPE_FCP; + } else if (reipl_block->pb0_hdr.pbt == IPL_PBT_NVME) { + memcpy(reipl_block_nvme, reipl_block, size); + reipl_type = IPL_TYPE_NVME; } else if (reipl_block->pb0_hdr.pbt == IPL_PBT_CCW) { memcpy(reipl_block_ccw, reipl_block, size); reipl_type = IPL_TYPE_CCW; + } else if (reipl_block->pb0_hdr.pbt == IPL_PBT_ECKD) { + memcpy(reipl_block_eckd, reipl_block, size); + reipl_type = IPL_TYPE_ECKD; } out: return reipl_set_type(reipl_type); @@ -1072,9 +1610,15 @@ static int __init reipl_init(void) rc = reipl_ccw_init(); if (rc) return rc; + rc = reipl_eckd_init(); + if (rc) + return rc; rc = reipl_fcp_init(); if (rc) return rc; + rc = reipl_nvme_init(); + if (rc) + return rc; rc = reipl_nss_init(); if (rc) return rc; @@ -1118,6 +1662,52 @@ static struct attribute_group dump_fcp_attr_group = { .attrs = dump_fcp_attrs, }; +/* NVME dump device attributes */ +DEFINE_IPL_ATTR_RW(dump_nvme, fid, "0x%08llx\n", "%llx\n", + dump_block_nvme->nvme.fid); +DEFINE_IPL_ATTR_RW(dump_nvme, nsid, "0x%08llx\n", "%llx\n", + dump_block_nvme->nvme.nsid); +DEFINE_IPL_ATTR_RW(dump_nvme, bootprog, "%lld\n", "%llx\n", + dump_block_nvme->nvme.bootprog); +DEFINE_IPL_ATTR_RW(dump_nvme, br_lba, "%lld\n", "%llx\n", + dump_block_nvme->nvme.br_lba); + +static struct attribute *dump_nvme_attrs[] = { + &sys_dump_nvme_fid_attr.attr, + &sys_dump_nvme_nsid_attr.attr, + &sys_dump_nvme_bootprog_attr.attr, + &sys_dump_nvme_br_lba_attr.attr, + NULL, +}; + +static struct attribute_group dump_nvme_attr_group = { + .name = IPL_NVME_STR, + .attrs = dump_nvme_attrs, +}; + +/* ECKD dump device attributes */ +DEFINE_IPL_CCW_ATTR_RW(dump_eckd, device, dump_block_eckd->eckd); +DEFINE_IPL_ATTR_RW(dump_eckd, bootprog, "%lld\n", "%llx\n", + dump_block_eckd->eckd.bootprog); + +IPL_ATTR_BR_CHR_SHOW_FN(dump, dump_block_eckd->eckd); +IPL_ATTR_BR_CHR_STORE_FN(dump, dump_block_eckd->eckd); + +static struct kobj_attribute sys_dump_eckd_br_chr_attr = + __ATTR(br_chr, 0644, eckd_dump_br_chr_show, eckd_dump_br_chr_store); + +static struct attribute *dump_eckd_attrs[] = { + &sys_dump_eckd_device_attr.attr, + &sys_dump_eckd_bootprog_attr.attr, + &sys_dump_eckd_br_chr_attr.attr, + NULL, +}; + +static struct attribute_group dump_eckd_attr_group = { + .name = IPL_ECKD_STR, + .attrs = dump_eckd_attrs, +}; + /* CCW dump device attributes */ DEFINE_IPL_CCW_ATTR_RW(dump_ccw, device, dump_block_ccw->ccw); @@ -1157,8 +1747,12 @@ static ssize_t dump_type_store(struct kobject *kobj, rc = dump_set_type(DUMP_TYPE_NONE); else if (strncmp(buf, DUMP_CCW_STR, strlen(DUMP_CCW_STR)) == 0) rc = dump_set_type(DUMP_TYPE_CCW); + else if (strncmp(buf, DUMP_ECKD_STR, strlen(DUMP_ECKD_STR)) == 0) + rc = dump_set_type(DUMP_TYPE_ECKD); else if (strncmp(buf, DUMP_FCP_STR, strlen(DUMP_FCP_STR)) == 0) rc = dump_set_type(DUMP_TYPE_FCP); + else if (strncmp(buf, DUMP_NVME_STR, strlen(DUMP_NVME_STR)) == 0) + rc = dump_set_type(DUMP_TYPE_NVME); return (rc != 0) ? rc : len; } @@ -1173,7 +1767,7 @@ static void diag308_dump(void *dump_block) while (1) { if (diag308(DIAG308_LOAD_NORMAL_DUMP, NULL) != 0x302) break; - udelay_simple(USEC_PER_SEC); + udelay(USEC_PER_SEC); } } @@ -1183,9 +1777,15 @@ static void __dump_run(void *unused) case DUMP_TYPE_CCW: diag308_dump(dump_block_ccw); break; + case DUMP_TYPE_ECKD: + diag308_dump(dump_block_eckd); + break; case DUMP_TYPE_FCP: diag308_dump(dump_block_fcp); break; + case DUMP_TYPE_NVME: + diag308_dump(dump_block_nvme); + break; default: break; } @@ -1242,6 +1842,52 @@ static int __init dump_fcp_init(void) return 0; } +static int __init dump_nvme_init(void) +{ + int rc; + + if (!sclp_ipl_info.has_dump) + return 0; /* LDIPL DUMP is not installed */ + dump_block_nvme = (void *) get_zeroed_page(GFP_KERNEL); + if (!dump_block_nvme) + return -ENOMEM; + rc = sysfs_create_group(&dump_kset->kobj, &dump_nvme_attr_group); + if (rc) { + free_page((unsigned long)dump_block_nvme); + return rc; + } + dump_block_nvme->hdr.len = IPL_BP_NVME_LEN; + dump_block_nvme->hdr.version = IPL_PARM_BLOCK_VERSION; + dump_block_nvme->fcp.len = IPL_BP0_NVME_LEN; + dump_block_nvme->fcp.pbt = IPL_PBT_NVME; + dump_block_nvme->fcp.opt = IPL_PB0_NVME_OPT_DUMP; + dump_capabilities |= DUMP_TYPE_NVME; + return 0; +} + +static int __init dump_eckd_init(void) +{ + int rc; + + if (!sclp_ipl_info.has_dump || !sclp.has_sipl_eckd) + return 0; /* LDIPL DUMP is not installed */ + dump_block_eckd = (void *)get_zeroed_page(GFP_KERNEL); + if (!dump_block_eckd) + return -ENOMEM; + rc = sysfs_create_group(&dump_kset->kobj, &dump_eckd_attr_group); + if (rc) { + free_page((unsigned long)dump_block_eckd); + return rc; + } + dump_block_eckd->hdr.len = IPL_BP_ECKD_LEN; + dump_block_eckd->hdr.version = IPL_PARM_BLOCK_VERSION; + dump_block_eckd->eckd.len = IPL_BP0_ECKD_LEN; + dump_block_eckd->eckd.pbt = IPL_PBT_ECKD; + dump_block_eckd->eckd.opt = IPL_PB0_ECKD_OPT_DUMP; + dump_capabilities |= DUMP_TYPE_ECKD; + return 0; +} + static int __init dump_init(void) { int rc; @@ -1257,9 +1903,15 @@ static int __init dump_init(void) rc = dump_ccw_init(); if (rc) return rc; + rc = dump_eckd_init(); + if (rc) + return rc; rc = dump_fcp_init(); if (rc) return rc; + rc = dump_nvme_init(); + if (rc) + return rc; dump_set_type(DUMP_TYPE_NONE); return 0; } @@ -1272,13 +1924,29 @@ static struct shutdown_action __refdata dump_action = { static void dump_reipl_run(struct shutdown_trigger *trigger) { - unsigned long ipib = (unsigned long) reipl_block_actual; + struct lowcore *abs_lc; unsigned int csum; + /* + * Set REIPL_CLEAR flag in os_info flags entry indicating + * 'clear' sysfs attribute has been set on the panicked system + * for specified reipl type. + * Always set for IPL_TYPE_NSS and IPL_TYPE_UNKNOWN. + */ + if ((reipl_type == IPL_TYPE_CCW && reipl_ccw_clear) || + (reipl_type == IPL_TYPE_ECKD && reipl_eckd_clear) || + (reipl_type == IPL_TYPE_FCP && reipl_fcp_clear) || + (reipl_type == IPL_TYPE_NVME && reipl_nvme_clear) || + reipl_type == IPL_TYPE_NSS || + reipl_type == IPL_TYPE_UNKNOWN) + os_info_flags |= OS_INFO_FLAG_REIPL_CLEAR; + os_info_entry_add(OS_INFO_FLAGS_ENTRY, &os_info_flags, sizeof(os_info_flags)); csum = (__force unsigned int) csum_partial(reipl_block_actual, reipl_block_actual->hdr.len, 0); - mem_assign_absolute(S390_lowcore.ipib, ipib); - mem_assign_absolute(S390_lowcore.ipib_checksum, csum); + abs_lc = get_abs_lowcore(); + abs_lc->ipib = __pa(reipl_block_actual); + abs_lc->ipib_checksum = csum; + put_abs_lowcore(abs_lc); dump_run(trigger); } @@ -1472,7 +2140,6 @@ static struct kobj_attribute on_restart_attr = __ATTR_RW(on_restart); static void __do_restart(void *ignore) { - __arch_local_irq_stosm(0x04); /* enable DAT */ smp_send_stop(); #ifdef CONFIG_CRASH_DUMP crash_kexec(NULL); @@ -1481,12 +2148,12 @@ static void __do_restart(void *ignore) stop_run(&on_restart_trigger); } -void do_restart(void) +void do_restart(void *arg) { tracing_off(); debug_locks_off(); lgr_info_log(); - smp_call_online_cpu(__do_restart, NULL); + smp_call_online_cpu(__do_restart, arg); } /* on halt */ @@ -1684,6 +2351,11 @@ void __init setup_ipl(void) ipl_info.data.ccw.dev_id.ssid = ipl_block.ccw.ssid; ipl_info.data.ccw.dev_id.devno = ipl_block.ccw.devno; break; + case IPL_TYPE_ECKD: + case IPL_TYPE_ECKD_DUMP: + ipl_info.data.eckd.dev_id.ssid = ipl_block.eckd.ssid; + ipl_info.data.eckd.dev_id.devno = ipl_block.eckd.devno; + break; case IPL_TYPE_FCP: case IPL_TYPE_FCP_DUMP: ipl_info.data.fcp.dev_id.ssid = 0; @@ -1691,6 +2363,11 @@ void __init setup_ipl(void) ipl_info.data.fcp.wwpn = ipl_block.fcp.wwpn; ipl_info.data.fcp.lun = ipl_block.fcp.lun; break; + case IPL_TYPE_NVME: + case IPL_TYPE_NVME_DUMP: + ipl_info.data.nvme.fid = ipl_block.nvme.fid; + ipl_info.data.nvme.nsid = ipl_block.nvme.nsid; + break; case IPL_TYPE_NSS: case IPL_TYPE_UNKNOWN: /* We have no info to copy */ @@ -1705,8 +2382,8 @@ void s390_reset_system(void) set_prefix(0); /* Disable lowcore protection */ - __ctl_clear_bit(0, 28); - diag_dma_ops.diag308_reset(); + local_ctl_clear_bit(0, CR0_LOW_ADDRESS_PROTECTION_BIT); + diag_amode31_ops.diag308_reset(); } #ifdef CONFIG_KEXEC_FILE @@ -1783,7 +2460,7 @@ void *ipl_report_finish(struct ipl_report *report) buf = vzalloc(report->size); if (!buf) - return ERR_PTR(-ENOMEM); + goto out; ptr = buf; memcpy(ptr, report->ipib, report->ipib->hdr.len); @@ -1822,6 +2499,7 @@ void *ipl_report_finish(struct ipl_report *report) } BUG_ON(ptr > buf + report->size); +out: return buf; } diff --git a/arch/s390/kernel/ipl_vmparm.c b/arch/s390/kernel/ipl_vmparm.c index af43535a976d..b5245fadcfb0 100644 --- a/arch/s390/kernel/ipl_vmparm.c +++ b/arch/s390/kernel/ipl_vmparm.c @@ -1,4 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/minmax.h> +#include <linux/string.h> #include <asm/ebcdic.h> #include <asm/ipl.h> diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index 8371855042dc..6f71b0ce1068 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -21,12 +21,14 @@ #include <linux/init.h> #include <linux/cpu.h> #include <linux/irq.h> +#include <linux/entry-common.h> #include <asm/irq_regs.h> #include <asm/cputime.h> #include <asm/lowcore.h> #include <asm/irq.h> #include <asm/hw_irq.h> #include <asm/stacktrace.h> +#include <asm/softirq_stack.h> #include "entry.h" DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_stat, irq_stat); @@ -95,27 +97,106 @@ static const struct irq_class irqclass_sub_desc[] = { {.irq = CPU_RST, .name = "RST", .desc = "[CPU] CPU Restart"}, }; -void __init init_IRQ(void) +static void do_IRQ(struct pt_regs *regs, int irq) { - BUILD_BUG_ON(ARRAY_SIZE(irqclass_sub_desc) != NR_ARCH_IRQS); - init_cio_interrupts(); - init_airq_interrupts(); - init_ext_interrupts(); -} - -void do_IRQ(struct pt_regs *regs, int irq) -{ - struct pt_regs *old_regs; - - old_regs = set_irq_regs(regs); - irq_enter(); if (tod_after_eq(S390_lowcore.int_clock, S390_lowcore.clock_comparator)) /* Serve timer interrupts first. */ clock_comparator_work(); generic_handle_irq(irq); - irq_exit(); +} + +static int on_async_stack(void) +{ + unsigned long frame = current_frame_address(); + + return ((S390_lowcore.async_stack ^ frame) & ~(THREAD_SIZE - 1)) == 0; +} + +static void do_irq_async(struct pt_regs *regs, int irq) +{ + if (on_async_stack()) { + do_IRQ(regs, irq); + } else { + call_on_stack(2, S390_lowcore.async_stack, void, do_IRQ, + struct pt_regs *, regs, int, irq); + } +} + +static int irq_pending(struct pt_regs *regs) +{ + int cc; + + asm volatile("tpi 0\n" + "ipm %0" : "=d" (cc) : : "cc"); + return cc >> 28; +} + +void noinstr do_io_irq(struct pt_regs *regs) +{ + irqentry_state_t state = irqentry_enter(regs); + struct pt_regs *old_regs = set_irq_regs(regs); + bool from_idle; + + irq_enter_rcu(); + + if (user_mode(regs)) { + update_timer_sys(); + if (static_branch_likely(&cpu_has_bear)) + current->thread.last_break = regs->last_break; + } + + from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT); + if (from_idle) + account_idle_time_irq(); + + do { + regs->tpi_info = S390_lowcore.tpi_info; + if (S390_lowcore.tpi_info.adapter_IO) + do_irq_async(regs, THIN_INTERRUPT); + else + do_irq_async(regs, IO_INTERRUPT); + } while (MACHINE_IS_LPAR && irq_pending(regs)); + + irq_exit_rcu(); + set_irq_regs(old_regs); + irqentry_exit(regs, state); + + if (from_idle) + regs->psw.mask &= ~(PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_WAIT); +} + +void noinstr do_ext_irq(struct pt_regs *regs) +{ + irqentry_state_t state = irqentry_enter(regs); + struct pt_regs *old_regs = set_irq_regs(regs); + bool from_idle; + + irq_enter_rcu(); + + if (user_mode(regs)) { + update_timer_sys(); + if (static_branch_likely(&cpu_has_bear)) + current->thread.last_break = regs->last_break; + } + + regs->int_code = S390_lowcore.ext_int_code_addr; + regs->int_parm = S390_lowcore.ext_params; + regs->int_parm_long = S390_lowcore.ext_params2; + + from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT); + if (from_idle) + account_idle_time_irq(); + + do_irq_async(regs, EXT_INTERRUPT); + + irq_exit_rcu(); + set_irq_regs(old_regs); + irqentry_exit(regs, state); + + if (from_idle) + regs->psw.mask &= ~(PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_WAIT); } static void show_msi_interrupt(struct seq_file *p, int irq) @@ -124,7 +205,7 @@ static void show_msi_interrupt(struct seq_file *p, int irq) unsigned long flags; int cpu; - irq_lock_sparse(); + rcu_read_lock(); desc = irq_to_desc(irq); if (!desc) goto out; @@ -132,7 +213,7 @@ static void show_msi_interrupt(struct seq_file *p, int irq) raw_spin_lock_irqsave(&desc->lock, flags); seq_printf(p, "%3d: ", irq); for_each_online_cpu(cpu) - seq_printf(p, "%10u ", kstat_irqs_cpu(irq, cpu)); + seq_printf(p, "%10u ", irq_desc_kstat_cpu(desc, cpu)); if (desc->irq_data.chip) seq_printf(p, " %8s", desc->irq_data.chip->name); @@ -143,7 +224,7 @@ static void show_msi_interrupt(struct seq_file *p, int irq) seq_putc(p, '\n'); raw_spin_unlock_irqrestore(&desc->lock, flags); out: - irq_unlock_sparse(); + rcu_read_unlock(); } /* @@ -154,7 +235,7 @@ int show_interrupts(struct seq_file *p, void *v) int index = *(loff_t *) v; int cpu, irq; - get_online_cpus(); + cpus_read_lock(); if (index == 0) { seq_puts(p, " "); for_each_online_cpu(cpu) @@ -184,7 +265,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); } out: - put_online_cpus(); + cpus_read_unlock(); return 0; } @@ -194,24 +275,6 @@ unsigned int arch_dynirq_lower_bound(unsigned int from) } /* - * Switch to the asynchronous interrupt stack for softirq execution. - */ -void do_softirq_own_stack(void) -{ - unsigned long old, new; - - old = current_stack_pointer(); - /* Check against async. stack address range. */ - new = S390_lowcore.async_stack; - if (((new - old) >> (PAGE_SHIFT + THREAD_SIZE_ORDER)) != 0) { - CALL_ON_STACK(__do_softirq, new, 0); - } else { - /* We are already on the async stack. */ - __do_softirq(); - } -} - -/* * ext_int_hash[index] is the list head for all external interrupts that hash * to this index. */ @@ -279,7 +342,7 @@ static irqreturn_t do_ext_interrupt(int irq, void *dummy) struct ext_int_info *p; int index; - ext_code = *(struct ext_code *) ®s->int_code; + ext_code.int_code = regs->int_code; if (ext_code.code != EXT_IRQ_CLK_COMP) set_cpu_flag(CIF_NOHZ_DELAY); @@ -294,12 +357,7 @@ static irqreturn_t do_ext_interrupt(int irq, void *dummy) return IRQ_HANDLED; } -static struct irqaction external_interrupt = { - .name = "EXT", - .handler = do_ext_interrupt, -}; - -void __init init_ext_interrupts(void) +static void __init init_ext_interrupts(void) { int idx; @@ -308,7 +366,16 @@ void __init init_ext_interrupts(void) irq_set_chip_and_handler(EXT_INTERRUPT, &dummy_irq_chip, handle_percpu_irq); - setup_irq(EXT_INTERRUPT, &external_interrupt); + if (request_irq(EXT_INTERRUPT, do_ext_interrupt, 0, "EXT", NULL)) + panic("Failed to register EXT interrupt\n"); +} + +void __init init_IRQ(void) +{ + BUILD_BUG_ON(ARRAY_SIZE(irqclass_sub_desc) != NR_ARCH_IRQS); + init_cio_interrupts(); + init_airq_interrupts(); + init_ext_interrupts(); } static DEFINE_SPINLOCK(irq_subclass_lock); @@ -318,7 +385,7 @@ void irq_subclass_register(enum irq_subclass subclass) { spin_lock(&irq_subclass_lock); if (!irq_subclass_refcount[subclass]) - ctl_set_bit(0, subclass); + system_ctl_set_bit(0, subclass); irq_subclass_refcount[subclass]++; spin_unlock(&irq_subclass_lock); } @@ -329,7 +396,7 @@ void irq_subclass_unregister(enum irq_subclass subclass) spin_lock(&irq_subclass_lock); irq_subclass_refcount[subclass]--; if (!irq_subclass_refcount[subclass]) - ctl_clear_bit(0, subclass); + system_ctl_clear_bit(0, subclass); spin_unlock(&irq_subclass_lock); } EXPORT_SYMBOL(irq_subclass_unregister); diff --git a/arch/s390/kernel/jump_label.c b/arch/s390/kernel/jump_label.c index ab584e8e3527..e808bb8bc0da 100644 --- a/arch/s390/kernel/jump_label.c +++ b/arch/s390/kernel/jump_label.c @@ -6,8 +6,9 @@ * Author(s): Jan Glauber <jang@linux.vnet.ibm.com> */ #include <linux/uaccess.h> -#include <linux/stop_machine.h> #include <linux/jump_label.h> +#include <linux/module.h> +#include <asm/text-patching.h> #include <asm/ipl.h> struct insn { @@ -36,21 +37,15 @@ static void jump_label_bug(struct jump_entry *entry, struct insn *expected, unsigned char *ipe = (unsigned char *)expected; unsigned char *ipn = (unsigned char *)new; - pr_emerg("Jump label code mismatch at %pS [%p]\n", ipc, ipc); + pr_emerg("Jump label code mismatch at %pS [%px]\n", ipc, ipc); pr_emerg("Found: %6ph\n", ipc); pr_emerg("Expected: %6ph\n", ipe); pr_emerg("New: %6ph\n", ipn); panic("Corrupted kernel text"); } -static struct insn orignop = { - .opcode = 0xc004, - .offset = JUMP_LABEL_NOP_OFFSET >> 1, -}; - -static void __jump_label_transform(struct jump_entry *entry, - enum jump_label_type type, - int init) +static void jump_label_transform(struct jump_entry *entry, + enum jump_label_type type) { void *code = (void *)jump_entry_code(entry); struct insn old, new; @@ -62,29 +57,26 @@ static void __jump_label_transform(struct jump_entry *entry, jump_label_make_branch(entry, &old); jump_label_make_nop(entry, &new); } - if (init) { - if (memcmp(code, &orignop, sizeof(orignop))) - jump_label_bug(entry, &orignop, &new); - } else { - if (memcmp(code, &old, sizeof(old))) - jump_label_bug(entry, &old, &new); - } + if (memcmp(code, &old, sizeof(old))) + jump_label_bug(entry, &old, &new); s390_kernel_write(code, &new, sizeof(new)); } -static void __jump_label_sync(void *dummy) +void arch_jump_label_transform(struct jump_entry *entry, + enum jump_label_type type) { + jump_label_transform(entry, type); + text_poke_sync(); } -void arch_jump_label_transform(struct jump_entry *entry, - enum jump_label_type type) +bool arch_jump_label_transform_queue(struct jump_entry *entry, + enum jump_label_type type) { - __jump_label_transform(entry, type, 0); - smp_call_function(__jump_label_sync, NULL, 1); + jump_label_transform(entry, type); + return true; } -void arch_jump_label_transform_static(struct jump_entry *entry, - enum jump_label_type type) +void arch_jump_label_transform_apply(void) { - __jump_label_transform(entry, type, 1); + text_poke_sync(); } diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index 6f1388391620..f0cf20d4b3c5 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -7,6 +7,9 @@ * s390 port, used ppc64 as template. Mike Grundy <grundym@us.ibm.com> */ +#define pr_fmt(fmt) "kprobes: " fmt + +#include <linux/moduleloader.h> #include <linux/kprobes.h> #include <linux/ptrace.h> #include <linux/preempt.h> @@ -21,28 +24,36 @@ #include <asm/set_memory.h> #include <asm/sections.h> #include <asm/dis.h> +#include "kprobes.h" +#include "entry.h" DEFINE_PER_CPU(struct kprobe *, current_kprobe); DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); struct kretprobe_blackpoint kretprobe_blacklist[] = { }; -DEFINE_INSN_CACHE_OPS(s390_insn); - static int insn_page_in_use; -static char insn_page[PAGE_SIZE] __aligned(PAGE_SIZE); + +void *alloc_insn_page(void) +{ + void *page; + + page = module_alloc(PAGE_SIZE); + if (!page) + return NULL; + set_memory_rox((unsigned long)page, 1); + return page; +} static void *alloc_s390_insn_page(void) { if (xchg(&insn_page_in_use, 1) == 1) return NULL; - set_memory_x((unsigned long) &insn_page, 1); - return &insn_page; + return &kprobes_insn_page; } static void free_s390_insn_page(void *page) { - set_memory_nx((unsigned long) page, 1); xchg(&insn_page_in_use, 0); } @@ -56,44 +67,32 @@ struct kprobe_insn_cache kprobe_s390_insn_slots = { static void copy_instruction(struct kprobe *p) { - unsigned long ip = (unsigned long) p->addr; + kprobe_opcode_t insn[MAX_INSN_SIZE]; s64 disp, new_disp; u64 addr, new_addr; + unsigned int len; - if (ftrace_location(ip) == ip) { + len = insn_length(*p->addr >> 8); + memcpy(&insn, p->addr, len); + p->opcode = insn[0]; + if (probe_is_insn_relative_long(&insn[0])) { /* - * If kprobes patches the instruction that is morphed by - * ftrace make sure that kprobes always sees the branch - * "jg .+24" that skips the mcount block or the "brcl 0,0" - * in case of hotpatch. + * For pc-relative instructions in RIL-b or RIL-c format patch + * the RI2 displacement field. We have already made sure that + * the insn slot for the patched instruction is within the same + * 2GB area as the original instruction (either kernel image or + * module area). Therefore the new displacement will always fit. */ - ftrace_generate_nop_insn((struct ftrace_insn *)p->ainsn.insn); - p->ainsn.is_ftrace_insn = 1; - } else - memcpy(p->ainsn.insn, p->addr, insn_length(*p->addr >> 8)); - p->opcode = p->ainsn.insn[0]; - if (!probe_is_insn_relative_long(p->ainsn.insn)) - return; - /* - * For pc-relative instructions in RIL-b or RIL-c format patch the - * RI2 displacement field. We have already made sure that the insn - * slot for the patched instruction is within the same 2GB area - * as the original instruction (either kernel image or module area). - * Therefore the new displacement will always fit. - */ - disp = *(s32 *)&p->ainsn.insn[1]; - addr = (u64)(unsigned long)p->addr; - new_addr = (u64)(unsigned long)p->ainsn.insn; - new_disp = ((addr + (disp * 2)) - new_addr) / 2; - *(s32 *)&p->ainsn.insn[1] = new_disp; + disp = *(s32 *)&insn[1]; + addr = (u64)(unsigned long)p->addr; + new_addr = (u64)(unsigned long)p->ainsn.insn; + new_disp = ((addr + (disp * 2)) - new_addr) / 2; + *(s32 *)&insn[1] = new_disp; + } + s390_kernel_write(p->ainsn.insn, &insn, len); } NOKPROBE_SYMBOL(copy_instruction); -static inline int is_kernel_addr(void *addr) -{ - return addr < (void *)_end; -} - static int s390_get_insn_slot(struct kprobe *p) { /* @@ -102,7 +101,7 @@ static int s390_get_insn_slot(struct kprobe *p) * field can be patched and executed within the insn slot. */ p->ainsn.insn = NULL; - if (is_kernel_addr(p->addr)) + if (is_kernel((unsigned long)p->addr)) p->ainsn.insn = get_s390_insn_slot(); else if (is_module_addr(p->addr)) p->ainsn.insn = get_insn_slot(); @@ -114,7 +113,7 @@ static void s390_free_insn_slot(struct kprobe *p) { if (!p->ainsn.insn) return; - if (is_kernel_addr(p->addr)) + if (is_kernel((unsigned long)p->addr)) free_s390_insn_slot(p->ainsn.insn, 0); else free_insn_slot(p->ainsn.insn, 0); @@ -122,9 +121,55 @@ static void s390_free_insn_slot(struct kprobe *p) } NOKPROBE_SYMBOL(s390_free_insn_slot); +/* Check if paddr is at an instruction boundary */ +static bool can_probe(unsigned long paddr) +{ + unsigned long addr, offset = 0; + kprobe_opcode_t insn; + struct kprobe *kp; + + if (paddr & 0x01) + return false; + + if (!kallsyms_lookup_size_offset(paddr, NULL, &offset)) + return false; + + /* Decode instructions */ + addr = paddr - offset; + while (addr < paddr) { + if (copy_from_kernel_nofault(&insn, (void *)addr, sizeof(insn))) + return false; + + if (insn >> 8 == 0) { + if (insn != BREAKPOINT_INSTRUCTION) { + /* + * Note that QEMU inserts opcode 0x0000 to implement + * software breakpoints for guests. Since the size of + * the original instruction is unknown, stop following + * instructions and prevent setting a kprobe. + */ + return false; + } + /* + * Check if the instruction has been modified by another + * kprobe, in which case the original instruction is + * decoded. + */ + kp = get_kprobe((void *)addr); + if (!kp) { + /* not a kprobe */ + return false; + } + insn = kp->opcode; + } + addr += insn_length(insn >> 8); + } + return addr == paddr; +} + int arch_prepare_kprobe(struct kprobe *p) { - if ((unsigned long) p->addr & 0x01) + if (!can_probe((unsigned long)p->addr)) return -EINVAL; /* Make sure the probe isn't going on a difficult instruction */ if (probe_is_prohibited_opcode(p->addr)) @@ -136,11 +181,6 @@ int arch_prepare_kprobe(struct kprobe *p) } NOKPROBE_SYMBOL(arch_prepare_kprobe); -int arch_check_ftrace_location(struct kprobe *p) -{ - return 0; -} - struct swap_insn_args { struct kprobe *p; unsigned int arm_kprobe : 1; @@ -149,28 +189,11 @@ struct swap_insn_args { static int swap_instruction(void *data) { struct swap_insn_args *args = data; - struct ftrace_insn new_insn, *insn; struct kprobe *p = args->p; - size_t len; - - new_insn.opc = args->arm_kprobe ? BREAKPOINT_INSTRUCTION : p->opcode; - len = sizeof(new_insn.opc); - if (!p->ainsn.is_ftrace_insn) - goto skip_ftrace; - len = sizeof(new_insn); - insn = (struct ftrace_insn *) p->addr; - if (args->arm_kprobe) { - if (is_ftrace_nop(insn)) - new_insn.disp = KPROBE_ON_FTRACE_NOP; - else - new_insn.disp = KPROBE_ON_FTRACE_CALL; - } else { - ftrace_generate_call_insn(&new_insn, (unsigned long)p->addr); - if (insn->disp == KPROBE_ON_FTRACE_NOP) - ftrace_generate_nop_insn(&new_insn); - } -skip_ftrace: - s390_kernel_write(p->addr, &new_insn, len); + u16 opc; + + opc = args->arm_kprobe ? BREAKPOINT_INSTRUCTION : p->opcode; + s390_kernel_write(p->addr, &opc, sizeof(opc)); return 0; } NOKPROBE_SYMBOL(swap_instruction); @@ -201,20 +224,27 @@ static void enable_singlestep(struct kprobe_ctlblk *kcb, struct pt_regs *regs, unsigned long ip) { - struct per_regs per_kprobe; + union { + struct ctlreg regs[3]; + struct { + struct ctlreg control; + struct ctlreg start; + struct ctlreg end; + }; + } per_kprobe; /* Set up the PER control registers %cr9-%cr11 */ - per_kprobe.control = PER_EVENT_IFETCH; - per_kprobe.start = ip; - per_kprobe.end = ip; + per_kprobe.control.val = PER_EVENT_IFETCH; + per_kprobe.start.val = ip; + per_kprobe.end.val = ip; /* Save control regs and psw mask */ - __ctl_store(kcb->kprobe_saved_ctl, 9, 11); + __local_ctl_store(9, 11, kcb->kprobe_saved_ctl); kcb->kprobe_saved_imask = regs->psw.mask & (PSW_MASK_PER | PSW_MASK_IO | PSW_MASK_EXT); /* Set PER control regs, turns on single step for the given address */ - __ctl_load(per_kprobe, 9, 11); + __local_ctl_load(9, 11, per_kprobe.regs); regs->psw.mask |= PSW_MASK_PER; regs->psw.mask &= ~(PSW_MASK_IO | PSW_MASK_EXT); regs->psw.addr = ip; @@ -226,7 +256,7 @@ static void disable_singlestep(struct kprobe_ctlblk *kcb, unsigned long ip) { /* Restore control regs and psw mask, set new psw address */ - __ctl_load(kcb->kprobe_saved_ctl, 9, 11); + __local_ctl_load(9, 11, kcb->kprobe_saved_ctl); regs->psw.mask &= ~PSW_MASK_PER; regs->psw.mask |= kcb->kprobe_saved_imask; regs->psw.addr = ip; @@ -255,18 +285,10 @@ static void pop_kprobe(struct kprobe_ctlblk *kcb) { __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp); kcb->kprobe_status = kcb->prev_kprobe.status; + kcb->prev_kprobe.kp = NULL; } NOKPROBE_SYMBOL(pop_kprobe); -void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) -{ - ri->ret_addr = (kprobe_opcode_t *) regs->gprs[14]; - - /* Replace the return addr with trampoline addr */ - regs->gprs[14] = (unsigned long) &kretprobe_trampoline; -} -NOKPROBE_SYMBOL(arch_prepare_kretprobe); - static void kprobe_reenter_check(struct kprobe_ctlblk *kcb, struct kprobe *p) { switch (kcb->kprobe_status) { @@ -282,7 +304,7 @@ static void kprobe_reenter_check(struct kprobe_ctlblk *kcb, struct kprobe *p) * is a BUG. The code path resides in the .kprobes.text * section and is executed with interrupts disabled. */ - pr_err("Invalid kprobe detected.\n"); + pr_err("Failed to recover from reentered kprobes.\n"); dump_kprobe(p); BUG(); } @@ -348,109 +370,6 @@ static int kprobe_handler(struct pt_regs *regs) NOKPROBE_SYMBOL(kprobe_handler); /* - * Function return probe trampoline: - * - init_kprobes() establishes a probepoint here - * - When the probed function returns, this probe - * causes the handlers to fire - */ -static void __used kretprobe_trampoline_holder(void) -{ - asm volatile(".global kretprobe_trampoline\n" - "kretprobe_trampoline: bcr 0,0\n"); -} - -/* - * Called when the probe at kretprobe trampoline is hit - */ -static int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct kretprobe_instance *ri; - struct hlist_head *head, empty_rp; - struct hlist_node *tmp; - unsigned long flags, orig_ret_address; - unsigned long trampoline_address; - kprobe_opcode_t *correct_ret_addr; - - INIT_HLIST_HEAD(&empty_rp); - kretprobe_hash_lock(current, &head, &flags); - - /* - * It is possible to have multiple instances associated with a given - * task either because an multiple functions in the call path - * have a return probe installed on them, and/or more than one return - * return probe was registered for a target function. - * - * We can handle this because: - * - instances are always inserted at the head of the list - * - when multiple return probes are registered for the same - * function, the first instance's ret_addr will point to the - * real return address, and all the rest will point to - * kretprobe_trampoline - */ - ri = NULL; - orig_ret_address = 0; - correct_ret_addr = NULL; - trampoline_address = (unsigned long) &kretprobe_trampoline; - hlist_for_each_entry_safe(ri, tmp, head, hlist) { - if (ri->task != current) - /* another task is sharing our hash bucket */ - continue; - - orig_ret_address = (unsigned long) ri->ret_addr; - - if (orig_ret_address != trampoline_address) - /* - * This is the real return address. Any other - * instances associated with this task are for - * other calls deeper on the call stack - */ - break; - } - - kretprobe_assert(ri, orig_ret_address, trampoline_address); - - correct_ret_addr = ri->ret_addr; - hlist_for_each_entry_safe(ri, tmp, head, hlist) { - if (ri->task != current) - /* another task is sharing our hash bucket */ - continue; - - orig_ret_address = (unsigned long) ri->ret_addr; - - if (ri->rp && ri->rp->handler) { - ri->ret_addr = correct_ret_addr; - ri->rp->handler(ri, regs); - } - - recycle_rp_inst(ri, &empty_rp); - - if (orig_ret_address != trampoline_address) - /* - * This is the real return address. Any other - * instances associated with this task are for - * other calls deeper on the call stack - */ - break; - } - - regs->psw.addr = orig_ret_address; - - kretprobe_hash_unlock(current, &flags); - - hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { - hlist_del(&ri->hlist); - kfree(ri); - } - /* - * By returning a non-zero value, we are telling - * kprobe_handler() that we don't want the post_handler - * to run (and have re-enabled preemption) - */ - return 1; -} -NOKPROBE_SYMBOL(trampoline_probe_handler); - -/* * Called after single-stepping. p->addr is the address of the * instruction whose first byte has been replaced by the "breakpoint" * instruction. To avoid the SMP problems that can occur when we @@ -464,24 +383,6 @@ static void resume_execution(struct kprobe *p, struct pt_regs *regs) unsigned long ip = regs->psw.addr; int fixup = probe_get_fixup_type(p->ainsn.insn); - /* Check if the kprobes location is an enabled ftrace caller */ - if (p->ainsn.is_ftrace_insn) { - struct ftrace_insn *insn = (struct ftrace_insn *) p->addr; - struct ftrace_insn call_insn; - - ftrace_generate_call_insn(&call_insn, (unsigned long) p->addr); - /* - * A kprobe on an enabled ftrace call site actually single - * stepped an unconditional branch (ftrace nop equivalent). - * Now we need to fixup things and pretend that a brasl r0,... - * was executed instead. - */ - if (insn->disp == KPROBE_ON_FTRACE_CALL) { - ip += call_insn.disp * 2 - MCOUNT_INSN_SIZE; - regs->gprs[0] = (unsigned long)p->addr + sizeof(*insn); - } - } - if (fixup & FIXUP_PSW_NORMAL) ip += (unsigned long) p->addr - (unsigned long) p->ainsn.insn; @@ -509,12 +410,11 @@ static int post_kprobe_handler(struct pt_regs *regs) if (!p) return 0; + resume_execution(p, regs); if (kcb->kprobe_status != KPROBE_REENTER && p->post_handler) { kcb->kprobe_status = KPROBE_HIT_SSDONE; p->post_handler(p, regs, 0); } - - resume_execution(p, regs); pop_kprobe(kcb); preempt_enable_no_resched(); @@ -534,7 +434,6 @@ static int kprobe_trap_handler(struct pt_regs *regs, int trapnr) { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); struct kprobe *p = kprobe_running(); - const struct exception_table_entry *entry; switch(kcb->kprobe_status) { case KPROBE_HIT_SS: @@ -553,32 +452,11 @@ static int kprobe_trap_handler(struct pt_regs *regs, int trapnr) case KPROBE_HIT_ACTIVE: case KPROBE_HIT_SSDONE: /* - * We increment the nmissed count for accounting, - * we can also use npre/npostfault count for accounting - * these specific fault cases. - */ - kprobes_inc_nmissed_count(p); - - /* - * We come here because instructions in the pre/post - * handler caused the page_fault, this could happen - * if handler tries to access user space by - * copy_from_user(), get_user() etc. Let the - * user-specified handler try to fix it first. - */ - if (p->fault_handler && p->fault_handler(p, regs, trapnr)) - return 1; - - /* * In case the user-specified fault handler returned * zero, try to fix up. */ - entry = s390_search_extables(regs->psw.addr); - if (entry) { - regs->psw.addr = extable_fixup(entry); + if (fixup_exception(regs)) return 1; - } - /* * fixup_exception() could not handle it, * Let do_page_fault() fix it. @@ -642,18 +520,13 @@ int kprobe_exceptions_notify(struct notifier_block *self, } NOKPROBE_SYMBOL(kprobe_exceptions_notify); -static struct kprobe trampoline = { - .addr = (kprobe_opcode_t *) &kretprobe_trampoline, - .pre_handler = trampoline_probe_handler -}; - int __init arch_init_kprobes(void) { - return register_kprobe(&trampoline); + return 0; } int arch_trampoline_kprobe(struct kprobe *p) { - return p->addr == (kprobe_opcode_t *) &kretprobe_trampoline; + return 0; } NOKPROBE_SYMBOL(arch_trampoline_kprobe); diff --git a/arch/s390/kernel/kprobes.h b/arch/s390/kernel/kprobes.h new file mode 100644 index 000000000000..dc3ed5098ee7 --- /dev/null +++ b/arch/s390/kernel/kprobes.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +#ifndef _ARCH_S390_KPROBES_H +#define _ARCH_S390_KPROBES_H + +#include <linux/kprobes.h> + +DEFINE_INSN_CACHE_OPS(s390_insn); + +#endif diff --git a/arch/s390/kernel/kprobes_insn_page.S b/arch/s390/kernel/kprobes_insn_page.S new file mode 100644 index 000000000000..0fe4d725e98b --- /dev/null +++ b/arch/s390/kernel/kprobes_insn_page.S @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/linkage.h> + +/* + * insn_page is a special 4k aligned dummy function for kprobes. + * It will contain all kprobed instructions that are out-of-line executed. + * The page must be within the kernel image to guarantee that the + * out-of-line instructions are within 2GB distance of their original + * location. Using a dummy function ensures that the insn_page is within + * the text section of the kernel and mapped read-only/executable from + * the beginning on, thus avoiding to split large mappings if the page + * would be in the data section instead. + */ + .section .kprobes.text, "ax" + .balign 4096 +SYM_CODE_START(kprobes_insn_page) + .rept 2048 + .word 0x07fe + .endr +SYM_CODE_END(kprobes_insn_page) + .previous diff --git a/arch/s390/kernel/lgr.c b/arch/s390/kernel/lgr.c index 452502f9a0d9..6652e54cf3db 100644 --- a/arch/s390/kernel/lgr.c +++ b/arch/s390/kernel/lgr.c @@ -88,8 +88,7 @@ static void lgr_stsi_2_2_2(struct lgr_info *lgr_info) if (stsi(si, 2, 2, 2)) return; cpascii(lgr_info->name, si->name, sizeof(si->name)); - memcpy(&lgr_info->lpar_number, &si->lpar_number, - sizeof(lgr_info->lpar_number)); + lgr_info->lpar_number = si->lpar_number; } /* @@ -167,7 +166,7 @@ static struct timer_list lgr_timer; */ static void lgr_timer_set(void) { - mod_timer(&lgr_timer, jiffies + LGR_TIMER_INTERVAL_SECS * HZ); + mod_timer(&lgr_timer, jiffies + msecs_to_jiffies(LGR_TIMER_INTERVAL_SECS * MSEC_PER_SEC)); } /* diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c index cb8b1cc285c9..aa22ffc16bcd 100644 --- a/arch/s390/kernel/machine_kexec.c +++ b/arch/s390/kernel/machine_kexec.c @@ -3,7 +3,6 @@ * Copyright IBM Corp. 2005, 2011 * * Author(s): Rolf Adelsberger, - * Heiko Carstens <heiko.carstens@de.ibm.com> * Michael Holzheu <holzheu@linux.vnet.ibm.com> */ @@ -14,24 +13,25 @@ #include <linux/reboot.h> #include <linux/ftrace.h> #include <linux/debug_locks.h> -#include <linux/suspend.h> +#include <asm/pfault.h> #include <asm/cio.h> #include <asm/setup.h> -#include <asm/pgtable.h> -#include <asm/pgalloc.h> #include <asm/smp.h> #include <asm/ipl.h> #include <asm/diag.h> #include <asm/elf.h> #include <asm/asm-offsets.h> #include <asm/cacheflush.h> +#include <asm/abs_lowcore.h> #include <asm/os_info.h> #include <asm/set_memory.h> #include <asm/stacktrace.h> #include <asm/switch_to.h> #include <asm/nmi.h> +#include <asm/sclp.h> -typedef void (*relocate_kernel_t)(kimage_entry_t *, unsigned long); +typedef void (*relocate_kernel_t)(unsigned long, unsigned long, unsigned long); +typedef int (*purgatory_t)(int); extern const unsigned char relocate_kernel[]; extern const unsigned long long relocate_kernel_len; @@ -39,44 +39,17 @@ extern const unsigned long long relocate_kernel_len; #ifdef CONFIG_CRASH_DUMP /* - * PM notifier callback for kdump - */ -static int machine_kdump_pm_cb(struct notifier_block *nb, unsigned long action, - void *ptr) -{ - switch (action) { - case PM_SUSPEND_PREPARE: - case PM_HIBERNATION_PREPARE: - if (kexec_crash_image) - arch_kexec_unprotect_crashkres(); - break; - case PM_POST_SUSPEND: - case PM_POST_HIBERNATION: - if (kexec_crash_image) - arch_kexec_protect_crashkres(); - break; - default: - return NOTIFY_DONE; - } - return NOTIFY_OK; -} - -static int __init machine_kdump_pm_init(void) -{ - pm_notifier(machine_kdump_pm_cb, 0); - return 0; -} -arch_initcall(machine_kdump_pm_init); - -/* * Reset the system, copy boot CPU registers to absolute zero, * and jump to the kdump image */ -static void __do_machine_kdump(void *image) +static void __do_machine_kdump(void *data) { - int (*start_kdump)(int); + struct kimage *image = data; + purgatory_t purgatory; unsigned long prefix; + purgatory = (purgatory_t)image->start; + /* store_status() saved the prefix register to lowcore */ prefix = (unsigned long) S390_lowcore.prefixreg_save_area; @@ -88,14 +61,12 @@ static void __do_machine_kdump(void *image) * This need to be done *after* s390_reset_system set the * prefix register of this CPU to zero */ - memcpy((void *) __LC_FPREGS_SAVE_AREA, - (void *)(prefix + __LC_FPREGS_SAVE_AREA), 512); + memcpy(absolute_pointer(__LC_FPREGS_SAVE_AREA), + phys_to_virt(prefix + __LC_FPREGS_SAVE_AREA), 512); - __load_psw_mask(PSW_MASK_BASE | PSW_DEFAULT_KEY | PSW_MASK_EA | PSW_MASK_BA); - start_kdump = (void *)((struct kimage *) image)->start; - start_kdump(1); + call_nodat(1, int, purgatory, int, 1); - /* Die if start_kdump returns */ + /* Die if kdump returns */ disabled_wait(); } @@ -119,16 +90,16 @@ static noinline void __machine_kdump(void *image) continue; } /* Store status of the boot CPU */ - mcesa = (struct mcesa *)(S390_lowcore.mcesad & MCESA_ORIGIN_MASK); - if (MACHINE_HAS_VX) + mcesa = __va(S390_lowcore.mcesad & MCESA_ORIGIN_MASK); + if (cpu_has_vx()) save_vx_regs((__vector128 *) mcesa->vector_save_area); if (MACHINE_HAS_GS) { - __ctl_store(cr2_old.val, 2, 2); + local_ctl_store(2, &cr2_old.reg); cr2_new = cr2_old; cr2_new.gse = 1; - __ctl_load(cr2_new.val, 2, 2); + local_ctl_load(2, &cr2_new.reg); save_gs_cb((struct gs_cb *) mcesa->guarded_storage_save_area); - __ctl_load(cr2_old.val, 2, 2); + local_ctl_load(2, &cr2_old.reg); } /* * To create a good backchain for this CPU in the dump store_status @@ -142,18 +113,6 @@ static noinline void __machine_kdump(void *image) store_status(__do_machine_kdump, image); } -static unsigned long do_start_kdump(unsigned long addr) -{ - struct kimage *image = (struct kimage *) addr; - int (*start_kdump)(int) = (void *)image->start; - int rc; - - __arch_local_irq_stnsm(0xfb); /* disable DAT */ - rc = start_kdump(0); - __arch_local_irq_stosm(0x04); /* enable DAT */ - return rc; -} - #endif /* CONFIG_CRASH_DUMP */ /* @@ -162,11 +121,10 @@ static unsigned long do_start_kdump(unsigned long addr) static bool kdump_csum_valid(struct kimage *image) { #ifdef CONFIG_CRASH_DUMP + purgatory_t purgatory = (purgatory_t)image->start; int rc; - preempt_disable(); - rc = CALL_ON_STACK(do_start_kdump, S390_lowcore.nodat_stack, 1, image); - preempt_enable(); + rc = call_nodat(1, int, purgatory, int, 0); return rc == 0; #else return false; @@ -240,7 +198,7 @@ int machine_kexec_prepare(struct kimage *image) return -EINVAL; /* Get the destination where the assembler code should be copied to.*/ - reboot_code_buffer = (void *) page_to_phys(image->control_code_page); + reboot_code_buffer = page_to_virt(image->control_code_page); /* Then copy it */ memcpy(reboot_code_buffer, relocate_kernel, relocate_kernel_len); @@ -253,13 +211,17 @@ void machine_kexec_cleanup(struct kimage *image) void arch_crash_save_vmcoreinfo(void) { + struct lowcore *abs_lc; + VMCOREINFO_SYMBOL(lowcore_ptr); VMCOREINFO_SYMBOL(high_memory); VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS); - vmcoreinfo_append_str("SDMA=%lx\n", __sdma); - vmcoreinfo_append_str("EDMA=%lx\n", __edma); + vmcoreinfo_append_str("SAMODE31=%lx\n", (unsigned long)__samode31); + vmcoreinfo_append_str("EAMODE31=%lx\n", (unsigned long)__eamode31); vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); - mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note()); + abs_lc = get_abs_lowcore(); + abs_lc->vmcore_info = paddr_vmcoreinfo_note(); + put_abs_lowcore(abs_lc); } void machine_shutdown(void) @@ -276,15 +238,20 @@ void machine_crash_shutdown(struct pt_regs *regs) */ static void __do_machine_kexec(void *data) { - relocate_kernel_t data_mover; + unsigned long data_mover, entry, diag308_subcode; struct kimage *image = data; + data_mover = page_to_phys(image->control_code_page); + entry = virt_to_phys(&image->head); + diag308_subcode = DIAG308_CLEAR_RESET; + if (sclp.has_iplcc) + diag308_subcode |= DIAG308_FLAG_EI; s390_reset_system(); - data_mover = (relocate_kernel_t) page_to_phys(image->control_code_page); - __arch_local_irq_stnsm(0xfb); /* disable DAT - avoid no-execute */ - /* Call the moving routine */ - (*data_mover)(&image->head, image->start); + call_nodat(3, void, (relocate_kernel_t)data_mover, + unsigned long, entry, + unsigned long, image->start, + unsigned long, diag308_subcode); /* Die if kexec returns */ disabled_wait(); @@ -295,7 +262,6 @@ static void __do_machine_kexec(void *data) */ static void __machine_kexec(void *data) { - __arch_local_irq_stosm(0x04); /* enable DAT */ pfault_fini(); tracing_off(); debug_locks_off(); diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c index 8415ae7d2a23..8d207b82d9fe 100644 --- a/arch/s390/kernel/machine_kexec_file.c +++ b/arch/s390/kernel/machine_kexec_file.c @@ -7,11 +7,14 @@ * Author(s): Philipp Rudo <prudo@linux.vnet.ibm.com> */ +#define pr_fmt(fmt) "kexec: " fmt + #include <linux/elf.h> #include <linux/errno.h> #include <linux/kexec.h> #include <linux/module_signature.h> #include <linux/verification.h> +#include <linux/vmalloc.h> #include <asm/boot_data.h> #include <asm/ipl.h> #include <asm/setup.h> @@ -28,6 +31,7 @@ int s390_verify_sig(const char *kernel, unsigned long kernel_len) const unsigned long marker_len = sizeof(MODULE_SIG_STRING) - 1; struct module_signature *ms; unsigned long sig_len; + int ret; /* Skip signature verification when not secure IPLed. */ if (!ipl_secure_flag) @@ -62,11 +66,18 @@ int s390_verify_sig(const char *kernel, unsigned long kernel_len) return -EBADMSG; } - return verify_pkcs7_signature(kernel, kernel_len, - kernel + kernel_len, sig_len, - VERIFY_USE_PLATFORM_KEYRING, - VERIFYING_MODULE_SIGNATURE, - NULL, NULL); + ret = verify_pkcs7_signature(kernel, kernel_len, + kernel + kernel_len, sig_len, + VERIFY_USE_SECONDARY_KEYRING, + VERIFYING_MODULE_SIGNATURE, + NULL, NULL); + if (ret == -ENOKEY && IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING)) + ret = verify_pkcs7_signature(kernel, kernel_len, + kernel + kernel_len, sig_len, + VERIFY_USE_PLATFORM_KEYRING, + VERIFYING_MODULE_SIGNATURE, + NULL, NULL); + return ret; } #endif /* CONFIG_KEXEC_SIG */ @@ -151,7 +162,7 @@ static int kexec_file_add_initrd(struct kimage *image, buf.mem += crashk_res.start; buf.memsz = buf.bufsz; - data->parm->initrd_start = buf.mem; + data->parm->initrd_start = data->memsz; data->parm->initrd_size = buf.memsz; data->memsz += buf.memsz; @@ -170,15 +181,14 @@ static int kexec_file_add_ipl_report(struct kimage *image, struct kexec_buf buf; unsigned long addr; void *ptr, *end; + int ret; buf.image = image; data->memsz = ALIGN(data->memsz, PAGE_SIZE); buf.mem = data->memsz; - if (image->type == KEXEC_TYPE_CRASH) - buf.mem += crashk_res.start; - ptr = (void *)ipl_cert_list_addr; + ptr = __va(ipl_cert_list_addr); end = ptr + ipl_cert_list_size; ncerts = 0; while (ptr < end) { @@ -190,7 +200,7 @@ static int kexec_file_add_ipl_report(struct kimage *image, addr = data->memsz + data->report->size; addr += ncerts * sizeof(struct ipl_rb_certificate_entry); - ptr = (void *)ipl_cert_list_addr; + ptr = __va(ipl_cert_list_addr); while (ptr < end) { len = *(unsigned int *)ptr; ptr += sizeof(len); @@ -199,9 +209,13 @@ static int kexec_file_add_ipl_report(struct kimage *image, ptr += len; } + ret = -ENOMEM; buf.buffer = ipl_report_finish(data->report); + if (!buf.buffer) + goto out; buf.bufsz = data->report->size; buf.memsz = buf.bufsz; + image->arch.ipl_buf = buf.buffer; data->memsz += buf.memsz; @@ -209,14 +223,21 @@ static int kexec_file_add_ipl_report(struct kimage *image, data->kernel_buf + offsetof(struct lowcore, ipl_parmblock_ptr); *lc_ipl_parmblock_ptr = (__u32)buf.mem; - return kexec_add_buffer(&buf); + if (image->type == KEXEC_TYPE_CRASH) + buf.mem += crashk_res.start; + + ret = kexec_add_buffer(&buf); +out: + return ret; } void *kexec_file_add_components(struct kimage *image, int (*add_kernel)(struct kimage *image, struct s390_load_data *data)) { + unsigned long max_command_line_size = LEGACY_COMMAND_LINE_SIZE; struct s390_load_data data = {0}; + unsigned long minsize; int ret; data.report = ipl_report_init(&ipl_block); @@ -227,10 +248,23 @@ void *kexec_file_add_components(struct kimage *image, if (ret) goto out; - if (image->cmdline_buf_len >= ARCH_COMMAND_LINE_SIZE) { - ret = -EINVAL; + ret = -EINVAL; + minsize = PARMAREA + offsetof(struct parmarea, command_line); + if (image->kernel_buf_len < minsize) goto out; - } + + if (data.parm->max_command_line_size) + max_command_line_size = data.parm->max_command_line_size; + + if (minsize + max_command_line_size < minsize) + goto out; + + if (image->kernel_buf_len < minsize + max_command_line_size) + goto out; + + if (image->cmdline_buf_len >= max_command_line_size) + goto out; + memcpy(data.parm->command_line, image->cmdline_buf, image->cmdline_buf_len); @@ -267,8 +301,16 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi, const Elf_Shdr *relsec, const Elf_Shdr *symtab) { + const char *strtab, *name, *shstrtab; + const Elf_Shdr *sechdrs; Elf_Rela *relas; int i, r_type; + int ret; + + /* String & section header string table */ + sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff; + strtab = (char *)pi->ehdr + sechdrs[symtab->sh_link].sh_offset; + shstrtab = (char *)pi->ehdr + sechdrs[pi->ehdr->e_shstrndx].sh_offset; relas = (void *)pi->ehdr + relsec->sh_offset; @@ -281,15 +323,27 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi, sym = (void *)pi->ehdr + symtab->sh_offset; sym += ELF64_R_SYM(relas[i].r_info); - if (sym->st_shndx == SHN_UNDEF) + if (sym->st_name) + name = strtab + sym->st_name; + else + name = shstrtab + sechdrs[sym->st_shndx].sh_name; + + if (sym->st_shndx == SHN_UNDEF) { + pr_err("Undefined symbol: %s\n", name); return -ENOEXEC; + } - if (sym->st_shndx == SHN_COMMON) + if (sym->st_shndx == SHN_COMMON) { + pr_err("symbol '%s' in common section\n", name); return -ENOEXEC; + } if (sym->st_shndx >= pi->ehdr->e_shnum && - sym->st_shndx != SHN_ABS) + sym->st_shndx != SHN_ABS) { + pr_err("Invalid section %d for symbol %s\n", + sym->st_shndx, name); return -ENOEXEC; + } loc = pi->purgatory_buf; loc += section->sh_offset; @@ -303,21 +357,23 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi, addr = section->sh_addr + relas[i].r_offset; r_type = ELF64_R_TYPE(relas[i].r_info); - arch_kexec_do_relocs(r_type, loc, val, addr); + + if (r_type == R_390_PLT32DBL) + r_type = R_390_PC32DBL; + + ret = arch_kexec_do_relocs(r_type, loc, val, addr); + if (ret) { + pr_err("Unknown rela relocation: %d\n", r_type); + return -ENOEXEC; + } } return 0; } -int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, - unsigned long buf_len) +int arch_kimage_file_post_load_cleanup(struct kimage *image) { - /* A kernel must be at least large enough to contain head.S. During - * load memory in head.S will be accessed, e.g. to register the next - * command line. If the next kernel were smaller the current kernel - * will panic at load. - */ - if (buf_len < HEAD_END) - return -ENOEXEC; - - return kexec_image_probe_default(image, buf, buf_len); + vfree(image->arch.ipl_buf); + image->arch.ipl_buf = NULL; + + return kexec_image_post_load_cleanup_default(image); } diff --git a/arch/s390/kernel/machine_kexec_reloc.c b/arch/s390/kernel/machine_kexec_reloc.c index d5035de9020e..b7182cec48dc 100644 --- a/arch/s390/kernel/machine_kexec_reloc.c +++ b/arch/s390/kernel/machine_kexec_reloc.c @@ -28,6 +28,7 @@ int arch_kexec_do_relocs(int r_type, void *loc, unsigned long val, break; case R_390_64: /* Direct 64 bit. */ case R_390_GLOB_DAT: + case R_390_JMP_SLOT: *(u64 *)loc = val; break; case R_390_PC16: /* PC relative 16 bit. */ diff --git a/arch/s390/kernel/mcount.S b/arch/s390/kernel/mcount.S index 9e1660a6b9db..ae4d4fd9afcd 100644 --- a/arch/s390/kernel/mcount.S +++ b/arch/s390/kernel/mcount.S @@ -2,8 +2,6 @@ /* * Copyright IBM Corp. 2008, 2009 * - * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>, - * */ #include <linux/linkage.h> @@ -11,84 +9,187 @@ #include <asm/ftrace.h> #include <asm/nospec-insn.h> #include <asm/ptrace.h> -#include <asm/export.h> + +#define STACK_FRAME_SIZE_PTREGS (STACK_FRAME_OVERHEAD + __PT_SIZE) +#define STACK_PTREGS (STACK_FRAME_OVERHEAD) +#define STACK_PTREGS_GPRS (STACK_PTREGS + __PT_GPRS) +#define STACK_PTREGS_PSW (STACK_PTREGS + __PT_PSW) + +#define STACK_FRAME_SIZE_FREGS (STACK_FRAME_OVERHEAD + __FTRACE_REGS_SIZE) +#define STACK_FREGS (STACK_FRAME_OVERHEAD) +#define STACK_FREGS_PTREGS (STACK_FRAME_OVERHEAD + __FTRACE_REGS_PT_REGS) +#define STACK_FREGS_PTREGS_GPRS (STACK_FREGS_PTREGS + __PT_GPRS) +#define STACK_FREGS_PTREGS_PSW (STACK_FREGS_PTREGS + __PT_PSW) +#define STACK_FREGS_PTREGS_ORIG_GPR2 (STACK_FREGS_PTREGS + __PT_ORIG_GPR2) +#define STACK_FREGS_PTREGS_FLAGS (STACK_FREGS_PTREGS + __PT_FLAGS) + +/* packed stack: allocate just enough for r14, r15 and backchain */ +#define TRACED_FUNC_FRAME_SIZE 24 + +#ifdef CONFIG_FUNCTION_TRACER GEN_BR_THUNK %r1 GEN_BR_THUNK %r14 .section .kprobes.text, "ax" -ENTRY(ftrace_stub) +SYM_FUNC_START(ftrace_stub) BR_EX %r14 -ENDPROC(ftrace_stub) +SYM_FUNC_END(ftrace_stub) -#define STACK_FRAME_SIZE (STACK_FRAME_OVERHEAD + __PT_SIZE) -#define STACK_PTREGS (STACK_FRAME_OVERHEAD) -#define STACK_PTREGS_GPRS (STACK_PTREGS + __PT_GPRS) -#define STACK_PTREGS_PSW (STACK_PTREGS + __PT_PSW) +SYM_CODE_START(ftrace_stub_direct_tramp) + lgr %r1, %r0 + BR_EX %r1 +SYM_CODE_END(ftrace_stub_direct_tramp) -ENTRY(_mcount) - BR_EX %r14 -ENDPROC(_mcount) -EXPORT_SYMBOL(_mcount) + .macro ftrace_regs_entry, allregs=0 + stg %r14,(__SF_GPRS+8*8)(%r15) # save traced function caller + + .if \allregs == 1 + # save psw mask + # don't put any instructions clobbering CC before this point + epsw %r1,%r14 + risbg %r14,%r1,0,31,32 + .endif -ENTRY(ftrace_caller) - .globl ftrace_regs_caller - .set ftrace_regs_caller,ftrace_caller lgr %r1,%r15 -#if !(defined(CC_USING_HOTPATCH) || defined(CC_USING_NOP_MCOUNT)) - aghi %r0,MCOUNT_RETURN_FIXUP -#endif - aghi %r15,-STACK_FRAME_SIZE + # allocate stack frame for ftrace_caller to contain traced function + aghi %r15,-TRACED_FUNC_FRAME_SIZE stg %r1,__SF_BACKCHAIN(%r15) - stg %r1,(STACK_PTREGS_GPRS+15*8)(%r15) - stg %r0,(STACK_PTREGS_PSW+8)(%r15) - stmg %r2,%r14,(STACK_PTREGS_GPRS+2*8)(%r15) + stg %r0,(__SF_GPRS+8*8)(%r15) + stg %r15,(__SF_GPRS+9*8)(%r15) + # allocate ftrace_regs and stack frame for ftrace_trace_function + aghi %r15,-STACK_FRAME_SIZE_FREGS + stg %r1,(STACK_FREGS_PTREGS_GPRS+15*8)(%r15) + xc STACK_FREGS_PTREGS_ORIG_GPR2(8,%r15),STACK_FREGS_PTREGS_ORIG_GPR2(%r15) + + .if \allregs == 1 + stg %r14,(STACK_FREGS_PTREGS_PSW)(%r15) + mvghi STACK_FREGS_PTREGS_FLAGS(%r15),_PIF_FTRACE_FULL_REGS + .else + xc STACK_FREGS_PTREGS_FLAGS(8,%r15),STACK_FREGS_PTREGS_FLAGS(%r15) + .endif + + lg %r14,(__SF_GPRS+8*8)(%r1) # restore original return address + aghi %r1,-TRACED_FUNC_FRAME_SIZE + stg %r1,__SF_BACKCHAIN(%r15) + stg %r0,(STACK_FREGS_PTREGS_PSW+8)(%r15) + stmg %r2,%r14,(STACK_FREGS_PTREGS_GPRS+2*8)(%r15) + .endm + +SYM_CODE_START(ftrace_regs_caller) + ftrace_regs_entry 1 + j ftrace_common +SYM_CODE_END(ftrace_regs_caller) + +SYM_CODE_START(ftrace_caller) + ftrace_regs_entry 0 + j ftrace_common +SYM_CODE_END(ftrace_caller) + +SYM_CODE_START(ftrace_common) #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES aghik %r2,%r0,-MCOUNT_INSN_SIZE lgrl %r4,function_trace_op - lgrl %r1,ftrace_trace_function + lgrl %r1,ftrace_func #else lgr %r2,%r0 aghi %r2,-MCOUNT_INSN_SIZE larl %r4,function_trace_op lg %r4,0(%r4) - larl %r1,ftrace_trace_function + larl %r1,ftrace_func lg %r1,0(%r1) #endif lgr %r3,%r14 - la %r5,STACK_PTREGS(%r15) + la %r5,STACK_FREGS(%r15) BASR_EX %r14,%r1 #ifdef CONFIG_FUNCTION_GRAPH_TRACER # The j instruction gets runtime patched to a nop instruction. # See ftrace_enable_ftrace_graph_caller. - .globl ftrace_graph_caller -ftrace_graph_caller: - j ftrace_graph_caller_end - lmg %r2,%r3,(STACK_PTREGS_GPRS+14*8)(%r15) - lg %r4,(STACK_PTREGS_PSW+8)(%r15) +SYM_INNER_LABEL(ftrace_graph_caller, SYM_L_GLOBAL) + j .Lftrace_graph_caller_end + lmg %r2,%r3,(STACK_FREGS_PTREGS_GPRS+14*8)(%r15) + lg %r4,(STACK_FREGS_PTREGS_PSW+8)(%r15) brasl %r14,prepare_ftrace_return - stg %r2,(STACK_PTREGS_GPRS+14*8)(%r15) -ftrace_graph_caller_end: - .globl ftrace_graph_caller_end + stg %r2,(STACK_FREGS_PTREGS_GPRS+14*8)(%r15) +.Lftrace_graph_caller_end: +#endif + lg %r0,(STACK_FREGS_PTREGS_PSW+8)(%r15) +#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES + ltg %r1,STACK_FREGS_PTREGS_ORIG_GPR2(%r15) + locgrz %r1,%r0 +#else + lg %r1,STACK_FREGS_PTREGS_ORIG_GPR2(%r15) + ltgr %r1,%r1 + jnz 0f + lgr %r1,%r0 #endif - lg %r1,(STACK_PTREGS_PSW+8)(%r15) - lmg %r2,%r15,(STACK_PTREGS_GPRS+2*8)(%r15) +0: lmg %r2,%r15,(STACK_FREGS_PTREGS_GPRS+2*8)(%r15) BR_EX %r1 -ENDPROC(ftrace_caller) +SYM_CODE_END(ftrace_common) #ifdef CONFIG_FUNCTION_GRAPH_TRACER -ENTRY(return_to_handler) +SYM_FUNC_START(return_to_handler) stmg %r2,%r5,32(%r15) lgr %r1,%r15 - aghi %r15,-STACK_FRAME_OVERHEAD + aghi %r15,-(STACK_FRAME_OVERHEAD+__FGRAPH_RET_SIZE) stg %r1,__SF_BACKCHAIN(%r15) + la %r3,STACK_FRAME_OVERHEAD(%r15) + stg %r1,__FGRAPH_RET_FP(%r3) + stg %r2,__FGRAPH_RET_GPR2(%r3) + lgr %r2,%r3 brasl %r14,ftrace_return_to_handler - aghi %r15,STACK_FRAME_OVERHEAD + aghi %r15,STACK_FRAME_OVERHEAD+__FGRAPH_RET_SIZE lgr %r14,%r2 lmg %r2,%r5,32(%r15) BR_EX %r14 -ENDPROC(return_to_handler) +SYM_FUNC_END(return_to_handler) #endif +#endif /* CONFIG_FUNCTION_TRACER */ + +SYM_CODE_START(ftrace_shared_hotpatch_trampoline_br) + lmg %r0,%r1,2(%r1) + br %r1 +SYM_INNER_LABEL(ftrace_shared_hotpatch_trampoline_br_end, SYM_L_GLOBAL) +SYM_CODE_END(ftrace_shared_hotpatch_trampoline_br) + +#ifdef CONFIG_EXPOLINE +SYM_CODE_START(ftrace_shared_hotpatch_trampoline_exrl) + lmg %r0,%r1,2(%r1) + exrl %r0,0f + j . +0: br %r1 +SYM_INNER_LABEL(ftrace_shared_hotpatch_trampoline_exrl_end, SYM_L_GLOBAL) +SYM_CODE_END(ftrace_shared_hotpatch_trampoline_exrl) +#endif /* CONFIG_EXPOLINE */ + +#ifdef CONFIG_RETHOOK + +SYM_CODE_START(arch_rethook_trampoline) + stg %r14,(__SF_GPRS+8*8)(%r15) + lay %r15,-STACK_FRAME_SIZE_PTREGS(%r15) + stmg %r0,%r14,STACK_PTREGS_GPRS(%r15) + + # store original stack pointer in backchain and pt_regs + lay %r7,STACK_FRAME_SIZE_PTREGS(%r15) + stg %r7,__SF_BACKCHAIN(%r15) + stg %r7,STACK_PTREGS_GPRS+(15*8)(%r15) + + # store full psw + epsw %r2,%r3 + risbg %r3,%r2,0,31,32 + stg %r3,STACK_PTREGS_PSW(%r15) + larl %r1,arch_rethook_trampoline + stg %r1,STACK_PTREGS_PSW+8(%r15) + + lay %r2,STACK_PTREGS(%r15) + brasl %r14,arch_rethook_trampoline_callback + + mvc __SF_EMPTY(16,%r7),STACK_PTREGS_PSW(%r15) + lmg %r0,%r15,STACK_PTREGS_GPRS(%r15) + lpswe __SF_EMPTY(%r15) +SYM_CODE_END(arch_rethook_trampoline) + +#endif /* CONFIG_RETHOOK */ diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c index ba8f19bb438b..42215f9404af 100644 --- a/arch/s390/kernel/module.c +++ b/arch/s390/kernel/module.c @@ -14,14 +14,19 @@ #include <linux/elf.h> #include <linux/vmalloc.h> #include <linux/fs.h> +#include <linux/ftrace.h> #include <linux/string.h> #include <linux/kernel.h> #include <linux/kasan.h> #include <linux/moduleloader.h> #include <linux/bug.h> +#include <linux/memory.h> #include <asm/alternative.h> #include <asm/nospec-branch.h> #include <asm/facility.h> +#include <asm/ftrace.lds.h> +#include <asm/set_memory.h> +#include <asm/setup.h> #if 0 #define DEBUGP printk @@ -29,24 +34,52 @@ #define DEBUGP(fmt , ...) #endif -#define PLT_ENTRY_SIZE 20 +#define PLT_ENTRY_SIZE 22 + +static unsigned long get_module_load_offset(void) +{ + static DEFINE_MUTEX(module_kaslr_mutex); + static unsigned long module_load_offset; + + if (!kaslr_enabled()) + return 0; + /* + * Calculate the module_load_offset the first time this code + * is called. Once calculated it stays the same until reboot. + */ + mutex_lock(&module_kaslr_mutex); + if (!module_load_offset) + module_load_offset = get_random_u32_inclusive(1, 1024) * PAGE_SIZE; + mutex_unlock(&module_kaslr_mutex); + return module_load_offset; +} void *module_alloc(unsigned long size) { + gfp_t gfp_mask = GFP_KERNEL; void *p; if (PAGE_ALIGN(size) > MODULES_LEN) return NULL; - p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END, - GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, - __builtin_return_address(0)); - if (p && (kasan_module_alloc(p, size) < 0)) { + p = __vmalloc_node_range(size, MODULE_ALIGN, + MODULES_VADDR + get_module_load_offset(), + MODULES_END, gfp_mask, PAGE_KERNEL, + VM_FLUSH_RESET_PERMS | VM_DEFER_KMEMLEAK, + NUMA_NO_NODE, __builtin_return_address(0)); + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { vfree(p); return NULL; } return p; } +#ifdef CONFIG_FUNCTION_TRACER +void module_arch_cleanup(struct module *mod) +{ + module_memfree(mod->arch.trampolines_start); +} +#endif + void module_arch_freeing_init(struct module *mod) { if (is_livepatch_module(mod) && @@ -114,6 +147,7 @@ int module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, Elf_Rela *rela; char *strings; int nrela, i, j; + struct module_memory *mod_mem; /* Find symbol table and string table. */ symtab = NULL; @@ -161,23 +195,26 @@ int module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, /* Increase core size by size of got & plt and set start offsets for got and plt. */ - me->core_layout.size = ALIGN(me->core_layout.size, 4); - me->arch.got_offset = me->core_layout.size; - me->core_layout.size += me->arch.got_size; - me->arch.plt_offset = me->core_layout.size; + mod_mem = &me->mem[MOD_TEXT]; + mod_mem->size = ALIGN(mod_mem->size, 4); + me->arch.got_offset = mod_mem->size; + mod_mem->size += me->arch.got_size; + me->arch.plt_offset = mod_mem->size; if (me->arch.plt_size) { if (IS_ENABLED(CONFIG_EXPOLINE) && !nospec_disable) me->arch.plt_size += PLT_ENTRY_SIZE; - me->core_layout.size += me->arch.plt_size; + mod_mem->size += me->arch.plt_size; } return 0; } static int apply_rela_bits(Elf_Addr loc, Elf_Addr val, - int sign, int bits, int shift) + int sign, int bits, int shift, + void *(*write)(void *dest, const void *src, size_t len)) { unsigned long umax; long min, max; + void *dest = (void *)loc; if (val & ((1UL << shift) - 1)) return -ENOEXEC; @@ -194,26 +231,33 @@ static int apply_rela_bits(Elf_Addr loc, Elf_Addr val, return -ENOEXEC; } - if (bits == 8) - *(unsigned char *) loc = val; - else if (bits == 12) - *(unsigned short *) loc = (val & 0xfff) | + if (bits == 8) { + unsigned char tmp = val; + write(dest, &tmp, 1); + } else if (bits == 12) { + unsigned short tmp = (val & 0xfff) | (*(unsigned short *) loc & 0xf000); - else if (bits == 16) - *(unsigned short *) loc = val; - else if (bits == 20) - *(unsigned int *) loc = (val & 0xfff) << 16 | - (val & 0xff000) >> 4 | - (*(unsigned int *) loc & 0xf00000ff); - else if (bits == 32) - *(unsigned int *) loc = val; - else if (bits == 64) - *(unsigned long *) loc = val; + write(dest, &tmp, 2); + } else if (bits == 16) { + unsigned short tmp = val; + write(dest, &tmp, 2); + } else if (bits == 20) { + unsigned int tmp = (val & 0xfff) << 16 | + (val & 0xff000) >> 4 | (*(unsigned int *) loc & 0xf00000ff); + write(dest, &tmp, 4); + } else if (bits == 32) { + unsigned int tmp = val; + write(dest, &tmp, 4); + } else if (bits == 64) { + unsigned long tmp = val; + write(dest, &tmp, 8); + } return 0; } static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab, - const char *strtab, struct module *me) + const char *strtab, struct module *me, + void *(*write)(void *dest, const void *src, size_t len)) { struct mod_arch_syminfo *info; Elf_Addr loc, val; @@ -241,17 +285,17 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab, case R_390_64: /* Direct 64 bit. */ val += rela->r_addend; if (r_type == R_390_8) - rc = apply_rela_bits(loc, val, 0, 8, 0); + rc = apply_rela_bits(loc, val, 0, 8, 0, write); else if (r_type == R_390_12) - rc = apply_rela_bits(loc, val, 0, 12, 0); + rc = apply_rela_bits(loc, val, 0, 12, 0, write); else if (r_type == R_390_16) - rc = apply_rela_bits(loc, val, 0, 16, 0); + rc = apply_rela_bits(loc, val, 0, 16, 0, write); else if (r_type == R_390_20) - rc = apply_rela_bits(loc, val, 1, 20, 0); + rc = apply_rela_bits(loc, val, 1, 20, 0, write); else if (r_type == R_390_32) - rc = apply_rela_bits(loc, val, 0, 32, 0); + rc = apply_rela_bits(loc, val, 0, 32, 0, write); else if (r_type == R_390_64) - rc = apply_rela_bits(loc, val, 0, 64, 0); + rc = apply_rela_bits(loc, val, 0, 64, 0, write); break; case R_390_PC16: /* PC relative 16 bit. */ case R_390_PC16DBL: /* PC relative 16 bit shifted by 1. */ @@ -260,15 +304,15 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab, case R_390_PC64: /* PC relative 64 bit. */ val += rela->r_addend - loc; if (r_type == R_390_PC16) - rc = apply_rela_bits(loc, val, 1, 16, 0); + rc = apply_rela_bits(loc, val, 1, 16, 0, write); else if (r_type == R_390_PC16DBL) - rc = apply_rela_bits(loc, val, 1, 16, 1); + rc = apply_rela_bits(loc, val, 1, 16, 1, write); else if (r_type == R_390_PC32DBL) - rc = apply_rela_bits(loc, val, 1, 32, 1); + rc = apply_rela_bits(loc, val, 1, 32, 1, write); else if (r_type == R_390_PC32) - rc = apply_rela_bits(loc, val, 1, 32, 0); + rc = apply_rela_bits(loc, val, 1, 32, 0, write); else if (r_type == R_390_PC64) - rc = apply_rela_bits(loc, val, 1, 64, 0); + rc = apply_rela_bits(loc, val, 1, 64, 0, write); break; case R_390_GOT12: /* 12 bit GOT offset. */ case R_390_GOT16: /* 16 bit GOT offset. */ @@ -283,33 +327,34 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab, case R_390_GOTPLT64: /* 64 bit offset to jump slot. */ case R_390_GOTPLTENT: /* 32 bit rel. offset to jump slot >> 1. */ if (info->got_initialized == 0) { - Elf_Addr *gotent; + Elf_Addr *gotent = me->mem[MOD_TEXT].base + + me->arch.got_offset + + info->got_offset; - gotent = me->core_layout.base + me->arch.got_offset + - info->got_offset; - *gotent = val; + write(gotent, &val, sizeof(*gotent)); info->got_initialized = 1; } val = info->got_offset + rela->r_addend; if (r_type == R_390_GOT12 || r_type == R_390_GOTPLT12) - rc = apply_rela_bits(loc, val, 0, 12, 0); + rc = apply_rela_bits(loc, val, 0, 12, 0, write); else if (r_type == R_390_GOT16 || r_type == R_390_GOTPLT16) - rc = apply_rela_bits(loc, val, 0, 16, 0); + rc = apply_rela_bits(loc, val, 0, 16, 0, write); else if (r_type == R_390_GOT20 || r_type == R_390_GOTPLT20) - rc = apply_rela_bits(loc, val, 1, 20, 0); + rc = apply_rela_bits(loc, val, 1, 20, 0, write); else if (r_type == R_390_GOT32 || r_type == R_390_GOTPLT32) - rc = apply_rela_bits(loc, val, 0, 32, 0); + rc = apply_rela_bits(loc, val, 0, 32, 0, write); else if (r_type == R_390_GOT64 || r_type == R_390_GOTPLT64) - rc = apply_rela_bits(loc, val, 0, 64, 0); + rc = apply_rela_bits(loc, val, 0, 64, 0, write); else if (r_type == R_390_GOTENT || r_type == R_390_GOTPLTENT) { - val += (Elf_Addr) me->core_layout.base - loc; - rc = apply_rela_bits(loc, val, 1, 32, 1); + val += (Elf_Addr)me->mem[MOD_TEXT].base + + me->arch.got_offset - loc; + rc = apply_rela_bits(loc, val, 1, 32, 1, write); } break; case R_390_PLT16DBL: /* 16 bit PC rel. PLT shifted by 1. */ @@ -320,25 +365,28 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab, case R_390_PLTOFF32: /* 32 bit offset from GOT to PLT. */ case R_390_PLTOFF64: /* 16 bit offset from GOT to PLT. */ if (info->plt_initialized == 0) { - unsigned int *ip; - ip = me->core_layout.base + me->arch.plt_offset + - info->plt_offset; - ip[0] = 0x0d10e310; /* basr 1,0 */ - ip[1] = 0x100a0004; /* lg 1,10(1) */ + unsigned char insn[PLT_ENTRY_SIZE]; + char *plt_base; + char *ip; + + plt_base = me->mem[MOD_TEXT].base + me->arch.plt_offset; + ip = plt_base + info->plt_offset; + *(int *)insn = 0x0d10e310; /* basr 1,0 */ + *(int *)&insn[4] = 0x100c0004; /* lg 1,12(1) */ if (IS_ENABLED(CONFIG_EXPOLINE) && !nospec_disable) { - unsigned int *ij; - ij = me->core_layout.base + - me->arch.plt_offset + - me->arch.plt_size - PLT_ENTRY_SIZE; - ip[2] = 0xa7f40000 + /* j __jump_r1 */ - (unsigned int)(u16) - (((unsigned long) ij - 8 - - (unsigned long) ip) / 2); + char *jump_r1; + + jump_r1 = plt_base + me->arch.plt_size - + PLT_ENTRY_SIZE; + /* brcl 0xf,__jump_r1 */ + *(short *)&insn[8] = 0xc0f4; + *(int *)&insn[10] = (jump_r1 - (ip + 8)) / 2; } else { - ip[2] = 0x07f10000; /* br %r1 */ + *(int *)&insn[8] = 0x07f10000; /* br %r1 */ } - ip[3] = (unsigned int) (val >> 32); - ip[4] = (unsigned int) val; + *(long *)&insn[14] = val; + + write(ip, insn, sizeof(insn)); info->plt_initialized = 1; } if (r_type == R_390_PLTOFF16 || @@ -351,44 +399,44 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab, val - loc + 0xffffUL < 0x1ffffeUL) || (r_type == R_390_PLT32DBL && val - loc + 0xffffffffULL < 0x1fffffffeULL))) - val = (Elf_Addr) me->core_layout.base + + val = (Elf_Addr) me->mem[MOD_TEXT].base + me->arch.plt_offset + info->plt_offset; val += rela->r_addend - loc; } if (r_type == R_390_PLT16DBL) - rc = apply_rela_bits(loc, val, 1, 16, 1); + rc = apply_rela_bits(loc, val, 1, 16, 1, write); else if (r_type == R_390_PLTOFF16) - rc = apply_rela_bits(loc, val, 0, 16, 0); + rc = apply_rela_bits(loc, val, 0, 16, 0, write); else if (r_type == R_390_PLT32DBL) - rc = apply_rela_bits(loc, val, 1, 32, 1); + rc = apply_rela_bits(loc, val, 1, 32, 1, write); else if (r_type == R_390_PLT32 || r_type == R_390_PLTOFF32) - rc = apply_rela_bits(loc, val, 0, 32, 0); + rc = apply_rela_bits(loc, val, 0, 32, 0, write); else if (r_type == R_390_PLT64 || r_type == R_390_PLTOFF64) - rc = apply_rela_bits(loc, val, 0, 64, 0); + rc = apply_rela_bits(loc, val, 0, 64, 0, write); break; case R_390_GOTOFF16: /* 16 bit offset to GOT. */ case R_390_GOTOFF32: /* 32 bit offset to GOT. */ case R_390_GOTOFF64: /* 64 bit offset to GOT. */ val = val + rela->r_addend - - ((Elf_Addr) me->core_layout.base + me->arch.got_offset); + ((Elf_Addr) me->mem[MOD_TEXT].base + me->arch.got_offset); if (r_type == R_390_GOTOFF16) - rc = apply_rela_bits(loc, val, 0, 16, 0); + rc = apply_rela_bits(loc, val, 0, 16, 0, write); else if (r_type == R_390_GOTOFF32) - rc = apply_rela_bits(loc, val, 0, 32, 0); + rc = apply_rela_bits(loc, val, 0, 32, 0, write); else if (r_type == R_390_GOTOFF64) - rc = apply_rela_bits(loc, val, 0, 64, 0); + rc = apply_rela_bits(loc, val, 0, 64, 0, write); break; case R_390_GOTPC: /* 32 bit PC relative offset to GOT. */ case R_390_GOTPCDBL: /* 32 bit PC rel. off. to GOT shifted by 1. */ - val = (Elf_Addr) me->core_layout.base + me->arch.got_offset + + val = (Elf_Addr) me->mem[MOD_TEXT].base + me->arch.got_offset + rela->r_addend - loc; if (r_type == R_390_GOTPC) - rc = apply_rela_bits(loc, val, 1, 32, 0); + rc = apply_rela_bits(loc, val, 1, 32, 0, write); else if (r_type == R_390_GOTPCDBL) - rc = apply_rela_bits(loc, val, 1, 32, 1); + rc = apply_rela_bits(loc, val, 1, 32, 1, write); break; case R_390_COPY: case R_390_GLOB_DAT: /* Create GOT entry. */ @@ -412,9 +460,10 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab, return 0; } -int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, +static int __apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, unsigned int symindex, unsigned int relsec, - struct module *me) + struct module *me, + void *(*write)(void *dest, const void *src, size_t len)) { Elf_Addr base; Elf_Sym *symtab; @@ -430,13 +479,51 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, n = sechdrs[relsec].sh_size / sizeof(Elf_Rela); for (i = 0; i < n; i++, rela++) { - rc = apply_rela(rela, base, symtab, strtab, me); + rc = apply_rela(rela, base, symtab, strtab, me, write); if (rc) return rc; } return 0; } +int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, + unsigned int symindex, unsigned int relsec, + struct module *me) +{ + bool early = me->state == MODULE_STATE_UNFORMED; + void *(*write)(void *, const void *, size_t) = memcpy; + + if (!early) + write = s390_kernel_write; + + return __apply_relocate_add(sechdrs, strtab, symindex, relsec, me, + write); +} + +#ifdef CONFIG_FUNCTION_TRACER +static int module_alloc_ftrace_hotpatch_trampolines(struct module *me, + const Elf_Shdr *s) +{ + char *start, *end; + int numpages; + size_t size; + + size = FTRACE_HOTPATCH_TRAMPOLINES_SIZE(s->sh_size); + numpages = DIV_ROUND_UP(size, PAGE_SIZE); + start = module_alloc(numpages * PAGE_SIZE); + if (!start) + return -ENOMEM; + set_memory_rox((unsigned long)start, numpages); + end = start + size; + + me->arch.trampolines_start = (struct ftrace_hotpatch_trampoline *)start; + me->arch.trampolines_end = (struct ftrace_hotpatch_trampoline *)end; + me->arch.next_trampoline = me->arch.trampolines_start; + + return 0; +} +#endif /* CONFIG_FUNCTION_TRACER */ + int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *me) @@ -444,22 +531,19 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *s; char *secstrings, *secname; void *aseg; +#ifdef CONFIG_FUNCTION_TRACER + int ret; +#endif if (IS_ENABLED(CONFIG_EXPOLINE) && !nospec_disable && me->arch.plt_size) { unsigned int *ij; - ij = me->core_layout.base + me->arch.plt_offset + + ij = me->mem[MOD_TEXT].base + me->arch.plt_offset + me->arch.plt_size - PLT_ENTRY_SIZE; - if (test_facility(35)) { - ij[0] = 0xc6000000; /* exrl %r0,.+10 */ - ij[1] = 0x0005a7f4; /* j . */ - ij[2] = 0x000007f1; /* br %r1 */ - } else { - ij[0] = 0x44000000 | (unsigned int) - offsetof(struct lowcore, br_r1_trampoline); - ij[1] = 0xa7f40000; /* j . */ - } + ij[0] = 0xc6000000; /* exrl %r0,.+10 */ + ij[1] = 0x0005a7f4; /* j . */ + ij[2] = 0x000007f1; /* br %r1 */ } secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; @@ -478,8 +562,15 @@ int module_finalize(const Elf_Ehdr *hdr, if (IS_ENABLED(CONFIG_EXPOLINE) && (str_has_prefix(secname, ".s390_return"))) nospec_revert(aseg, aseg + s->sh_size); + +#ifdef CONFIG_FUNCTION_TRACER + if (!strcmp(FTRACE_CALLSITE_SECTION, secname)) { + ret = module_alloc_ftrace_hotpatch_trampolines(me, s); + if (ret < 0) + return ret; + } +#endif /* CONFIG_FUNCTION_TRACER */ } - jump_label_apply_nops(me); return 0; } diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index 0a487fae763e..9ad44c26d1a2 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -6,12 +6,12 @@ * Author(s): Ingo Adlung <adlung@de.ibm.com>, * Martin Schwidefsky <schwidefsky@de.ibm.com>, * Cornelia Huck <cornelia.huck@de.ibm.com>, - * Heiko Carstens <heiko.carstens@de.ibm.com>, */ #include <linux/kernel_stat.h> #include <linux/init.h> #include <linux/errno.h> +#include <linux/entry-common.h> #include <linux/hardirq.h> #include <linux/log2.h> #include <linux/kprobes.h> @@ -19,18 +19,20 @@ #include <linux/time.h> #include <linux/module.h> #include <linux/sched/signal.h> - +#include <linux/kvm_host.h> #include <linux/export.h> #include <asm/lowcore.h> +#include <asm/ctlreg.h> #include <asm/smp.h> #include <asm/stp.h> #include <asm/cputime.h> #include <asm/nmi.h> #include <asm/crw.h> #include <asm/switch_to.h> -#include <asm/ctl_reg.h> #include <asm/asm-offsets.h> -#include <linux/kvm_host.h> +#include <asm/pai.h> +#include <asm/vx-insn.h> +#include <asm/fpu/api.h> struct mcck_struct { unsigned int kill_task : 1; @@ -41,116 +43,134 @@ struct mcck_struct { }; static DEFINE_PER_CPU(struct mcck_struct, cpu_mcck); -static struct kmem_cache *mcesa_cache; -static unsigned long mcesa_origin_lc; static inline int nmi_needs_mcesa(void) { - return MACHINE_HAS_VX || MACHINE_HAS_GS; -} - -static inline unsigned long nmi_get_mcesa_size(void) -{ - if (MACHINE_HAS_GS) - return MCESA_MAX_SIZE; - return MCESA_MIN_SIZE; + return cpu_has_vx() || MACHINE_HAS_GS; } /* * The initial machine check extended save area for the boot CPU. - * It will be replaced by nmi_init() with an allocated structure. - * The structure is required for machine check happening early in - * the boot process. + * It will be replaced on the boot CPU reinit with an allocated + * structure. The structure is required for machine check happening + * early in the boot process. */ -static struct mcesa boot_mcesa __initdata __aligned(MCESA_MAX_SIZE); +static struct mcesa boot_mcesa __aligned(MCESA_MAX_SIZE); -void __init nmi_alloc_boot_cpu(struct lowcore *lc) +void __init nmi_alloc_mcesa_early(u64 *mcesad) { if (!nmi_needs_mcesa()) return; - lc->mcesad = (unsigned long) &boot_mcesa; + *mcesad = __pa(&boot_mcesa); if (MACHINE_HAS_GS) - lc->mcesad |= ilog2(MCESA_MAX_SIZE); + *mcesad |= ilog2(MCESA_MAX_SIZE); } -static int __init nmi_init(void) +int nmi_alloc_mcesa(u64 *mcesad) { - unsigned long origin, cr0, size; + unsigned long size; + void *origin; + *mcesad = 0; if (!nmi_needs_mcesa()) return 0; - size = nmi_get_mcesa_size(); - if (size > MCESA_MIN_SIZE) - mcesa_origin_lc = ilog2(size); - /* create slab cache for the machine-check-extended-save-areas */ - mcesa_cache = kmem_cache_create("nmi_save_areas", size, size, 0, NULL); - if (!mcesa_cache) - panic("Couldn't create nmi save area cache"); - origin = (unsigned long) kmem_cache_alloc(mcesa_cache, GFP_KERNEL); + size = MACHINE_HAS_GS ? MCESA_MAX_SIZE : MCESA_MIN_SIZE; + origin = kmalloc(size, GFP_KERNEL); if (!origin) - panic("Couldn't allocate nmi save area"); + return -ENOMEM; /* The pointer is stored with mcesa_bits ORed in */ - kmemleak_not_leak((void *) origin); - __ctl_store(cr0, 0, 0); - __ctl_clear_bit(0, 28); /* disable lowcore protection */ - /* Replace boot_mcesa on the boot CPU */ - S390_lowcore.mcesad = origin | mcesa_origin_lc; - __ctl_load(cr0, 0, 0); + kmemleak_not_leak(origin); + *mcesad = __pa(origin); + if (MACHINE_HAS_GS) + *mcesad |= ilog2(MCESA_MAX_SIZE); return 0; } -early_initcall(nmi_init); -int nmi_alloc_per_cpu(struct lowcore *lc) +void nmi_free_mcesa(u64 *mcesad) { - unsigned long origin; - if (!nmi_needs_mcesa()) - return 0; - origin = (unsigned long) kmem_cache_alloc(mcesa_cache, GFP_KERNEL); - if (!origin) - return -ENOMEM; - /* The pointer is stored with mcesa_bits ORed in */ - kmemleak_not_leak((void *) origin); - lc->mcesad = origin | mcesa_origin_lc; - return 0; + return; + kfree(__va(*mcesad & MCESA_ORIGIN_MASK)); } -void nmi_free_per_cpu(struct lowcore *lc) +static __always_inline char *nmi_puts(char *dest, const char *src) { - if (!nmi_needs_mcesa()) - return; - kmem_cache_free(mcesa_cache, (void *)(lc->mcesad & MCESA_ORIGIN_MASK)); + while (*src) + *dest++ = *src++; + *dest = 0; + return dest; +} + +static __always_inline char *u64_to_hex(char *dest, u64 val) +{ + int i, num; + + for (i = 1; i <= 16; i++) { + num = (val >> (64 - 4 * i)) & 0xf; + if (num >= 10) + *dest++ = 'A' + num - 10; + else + *dest++ = '0' + num; + } + *dest = 0; + return dest; } static notrace void s390_handle_damage(void) { + union ctlreg0 cr0, cr0_new; + char message[100]; + psw_t psw_save; + char *ptr; + smp_emergency_stop(); + diag_amode31_ops.diag308_reset(); + ptr = nmi_puts(message, "System stopped due to unrecoverable machine check, code: 0x"); + u64_to_hex(ptr, S390_lowcore.mcck_interruption_code); + + /* + * Disable low address protection and make machine check new PSW a + * disabled wait PSW. Any additional machine check cannot be handled. + */ + local_ctl_store(0, &cr0.reg); + cr0_new = cr0; + cr0_new.lap = 0; + local_ctl_load(0, &cr0_new.reg); + psw_save = S390_lowcore.mcck_new_psw; + psw_bits(S390_lowcore.mcck_new_psw).io = 0; + psw_bits(S390_lowcore.mcck_new_psw).ext = 0; + psw_bits(S390_lowcore.mcck_new_psw).wait = 1; + sclp_emergency_printk(message); + + /* + * Restore machine check new PSW and control register 0 to original + * values. This makes possible system dump analysis easier. + */ + S390_lowcore.mcck_new_psw = psw_save; + local_ctl_load(0, &cr0.reg); disabled_wait(); while (1); } NOKPROBE_SYMBOL(s390_handle_damage); /* - * Main machine check handler function. Will be called with interrupts enabled - * or disabled and machine checks enabled or disabled. + * Main machine check handler function. Will be called with interrupts disabled + * and machine checks enabled. */ void s390_handle_mcck(void) { - unsigned long flags; struct mcck_struct mcck; + unsigned long mflags; /* * Disable machine checks and get the current state of accumulated * machine checks. Afterwards delete the old state and enable machine * checks again. */ - local_irq_save(flags); - local_mcck_disable(); + local_mcck_save(mflags); mcck = *this_cpu_ptr(&cpu_mcck); memset(this_cpu_ptr(&cpu_mcck), 0, sizeof(mcck)); - clear_cpu_flag(CIF_MCCK_PENDING); - local_mcck_enable(); - local_irq_restore(flags); + local_mcck_restore(mflags); if (mcck.channel_report) crw_handle_channel_report(); @@ -167,134 +187,150 @@ void s390_handle_mcck(void) static int mchchk_wng_posted = 0; /* Use single cpu clear, as we cannot handle smp here. */ - __ctl_clear_bit(14, 24); /* Disable WARNING MCH */ + local_ctl_clear_bit(14, CR14_WARNING_SUBMASK_BIT); if (xchg(&mchchk_wng_posted, 1) == 0) kill_cad_pid(SIGPWR, 1); } if (mcck.stp_queue) stp_queue_work(); if (mcck.kill_task) { - local_irq_enable(); printk(KERN_EMERG "mcck: Terminating task because of machine " "malfunction (code 0x%016lx).\n", mcck.mcck_code); printk(KERN_EMERG "mcck: task: %s, pid: %d.\n", current->comm, current->pid); - do_exit(SIGSEGV); + if (is_global_init(current)) + panic("mcck: Attempting to kill init!\n"); + do_send_sig_info(SIGKILL, SEND_SIG_PRIV, current, PIDTYPE_PID); } } -EXPORT_SYMBOL_GPL(s390_handle_mcck); /* - * returns 0 if all required registers are available + * returns 0 if register contents could be validated * returns 1 otherwise */ -static int notrace s390_check_registers(union mci mci, int umode) +static int notrace s390_validate_registers(union mci mci) { + struct mcesa *mcesa; + void *fpt_save_area; union ctlreg2 cr2; int kill_task; + u64 zero; kill_task = 0; + zero = 0; - if (!mci.gr) { - /* - * General purpose registers couldn't be restored and have - * unknown contents. Stop system or terminate process. - */ - if (!umode) - s390_handle_damage(); + if (!mci.gr || !mci.fp) kill_task = 1; - } - /* Check control registers */ - if (!mci.cr) { - /* - * Control registers have unknown contents. - * Can't recover and therefore stopping machine. - */ - s390_handle_damage(); - } - if (!mci.fp) { - /* - * Floating point registers can't be restored. If the - * kernel currently uses floating point registers the - * system is stopped. If the process has its floating - * pointer registers loaded it is terminated. - */ - if (S390_lowcore.fpu_flags & KERNEL_VXR_V0V7) - s390_handle_damage(); - if (!test_cpu_flag(CIF_FPU)) - kill_task = 1; - } + fpt_save_area = &S390_lowcore.floating_pt_save_area; if (!mci.fc) { - /* - * Floating point control register can't be restored. - * If the kernel currently uses the floating pointer - * registers and needs the FPC register the system is - * stopped. If the process has its floating pointer - * registers loaded it is terminated. - */ - if (S390_lowcore.fpu_flags & KERNEL_FPC) - s390_handle_damage(); - if (!test_cpu_flag(CIF_FPU)) - kill_task = 1; + kill_task = 1; + asm volatile( + " lfpc %0\n" + : + : "Q" (zero)); + } else { + asm volatile( + " lfpc %0\n" + : + : "Q" (S390_lowcore.fpt_creg_save_area)); } - if (MACHINE_HAS_VX) { - if (!mci.vr) { - /* - * Vector registers can't be restored. If the kernel - * currently uses vector registers the system is - * stopped. If the process has its vector registers - * loaded it is terminated. - */ - if (S390_lowcore.fpu_flags & KERNEL_VXR) - s390_handle_damage(); - if (!test_cpu_flag(CIF_FPU)) - kill_task = 1; - } - } - /* Check if access registers are valid */ - if (!mci.ar) { + mcesa = __va(S390_lowcore.mcesad & MCESA_ORIGIN_MASK); + if (!cpu_has_vx()) { + /* Validate floating point registers */ + asm volatile( + " ld 0,0(%0)\n" + " ld 1,8(%0)\n" + " ld 2,16(%0)\n" + " ld 3,24(%0)\n" + " ld 4,32(%0)\n" + " ld 5,40(%0)\n" + " ld 6,48(%0)\n" + " ld 7,56(%0)\n" + " ld 8,64(%0)\n" + " ld 9,72(%0)\n" + " ld 10,80(%0)\n" + " ld 11,88(%0)\n" + " ld 12,96(%0)\n" + " ld 13,104(%0)\n" + " ld 14,112(%0)\n" + " ld 15,120(%0)\n" + : + : "a" (fpt_save_area) + : "memory"); + } else { + /* Validate vector registers */ + union ctlreg0 cr0; + /* - * Access registers have unknown contents. - * Terminating task. + * The vector validity must only be checked if not running a + * KVM guest. For KVM guests the machine check is forwarded by + * KVM and it is the responsibility of the guest to take + * appropriate actions. The host vector or FPU values have been + * saved by KVM and will be restored by KVM. */ - kill_task = 1; + if (!mci.vr && !test_cpu_flag(CIF_MCCK_GUEST)) + kill_task = 1; + cr0.reg = S390_lowcore.cregs_save_area[0]; + cr0.afp = cr0.vx = 1; + local_ctl_load(0, &cr0.reg); + asm volatile( + " la 1,%0\n" + " VLM 0,15,0,1\n" + " VLM 16,31,256,1\n" + : + : "Q" (*(struct vx_array *)mcesa->vector_save_area) + : "1"); + local_ctl_load(0, &S390_lowcore.cregs_save_area[0]); } - /* Check guarded storage registers */ - cr2.val = S390_lowcore.cregs_save_area[2]; + /* Validate access registers */ + asm volatile( + " lam 0,15,0(%0)\n" + : + : "a" (&S390_lowcore.access_regs_save_area) + : "memory"); + if (!mci.ar) + kill_task = 1; + /* Validate guarded storage registers */ + cr2.reg = S390_lowcore.cregs_save_area[2]; if (cr2.gse) { if (!mci.gs) { /* - * Guarded storage register can't be restored and - * the current processes uses guarded storage. - * It has to be terminated. + * 2 cases: + * - machine check in kernel or userspace + * - machine check while running SIE (KVM guest) + * For kernel or userspace the userspace values of + * guarded storage control can not be recreated, the + * process must be terminated. + * For SIE the guest values of guarded storage can not + * be recreated. This is either due to a bug or due to + * GS being disabled in the guest. The guest will be + * notified by KVM code and the guests machine check + * handling must take care of this. The host values + * are saved by KVM and are not affected. */ - kill_task = 1; + if (!test_cpu_flag(CIF_MCCK_GUEST)) + kill_task = 1; + } else { + load_gs_cb((struct gs_cb *)mcesa->guarded_storage_save_area); } } - /* Check if old PSW is valid */ - if (!mci.wp) { - /* - * Can't tell if we come from user or kernel mode - * -> stopping machine. - */ - s390_handle_damage(); - } - /* Check for invalid kernel instruction address */ - if (!mci.ia && !umode) { - /* - * The instruction address got lost while running - * in the kernel -> stopping machine. - */ - s390_handle_damage(); - } + /* + * The getcpu vdso syscall reads CPU number from the programmable + * field of the TOD clock. Disregard the TOD programmable register + * validity bit and load the CPU number into the TOD programmable + * field unconditionally. + */ + set_tod_programmable_field(raw_smp_processor_id()); + /* Validate clock comparator register */ + set_clock_comparator(S390_lowcore.clock_comparator); if (!mci.ms || !mci.pm || !mci.ia) kill_task = 1; return kill_task; } -NOKPROBE_SYMBOL(s390_check_registers); +NOKPROBE_SYMBOL(s390_validate_registers); /* * Backup the guest's machine check info to its description block @@ -305,8 +341,7 @@ static void notrace s390_backup_mcck_info(struct pt_regs *regs) struct sie_page *sie_page; /* r14 contains the sie block, which was set in sie64a */ - struct kvm_s390_sie_block *sie_block = - (struct kvm_s390_sie_block *) regs->gprs[14]; + struct kvm_s390_sie_block *sie_block = phys_to_virt(regs->gprs[14]); if (sie_block == NULL) /* Something's seriously wrong, stop system. */ @@ -340,19 +375,19 @@ void notrace s390_do_machine_check(struct pt_regs *regs) static unsigned long long last_ipd; struct mcck_struct *mcck; unsigned long long tmp; + irqentry_state_t irq_state; union mci mci; unsigned long mcck_dam_code; + int mcck_pending = 0; + + irq_state = irqentry_nmi_enter(regs); - nmi_enter(); + if (user_mode(regs)) + update_timer_mcck(); inc_irq_stat(NMI_NMI); mci.val = S390_lowcore.mcck_interruption_code; mcck = this_cpu_ptr(&cpu_mcck); - if (mci.sd) { - /* System damage -> stopping machine */ - s390_handle_damage(); - } - /* * Reinject the instruction processing damages' machine checks * including Delayed Access Exception into the guest @@ -393,14 +428,16 @@ void notrace s390_do_machine_check(struct pt_regs *regs) s390_handle_damage(); } } - if (s390_check_registers(mci, user_mode(regs))) { + if (s390_validate_registers(mci)) { + if (!user_mode(regs)) + s390_handle_damage(); /* * Couldn't restore all register contents for the * user space process -> mark task for termination. */ mcck->kill_task = 1; mcck->mcck_code = mci.val; - set_cpu_flag(CIF_MCCK_PENDING); + mcck_pending = 1; } /* @@ -420,34 +457,32 @@ void notrace s390_do_machine_check(struct pt_regs *regs) mcck->stp_queue |= stp_sync_check(); if (S390_lowcore.external_damage_code & (1U << ED_STP_ISLAND)) mcck->stp_queue |= stp_island_check(); - if (mcck->stp_queue) - set_cpu_flag(CIF_MCCK_PENDING); + mcck_pending = 1; } - /* * Reinject storage related machine checks into the guest if they * happen when the guest is running. */ if (!test_cpu_flag(CIF_MCCK_GUEST)) { + /* Storage error uncorrected */ if (mci.se) - /* Storage error uncorrected */ s390_handle_damage(); + /* Storage key-error uncorrected */ if (mci.ke) - /* Storage key-error uncorrected */ s390_handle_damage(); + /* Storage degradation */ if (mci.ds && mci.fa) - /* Storage degradation */ s390_handle_damage(); } if (mci.cp) { /* Channel report word pending */ mcck->channel_report = 1; - set_cpu_flag(CIF_MCCK_PENDING); + mcck_pending = 1; } if (mci.w) { /* Warning pending */ mcck->warning = 1; - set_cpu_flag(CIF_MCCK_PENDING); + mcck_pending = 1; } /* @@ -462,15 +497,19 @@ void notrace s390_do_machine_check(struct pt_regs *regs) *((long *)(regs->gprs[15] + __SF_SIE_REASON)) = -EINTR; } clear_cpu_flag(CIF_MCCK_GUEST); - nmi_exit(); + + if (mcck_pending) + schedule_mcck_handler(); + + irqentry_nmi_exit(regs, irq_state); } NOKPROBE_SYMBOL(s390_do_machine_check); static int __init machine_check_init(void) { - ctl_set_bit(14, 25); /* enable external damage MCH */ - ctl_set_bit(14, 27); /* enable system recovery MCH */ - ctl_set_bit(14, 24); /* enable warning MCH */ + system_ctl_set_bit(14, CR14_EXTERNAL_DAMAGE_SUBMASK_BIT); + system_ctl_set_bit(14, CR14_RECOVERY_SUBMASK_BIT); + system_ctl_set_bit(14, CR14_WARNING_SUBMASK_BIT); return 0; } early_initcall(machine_check_init); diff --git a/arch/s390/kernel/nospec-branch.c b/arch/s390/kernel/nospec-branch.c index 29e511f5bf06..d1b16d83e49a 100644 --- a/arch/s390/kernel/nospec-branch.c +++ b/arch/s390/kernel/nospec-branch.c @@ -14,14 +14,14 @@ static int __init nobp_setup_early(char *str) return rc; if (enabled && test_facility(82)) { /* - * The user explicitely requested nobp=1, enable it and + * The user explicitly requested nobp=1, enable it and * disable the expoline support. */ - __set_facility(82, S390_lowcore.alt_stfle_fac_list); + __set_facility(82, alt_stfle_fac_list); if (IS_ENABLED(CONFIG_EXPOLINE)) nospec_disable = 1; } else { - __clear_facility(82, S390_lowcore.alt_stfle_fac_list); + __clear_facility(82, alt_stfle_fac_list); } return 0; } @@ -29,7 +29,7 @@ early_param("nobp", nobp_setup_early); static int __init nospec_setup_early(char *str) { - __clear_facility(82, S390_lowcore.alt_stfle_fac_list); + __clear_facility(82, alt_stfle_fac_list); return 0; } early_param("nospec", nospec_setup_early); @@ -38,9 +38,9 @@ static int __init nospec_report(void) { if (test_facility(156)) pr_info("Spectre V2 mitigation: etokens\n"); - if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable) + if (nospec_uses_trampoline()) pr_info("Spectre V2 mitigation: execute trampolines\n"); - if (__test_facility(82, S390_lowcore.alt_stfle_fac_list)) + if (__test_facility(82, alt_stfle_fac_list)) pr_info("Spectre V2 mitigation: limited branch prediction\n"); return 0; } @@ -66,14 +66,14 @@ void __init nospec_auto_detect(void) */ if (__is_defined(CC_USING_EXPOLINE)) nospec_disable = 1; - __clear_facility(82, S390_lowcore.alt_stfle_fac_list); + __clear_facility(82, alt_stfle_fac_list); } else if (__is_defined(CC_USING_EXPOLINE)) { /* * The kernel has been compiled with expolines. * Keep expolines enabled and disable nobp. */ nospec_disable = 0; - __clear_facility(82, S390_lowcore.alt_stfle_fac_list); + __clear_facility(82, alt_stfle_fac_list); } /* * If the kernel has not been compiled with expolines the @@ -86,7 +86,7 @@ static int __init spectre_v2_setup_early(char *str) { if (str && !strncmp(str, "on", 2)) { nospec_disable = 0; - __clear_facility(82, S390_lowcore.alt_stfle_fac_list); + __clear_facility(82, alt_stfle_fac_list); } if (str && !strncmp(str, "off", 3)) nospec_disable = 1; @@ -99,11 +99,13 @@ early_param("spectre_v2", spectre_v2_setup_early); static void __init_or_module __nospec_revert(s32 *start, s32 *end) { enum { BRCL_EXPOLINE, BRASL_EXPOLINE } type; + static const u8 branch[] = { 0x47, 0x00, 0x07, 0x00 }; u8 *instr, *thunk, *br; u8 insnbuf[6]; s32 *epo; /* Second part of the instruction replace is always a nop */ + memcpy(insnbuf + 2, branch, sizeof(branch)); for (epo = start; epo < end; epo++) { instr = (u8 *) epo + *epo; if (instr[0] == 0xc0 && (instr[1] & 0x0f) == 0x04) @@ -116,42 +118,20 @@ static void __init_or_module __nospec_revert(s32 *start, s32 *end) if (thunk[0] == 0xc6 && thunk[1] == 0x00) /* exrl %r0,<target-br> */ br = thunk + (*(int *)(thunk + 2)) * 2; - else if (thunk[0] == 0xc0 && (thunk[1] & 0x0f) == 0x00 && - thunk[6] == 0x44 && thunk[7] == 0x00 && - (thunk[8] & 0x0f) == 0x00 && thunk[9] == 0x00 && - (thunk[1] & 0xf0) == (thunk[8] & 0xf0)) - /* larl %rx,<target br> + ex %r0,0(%rx) */ - br = thunk + (*(int *)(thunk + 2)) * 2; else continue; - /* Check for unconditional branch 0x07f? or 0x47f???? */ - if ((br[0] & 0xbf) != 0x07 || (br[1] & 0xf0) != 0xf0) + if (br[0] != 0x07 || (br[1] & 0xf0) != 0xf0) continue; - - memcpy(insnbuf + 2, (char[]) { 0x47, 0x00, 0x07, 0x00 }, 4); switch (type) { case BRCL_EXPOLINE: + /* brcl to thunk, replace with br + nop */ insnbuf[0] = br[0]; insnbuf[1] = (instr[1] & 0xf0) | (br[1] & 0x0f); - if (br[0] == 0x47) { - /* brcl to b, replace with bc + nopr */ - insnbuf[2] = br[2]; - insnbuf[3] = br[3]; - } else { - /* brcl to br, replace with bcr + nop */ - } break; case BRASL_EXPOLINE: + /* brasl to thunk, replace with basr + nop */ + insnbuf[0] = 0x0d; insnbuf[1] = (instr[1] & 0xf0) | (br[1] & 0x0f); - if (br[0] == 0x47) { - /* brasl to b, replace with bas + nopr */ - insnbuf[0] = 0x4d; - insnbuf[2] = br[2]; - insnbuf[3] = br[3]; - } else { - /* brasl to br, replace with basr + nop */ - insnbuf[0] = 0x0d; - } break; } diff --git a/arch/s390/kernel/nospec-sysfs.c b/arch/s390/kernel/nospec-sysfs.c index 48f472bf9290..52d4353188ad 100644 --- a/arch/s390/kernel/nospec-sysfs.c +++ b/arch/s390/kernel/nospec-sysfs.c @@ -15,9 +15,9 @@ ssize_t cpu_show_spectre_v2(struct device *dev, { if (test_facility(156)) return sprintf(buf, "Mitigation: etokens\n"); - if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable) + if (nospec_uses_trampoline()) return sprintf(buf, "Mitigation: execute trampolines\n"); - if (__test_facility(82, S390_lowcore.alt_stfle_fac_list)) + if (__test_facility(82, alt_stfle_fac_list)) return sprintf(buf, "Mitigation: limited branch prediction\n"); return sprintf(buf, "Vulnerable\n"); } diff --git a/arch/s390/kernel/numa.c b/arch/s390/kernel/numa.c new file mode 100644 index 000000000000..23ab9f02f278 --- /dev/null +++ b/arch/s390/kernel/numa.c @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NUMA support for s390 + * + * Implement NUMA core code. + * + * Copyright IBM Corp. 2015 + */ + +#include <linux/kernel.h> +#include <linux/mmzone.h> +#include <linux/cpumask.h> +#include <linux/memblock.h> +#include <linux/node.h> +#include <asm/numa.h> + +struct pglist_data *node_data[MAX_NUMNODES]; +EXPORT_SYMBOL(node_data); + +void __init numa_setup(void) +{ + int nid; + + nodes_clear(node_possible_map); + node_set(0, node_possible_map); + node_set_online(0); + for (nid = 0; nid < MAX_NUMNODES; nid++) { + NODE_DATA(nid) = memblock_alloc(sizeof(pg_data_t), 8); + if (!NODE_DATA(nid)) + panic("%s: Failed to allocate %zu bytes align=0x%x\n", + __func__, sizeof(pg_data_t), 8); + } + NODE_DATA(0)->node_spanned_pages = memblock_end_of_DRAM() >> PAGE_SHIFT; + NODE_DATA(0)->node_id = 0; +} diff --git a/arch/s390/kernel/os_info.c b/arch/s390/kernel/os_info.c index 0a5e4bafb6ad..6e1824141b29 100644 --- a/arch/s390/kernel/os_info.c +++ b/arch/s390/kernel/os_info.c @@ -13,8 +13,10 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <asm/checksum.h> -#include <asm/lowcore.h> +#include <asm/abs_lowcore.h> #include <asm/os_info.h> +#include <asm/maccess.h> +#include <asm/asm-offsets.h> /* * OS info structure has to be page aligned @@ -45,24 +47,26 @@ void os_info_crashkernel_add(unsigned long base, unsigned long size) */ void os_info_entry_add(int nr, void *ptr, u64 size) { - os_info.entry[nr].addr = (u64)(unsigned long)ptr; + os_info.entry[nr].addr = __pa(ptr); os_info.entry[nr].size = size; os_info.entry[nr].csum = (__force u32)csum_partial(ptr, size, 0); os_info.csum = os_info_csum(&os_info); } /* - * Initialize OS info struture and set lowcore pointer + * Initialize OS info structure and set lowcore pointer */ void __init os_info_init(void) { - void *ptr = &os_info; + struct lowcore *abs_lc; os_info.version_major = OS_INFO_VERSION_MAJOR; os_info.version_minor = OS_INFO_VERSION_MINOR; os_info.magic = OS_INFO_MAGIC; os_info.csum = os_info_csum(&os_info); - mem_assign_absolute(S390_lowcore.os_info, (unsigned long) ptr); + abs_lc = get_abs_lowcore(); + abs_lc->os_info = __pa(&os_info); + put_abs_lowcore(abs_lc); } #ifdef CONFIG_CRASH_DUMP @@ -90,7 +94,7 @@ static void os_info_old_alloc(int nr, int align) goto fail; } buf_align = PTR_ALIGN(buf, align); - if (copy_oldmem_kernel(buf_align, (void *) addr, size)) { + if (copy_oldmem_kernel(buf_align, addr, size)) { msg = "copy failed"; goto fail_free; } @@ -121,17 +125,16 @@ static void os_info_old_init(void) if (os_info_init) return; - if (!OLDMEM_BASE) + if (!oldmem_data.start) goto fail; - if (copy_oldmem_kernel(&addr, &S390_lowcore.os_info, sizeof(addr))) + if (copy_oldmem_kernel(&addr, __LC_OS_INFO, sizeof(addr))) goto fail; if (addr == 0 || addr % PAGE_SIZE) goto fail; os_info_old = kzalloc(sizeof(*os_info_old), GFP_KERNEL); if (!os_info_old) goto fail; - if (copy_oldmem_kernel(os_info_old, (void *) addr, - sizeof(*os_info_old))) + if (copy_oldmem_kernel(os_info_old, addr, sizeof(*os_info_old))) goto fail_free; if (os_info_old->magic != OS_INFO_MAGIC) goto fail_free; diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index 0eb1d1cc53a8..41ed6e0f0a2a 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -2,8 +2,9 @@ /* * Performance event support for s390x - CPU-measurement Counter Facility * - * Copyright IBM Corp. 2012, 2019 + * Copyright IBM Corp. 2012, 2023 * Author(s): Hendrik Brueckner <brueckner@linux.ibm.com> + * Thomas Richter <tmricht@linux.ibm.com> */ #define KMSG_COMPONENT "cpum_cf" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt @@ -14,7 +15,574 @@ #include <linux/notifier.h> #include <linux/init.h> #include <linux/export.h> -#include <asm/cpu_mcf.h> +#include <linux/miscdevice.h> +#include <linux/perf_event.h> + +#include <asm/cpu_mf.h> +#include <asm/hwctrset.h> +#include <asm/debug.h> + +enum cpumf_ctr_set { + CPUMF_CTR_SET_BASIC = 0, /* Basic Counter Set */ + CPUMF_CTR_SET_USER = 1, /* Problem-State Counter Set */ + CPUMF_CTR_SET_CRYPTO = 2, /* Crypto-Activity Counter Set */ + CPUMF_CTR_SET_EXT = 3, /* Extended Counter Set */ + CPUMF_CTR_SET_MT_DIAG = 4, /* MT-diagnostic Counter Set */ + + /* Maximum number of counter sets */ + CPUMF_CTR_SET_MAX, +}; + +#define CPUMF_LCCTL_ENABLE_SHIFT 16 +#define CPUMF_LCCTL_ACTCTL_SHIFT 0 + +static inline void ctr_set_enable(u64 *state, u64 ctrsets) +{ + *state |= ctrsets << CPUMF_LCCTL_ENABLE_SHIFT; +} + +static inline void ctr_set_disable(u64 *state, u64 ctrsets) +{ + *state &= ~(ctrsets << CPUMF_LCCTL_ENABLE_SHIFT); +} + +static inline void ctr_set_start(u64 *state, u64 ctrsets) +{ + *state |= ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT; +} + +static inline void ctr_set_stop(u64 *state, u64 ctrsets) +{ + *state &= ~(ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT); +} + +static inline int ctr_stcctm(enum cpumf_ctr_set set, u64 range, u64 *dest) +{ + switch (set) { + case CPUMF_CTR_SET_BASIC: + return stcctm(BASIC, range, dest); + case CPUMF_CTR_SET_USER: + return stcctm(PROBLEM_STATE, range, dest); + case CPUMF_CTR_SET_CRYPTO: + return stcctm(CRYPTO_ACTIVITY, range, dest); + case CPUMF_CTR_SET_EXT: + return stcctm(EXTENDED, range, dest); + case CPUMF_CTR_SET_MT_DIAG: + return stcctm(MT_DIAG_CLEARING, range, dest); + case CPUMF_CTR_SET_MAX: + return 3; + } + return 3; +} + +struct cpu_cf_events { + refcount_t refcnt; /* Reference count */ + atomic_t ctr_set[CPUMF_CTR_SET_MAX]; + u64 state; /* For perf_event_open SVC */ + u64 dev_state; /* For /dev/hwctr */ + unsigned int flags; + size_t used; /* Bytes used in data */ + size_t usedss; /* Bytes used in start/stop */ + unsigned char start[PAGE_SIZE]; /* Counter set at event add */ + unsigned char stop[PAGE_SIZE]; /* Counter set at event delete */ + unsigned char data[PAGE_SIZE]; /* Counter set at /dev/hwctr */ + unsigned int sets; /* # Counter set saved in memory */ +}; + +static unsigned int cfdiag_cpu_speed; /* CPU speed for CF_DIAG trailer */ +static debug_info_t *cf_dbg; + +/* + * The CPU Measurement query counter information instruction contains + * information which varies per machine generation, but is constant and + * does not change when running on a particular machine, such as counter + * first and second version number. This is needed to determine the size + * of counter sets. Extract this information at device driver initialization. + */ +static struct cpumf_ctr_info cpumf_ctr_info; + +struct cpu_cf_ptr { + struct cpu_cf_events *cpucf; +}; + +static struct cpu_cf_root { /* Anchor to per CPU data */ + refcount_t refcnt; /* Overall active events */ + struct cpu_cf_ptr __percpu *cfptr; +} cpu_cf_root; + +/* + * Serialize event initialization and event removal. Both are called from + * user space in task context with perf_event_open() and close() + * system calls. + * + * This mutex serializes functions cpum_cf_alloc_cpu() called at event + * initialization via cpumf_pmu_event_init() and function cpum_cf_free_cpu() + * called at event removal via call back function hw_perf_event_destroy() + * when the event is deleted. They are serialized to enforce correct + * bookkeeping of pointer and reference counts anchored by + * struct cpu_cf_root and the access to cpu_cf_root::refcnt and the + * per CPU pointers stored in cpu_cf_root::cfptr. + */ +static DEFINE_MUTEX(pmc_reserve_mutex); + +/* + * Get pointer to per-cpu structure. + * + * Function get_cpu_cfhw() is called from + * - cfset_copy_all(): This function is protected by cpus_read_lock(), so + * CPU hot plug remove can not happen. Event removal requires a close() + * first. + * + * Function this_cpu_cfhw() is called from perf common code functions: + * - pmu_{en|dis}able(), pmu_{add|del}()and pmu_{start|stop}(): + * All functions execute with interrupts disabled on that particular CPU. + * - cfset_ioctl_{on|off}, cfset_cpu_read(): see comment cfset_copy_all(). + * + * Therefore it is safe to access the CPU specific pointer to the event. + */ +static struct cpu_cf_events *get_cpu_cfhw(int cpu) +{ + struct cpu_cf_ptr __percpu *p = cpu_cf_root.cfptr; + + if (p) { + struct cpu_cf_ptr *q = per_cpu_ptr(p, cpu); + + return q->cpucf; + } + return NULL; +} + +static struct cpu_cf_events *this_cpu_cfhw(void) +{ + return get_cpu_cfhw(smp_processor_id()); +} + +/* Disable counter sets on dedicated CPU */ +static void cpum_cf_reset_cpu(void *flags) +{ + lcctl(0); +} + +/* Free per CPU data when the last event is removed. */ +static void cpum_cf_free_root(void) +{ + if (!refcount_dec_and_test(&cpu_cf_root.refcnt)) + return; + free_percpu(cpu_cf_root.cfptr); + cpu_cf_root.cfptr = NULL; + irq_subclass_unregister(IRQ_SUBCLASS_MEASUREMENT_ALERT); + on_each_cpu(cpum_cf_reset_cpu, NULL, 1); + debug_sprintf_event(cf_dbg, 4, "%s root.refcnt %u cfptr %d\n", + __func__, refcount_read(&cpu_cf_root.refcnt), + !cpu_cf_root.cfptr); +} + +/* + * On initialization of first event also allocate per CPU data dynamically. + * Start with an array of pointers, the array size is the maximum number of + * CPUs possible, which might be larger than the number of CPUs currently + * online. + */ +static int cpum_cf_alloc_root(void) +{ + int rc = 0; + + if (refcount_inc_not_zero(&cpu_cf_root.refcnt)) + return rc; + + /* The memory is already zeroed. */ + cpu_cf_root.cfptr = alloc_percpu(struct cpu_cf_ptr); + if (cpu_cf_root.cfptr) { + refcount_set(&cpu_cf_root.refcnt, 1); + on_each_cpu(cpum_cf_reset_cpu, NULL, 1); + irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT); + } else { + rc = -ENOMEM; + } + + return rc; +} + +/* Free CPU counter data structure for a PMU */ +static void cpum_cf_free_cpu(int cpu) +{ + struct cpu_cf_events *cpuhw; + struct cpu_cf_ptr *p; + + mutex_lock(&pmc_reserve_mutex); + /* + * When invoked via CPU hotplug handler, there might be no events + * installed or that particular CPU might not have an + * event installed. This anchor pointer can be NULL! + */ + if (!cpu_cf_root.cfptr) + goto out; + p = per_cpu_ptr(cpu_cf_root.cfptr, cpu); + cpuhw = p->cpucf; + /* + * Might be zero when called from CPU hotplug handler and no event + * installed on that CPU, but on different CPUs. + */ + if (!cpuhw) + goto out; + + if (refcount_dec_and_test(&cpuhw->refcnt)) { + kfree(cpuhw); + p->cpucf = NULL; + } + cpum_cf_free_root(); +out: + mutex_unlock(&pmc_reserve_mutex); +} + +/* Allocate CPU counter data structure for a PMU. Called under mutex lock. */ +static int cpum_cf_alloc_cpu(int cpu) +{ + struct cpu_cf_events *cpuhw; + struct cpu_cf_ptr *p; + int rc; + + mutex_lock(&pmc_reserve_mutex); + rc = cpum_cf_alloc_root(); + if (rc) + goto unlock; + p = per_cpu_ptr(cpu_cf_root.cfptr, cpu); + cpuhw = p->cpucf; + + if (!cpuhw) { + cpuhw = kzalloc(sizeof(*cpuhw), GFP_KERNEL); + if (cpuhw) { + p->cpucf = cpuhw; + refcount_set(&cpuhw->refcnt, 1); + } else { + rc = -ENOMEM; + } + } else { + refcount_inc(&cpuhw->refcnt); + } + if (rc) { + /* + * Error in allocation of event, decrement anchor. Since + * cpu_cf_event in not created, its destroy() function is not + * invoked. Adjust the reference counter for the anchor. + */ + cpum_cf_free_root(); + } +unlock: + mutex_unlock(&pmc_reserve_mutex); + return rc; +} + +/* + * Create/delete per CPU data structures for /dev/hwctr interface and events + * created by perf_event_open(). + * If cpu is -1, track task on all available CPUs. This requires + * allocation of hardware data structures for all CPUs. This setup handles + * perf_event_open() with task context and /dev/hwctr interface. + * If cpu is non-zero install event on this CPU only. This setup handles + * perf_event_open() with CPU context. + */ +static int cpum_cf_alloc(int cpu) +{ + cpumask_var_t mask; + int rc; + + if (cpu == -1) { + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + for_each_online_cpu(cpu) { + rc = cpum_cf_alloc_cpu(cpu); + if (rc) { + for_each_cpu(cpu, mask) + cpum_cf_free_cpu(cpu); + break; + } + cpumask_set_cpu(cpu, mask); + } + free_cpumask_var(mask); + } else { + rc = cpum_cf_alloc_cpu(cpu); + } + return rc; +} + +static void cpum_cf_free(int cpu) +{ + if (cpu == -1) { + for_each_online_cpu(cpu) + cpum_cf_free_cpu(cpu); + } else { + cpum_cf_free_cpu(cpu); + } +} + +#define CF_DIAG_CTRSET_DEF 0xfeef /* Counter set header mark */ + /* interval in seconds */ + +/* Counter sets are stored as data stream in a page sized memory buffer and + * exported to user space via raw data attached to the event sample data. + * Each counter set starts with an eight byte header consisting of: + * - a two byte eye catcher (0xfeef) + * - a one byte counter set number + * - a two byte counter set size (indicates the number of counters in this set) + * - a three byte reserved value (must be zero) to make the header the same + * size as a counter value. + * All counter values are eight byte in size. + * + * All counter sets are followed by a 64 byte trailer. + * The trailer consists of a: + * - flag field indicating valid fields when corresponding bit set + * - the counter facility first and second version number + * - the CPU speed if nonzero + * - the time stamp the counter sets have been collected + * - the time of day (TOD) base value + * - the machine type. + * + * The counter sets are saved when the process is prepared to be executed on a + * CPU and saved again when the process is going to be removed from a CPU. + * The difference of both counter sets are calculated and stored in the event + * sample data area. + */ +struct cf_ctrset_entry { /* CPU-M CF counter set entry (8 byte) */ + unsigned int def:16; /* 0-15 Data Entry Format */ + unsigned int set:16; /* 16-31 Counter set identifier */ + unsigned int ctr:16; /* 32-47 Number of stored counters */ + unsigned int res1:16; /* 48-63 Reserved */ +}; + +struct cf_trailer_entry { /* CPU-M CF_DIAG trailer (64 byte) */ + /* 0 - 7 */ + union { + struct { + unsigned int clock_base:1; /* TOD clock base set */ + unsigned int speed:1; /* CPU speed set */ + /* Measurement alerts */ + unsigned int mtda:1; /* Loss of MT ctr. data alert */ + unsigned int caca:1; /* Counter auth. change alert */ + unsigned int lcda:1; /* Loss of counter data alert */ + }; + unsigned long flags; /* 0-63 All indicators */ + }; + /* 8 - 15 */ + unsigned int cfvn:16; /* 64-79 Ctr First Version */ + unsigned int csvn:16; /* 80-95 Ctr Second Version */ + unsigned int cpu_speed:32; /* 96-127 CPU speed */ + /* 16 - 23 */ + unsigned long timestamp; /* 128-191 Timestamp (TOD) */ + /* 24 - 55 */ + union { + struct { + unsigned long progusage1; + unsigned long progusage2; + unsigned long progusage3; + unsigned long tod_base; + }; + unsigned long progusage[4]; + }; + /* 56 - 63 */ + unsigned int mach_type:16; /* Machine type */ + unsigned int res1:16; /* Reserved */ + unsigned int res2:32; /* Reserved */ +}; + +/* Create the trailer data at the end of a page. */ +static void cfdiag_trailer(struct cf_trailer_entry *te) +{ + struct cpuid cpuid; + + te->cfvn = cpumf_ctr_info.cfvn; /* Counter version numbers */ + te->csvn = cpumf_ctr_info.csvn; + + get_cpu_id(&cpuid); /* Machine type */ + te->mach_type = cpuid.machine; + te->cpu_speed = cfdiag_cpu_speed; + if (te->cpu_speed) + te->speed = 1; + te->clock_base = 1; /* Save clock base */ + te->tod_base = tod_clock_base.tod; + te->timestamp = get_tod_clock_fast(); +} + +/* + * The number of counters per counter set varies between machine generations, + * but is constant when running on a particular machine generation. + * Determine each counter set size at device driver initialization and + * retrieve it later. + */ +static size_t cpumf_ctr_setsizes[CPUMF_CTR_SET_MAX]; +static void cpum_cf_make_setsize(enum cpumf_ctr_set ctrset) +{ + size_t ctrset_size = 0; + + switch (ctrset) { + case CPUMF_CTR_SET_BASIC: + if (cpumf_ctr_info.cfvn >= 1) + ctrset_size = 6; + break; + case CPUMF_CTR_SET_USER: + if (cpumf_ctr_info.cfvn == 1) + ctrset_size = 6; + else if (cpumf_ctr_info.cfvn >= 3) + ctrset_size = 2; + break; + case CPUMF_CTR_SET_CRYPTO: + if (cpumf_ctr_info.csvn >= 1 && cpumf_ctr_info.csvn <= 5) + ctrset_size = 16; + else if (cpumf_ctr_info.csvn == 6 || cpumf_ctr_info.csvn == 7) + ctrset_size = 20; + break; + case CPUMF_CTR_SET_EXT: + if (cpumf_ctr_info.csvn == 1) + ctrset_size = 32; + else if (cpumf_ctr_info.csvn == 2) + ctrset_size = 48; + else if (cpumf_ctr_info.csvn >= 3 && cpumf_ctr_info.csvn <= 5) + ctrset_size = 128; + else if (cpumf_ctr_info.csvn == 6 || cpumf_ctr_info.csvn == 7) + ctrset_size = 160; + break; + case CPUMF_CTR_SET_MT_DIAG: + if (cpumf_ctr_info.csvn > 3) + ctrset_size = 48; + break; + case CPUMF_CTR_SET_MAX: + break; + } + cpumf_ctr_setsizes[ctrset] = ctrset_size; +} + +/* + * Return the maximum possible counter set size (in number of 8 byte counters) + * depending on type and model number. + */ +static size_t cpum_cf_read_setsize(enum cpumf_ctr_set ctrset) +{ + return cpumf_ctr_setsizes[ctrset]; +} + +/* Read a counter set. The counter set number determines the counter set and + * the CPUM-CF first and second version number determine the number of + * available counters in each counter set. + * Each counter set starts with header containing the counter set number and + * the number of eight byte counters. + * + * The functions returns the number of bytes occupied by this counter set + * including the header. + * If there is no counter in the counter set, this counter set is useless and + * zero is returned on this case. + * + * Note that the counter sets may not be enabled or active and the stcctm + * instruction might return error 3. Depending on error_ok value this is ok, + * for example when called from cpumf_pmu_start() call back function. + */ +static size_t cfdiag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset, + size_t room, bool error_ok) +{ + size_t ctrset_size, need = 0; + int rc = 3; /* Assume write failure */ + + ctrdata->def = CF_DIAG_CTRSET_DEF; + ctrdata->set = ctrset; + ctrdata->res1 = 0; + ctrset_size = cpum_cf_read_setsize(ctrset); + + if (ctrset_size) { /* Save data */ + need = ctrset_size * sizeof(u64) + sizeof(*ctrdata); + if (need <= room) { + rc = ctr_stcctm(ctrset, ctrset_size, + (u64 *)(ctrdata + 1)); + } + if (rc != 3 || error_ok) + ctrdata->ctr = ctrset_size; + else + need = 0; + } + + return need; +} + +static const u64 cpumf_ctr_ctl[CPUMF_CTR_SET_MAX] = { + [CPUMF_CTR_SET_BASIC] = 0x02, + [CPUMF_CTR_SET_USER] = 0x04, + [CPUMF_CTR_SET_CRYPTO] = 0x08, + [CPUMF_CTR_SET_EXT] = 0x01, + [CPUMF_CTR_SET_MT_DIAG] = 0x20, +}; + +/* Read out all counter sets and save them in the provided data buffer. + * The last 64 byte host an artificial trailer entry. + */ +static size_t cfdiag_getctr(void *data, size_t sz, unsigned long auth, + bool error_ok) +{ + struct cf_trailer_entry *trailer; + size_t offset = 0, done; + int i; + + memset(data, 0, sz); + sz -= sizeof(*trailer); /* Always room for trailer */ + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { + struct cf_ctrset_entry *ctrdata = data + offset; + + if (!(auth & cpumf_ctr_ctl[i])) + continue; /* Counter set not authorized */ + + done = cfdiag_getctrset(ctrdata, i, sz - offset, error_ok); + offset += done; + } + trailer = data + offset; + cfdiag_trailer(trailer); + return offset + sizeof(*trailer); +} + +/* Calculate the difference for each counter in a counter set. */ +static void cfdiag_diffctrset(u64 *pstart, u64 *pstop, int counters) +{ + for (; --counters >= 0; ++pstart, ++pstop) + if (*pstop >= *pstart) + *pstop -= *pstart; + else + *pstop = *pstart - *pstop + 1; +} + +/* Scan the counter sets and calculate the difference of each counter + * in each set. The result is the increment of each counter during the + * period the counter set has been activated. + * + * Return true on success. + */ +static int cfdiag_diffctr(struct cpu_cf_events *cpuhw, unsigned long auth) +{ + struct cf_trailer_entry *trailer_start, *trailer_stop; + struct cf_ctrset_entry *ctrstart, *ctrstop; + size_t offset = 0; + + auth &= (1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1; + do { + ctrstart = (struct cf_ctrset_entry *)(cpuhw->start + offset); + ctrstop = (struct cf_ctrset_entry *)(cpuhw->stop + offset); + + if (memcmp(ctrstop, ctrstart, sizeof(*ctrstop))) { + pr_err_once("cpum_cf_diag counter set compare error " + "in set %i\n", ctrstart->set); + return 0; + } + auth &= ~cpumf_ctr_ctl[ctrstart->set]; + if (ctrstart->def == CF_DIAG_CTRSET_DEF) { + cfdiag_diffctrset((u64 *)(ctrstart + 1), + (u64 *)(ctrstop + 1), ctrstart->ctr); + offset += ctrstart->ctr * sizeof(u64) + + sizeof(*ctrstart); + } + } while (ctrstart->def && auth); + + /* Save time_stamp from start of event in stop's trailer */ + trailer_start = (struct cf_trailer_entry *)(cpuhw->start + offset); + trailer_stop = (struct cf_trailer_entry *)(cpuhw->stop + offset); + trailer_stop->progusage[0] = trailer_start->timestamp; + + return 1; +} static enum cpumf_ctr_set get_counter_set(u64 event) { @@ -34,39 +602,35 @@ static enum cpumf_ctr_set get_counter_set(u64 event) return set; } -static int validate_ctr_version(const struct hw_perf_event *hwc) +static int validate_ctr_version(const u64 config, enum cpumf_ctr_set set) { - struct cpu_cf_events *cpuhw; - int err = 0; u16 mtdiag_ctl; - - cpuhw = &get_cpu_var(cpu_cf_events); + int err = 0; /* check required version for counter sets */ - switch (hwc->config_base) { + switch (set) { case CPUMF_CTR_SET_BASIC: case CPUMF_CTR_SET_USER: - if (cpuhw->info.cfvn < 1) + if (cpumf_ctr_info.cfvn < 1) err = -EOPNOTSUPP; break; case CPUMF_CTR_SET_CRYPTO: - if ((cpuhw->info.csvn >= 1 && cpuhw->info.csvn <= 5 && - hwc->config > 79) || - (cpuhw->info.csvn >= 6 && hwc->config > 83)) + if ((cpumf_ctr_info.csvn >= 1 && cpumf_ctr_info.csvn <= 5 && + config > 79) || (cpumf_ctr_info.csvn >= 6 && config > 83)) err = -EOPNOTSUPP; break; case CPUMF_CTR_SET_EXT: - if (cpuhw->info.csvn < 1) + if (cpumf_ctr_info.csvn < 1) err = -EOPNOTSUPP; - if ((cpuhw->info.csvn == 1 && hwc->config > 159) || - (cpuhw->info.csvn == 2 && hwc->config > 175) || - (cpuhw->info.csvn >= 3 && cpuhw->info.csvn <= 5 - && hwc->config > 255) || - (cpuhw->info.csvn >= 6 && hwc->config > 287)) + if ((cpumf_ctr_info.csvn == 1 && config > 159) || + (cpumf_ctr_info.csvn == 2 && config > 175) || + (cpumf_ctr_info.csvn >= 3 && cpumf_ctr_info.csvn <= 5 && + config > 255) || + (cpumf_ctr_info.csvn >= 6 && config > 287)) err = -EOPNOTSUPP; break; case CPUMF_CTR_SET_MT_DIAG: - if (cpuhw->info.csvn <= 3) + if (cpumf_ctr_info.csvn <= 3) err = -EOPNOTSUPP; /* * MT-diagnostic counters are read-only. The counter set @@ -81,35 +645,15 @@ static int validate_ctr_version(const struct hw_perf_event *hwc) * counter set is enabled and active. */ mtdiag_ctl = cpumf_ctr_ctl[CPUMF_CTR_SET_MT_DIAG]; - if (!((cpuhw->info.auth_ctl & mtdiag_ctl) && - (cpuhw->info.enable_ctl & mtdiag_ctl) && - (cpuhw->info.act_ctl & mtdiag_ctl))) + if (!((cpumf_ctr_info.auth_ctl & mtdiag_ctl) && + (cpumf_ctr_info.enable_ctl & mtdiag_ctl) && + (cpumf_ctr_info.act_ctl & mtdiag_ctl))) err = -EOPNOTSUPP; break; + case CPUMF_CTR_SET_MAX: + err = -EOPNOTSUPP; } - put_cpu_var(cpu_cf_events); - return err; -} - -static int validate_ctr_auth(const struct hw_perf_event *hwc) -{ - struct cpu_cf_events *cpuhw; - u64 ctrs_state; - int err = 0; - - cpuhw = &get_cpu_var(cpu_cf_events); - - /* Check authorization for cpu counter sets. - * If the particular CPU counter set is not authorized, - * return with -ENOENT in order to fall back to other - * PMUs that might suffice the event request. - */ - ctrs_state = cpumf_ctr_ctl[hwc->config_base]; - if (!(ctrs_state & cpuhw->info.auth_ctl)) - err = -ENOENT; - - put_cpu_var(cpu_cf_events); return err; } @@ -120,20 +664,17 @@ static int validate_ctr_auth(const struct hw_perf_event *hwc) */ static void cpumf_pmu_enable(struct pmu *pmu) { - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); int err; - if (cpuhw->flags & PMU_F_ENABLED) - return; - - err = lcctl(cpuhw->state); - if (err) { - pr_err("Enabling the performance measuring unit " - "failed with rc=%x\n", err); + if (!cpuhw || (cpuhw->flags & PMU_F_ENABLED)) return; - } - cpuhw->flags |= PMU_F_ENABLED; + err = lcctl(cpuhw->state | cpuhw->dev_state); + if (err) + pr_err("Enabling the performance measuring unit failed with rc=%x\n", err); + else + cpuhw->flags |= PMU_F_ENABLED; } /* @@ -143,39 +684,26 @@ static void cpumf_pmu_enable(struct pmu *pmu) */ static void cpumf_pmu_disable(struct pmu *pmu) { - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - int err; + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); u64 inactive; + int err; - if (!(cpuhw->flags & PMU_F_ENABLED)) + if (!cpuhw || !(cpuhw->flags & PMU_F_ENABLED)) return; inactive = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1); + inactive |= cpuhw->dev_state; err = lcctl(inactive); - if (err) { - pr_err("Disabling the performance measuring unit " - "failed with rc=%x\n", err); - return; - } - - cpuhw->flags &= ~PMU_F_ENABLED; + if (err) + pr_err("Disabling the performance measuring unit failed with rc=%x\n", err); + else + cpuhw->flags &= ~PMU_F_ENABLED; } - -/* Number of perf events counting hardware events */ -static atomic_t num_events = ATOMIC_INIT(0); -/* Used to avoid races in calling reserve/release_cpumf_hardware */ -static DEFINE_MUTEX(pmc_reserve_mutex); - /* Release the PMU if event is the last perf event */ static void hw_perf_event_destroy(struct perf_event *event) { - if (!atomic_add_unless(&num_events, -1, 1)) { - mutex_lock(&pmc_reserve_mutex); - if (atomic_dec_return(&num_events) == 0) - __kernel_cpumcf_end(); - mutex_unlock(&pmc_reserve_mutex); - } + cpum_cf_free(event->cpu); } /* CPUMF <-> perf event mappings for kernel+userspace (basic set) */ @@ -199,12 +727,17 @@ static const int cpumf_generic_events_user[] = { [PERF_COUNT_HW_BUS_CYCLES] = -1, }; +static int is_userspace_event(u64 ev) +{ + return cpumf_generic_events_user[PERF_COUNT_HW_CPU_CYCLES] == ev || + cpumf_generic_events_user[PERF_COUNT_HW_INSTRUCTIONS] == ev; +} + static int __hw_perf_event_init(struct perf_event *event, unsigned int type) { struct perf_event_attr *attr = &event->attr; struct hw_perf_event *hwc = &event->hw; enum cpumf_ctr_set set; - int err = 0; u64 ev; switch (type) { @@ -221,21 +754,26 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type) if (is_sampling_event(event)) /* No sampling support */ return -ENOENT; ev = attr->config; - /* Count user space (problem-state) only */ if (!attr->exclude_user && attr->exclude_kernel) { - if (ev >= ARRAY_SIZE(cpumf_generic_events_user)) - return -EOPNOTSUPP; - ev = cpumf_generic_events_user[ev]; - - /* No support for kernel space counters only */ + /* + * Count user space (problem-state) only + * Handle events 32 and 33 as 0:u and 1:u + */ + if (!is_userspace_event(ev)) { + if (ev >= ARRAY_SIZE(cpumf_generic_events_user)) + return -EOPNOTSUPP; + ev = cpumf_generic_events_user[ev]; + } } else if (!attr->exclude_kernel && attr->exclude_user) { + /* No support for kernel space counters only */ return -EOPNOTSUPP; - - /* Count user and kernel space */ } else { - if (ev >= ARRAY_SIZE(cpumf_generic_events_basic)) - return -EOPNOTSUPP; - ev = cpumf_generic_events_basic[ev]; + /* Count user and kernel space, incl. events 32 + 33 */ + if (!is_userspace_event(ev)) { + if (ev >= ARRAY_SIZE(cpumf_generic_events_basic)) + return -EOPNOTSUPP; + ev = cpumf_generic_events_basic[ev]; + } } break; @@ -260,36 +798,51 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type) /* * Use the hardware perf event structure to store the * counter number in the 'config' member and the counter - * set number in the 'config_base'. The counter set number - * is then later used to enable/disable the counter(s). + * set number in the 'config_base' as bit mask. + * It is later used to enable/disable the counter(s). */ hwc->config = ev; - hwc->config_base = set; + hwc->config_base = cpumf_ctr_ctl[set]; break; case CPUMF_CTR_SET_MAX: /* The counter could not be associated to a counter set */ return -EINVAL; - }; + } /* Initialize for using the CPU-measurement counter facility */ - if (!atomic_inc_not_zero(&num_events)) { - mutex_lock(&pmc_reserve_mutex); - if (atomic_read(&num_events) == 0 && __kernel_cpumcf_begin()) - err = -EBUSY; - else - atomic_inc(&num_events); - mutex_unlock(&pmc_reserve_mutex); - } - if (err) - return err; + if (cpum_cf_alloc(event->cpu)) + return -ENOMEM; event->destroy = hw_perf_event_destroy; - /* Finally, validate version and authorization of the counter set */ - err = validate_ctr_auth(hwc); - if (!err) - err = validate_ctr_version(hwc); + /* + * Finally, validate version and authorization of the counter set. + * If the particular CPU counter set is not authorized, + * return with -ENOENT in order to fall back to other + * PMUs that might suffice the event request. + */ + if (!(hwc->config_base & cpumf_ctr_info.auth_ctl)) + return -ENOENT; + return validate_ctr_version(hwc->config, set); +} - return err; +/* Events CPU_CYLCES and INSTRUCTIONS can be submitted with two different + * attribute::type values: + * - PERF_TYPE_HARDWARE: + * - pmu->type: + * Handle both type of invocations identical. They address the same hardware. + * The result is different when event modifiers exclude_kernel and/or + * exclude_user are also set. + */ +static int cpumf_pmu_event_type(struct perf_event *event) +{ + u64 ev = event->attr.config; + + if (cpumf_generic_events_basic[PERF_COUNT_HW_CPU_CYCLES] == ev || + cpumf_generic_events_basic[PERF_COUNT_HW_INSTRUCTIONS] == ev || + cpumf_generic_events_user[PERF_COUNT_HW_CPU_CYCLES] == ev || + cpumf_generic_events_user[PERF_COUNT_HW_INSTRUCTIONS] == ev) + return PERF_TYPE_HARDWARE; + return PERF_TYPE_RAW; } static int cpumf_pmu_event_init(struct perf_event *event) @@ -301,7 +854,7 @@ static int cpumf_pmu_event_init(struct perf_event *event) err = __hw_perf_event_init(event, type); else if (event->pmu->type == type) /* Registered as unknown PMU */ - err = __hw_perf_event_init(event, PERF_TYPE_RAW); + err = __hw_perf_event_init(event, cpumf_pmu_event_type(event)); else return -ENOENT; @@ -361,18 +914,13 @@ static void cpumf_pmu_read(struct perf_event *event) static void cpumf_pmu_start(struct perf_event *event, int flags) { - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); struct hw_perf_event *hwc = &event->hw; + int i; - if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) - return; - - if (WARN_ON_ONCE(hwc->config == -1)) + if (!(hwc->state & PERF_HES_STOPPED)) return; - if (flags & PERF_EF_RELOAD) - WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); - hwc->state = 0; /* (Re-)enable and activate the counter set */ @@ -384,45 +932,95 @@ static void cpumf_pmu_start(struct perf_event *event, int flags) * needs to be synchronized. At this point, the counter set can be in * the inactive or disabled state. */ - hw_perf_event_reset(event); + if (hwc->config == PERF_EVENT_CPUM_CF_DIAG) { + cpuhw->usedss = cfdiag_getctr(cpuhw->start, + sizeof(cpuhw->start), + hwc->config_base, true); + } else { + hw_perf_event_reset(event); + } - /* increment refcount for this counter set */ - atomic_inc(&cpuhw->ctr_set[hwc->config_base]); + /* Increment refcount for counter sets */ + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) + if ((hwc->config_base & cpumf_ctr_ctl[i])) + atomic_inc(&cpuhw->ctr_set[i]); +} + +/* Create perf event sample with the counter sets as raw data. The sample + * is then pushed to the event subsystem and the function checks for + * possible event overflows. If an event overflow occurs, the PMU is + * stopped. + * + * Return non-zero if an event overflow occurred. + */ +static int cfdiag_push_sample(struct perf_event *event, + struct cpu_cf_events *cpuhw) +{ + struct perf_sample_data data; + struct perf_raw_record raw; + struct pt_regs regs; + int overflow; + + /* Setup perf sample */ + perf_sample_data_init(&data, 0, event->hw.last_period); + memset(®s, 0, sizeof(regs)); + memset(&raw, 0, sizeof(raw)); + + if (event->attr.sample_type & PERF_SAMPLE_CPU) + data.cpu_entry.cpu = event->cpu; + if (event->attr.sample_type & PERF_SAMPLE_RAW) { + raw.frag.size = cpuhw->usedss; + raw.frag.data = cpuhw->stop; + perf_sample_save_raw_data(&data, &raw); + } + + overflow = perf_event_overflow(event, &data, ®s); + if (overflow) + event->pmu->stop(event, 0); + + perf_event_update_userpage(event); + return overflow; } static void cpumf_pmu_stop(struct perf_event *event, int flags) { - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); struct hw_perf_event *hwc = &event->hw; + int i; if (!(hwc->state & PERF_HES_STOPPED)) { /* Decrement reference count for this counter set and if this * is the last used counter in the set, clear activation * control and set the counter set state to inactive. */ - if (!atomic_dec_return(&cpuhw->ctr_set[hwc->config_base])) - ctr_set_stop(&cpuhw->state, hwc->config_base); - event->hw.state |= PERF_HES_STOPPED; + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { + if (!(hwc->config_base & cpumf_ctr_ctl[i])) + continue; + if (!atomic_dec_return(&cpuhw->ctr_set[i])) + ctr_set_stop(&cpuhw->state, cpumf_ctr_ctl[i]); + } + hwc->state |= PERF_HES_STOPPED; } if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { - hw_perf_event_update(event); - event->hw.state |= PERF_HES_UPTODATE; + if (hwc->config == PERF_EVENT_CPUM_CF_DIAG) { + local64_inc(&event->count); + cpuhw->usedss = cfdiag_getctr(cpuhw->stop, + sizeof(cpuhw->stop), + event->hw.config_base, + false); + if (cfdiag_diffctr(cpuhw, event->hw.config_base)) + cfdiag_push_sample(event, cpuhw); + } else { + hw_perf_event_update(event); + } + hwc->state |= PERF_HES_UPTODATE; } } static int cpumf_pmu_add(struct perf_event *event, int flags) { - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - - /* Check authorization for the counter set to which this - * counter belongs. - * For group events transaction, the authorization check is - * done in cpumf_pmu_commit_txn(). - */ - if (!(cpuhw->txn_flags & PERF_PMU_TXN_ADD)) - if (validate_ctr_auth(&event->hw)) - return -ENOENT; + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); ctr_set_enable(&cpuhw->state, event->hw.config_base); event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; @@ -430,14 +1028,13 @@ static int cpumf_pmu_add(struct perf_event *event, int flags) if (flags & PERF_EF_START) cpumf_pmu_start(event, PERF_EF_RELOAD); - perf_event_update_userpage(event); - return 0; } static void cpumf_pmu_del(struct perf_event *event, int flags) { - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); + int i; cpumf_pmu_stop(event, PERF_EF_UPDATE); @@ -449,112 +1046,905 @@ static void cpumf_pmu_del(struct perf_event *event, int flags) * clear enable control and resets all counters in a set. Therefore, * cpumf_pmu_start() always has to reenable a counter set. */ - if (!atomic_read(&cpuhw->ctr_set[event->hw.config_base])) - ctr_set_disable(&cpuhw->state, event->hw.config_base); - - perf_event_update_userpage(event); + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) + if (!atomic_read(&cpuhw->ctr_set[i])) + ctr_set_disable(&cpuhw->state, cpumf_ctr_ctl[i]); } +/* Performance monitoring unit for s390x */ +static struct pmu cpumf_pmu = { + .task_ctx_nr = perf_sw_context, + .capabilities = PERF_PMU_CAP_NO_INTERRUPT, + .pmu_enable = cpumf_pmu_enable, + .pmu_disable = cpumf_pmu_disable, + .event_init = cpumf_pmu_event_init, + .add = cpumf_pmu_add, + .del = cpumf_pmu_del, + .start = cpumf_pmu_start, + .stop = cpumf_pmu_stop, + .read = cpumf_pmu_read, +}; + +static struct cfset_session { /* CPUs and counter set bit mask */ + struct list_head head; /* Head of list of active processes */ +} cfset_session = { + .head = LIST_HEAD_INIT(cfset_session.head) +}; + +static refcount_t cfset_opencnt = REFCOUNT_INIT(0); /* Access count */ /* - * Start group events scheduling transaction. - * Set flags to perform a single test at commit time. + * Synchronize access to device /dev/hwc. This mutex protects against + * concurrent access to functions cfset_open() and cfset_release(). + * Same for CPU hotplug add and remove events triggering + * cpum_cf_online_cpu() and cpum_cf_offline_cpu(). + * It also serializes concurrent device ioctl access from multiple + * processes accessing /dev/hwc. * - * We only support PERF_PMU_TXN_ADD transactions. Save the - * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD - * transactions. + * The mutex protects concurrent access to the /dev/hwctr session management + * struct cfset_session and reference counting variable cfset_opencnt. */ -static void cpumf_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags) +static DEFINE_MUTEX(cfset_ctrset_mutex); + +/* + * CPU hotplug handles only /dev/hwctr device. + * For perf_event_open() the CPU hotplug handling is done on kernel common + * code: + * - CPU add: Nothing is done since a file descriptor can not be created + * and returned to the user. + * - CPU delete: Handled by common code via pmu_disable(), pmu_stop() and + * pmu_delete(). The event itself is removed when the file descriptor is + * closed. + */ +static int cfset_online_cpu(unsigned int cpu); + +static int cpum_cf_online_cpu(unsigned int cpu) { - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + int rc = 0; + + /* + * Ignore notification for perf_event_open(). + * Handle only /dev/hwctr device sessions. + */ + mutex_lock(&cfset_ctrset_mutex); + if (refcount_read(&cfset_opencnt)) { + rc = cpum_cf_alloc_cpu(cpu); + if (!rc) + cfset_online_cpu(cpu); + } + mutex_unlock(&cfset_ctrset_mutex); + return rc; +} + +static int cfset_offline_cpu(unsigned int cpu); + +static int cpum_cf_offline_cpu(unsigned int cpu) +{ + /* + * During task exit processing of grouped perf events triggered by CPU + * hotplug processing, pmu_disable() is called as part of perf context + * removal process. Therefore do not trigger event removal now for + * perf_event_open() created events. Perf common code triggers event + * destruction when the event file descriptor is closed. + * + * Handle only /dev/hwctr device sessions. + */ + mutex_lock(&cfset_ctrset_mutex); + if (refcount_read(&cfset_opencnt)) { + cfset_offline_cpu(cpu); + cpum_cf_free_cpu(cpu); + } + mutex_unlock(&cfset_ctrset_mutex); + return 0; +} + +/* Return true if store counter set multiple instruction is available */ +static inline int stccm_avail(void) +{ + return test_facility(142); +} + +/* CPU-measurement alerts for the counter facility */ +static void cpumf_measurement_alert(struct ext_code ext_code, + unsigned int alert, unsigned long unused) +{ + struct cpu_cf_events *cpuhw; + + if (!(alert & CPU_MF_INT_CF_MASK)) + return; - WARN_ON_ONCE(cpuhw->txn_flags); /* txn already in flight */ + inc_irq_stat(IRQEXT_CMC); - cpuhw->txn_flags = txn_flags; - if (txn_flags & ~PERF_PMU_TXN_ADD) + /* + * Measurement alerts are shared and might happen when the PMU + * is not reserved. Ignore these alerts in this case. + */ + cpuhw = this_cpu_cfhw(); + if (!cpuhw) return; - perf_pmu_disable(pmu); - cpuhw->tx_state = cpuhw->state; + /* counter authorization change alert */ + if (alert & CPU_MF_INT_CF_CACA) + qctri(&cpumf_ctr_info); + + /* loss of counter data alert */ + if (alert & CPU_MF_INT_CF_LCDA) + pr_err("CPU[%i] Counter data was lost\n", smp_processor_id()); + + /* loss of MT counter data alert */ + if (alert & CPU_MF_INT_CF_MTDA) + pr_warn("CPU[%i] MT counter data was lost\n", + smp_processor_id()); } -/* - * Stop and cancel a group events scheduling tranctions. - * Assumes cpumf_pmu_del() is called for each successful added - * cpumf_pmu_add() during the transaction. +static int cfset_init(void); +static int __init cpumf_pmu_init(void) +{ + int rc; + + /* Extract counter measurement facility information */ + if (!cpum_cf_avail() || qctri(&cpumf_ctr_info)) + return -ENODEV; + + /* Determine and store counter set sizes for later reference */ + for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc) + cpum_cf_make_setsize(rc); + + /* + * Clear bit 15 of cr0 to unauthorize problem-state to + * extract measurement counters + */ + system_ctl_clear_bit(0, CR0_CPUMF_EXTRACTION_AUTH_BIT); + + /* register handler for measurement-alert interruptions */ + rc = register_external_irq(EXT_IRQ_MEASURE_ALERT, + cpumf_measurement_alert); + if (rc) { + pr_err("Registering for CPU-measurement alerts failed with rc=%i\n", rc); + return rc; + } + + /* Setup s390dbf facility */ + cf_dbg = debug_register(KMSG_COMPONENT, 2, 1, 128); + if (!cf_dbg) { + pr_err("Registration of s390dbf(cpum_cf) failed\n"); + rc = -ENOMEM; + goto out1; + } + debug_register_view(cf_dbg, &debug_sprintf_view); + + cpumf_pmu.attr_groups = cpumf_cf_event_group(); + rc = perf_pmu_register(&cpumf_pmu, "cpum_cf", -1); + if (rc) { + pr_err("Registering the cpum_cf PMU failed with rc=%i\n", rc); + goto out2; + } else if (stccm_avail()) { /* Setup counter set device */ + cfset_init(); + } + + rc = cpuhp_setup_state(CPUHP_AP_PERF_S390_CF_ONLINE, + "perf/s390/cf:online", + cpum_cf_online_cpu, cpum_cf_offline_cpu); + return rc; + +out2: + debug_unregister_view(cf_dbg, &debug_sprintf_view); + debug_unregister(cf_dbg); +out1: + unregister_external_irq(EXT_IRQ_MEASURE_ALERT, cpumf_measurement_alert); + return rc; +} + +/* Support for the CPU Measurement Facility counter set extraction using + * device /dev/hwctr. This allows user space programs to extract complete + * counter set via normal file operations. + */ + +struct cfset_call_on_cpu_parm { /* Parm struct for smp_call_on_cpu */ + unsigned int sets; /* Counter set bit mask */ + atomic_t cpus_ack; /* # CPUs successfully executed func */ +}; + +struct cfset_request { /* CPUs and counter set bit mask */ + unsigned long ctrset; /* Bit mask of counter set to read */ + cpumask_t mask; /* CPU mask to read from */ + struct list_head node; /* Chain to cfset_session.head */ +}; + +static void cfset_session_init(void) +{ + INIT_LIST_HEAD(&cfset_session.head); +} + +/* Remove current request from global bookkeeping. Maintain a counter set bit + * mask on a per CPU basis. + * Done in process context under mutex protection. */ -static void cpumf_pmu_cancel_txn(struct pmu *pmu) +static void cfset_session_del(struct cfset_request *p) { - unsigned int txn_flags; - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + list_del(&p->node); +} - WARN_ON_ONCE(!cpuhw->txn_flags); /* no txn in flight */ +/* Add current request to global bookkeeping. Maintain a counter set bit mask + * on a per CPU basis. + * Done in process context under mutex protection. + */ +static void cfset_session_add(struct cfset_request *p) +{ + list_add(&p->node, &cfset_session.head); +} - txn_flags = cpuhw->txn_flags; - cpuhw->txn_flags = 0; - if (txn_flags & ~PERF_PMU_TXN_ADD) - return; +/* The /dev/hwctr device access uses PMU_F_IN_USE to mark the device access + * path is currently used. + * The cpu_cf_events::dev_state is used to denote counter sets in use by this + * interface. It is always or'ed in. If this interface is not active, its + * value is zero and no additional counter sets will be included. + * + * The cpu_cf_events::state is used by the perf_event_open SVC and remains + * unchanged. + * + * perf_pmu_enable() and perf_pmu_enable() and its call backs + * cpumf_pmu_enable() and cpumf_pmu_disable() are called by the + * performance measurement subsystem to enable per process + * CPU Measurement counter facility. + * The XXX_enable() and XXX_disable functions are used to turn off + * x86 performance monitoring interrupt (PMI) during scheduling. + * s390 uses these calls to temporarily stop and resume the active CPU + * counters sets during scheduling. + * + * We do allow concurrent access of perf_event_open() SVC and /dev/hwctr + * device access. The perf_event_open() SVC interface makes a lot of effort + * to only run the counters while the calling process is actively scheduled + * to run. + * When /dev/hwctr interface is also used at the same time, the counter sets + * will keep running, even when the process is scheduled off a CPU. + * However this is not a problem and does not lead to wrong counter values + * for the perf_event_open() SVC. The current counter value will be recorded + * during schedule-in. At schedule-out time the current counter value is + * extracted again and the delta is calculated and added to the event. + */ +/* Stop all counter sets via ioctl interface */ +static void cfset_ioctl_off(void *parm) +{ + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); + struct cfset_call_on_cpu_parm *p = parm; + int rc; + + /* Check if any counter set used by /dev/hwctr */ + for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc) + if ((p->sets & cpumf_ctr_ctl[rc])) { + if (!atomic_dec_return(&cpuhw->ctr_set[rc])) { + ctr_set_disable(&cpuhw->dev_state, + cpumf_ctr_ctl[rc]); + ctr_set_stop(&cpuhw->dev_state, + cpumf_ctr_ctl[rc]); + } + } + /* Keep perf_event_open counter sets */ + rc = lcctl(cpuhw->dev_state | cpuhw->state); + if (rc) + pr_err("Counter set stop %#llx of /dev/%s failed rc=%i\n", + cpuhw->state, S390_HWCTR_DEVICE, rc); + if (!cpuhw->dev_state) + cpuhw->flags &= ~PMU_F_IN_USE; +} + +/* Start counter sets on particular CPU */ +static void cfset_ioctl_on(void *parm) +{ + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); + struct cfset_call_on_cpu_parm *p = parm; + int rc; + + cpuhw->flags |= PMU_F_IN_USE; + ctr_set_enable(&cpuhw->dev_state, p->sets); + ctr_set_start(&cpuhw->dev_state, p->sets); + for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc) + if ((p->sets & cpumf_ctr_ctl[rc])) + atomic_inc(&cpuhw->ctr_set[rc]); + rc = lcctl(cpuhw->dev_state | cpuhw->state); /* Start counter sets */ + if (!rc) + atomic_inc(&p->cpus_ack); + else + pr_err("Counter set start %#llx of /dev/%s failed rc=%i\n", + cpuhw->dev_state | cpuhw->state, S390_HWCTR_DEVICE, rc); +} + +static void cfset_release_cpu(void *p) +{ + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); + int rc; + + cpuhw->dev_state = 0; + rc = lcctl(cpuhw->state); /* Keep perf_event_open counter sets */ + if (rc) + pr_err("Counter set release %#llx of /dev/%s failed rc=%i\n", + cpuhw->state, S390_HWCTR_DEVICE, rc); +} + +/* This modifies the process CPU mask to adopt it to the currently online + * CPUs. Offline CPUs can not be addresses. This call terminates the access + * and is usually followed by close() or a new iotcl(..., START, ...) which + * creates a new request structure. + */ +static void cfset_all_stop(struct cfset_request *req) +{ + struct cfset_call_on_cpu_parm p = { + .sets = req->ctrset, + }; - WARN_ON(cpuhw->tx_state != cpuhw->state); + cpumask_and(&req->mask, &req->mask, cpu_online_mask); + on_each_cpu_mask(&req->mask, cfset_ioctl_off, &p, 1); +} - perf_pmu_enable(pmu); +/* Release function is also called when application gets terminated without + * doing a proper ioctl(..., S390_HWCTR_STOP, ...) command. + */ +static int cfset_release(struct inode *inode, struct file *file) +{ + mutex_lock(&cfset_ctrset_mutex); + /* Open followed by close/exit has no private_data */ + if (file->private_data) { + cfset_all_stop(file->private_data); + cfset_session_del(file->private_data); + kfree(file->private_data); + file->private_data = NULL; + } + if (refcount_dec_and_test(&cfset_opencnt)) { /* Last close */ + on_each_cpu(cfset_release_cpu, NULL, 1); + cpum_cf_free(-1); + } + mutex_unlock(&cfset_ctrset_mutex); + return 0; } /* - * Commit the group events scheduling transaction. On success, the - * transaction is closed. On error, the transaction is kept open - * until cpumf_pmu_cancel_txn() is called. + * Open via /dev/hwctr device. Allocate all per CPU resources on the first + * open of the device. The last close releases all per CPU resources. + * Parallel perf_event_open system calls also use per CPU resources. + * These invocations are handled via reference counting on the per CPU data + * structures. */ -static int cpumf_pmu_commit_txn(struct pmu *pmu) +static int cfset_open(struct inode *inode, struct file *file) { - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - u64 state; + int rc = 0; + + if (!perfmon_capable()) + return -EPERM; + file->private_data = NULL; + + mutex_lock(&cfset_ctrset_mutex); + if (!refcount_inc_not_zero(&cfset_opencnt)) { /* First open */ + rc = cpum_cf_alloc(-1); + if (!rc) { + cfset_session_init(); + refcount_set(&cfset_opencnt, 1); + } + } + mutex_unlock(&cfset_ctrset_mutex); - WARN_ON_ONCE(!cpuhw->txn_flags); /* no txn in flight */ + /* nonseekable_open() never fails */ + return rc ?: nonseekable_open(inode, file); +} - if (cpuhw->txn_flags & ~PERF_PMU_TXN_ADD) { - cpuhw->txn_flags = 0; - return 0; +static int cfset_all_start(struct cfset_request *req) +{ + struct cfset_call_on_cpu_parm p = { + .sets = req->ctrset, + .cpus_ack = ATOMIC_INIT(0), + }; + cpumask_var_t mask; + int rc = 0; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + cpumask_and(mask, &req->mask, cpu_online_mask); + on_each_cpu_mask(mask, cfset_ioctl_on, &p, 1); + if (atomic_read(&p.cpus_ack) != cpumask_weight(mask)) { + on_each_cpu_mask(mask, cfset_ioctl_off, &p, 1); + rc = -EIO; } + free_cpumask_var(mask); + return rc; +} - /* check if the updated state can be scheduled */ - state = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1); - state >>= CPUMF_LCCTL_ENABLE_SHIFT; - if ((state & cpuhw->info.auth_ctl) != state) - return -ENOENT; +/* Return the maximum required space for all possible CPUs in case one + * CPU will be onlined during the START, READ, STOP cycles. + * To find out the size of the counter sets, any one CPU will do. They + * all have the same counter sets. + */ +static size_t cfset_needspace(unsigned int sets) +{ + size_t bytes = 0; + int i; + + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { + if (!(sets & cpumf_ctr_ctl[i])) + continue; + bytes += cpum_cf_read_setsize(i) * sizeof(u64) + + sizeof(((struct s390_ctrset_setdata *)0)->set) + + sizeof(((struct s390_ctrset_setdata *)0)->no_cnts); + } + bytes = sizeof(((struct s390_ctrset_read *)0)->no_cpus) + nr_cpu_ids * + (bytes + sizeof(((struct s390_ctrset_cpudata *)0)->cpu_nr) + + sizeof(((struct s390_ctrset_cpudata *)0)->no_sets)); + return bytes; +} + +static int cfset_all_copy(unsigned long arg, cpumask_t *mask) +{ + struct s390_ctrset_read __user *ctrset_read; + unsigned int cpu, cpus, rc = 0; + void __user *uptr; + + ctrset_read = (struct s390_ctrset_read __user *)arg; + uptr = ctrset_read->data; + for_each_cpu(cpu, mask) { + struct cpu_cf_events *cpuhw = get_cpu_cfhw(cpu); + struct s390_ctrset_cpudata __user *ctrset_cpudata; + + ctrset_cpudata = uptr; + rc = put_user(cpu, &ctrset_cpudata->cpu_nr); + rc |= put_user(cpuhw->sets, &ctrset_cpudata->no_sets); + rc |= copy_to_user(ctrset_cpudata->data, cpuhw->data, + cpuhw->used); + if (rc) { + rc = -EFAULT; + goto out; + } + uptr += sizeof(struct s390_ctrset_cpudata) + cpuhw->used; + cond_resched(); + } + cpus = cpumask_weight(mask); + if (put_user(cpus, &ctrset_read->no_cpus)) + rc = -EFAULT; +out: + return rc; +} - cpuhw->txn_flags = 0; - perf_pmu_enable(pmu); +static size_t cfset_cpuset_read(struct s390_ctrset_setdata *p, int ctrset, + int ctrset_size, size_t room) +{ + size_t need = 0; + int rc = -1; + + need = sizeof(*p) + sizeof(u64) * ctrset_size; + if (need <= room) { + p->set = cpumf_ctr_ctl[ctrset]; + p->no_cnts = ctrset_size; + rc = ctr_stcctm(ctrset, ctrset_size, (u64 *)p->cv); + if (rc == 3) /* Nothing stored */ + need = 0; + } + return need; +} + +/* Read all counter sets. */ +static void cfset_cpu_read(void *parm) +{ + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); + struct cfset_call_on_cpu_parm *p = parm; + int set, set_size; + size_t space; + + /* No data saved yet */ + cpuhw->used = 0; + cpuhw->sets = 0; + memset(cpuhw->data, 0, sizeof(cpuhw->data)); + + /* Scan the counter sets */ + for (set = CPUMF_CTR_SET_BASIC; set < CPUMF_CTR_SET_MAX; ++set) { + struct s390_ctrset_setdata *sp = (void *)cpuhw->data + + cpuhw->used; + + if (!(p->sets & cpumf_ctr_ctl[set])) + continue; /* Counter set not in list */ + set_size = cpum_cf_read_setsize(set); + space = sizeof(cpuhw->data) - cpuhw->used; + space = cfset_cpuset_read(sp, set, set_size, space); + if (space) { + cpuhw->used += space; + cpuhw->sets += 1; + } + } +} + +static int cfset_all_read(unsigned long arg, struct cfset_request *req) +{ + struct cfset_call_on_cpu_parm p; + cpumask_var_t mask; + int rc; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + p.sets = req->ctrset; + cpumask_and(mask, &req->mask, cpu_online_mask); + on_each_cpu_mask(mask, cfset_cpu_read, &p, 1); + rc = cfset_all_copy(arg, mask); + free_cpumask_var(mask); + return rc; +} + +static long cfset_ioctl_read(unsigned long arg, struct cfset_request *req) +{ + int ret = -ENODATA; + + if (req && req->ctrset) + ret = cfset_all_read(arg, req); + return ret; +} + +static long cfset_ioctl_stop(struct file *file) +{ + struct cfset_request *req = file->private_data; + int ret = -ENXIO; + + if (req) { + cfset_all_stop(req); + cfset_session_del(req); + kfree(req); + file->private_data = NULL; + ret = 0; + } + return ret; +} + +static long cfset_ioctl_start(unsigned long arg, struct file *file) +{ + struct s390_ctrset_start __user *ustart; + struct s390_ctrset_start start; + struct cfset_request *preq; + void __user *umask; + unsigned int len; + int ret = 0; + size_t need; + + if (file->private_data) + return -EBUSY; + ustart = (struct s390_ctrset_start __user *)arg; + if (copy_from_user(&start, ustart, sizeof(start))) + return -EFAULT; + if (start.version != S390_HWCTR_START_VERSION) + return -EINVAL; + if (start.counter_sets & ~(cpumf_ctr_ctl[CPUMF_CTR_SET_BASIC] | + cpumf_ctr_ctl[CPUMF_CTR_SET_USER] | + cpumf_ctr_ctl[CPUMF_CTR_SET_CRYPTO] | + cpumf_ctr_ctl[CPUMF_CTR_SET_EXT] | + cpumf_ctr_ctl[CPUMF_CTR_SET_MT_DIAG])) + return -EINVAL; /* Invalid counter set */ + if (!start.counter_sets) + return -EINVAL; /* No counter set at all? */ + + preq = kzalloc(sizeof(*preq), GFP_KERNEL); + if (!preq) + return -ENOMEM; + cpumask_clear(&preq->mask); + len = min_t(u64, start.cpumask_len, cpumask_size()); + umask = (void __user *)start.cpumask; + if (copy_from_user(&preq->mask, umask, len)) { + kfree(preq); + return -EFAULT; + } + if (cpumask_empty(&preq->mask)) { + kfree(preq); + return -EINVAL; + } + need = cfset_needspace(start.counter_sets); + if (put_user(need, &ustart->data_bytes)) { + kfree(preq); + return -EFAULT; + } + preq->ctrset = start.counter_sets; + ret = cfset_all_start(preq); + if (!ret) { + cfset_session_add(preq); + file->private_data = preq; + } else { + kfree(preq); + } + return ret; +} + +/* Entry point to the /dev/hwctr device interface. + * The ioctl system call supports three subcommands: + * S390_HWCTR_START: Start the specified counter sets on a CPU list. The + * counter set keeps running until explicitly stopped. Returns the number + * of bytes needed to store the counter values. If another S390_HWCTR_START + * ioctl subcommand is called without a previous S390_HWCTR_STOP stop + * command on the same file descriptor, -EBUSY is returned. + * S390_HWCTR_READ: Read the counter set values from specified CPU list given + * with the S390_HWCTR_START command. + * S390_HWCTR_STOP: Stops the counter sets on the CPU list given with the + * previous S390_HWCTR_START subcommand. + */ +static long cfset_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int ret; + + cpus_read_lock(); + mutex_lock(&cfset_ctrset_mutex); + switch (cmd) { + case S390_HWCTR_START: + ret = cfset_ioctl_start(arg, file); + break; + case S390_HWCTR_STOP: + ret = cfset_ioctl_stop(file); + break; + case S390_HWCTR_READ: + ret = cfset_ioctl_read(arg, file->private_data); + break; + default: + ret = -ENOTTY; + break; + } + mutex_unlock(&cfset_ctrset_mutex); + cpus_read_unlock(); + return ret; +} + +static const struct file_operations cfset_fops = { + .owner = THIS_MODULE, + .open = cfset_open, + .release = cfset_release, + .unlocked_ioctl = cfset_ioctl, + .compat_ioctl = cfset_ioctl, + .llseek = no_llseek +}; + +static struct miscdevice cfset_dev = { + .name = S390_HWCTR_DEVICE, + .minor = MISC_DYNAMIC_MINOR, + .fops = &cfset_fops, + .mode = 0666, +}; + +/* Hotplug add of a CPU. Scan through all active processes and add + * that CPU to the list of CPUs supplied with ioctl(..., START, ...). + */ +static int cfset_online_cpu(unsigned int cpu) +{ + struct cfset_call_on_cpu_parm p; + struct cfset_request *rp; + + if (!list_empty(&cfset_session.head)) { + list_for_each_entry(rp, &cfset_session.head, node) { + p.sets = rp->ctrset; + cfset_ioctl_on(&p); + cpumask_set_cpu(cpu, &rp->mask); + } + } return 0; } -/* Performance monitoring unit for s390x */ -static struct pmu cpumf_pmu = { +/* Hotplug remove of a CPU. Scan through all active processes and clear + * that CPU from the list of CPUs supplied with ioctl(..., START, ...). + * Adjust reference counts. + */ +static int cfset_offline_cpu(unsigned int cpu) +{ + struct cfset_call_on_cpu_parm p; + struct cfset_request *rp; + + if (!list_empty(&cfset_session.head)) { + list_for_each_entry(rp, &cfset_session.head, node) { + p.sets = rp->ctrset; + cfset_ioctl_off(&p); + cpumask_clear_cpu(cpu, &rp->mask); + } + } + return 0; +} + +static void cfdiag_read(struct perf_event *event) +{ +} + +static int get_authctrsets(void) +{ + unsigned long auth = 0; + enum cpumf_ctr_set i; + + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { + if (cpumf_ctr_info.auth_ctl & cpumf_ctr_ctl[i]) + auth |= cpumf_ctr_ctl[i]; + } + return auth; +} + +/* Setup the event. Test for authorized counter sets and only include counter + * sets which are authorized at the time of the setup. Including unauthorized + * counter sets result in specification exception (and panic). + */ +static int cfdiag_event_init2(struct perf_event *event) +{ + struct perf_event_attr *attr = &event->attr; + int err = 0; + + /* Set sample_period to indicate sampling */ + event->hw.config = attr->config; + event->hw.sample_period = attr->sample_period; + local64_set(&event->hw.period_left, event->hw.sample_period); + local64_set(&event->count, 0); + event->hw.last_period = event->hw.sample_period; + + /* Add all authorized counter sets to config_base. The + * the hardware init function is either called per-cpu or just once + * for all CPUS (event->cpu == -1). This depends on the whether + * counting is started for all CPUs or on a per workload base where + * the perf event moves from one CPU to another CPU. + * Checking the authorization on any CPU is fine as the hardware + * applies the same authorization settings to all CPUs. + */ + event->hw.config_base = get_authctrsets(); + + /* No authorized counter sets, nothing to count/sample */ + if (!event->hw.config_base) + err = -EINVAL; + + return err; +} + +static int cfdiag_event_init(struct perf_event *event) +{ + struct perf_event_attr *attr = &event->attr; + int err = -ENOENT; + + if (event->attr.config != PERF_EVENT_CPUM_CF_DIAG || + event->attr.type != event->pmu->type) + goto out; + + /* Raw events are used to access counters directly, + * hence do not permit excludes. + * This event is useless without PERF_SAMPLE_RAW to return counter set + * values as raw data. + */ + if (attr->exclude_kernel || attr->exclude_user || attr->exclude_hv || + !(attr->sample_type & (PERF_SAMPLE_CPU | PERF_SAMPLE_RAW))) { + err = -EOPNOTSUPP; + goto out; + } + + /* Initialize for using the CPU-measurement counter facility */ + if (cpum_cf_alloc(event->cpu)) + return -ENOMEM; + event->destroy = hw_perf_event_destroy; + + err = cfdiag_event_init2(event); + if (unlikely(err)) + event->destroy(event); +out: + return err; +} + +/* Create cf_diag/events/CF_DIAG event sysfs file. This counter is used + * to collect the complete counter sets for a scheduled process. Target + * are complete counter sets attached as raw data to the artificial event. + * This results in complete counter sets available when a process is + * scheduled. Contains the delta of every counter while the process was + * running. + */ +CPUMF_EVENT_ATTR(CF_DIAG, CF_DIAG, PERF_EVENT_CPUM_CF_DIAG); + +static struct attribute *cfdiag_events_attr[] = { + CPUMF_EVENT_PTR(CF_DIAG, CF_DIAG), + NULL, +}; + +PMU_FORMAT_ATTR(event, "config:0-63"); + +static struct attribute *cfdiag_format_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group cfdiag_events_group = { + .name = "events", + .attrs = cfdiag_events_attr, +}; +static struct attribute_group cfdiag_format_group = { + .name = "format", + .attrs = cfdiag_format_attr, +}; +static const struct attribute_group *cfdiag_attr_groups[] = { + &cfdiag_events_group, + &cfdiag_format_group, + NULL, +}; + +/* Performance monitoring unit for event CF_DIAG. Since this event + * is also started and stopped via the perf_event_open() system call, use + * the same event enable/disable call back functions. They do not + * have a pointer to the perf_event strcture as first parameter. + * + * The functions XXX_add, XXX_del, XXX_start and XXX_stop are also common. + * Reuse them and distinguish the event (always first parameter) via + * 'config' member. + */ +static struct pmu cf_diag = { .task_ctx_nr = perf_sw_context, - .capabilities = PERF_PMU_CAP_NO_INTERRUPT, + .event_init = cfdiag_event_init, .pmu_enable = cpumf_pmu_enable, .pmu_disable = cpumf_pmu_disable, - .event_init = cpumf_pmu_event_init, .add = cpumf_pmu_add, .del = cpumf_pmu_del, .start = cpumf_pmu_start, .stop = cpumf_pmu_stop, - .read = cpumf_pmu_read, - .start_txn = cpumf_pmu_start_txn, - .commit_txn = cpumf_pmu_commit_txn, - .cancel_txn = cpumf_pmu_cancel_txn, + .read = cfdiag_read, + + .attr_groups = cfdiag_attr_groups }; -static int __init cpumf_pmu_init(void) +/* Calculate memory needed to store all counter sets together with header and + * trailer data. This is independent of the counter set authorization which + * can vary depending on the configuration. + */ +static size_t cfdiag_maxsize(struct cpumf_ctr_info *info) +{ + size_t max_size = sizeof(struct cf_trailer_entry); + enum cpumf_ctr_set i; + + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { + size_t size = cpum_cf_read_setsize(i); + + if (size) + max_size += size * sizeof(u64) + + sizeof(struct cf_ctrset_entry); + } + return max_size; +} + +/* Get the CPU speed, try sampling facility first and CPU attributes second. */ +static void cfdiag_get_cpu_speed(void) +{ + unsigned long mhz; + + if (cpum_sf_avail()) { /* Sampling facility first */ + struct hws_qsi_info_block si; + + memset(&si, 0, sizeof(si)); + if (!qsi(&si)) { + cfdiag_cpu_speed = si.cpu_speed; + return; + } + } + + /* Fallback: CPU speed extract static part. Used in case + * CPU Measurement Sampling Facility is turned off. + */ + mhz = __ecag(ECAG_CPU_ATTRIBUTE, 0); + if (mhz != -1UL) + cfdiag_cpu_speed = mhz & 0xffffffff; +} + +static int cfset_init(void) { + size_t need; int rc; - if (!kernel_cpumcf_avail()) - return -ENODEV; + cfdiag_get_cpu_speed(); + /* Make sure the counter set data fits into predefined buffer. */ + need = cfdiag_maxsize(&cpumf_ctr_info); + if (need > sizeof(((struct cpu_cf_events *)0)->start)) { + pr_err("Insufficient memory for PMU(cpum_cf_diag) need=%zu\n", + need); + return -ENOMEM; + } - cpumf_pmu.attr_groups = cpumf_cf_event_group(); - rc = perf_pmu_register(&cpumf_pmu, "cpum_cf", -1); - if (rc) - pr_err("Registering the cpum_cf PMU failed with rc=%i\n", rc); + rc = misc_register(&cfset_dev); + if (rc) { + pr_err("Registration of /dev/%s failed rc=%i\n", + cfset_dev.name, rc); + goto out; + } + + rc = perf_pmu_register(&cf_diag, "cpum_cf_diag", -1); + if (rc) { + misc_deregister(&cfset_dev); + pr_err("Registration of PMU(cpum_cf_diag) failed with rc=%i\n", + rc); + } +out: return rc; } -subsys_initcall(cpumf_pmu_init); + +device_initcall(cpumf_pmu_init); diff --git a/arch/s390/kernel/perf_cpum_cf_common.c b/arch/s390/kernel/perf_cpum_cf_common.c deleted file mode 100644 index 3bced89caffb..000000000000 --- a/arch/s390/kernel/perf_cpum_cf_common.c +++ /dev/null @@ -1,201 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * CPU-Measurement Counter Facility Support - Common Layer - * - * Copyright IBM Corp. 2019 - * Author(s): Hendrik Brueckner <brueckner@linux.ibm.com> - */ -#define KMSG_COMPONENT "cpum_cf_common" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt - -#include <linux/kernel.h> -#include <linux/kernel_stat.h> -#include <linux/percpu.h> -#include <linux/notifier.h> -#include <linux/init.h> -#include <linux/export.h> -#include <asm/ctl_reg.h> -#include <asm/irq.h> -#include <asm/cpu_mcf.h> - -/* Per-CPU event structure for the counter facility */ -DEFINE_PER_CPU(struct cpu_cf_events, cpu_cf_events) = { - .ctr_set = { - [CPUMF_CTR_SET_BASIC] = ATOMIC_INIT(0), - [CPUMF_CTR_SET_USER] = ATOMIC_INIT(0), - [CPUMF_CTR_SET_CRYPTO] = ATOMIC_INIT(0), - [CPUMF_CTR_SET_EXT] = ATOMIC_INIT(0), - [CPUMF_CTR_SET_MT_DIAG] = ATOMIC_INIT(0), - }, - .alert = ATOMIC64_INIT(0), - .state = 0, - .flags = 0, - .txn_flags = 0, -}; -/* Indicator whether the CPU-Measurement Counter Facility Support is ready */ -static bool cpum_cf_initalized; - -/* CPU-measurement alerts for the counter facility */ -static void cpumf_measurement_alert(struct ext_code ext_code, - unsigned int alert, unsigned long unused) -{ - struct cpu_cf_events *cpuhw; - - if (!(alert & CPU_MF_INT_CF_MASK)) - return; - - inc_irq_stat(IRQEXT_CMC); - cpuhw = this_cpu_ptr(&cpu_cf_events); - - /* Measurement alerts are shared and might happen when the PMU - * is not reserved. Ignore these alerts in this case. */ - if (!(cpuhw->flags & PMU_F_RESERVED)) - return; - - /* counter authorization change alert */ - if (alert & CPU_MF_INT_CF_CACA) - qctri(&cpuhw->info); - - /* loss of counter data alert */ - if (alert & CPU_MF_INT_CF_LCDA) - pr_err("CPU[%i] Counter data was lost\n", smp_processor_id()); - - /* loss of MT counter data alert */ - if (alert & CPU_MF_INT_CF_MTDA) - pr_warn("CPU[%i] MT counter data was lost\n", - smp_processor_id()); - - /* store alert for special handling by in-kernel users */ - atomic64_or(alert, &cpuhw->alert); -} - -#define PMC_INIT 0 -#define PMC_RELEASE 1 -static void cpum_cf_setup_cpu(void *flags) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - - switch (*((int *) flags)) { - case PMC_INIT: - memset(&cpuhw->info, 0, sizeof(cpuhw->info)); - qctri(&cpuhw->info); - cpuhw->flags |= PMU_F_RESERVED; - break; - - case PMC_RELEASE: - cpuhw->flags &= ~PMU_F_RESERVED; - break; - } - - /* Disable CPU counter sets */ - lcctl(0); -} - -bool kernel_cpumcf_avail(void) -{ - return cpum_cf_initalized; -} -EXPORT_SYMBOL(kernel_cpumcf_avail); - - -/* Reserve/release functions for sharing perf hardware */ -static DEFINE_SPINLOCK(cpumcf_owner_lock); -static void *cpumcf_owner; - -/* Initialize the CPU-measurement counter facility */ -int __kernel_cpumcf_begin(void) -{ - int flags = PMC_INIT; - int err = 0; - - spin_lock(&cpumcf_owner_lock); - if (cpumcf_owner) - err = -EBUSY; - else - cpumcf_owner = __builtin_return_address(0); - spin_unlock(&cpumcf_owner_lock); - if (err) - return err; - - on_each_cpu(cpum_cf_setup_cpu, &flags, 1); - irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT); - - return 0; -} -EXPORT_SYMBOL(__kernel_cpumcf_begin); - -/* Obtain the CPU-measurement alerts for the counter facility */ -unsigned long kernel_cpumcf_alert(int clear) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - unsigned long alert; - - alert = atomic64_read(&cpuhw->alert); - if (clear) - atomic64_set(&cpuhw->alert, 0); - - return alert; -} -EXPORT_SYMBOL(kernel_cpumcf_alert); - -/* Release the CPU-measurement counter facility */ -void __kernel_cpumcf_end(void) -{ - int flags = PMC_RELEASE; - - on_each_cpu(cpum_cf_setup_cpu, &flags, 1); - irq_subclass_unregister(IRQ_SUBCLASS_MEASUREMENT_ALERT); - - spin_lock(&cpumcf_owner_lock); - cpumcf_owner = NULL; - spin_unlock(&cpumcf_owner_lock); -} -EXPORT_SYMBOL(__kernel_cpumcf_end); - -static int cpum_cf_setup(unsigned int cpu, int flags) -{ - local_irq_disable(); - cpum_cf_setup_cpu(&flags); - local_irq_enable(); - return 0; -} - -static int cpum_cf_online_cpu(unsigned int cpu) -{ - return cpum_cf_setup(cpu, PMC_INIT); -} - -static int cpum_cf_offline_cpu(unsigned int cpu) -{ - return cpum_cf_setup(cpu, PMC_RELEASE); -} - -static int __init cpum_cf_init(void) -{ - int rc; - - if (!cpum_cf_avail()) - return -ENODEV; - - /* clear bit 15 of cr0 to unauthorize problem-state to - * extract measurement counters */ - ctl_clear_bit(0, 48); - - /* register handler for measurement-alert interruptions */ - rc = register_external_irq(EXT_IRQ_MEASURE_ALERT, - cpumf_measurement_alert); - if (rc) { - pr_err("Registering for CPU-measurement alerts " - "failed with rc=%i\n", rc); - return rc; - } - - rc = cpuhp_setup_state(CPUHP_AP_PERF_S390_CF_ONLINE, - "perf/s390/cf:online", - cpum_cf_online_cpu, cpum_cf_offline_cpu); - if (!rc) - cpum_cf_initalized = true; - - return rc; -} -early_initcall(cpum_cf_init); diff --git a/arch/s390/kernel/perf_cpum_cf_diag.c b/arch/s390/kernel/perf_cpum_cf_diag.c deleted file mode 100644 index e949ab832ed7..000000000000 --- a/arch/s390/kernel/perf_cpum_cf_diag.c +++ /dev/null @@ -1,705 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Performance event support for s390x - CPU-measurement Counter Sets - * - * Copyright IBM Corp. 2019 - * Author(s): Hendrik Brueckner <brueckner@linux.ibm.com> - * Thomas Richer <tmricht@linux.ibm.com> - */ -#define KMSG_COMPONENT "cpum_cf_diag" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt - -#include <linux/kernel.h> -#include <linux/kernel_stat.h> -#include <linux/percpu.h> -#include <linux/notifier.h> -#include <linux/init.h> -#include <linux/export.h> -#include <linux/slab.h> -#include <linux/processor.h> - -#include <asm/ctl_reg.h> -#include <asm/irq.h> -#include <asm/cpu_mcf.h> -#include <asm/timex.h> -#include <asm/debug.h> - -#define CF_DIAG_CTRSET_DEF 0xfeef /* Counter set header mark */ - -static unsigned int cf_diag_cpu_speed; -static debug_info_t *cf_diag_dbg; - -struct cf_diag_csd { /* Counter set data per CPU */ - size_t used; /* Bytes used in data/start */ - unsigned char start[PAGE_SIZE]; /* Counter set at event start */ - unsigned char data[PAGE_SIZE]; /* Counter set at event delete */ -}; -static DEFINE_PER_CPU(struct cf_diag_csd, cf_diag_csd); - -/* Counter sets are stored as data stream in a page sized memory buffer and - * exported to user space via raw data attached to the event sample data. - * Each counter set starts with an eight byte header consisting of: - * - a two byte eye catcher (0xfeef) - * - a one byte counter set number - * - a two byte counter set size (indicates the number of counters in this set) - * - a three byte reserved value (must be zero) to make the header the same - * size as a counter value. - * All counter values are eight byte in size. - * - * All counter sets are followed by a 64 byte trailer. - * The trailer consists of a: - * - flag field indicating valid fields when corresponding bit set - * - the counter facility first and second version number - * - the CPU speed if nonzero - * - the time stamp the counter sets have been collected - * - the time of day (TOD) base value - * - the machine type. - * - * The counter sets are saved when the process is prepared to be executed on a - * CPU and saved again when the process is going to be removed from a CPU. - * The difference of both counter sets are calculated and stored in the event - * sample data area. - */ - -struct cf_ctrset_entry { /* CPU-M CF counter set entry (8 byte) */ - unsigned int def:16; /* 0-15 Data Entry Format */ - unsigned int set:16; /* 16-31 Counter set identifier */ - unsigned int ctr:16; /* 32-47 Number of stored counters */ - unsigned int res1:16; /* 48-63 Reserved */ -}; - -struct cf_trailer_entry { /* CPU-M CF_DIAG trailer (64 byte) */ - /* 0 - 7 */ - union { - struct { - unsigned int clock_base:1; /* TOD clock base set */ - unsigned int speed:1; /* CPU speed set */ - /* Measurement alerts */ - unsigned int mtda:1; /* Loss of MT ctr. data alert */ - unsigned int caca:1; /* Counter auth. change alert */ - unsigned int lcda:1; /* Loss of counter data alert */ - }; - unsigned long flags; /* 0-63 All indicators */ - }; - /* 8 - 15 */ - unsigned int cfvn:16; /* 64-79 Ctr First Version */ - unsigned int csvn:16; /* 80-95 Ctr Second Version */ - unsigned int cpu_speed:32; /* 96-127 CPU speed */ - /* 16 - 23 */ - unsigned long timestamp; /* 128-191 Timestamp (TOD) */ - /* 24 - 55 */ - union { - struct { - unsigned long progusage1; - unsigned long progusage2; - unsigned long progusage3; - unsigned long tod_base; - }; - unsigned long progusage[4]; - }; - /* 56 - 63 */ - unsigned int mach_type:16; /* Machine type */ - unsigned int res1:16; /* Reserved */ - unsigned int res2:32; /* Reserved */ -}; - -/* Create the trailer data at the end of a page. */ -static void cf_diag_trailer(struct cf_trailer_entry *te) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - struct cpuid cpuid; - - te->cfvn = cpuhw->info.cfvn; /* Counter version numbers */ - te->csvn = cpuhw->info.csvn; - - get_cpu_id(&cpuid); /* Machine type */ - te->mach_type = cpuid.machine; - te->cpu_speed = cf_diag_cpu_speed; - if (te->cpu_speed) - te->speed = 1; - te->clock_base = 1; /* Save clock base */ - memcpy(&te->tod_base, &tod_clock_base[1], 8); - store_tod_clock((__u64 *)&te->timestamp); -} - -/* - * Change the CPUMF state to active. - * Enable and activate the CPU-counter sets according - * to the per-cpu control state. - */ -static void cf_diag_enable(struct pmu *pmu) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - int err; - - debug_sprintf_event(cf_diag_dbg, 5, - "%s pmu %p cpu %d flags %#x state %#llx\n", - __func__, pmu, smp_processor_id(), cpuhw->flags, - cpuhw->state); - if (cpuhw->flags & PMU_F_ENABLED) - return; - - err = lcctl(cpuhw->state); - if (err) { - pr_err("Enabling the performance measuring unit " - "failed with rc=%x\n", err); - return; - } - cpuhw->flags |= PMU_F_ENABLED; -} - -/* - * Change the CPUMF state to inactive. - * Disable and enable (inactive) the CPU-counter sets according - * to the per-cpu control state. - */ -static void cf_diag_disable(struct pmu *pmu) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - u64 inactive; - int err; - - debug_sprintf_event(cf_diag_dbg, 5, - "%s pmu %p cpu %d flags %#x state %#llx\n", - __func__, pmu, smp_processor_id(), cpuhw->flags, - cpuhw->state); - if (!(cpuhw->flags & PMU_F_ENABLED)) - return; - - inactive = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1); - err = lcctl(inactive); - if (err) { - pr_err("Disabling the performance measuring unit " - "failed with rc=%x\n", err); - return; - } - cpuhw->flags &= ~PMU_F_ENABLED; -} - -/* Number of perf events counting hardware events */ -static atomic_t cf_diag_events = ATOMIC_INIT(0); - -/* Release the PMU if event is the last perf event */ -static void cf_diag_perf_event_destroy(struct perf_event *event) -{ - debug_sprintf_event(cf_diag_dbg, 5, - "%s event %p cpu %d cf_diag_events %d\n", - __func__, event, event->cpu, - atomic_read(&cf_diag_events)); - if (atomic_dec_return(&cf_diag_events) == 0) - __kernel_cpumcf_end(); -} - -/* Setup the event. Test for authorized counter sets and only include counter - * sets which are authorized at the time of the setup. Including unauthorized - * counter sets result in specification exception (and panic). - */ -static int __hw_perf_event_init(struct perf_event *event) -{ - struct perf_event_attr *attr = &event->attr; - struct cpu_cf_events *cpuhw; - enum cpumf_ctr_set i; - int err = 0; - - debug_sprintf_event(cf_diag_dbg, 5, "%s event %p cpu %d\n", __func__, - event, event->cpu); - - event->hw.config = attr->config; - event->hw.config_base = 0; - - /* Add all authorized counter sets to config_base. The - * the hardware init function is either called per-cpu or just once - * for all CPUS (event->cpu == -1). This depends on the whether - * counting is started for all CPUs or on a per workload base where - * the perf event moves from one CPU to another CPU. - * Checking the authorization on any CPU is fine as the hardware - * applies the same authorization settings to all CPUs. - */ - cpuhw = &get_cpu_var(cpu_cf_events); - for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) - if (cpuhw->info.auth_ctl & cpumf_ctr_ctl[i]) - event->hw.config_base |= cpumf_ctr_ctl[i]; - put_cpu_var(cpu_cf_events); - - /* No authorized counter sets, nothing to count/sample */ - if (!event->hw.config_base) { - err = -EINVAL; - goto out; - } - - /* Set sample_period to indicate sampling */ - event->hw.sample_period = attr->sample_period; - local64_set(&event->hw.period_left, event->hw.sample_period); - event->hw.last_period = event->hw.sample_period; -out: - debug_sprintf_event(cf_diag_dbg, 5, "%s err %d config_base %#lx\n", - __func__, err, event->hw.config_base); - return err; -} - -static int cf_diag_event_init(struct perf_event *event) -{ - struct perf_event_attr *attr = &event->attr; - int err = -ENOENT; - - debug_sprintf_event(cf_diag_dbg, 5, - "%s event %p cpu %d config %#llx type:%u " - "sample_type %#llx cf_diag_events %d\n", __func__, - event, event->cpu, attr->config, event->pmu->type, - attr->sample_type, atomic_read(&cf_diag_events)); - - if (event->attr.config != PERF_EVENT_CPUM_CF_DIAG || - event->attr.type != event->pmu->type) - goto out; - - /* Raw events are used to access counters directly, - * hence do not permit excludes. - * This event is usesless without PERF_SAMPLE_RAW to return counter set - * values as raw data. - */ - if (attr->exclude_kernel || attr->exclude_user || attr->exclude_hv || - !(attr->sample_type & (PERF_SAMPLE_CPU | PERF_SAMPLE_RAW))) { - err = -EOPNOTSUPP; - goto out; - } - - /* Initialize for using the CPU-measurement counter facility */ - if (atomic_inc_return(&cf_diag_events) == 1) { - if (__kernel_cpumcf_begin()) { - atomic_dec(&cf_diag_events); - err = -EBUSY; - goto out; - } - } - event->destroy = cf_diag_perf_event_destroy; - - err = __hw_perf_event_init(event); - if (unlikely(err)) - event->destroy(event); -out: - debug_sprintf_event(cf_diag_dbg, 5, "%s err %d\n", __func__, err); - return err; -} - -static void cf_diag_read(struct perf_event *event) -{ - debug_sprintf_event(cf_diag_dbg, 5, "%s event %p\n", __func__, event); -} - -/* Return the maximum possible counter set size (in number of 8 byte counters) - * depending on type and model number. - */ -static size_t cf_diag_ctrset_size(enum cpumf_ctr_set ctrset, - struct cpumf_ctr_info *info) -{ - size_t ctrset_size = 0; - - switch (ctrset) { - case CPUMF_CTR_SET_BASIC: - if (info->cfvn >= 1) - ctrset_size = 6; - break; - case CPUMF_CTR_SET_USER: - if (info->cfvn == 1) - ctrset_size = 6; - else if (info->cfvn >= 3) - ctrset_size = 2; - break; - case CPUMF_CTR_SET_CRYPTO: - if (info->csvn >= 1 && info->csvn <= 5) - ctrset_size = 16; - else if (info->csvn == 6) - ctrset_size = 20; - break; - case CPUMF_CTR_SET_EXT: - if (info->csvn == 1) - ctrset_size = 32; - else if (info->csvn == 2) - ctrset_size = 48; - else if (info->csvn >= 3 && info->csvn <= 5) - ctrset_size = 128; - else if (info->csvn == 6) - ctrset_size = 160; - break; - case CPUMF_CTR_SET_MT_DIAG: - if (info->csvn > 3) - ctrset_size = 48; - break; - case CPUMF_CTR_SET_MAX: - break; - } - - return ctrset_size; -} - -/* Calculate memory needed to store all counter sets together with header and - * trailer data. This is independend of the counter set authorization which - * can vary depending on the configuration. - */ -static size_t cf_diag_ctrset_maxsize(struct cpumf_ctr_info *info) -{ - size_t max_size = sizeof(struct cf_trailer_entry); - enum cpumf_ctr_set i; - - for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { - size_t size = cf_diag_ctrset_size(i, info); - - if (size) - max_size += size * sizeof(u64) + - sizeof(struct cf_ctrset_entry); - } - debug_sprintf_event(cf_diag_dbg, 5, "%s max_size %zu\n", __func__, - max_size); - - return max_size; -} - -/* Read a counter set. The counter set number determines which counter set and - * the CPUM-CF first and second version number determine the number of - * available counters in this counter set. - * Each counter set starts with header containing the counter set number and - * the number of 8 byte counters. - * - * The functions returns the number of bytes occupied by this counter set - * including the header. - * If there is no counter in the counter set, this counter set is useless and - * zero is returned on this case. - */ -static size_t cf_diag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset, - size_t room) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - size_t ctrset_size, need = 0; - int rc = 3; /* Assume write failure */ - - ctrdata->def = CF_DIAG_CTRSET_DEF; - ctrdata->set = ctrset; - ctrdata->res1 = 0; - ctrset_size = cf_diag_ctrset_size(ctrset, &cpuhw->info); - - if (ctrset_size) { /* Save data */ - need = ctrset_size * sizeof(u64) + sizeof(*ctrdata); - if (need <= room) - rc = ctr_stcctm(ctrset, ctrset_size, - (u64 *)(ctrdata + 1)); - if (rc != 3) - ctrdata->ctr = ctrset_size; - else - need = 0; - } - - debug_sprintf_event(cf_diag_dbg, 6, - "%s ctrset %d ctrset_size %zu cfvn %d csvn %d" - " need %zd rc %d\n", - __func__, ctrset, ctrset_size, cpuhw->info.cfvn, - cpuhw->info.csvn, need, rc); - return need; -} - -/* Read out all counter sets and save them in the provided data buffer. - * The last 64 byte host an artificial trailer entry. - */ -static size_t cf_diag_getctr(void *data, size_t sz, unsigned long auth) -{ - struct cf_trailer_entry *trailer; - size_t offset = 0, done; - int i; - - memset(data, 0, sz); - sz -= sizeof(*trailer); /* Always room for trailer */ - for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { - struct cf_ctrset_entry *ctrdata = data + offset; - - if (!(auth & cpumf_ctr_ctl[i])) - continue; /* Counter set not authorized */ - - done = cf_diag_getctrset(ctrdata, i, sz - offset); - offset += done; - debug_sprintf_event(cf_diag_dbg, 6, - "%s ctrset %d offset %zu done %zu\n", - __func__, i, offset, done); - } - trailer = data + offset; - cf_diag_trailer(trailer); - return offset + sizeof(*trailer); -} - -/* Calculate the difference for each counter in a counter set. */ -static void cf_diag_diffctrset(u64 *pstart, u64 *pstop, int counters) -{ - for (; --counters >= 0; ++pstart, ++pstop) - if (*pstop >= *pstart) - *pstop -= *pstart; - else - *pstop = *pstart - *pstop; -} - -/* Scan the counter sets and calculate the difference of each counter - * in each set. The result is the increment of each counter during the - * period the counter set has been activated. - * - * Return true on success. - */ -static int cf_diag_diffctr(struct cf_diag_csd *csd, unsigned long auth) -{ - struct cf_trailer_entry *trailer_start, *trailer_stop; - struct cf_ctrset_entry *ctrstart, *ctrstop; - size_t offset = 0; - - auth &= (1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1; - do { - ctrstart = (struct cf_ctrset_entry *)(csd->start + offset); - ctrstop = (struct cf_ctrset_entry *)(csd->data + offset); - - if (memcmp(ctrstop, ctrstart, sizeof(*ctrstop))) { - pr_err("cpum_cf_diag counter set compare error " - "in set %i\n", ctrstart->set); - return 0; - } - auth &= ~cpumf_ctr_ctl[ctrstart->set]; - if (ctrstart->def == CF_DIAG_CTRSET_DEF) { - cf_diag_diffctrset((u64 *)(ctrstart + 1), - (u64 *)(ctrstop + 1), ctrstart->ctr); - offset += ctrstart->ctr * sizeof(u64) + - sizeof(*ctrstart); - } - debug_sprintf_event(cf_diag_dbg, 6, - "%s set %d ctr %d offset %zu auth %lx\n", - __func__, ctrstart->set, ctrstart->ctr, - offset, auth); - } while (ctrstart->def && auth); - - /* Save time_stamp from start of event in stop's trailer */ - trailer_start = (struct cf_trailer_entry *)(csd->start + offset); - trailer_stop = (struct cf_trailer_entry *)(csd->data + offset); - trailer_stop->progusage[0] = trailer_start->timestamp; - - return 1; -} - -/* Create perf event sample with the counter sets as raw data. The sample - * is then pushed to the event subsystem and the function checks for - * possible event overflows. If an event overflow occurs, the PMU is - * stopped. - * - * Return non-zero if an event overflow occurred. - */ -static int cf_diag_push_sample(struct perf_event *event, - struct cf_diag_csd *csd) -{ - struct perf_sample_data data; - struct perf_raw_record raw; - struct pt_regs regs; - int overflow; - - /* Setup perf sample */ - perf_sample_data_init(&data, 0, event->hw.last_period); - memset(®s, 0, sizeof(regs)); - memset(&raw, 0, sizeof(raw)); - - if (event->attr.sample_type & PERF_SAMPLE_CPU) - data.cpu_entry.cpu = event->cpu; - if (event->attr.sample_type & PERF_SAMPLE_RAW) { - raw.frag.size = csd->used; - raw.frag.data = csd->data; - raw.size = csd->used; - data.raw = &raw; - } - - overflow = perf_event_overflow(event, &data, ®s); - debug_sprintf_event(cf_diag_dbg, 6, - "%s event %p cpu %d sample_type %#llx raw %d " - "ov %d\n", __func__, event, event->cpu, - event->attr.sample_type, raw.size, overflow); - if (overflow) - event->pmu->stop(event, 0); - - perf_event_update_userpage(event); - return overflow; -} - -static void cf_diag_start(struct perf_event *event, int flags) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - struct cf_diag_csd *csd = this_cpu_ptr(&cf_diag_csd); - struct hw_perf_event *hwc = &event->hw; - - debug_sprintf_event(cf_diag_dbg, 5, - "%s event %p cpu %d flags %#x hwc-state %#x\n", - __func__, event, event->cpu, flags, hwc->state); - if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) - return; - - /* (Re-)enable and activate all counter sets */ - lcctl(0); /* Reset counter sets */ - hwc->state = 0; - ctr_set_multiple_enable(&cpuhw->state, hwc->config_base); - lcctl(cpuhw->state); /* Enable counter sets */ - csd->used = cf_diag_getctr(csd->start, sizeof(csd->start), - event->hw.config_base); - ctr_set_multiple_start(&cpuhw->state, hwc->config_base); - /* Function cf_diag_enable() starts the counter sets. */ -} - -static void cf_diag_stop(struct perf_event *event, int flags) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - struct cf_diag_csd *csd = this_cpu_ptr(&cf_diag_csd); - struct hw_perf_event *hwc = &event->hw; - - debug_sprintf_event(cf_diag_dbg, 5, - "%s event %p cpu %d flags %#x hwc-state %#x\n", - __func__, event, event->cpu, flags, hwc->state); - - /* Deactivate all counter sets */ - ctr_set_multiple_stop(&cpuhw->state, hwc->config_base); - local64_inc(&event->count); - csd->used = cf_diag_getctr(csd->data, sizeof(csd->data), - event->hw.config_base); - if (cf_diag_diffctr(csd, event->hw.config_base)) - cf_diag_push_sample(event, csd); - hwc->state |= PERF_HES_STOPPED; -} - -static int cf_diag_add(struct perf_event *event, int flags) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - int err = 0; - - debug_sprintf_event(cf_diag_dbg, 5, - "%s event %p cpu %d flags %#x cpuhw %p\n", - __func__, event, event->cpu, flags, cpuhw); - - if (cpuhw->flags & PMU_F_IN_USE) { - err = -EAGAIN; - goto out; - } - - event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; - - cpuhw->flags |= PMU_F_IN_USE; - if (flags & PERF_EF_START) - cf_diag_start(event, PERF_EF_RELOAD); -out: - debug_sprintf_event(cf_diag_dbg, 5, "%s err %d\n", __func__, err); - return err; -} - -static void cf_diag_del(struct perf_event *event, int flags) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - - debug_sprintf_event(cf_diag_dbg, 5, - "%s event %p cpu %d flags %#x\n", - __func__, event, event->cpu, flags); - - cf_diag_stop(event, PERF_EF_UPDATE); - ctr_set_multiple_stop(&cpuhw->state, event->hw.config_base); - ctr_set_multiple_disable(&cpuhw->state, event->hw.config_base); - cpuhw->flags &= ~PMU_F_IN_USE; -} - -CPUMF_EVENT_ATTR(CF_DIAG, CF_DIAG, PERF_EVENT_CPUM_CF_DIAG); - -static struct attribute *cf_diag_events_attr[] = { - CPUMF_EVENT_PTR(CF_DIAG, CF_DIAG), - NULL, -}; - -PMU_FORMAT_ATTR(event, "config:0-63"); - -static struct attribute *cf_diag_format_attr[] = { - &format_attr_event.attr, - NULL, -}; - -static struct attribute_group cf_diag_events_group = { - .name = "events", - .attrs = cf_diag_events_attr, -}; -static struct attribute_group cf_diag_format_group = { - .name = "format", - .attrs = cf_diag_format_attr, -}; -static const struct attribute_group *cf_diag_attr_groups[] = { - &cf_diag_events_group, - &cf_diag_format_group, - NULL, -}; - -/* Performance monitoring unit for s390x */ -static struct pmu cf_diag = { - .task_ctx_nr = perf_sw_context, - .pmu_enable = cf_diag_enable, - .pmu_disable = cf_diag_disable, - .event_init = cf_diag_event_init, - .add = cf_diag_add, - .del = cf_diag_del, - .start = cf_diag_start, - .stop = cf_diag_stop, - .read = cf_diag_read, - - .attr_groups = cf_diag_attr_groups -}; - -/* Get the CPU speed, try sampling facility first and CPU attributes second. */ -static void cf_diag_get_cpu_speed(void) -{ - if (cpum_sf_avail()) { /* Sampling facility first */ - struct hws_qsi_info_block si; - - memset(&si, 0, sizeof(si)); - if (!qsi(&si)) { - cf_diag_cpu_speed = si.cpu_speed; - return; - } - } - - if (test_facility(34)) { /* CPU speed extract static part */ - unsigned long mhz = __ecag(ECAG_CPU_ATTRIBUTE, 0); - - if (mhz != -1UL) - cf_diag_cpu_speed = mhz & 0xffffffff; - } -} - -/* Initialize the counter set PMU to generate complete counter set data as - * event raw data. This relies on the CPU Measurement Counter Facility device - * already being loaded and initialized. - */ -static int __init cf_diag_init(void) -{ - struct cpumf_ctr_info info; - size_t need; - int rc; - - if (!kernel_cpumcf_avail() || !stccm_avail() || qctri(&info)) - return -ENODEV; - cf_diag_get_cpu_speed(); - - /* Make sure the counter set data fits into predefined buffer. */ - need = cf_diag_ctrset_maxsize(&info); - if (need > sizeof(((struct cf_diag_csd *)0)->start)) { - pr_err("Insufficient memory for PMU(cpum_cf_diag) need=%zu\n", - need); - return -ENOMEM; - } - - /* Setup s390dbf facility */ - cf_diag_dbg = debug_register(KMSG_COMPONENT, 2, 1, 128); - if (!cf_diag_dbg) { - pr_err("Registration of s390dbf(cpum_cf_diag) failed\n"); - return -ENOMEM; - } - debug_register_view(cf_diag_dbg, &debug_sprintf_view); - - rc = perf_pmu_register(&cf_diag, "cpum_cf_diag", -1); - if (rc) { - debug_unregister_view(cf_diag_dbg, &debug_sprintf_view); - debug_unregister(cf_diag_dbg); - pr_err("Registration of PMU(cpum_cf_diag) failed with rc=%i\n", - rc); - } - return rc; -} -arch_initcall(cf_diag_init); diff --git a/arch/s390/kernel/perf_cpum_cf_events.c b/arch/s390/kernel/perf_cpum_cf_events.c index 8b33e03e47b8..0d64aafd158f 100644 --- a/arch/s390/kernel/perf_cpum_cf_events.c +++ b/arch/s390/kernel/perf_cpum_cf_events.c @@ -238,6 +238,134 @@ CPUMF_EVENT_ATTR(cf_z14, TX_C_TABORT_SPECIAL, 0x00f5); CPUMF_EVENT_ATTR(cf_z14, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0); CPUMF_EVENT_ATTR(cf_z14, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1); +CPUMF_EVENT_ATTR(cf_z15, L1D_RO_EXCL_WRITES, 0x0080); +CPUMF_EVENT_ATTR(cf_z15, DTLB2_WRITES, 0x0081); +CPUMF_EVENT_ATTR(cf_z15, DTLB2_MISSES, 0x0082); +CPUMF_EVENT_ATTR(cf_z15, DTLB2_HPAGE_WRITES, 0x0083); +CPUMF_EVENT_ATTR(cf_z15, DTLB2_GPAGE_WRITES, 0x0084); +CPUMF_EVENT_ATTR(cf_z15, L1D_L2D_SOURCED_WRITES, 0x0085); +CPUMF_EVENT_ATTR(cf_z15, ITLB2_WRITES, 0x0086); +CPUMF_EVENT_ATTR(cf_z15, ITLB2_MISSES, 0x0087); +CPUMF_EVENT_ATTR(cf_z15, L1I_L2I_SOURCED_WRITES, 0x0088); +CPUMF_EVENT_ATTR(cf_z15, TLB2_PTE_WRITES, 0x0089); +CPUMF_EVENT_ATTR(cf_z15, TLB2_CRSTE_WRITES, 0x008a); +CPUMF_EVENT_ATTR(cf_z15, TLB2_ENGINES_BUSY, 0x008b); +CPUMF_EVENT_ATTR(cf_z15, TX_C_TEND, 0x008c); +CPUMF_EVENT_ATTR(cf_z15, TX_NC_TEND, 0x008d); +CPUMF_EVENT_ATTR(cf_z15, L1C_TLB2_MISSES, 0x008f); +CPUMF_EVENT_ATTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES, 0x0090); +CPUMF_EVENT_ATTR(cf_z15, L1D_ONCHIP_MEMORY_SOURCED_WRITES, 0x0091); +CPUMF_EVENT_ATTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES_IV, 0x0092); +CPUMF_EVENT_ATTR(cf_z15, L1D_ONCLUSTER_L3_SOURCED_WRITES, 0x0093); +CPUMF_EVENT_ATTR(cf_z15, L1D_ONCLUSTER_MEMORY_SOURCED_WRITES, 0x0094); +CPUMF_EVENT_ATTR(cf_z15, L1D_ONCLUSTER_L3_SOURCED_WRITES_IV, 0x0095); +CPUMF_EVENT_ATTR(cf_z15, L1D_OFFCLUSTER_L3_SOURCED_WRITES, 0x0096); +CPUMF_EVENT_ATTR(cf_z15, L1D_OFFCLUSTER_MEMORY_SOURCED_WRITES, 0x0097); +CPUMF_EVENT_ATTR(cf_z15, L1D_OFFCLUSTER_L3_SOURCED_WRITES_IV, 0x0098); +CPUMF_EVENT_ATTR(cf_z15, L1D_OFFDRAWER_L3_SOURCED_WRITES, 0x0099); +CPUMF_EVENT_ATTR(cf_z15, L1D_OFFDRAWER_MEMORY_SOURCED_WRITES, 0x009a); +CPUMF_EVENT_ATTR(cf_z15, L1D_OFFDRAWER_L3_SOURCED_WRITES_IV, 0x009b); +CPUMF_EVENT_ATTR(cf_z15, L1D_ONDRAWER_L4_SOURCED_WRITES, 0x009c); +CPUMF_EVENT_ATTR(cf_z15, L1D_OFFDRAWER_L4_SOURCED_WRITES, 0x009d); +CPUMF_EVENT_ATTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES_RO, 0x009e); +CPUMF_EVENT_ATTR(cf_z15, L1I_ONCHIP_L3_SOURCED_WRITES, 0x00a2); +CPUMF_EVENT_ATTR(cf_z15, L1I_ONCHIP_MEMORY_SOURCED_WRITES, 0x00a3); +CPUMF_EVENT_ATTR(cf_z15, L1I_ONCHIP_L3_SOURCED_WRITES_IV, 0x00a4); +CPUMF_EVENT_ATTR(cf_z15, L1I_ONCLUSTER_L3_SOURCED_WRITES, 0x00a5); +CPUMF_EVENT_ATTR(cf_z15, L1I_ONCLUSTER_MEMORY_SOURCED_WRITES, 0x00a6); +CPUMF_EVENT_ATTR(cf_z15, L1I_ONCLUSTER_L3_SOURCED_WRITES_IV, 0x00a7); +CPUMF_EVENT_ATTR(cf_z15, L1I_OFFCLUSTER_L3_SOURCED_WRITES, 0x00a8); +CPUMF_EVENT_ATTR(cf_z15, L1I_OFFCLUSTER_MEMORY_SOURCED_WRITES, 0x00a9); +CPUMF_EVENT_ATTR(cf_z15, L1I_OFFCLUSTER_L3_SOURCED_WRITES_IV, 0x00aa); +CPUMF_EVENT_ATTR(cf_z15, L1I_OFFDRAWER_L3_SOURCED_WRITES, 0x00ab); +CPUMF_EVENT_ATTR(cf_z15, L1I_OFFDRAWER_MEMORY_SOURCED_WRITES, 0x00ac); +CPUMF_EVENT_ATTR(cf_z15, L1I_OFFDRAWER_L3_SOURCED_WRITES_IV, 0x00ad); +CPUMF_EVENT_ATTR(cf_z15, L1I_ONDRAWER_L4_SOURCED_WRITES, 0x00ae); +CPUMF_EVENT_ATTR(cf_z15, L1I_OFFDRAWER_L4_SOURCED_WRITES, 0x00af); +CPUMF_EVENT_ATTR(cf_z15, BCD_DFP_EXECUTION_SLOTS, 0x00e0); +CPUMF_EVENT_ATTR(cf_z15, VX_BCD_EXECUTION_SLOTS, 0x00e1); +CPUMF_EVENT_ATTR(cf_z15, DECIMAL_INSTRUCTIONS, 0x00e2); +CPUMF_EVENT_ATTR(cf_z15, LAST_HOST_TRANSLATIONS, 0x00e8); +CPUMF_EVENT_ATTR(cf_z15, TX_NC_TABORT, 0x00f3); +CPUMF_EVENT_ATTR(cf_z15, TX_C_TABORT_NO_SPECIAL, 0x00f4); +CPUMF_EVENT_ATTR(cf_z15, TX_C_TABORT_SPECIAL, 0x00f5); +CPUMF_EVENT_ATTR(cf_z15, DFLT_ACCESS, 0x00f7); +CPUMF_EVENT_ATTR(cf_z15, DFLT_CYCLES, 0x00fc); +CPUMF_EVENT_ATTR(cf_z15, DFLT_CC, 0x00108); +CPUMF_EVENT_ATTR(cf_z15, DFLT_CCFINISH, 0x00109); +CPUMF_EVENT_ATTR(cf_z15, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0); +CPUMF_EVENT_ATTR(cf_z15, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1); +CPUMF_EVENT_ATTR(cf_z16, L1D_RO_EXCL_WRITES, 0x0080); +CPUMF_EVENT_ATTR(cf_z16, DTLB2_WRITES, 0x0081); +CPUMF_EVENT_ATTR(cf_z16, DTLB2_MISSES, 0x0082); +CPUMF_EVENT_ATTR(cf_z16, CRSTE_1MB_WRITES, 0x0083); +CPUMF_EVENT_ATTR(cf_z16, DTLB2_GPAGE_WRITES, 0x0084); +CPUMF_EVENT_ATTR(cf_z16, ITLB2_WRITES, 0x0086); +CPUMF_EVENT_ATTR(cf_z16, ITLB2_MISSES, 0x0087); +CPUMF_EVENT_ATTR(cf_z16, TLB2_PTE_WRITES, 0x0089); +CPUMF_EVENT_ATTR(cf_z16, TLB2_CRSTE_WRITES, 0x008a); +CPUMF_EVENT_ATTR(cf_z16, TLB2_ENGINES_BUSY, 0x008b); +CPUMF_EVENT_ATTR(cf_z16, TX_C_TEND, 0x008c); +CPUMF_EVENT_ATTR(cf_z16, TX_NC_TEND, 0x008d); +CPUMF_EVENT_ATTR(cf_z16, L1C_TLB2_MISSES, 0x008f); +CPUMF_EVENT_ATTR(cf_z16, DCW_REQ, 0x0091); +CPUMF_EVENT_ATTR(cf_z16, DCW_REQ_IV, 0x0092); +CPUMF_EVENT_ATTR(cf_z16, DCW_REQ_CHIP_HIT, 0x0093); +CPUMF_EVENT_ATTR(cf_z16, DCW_REQ_DRAWER_HIT, 0x0094); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP, 0x0095); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP_IV, 0x0096); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP_CHIP_HIT, 0x0097); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP_DRAWER_HIT, 0x0098); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_MODULE, 0x0099); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_DRAWER, 0x009a); +CPUMF_EVENT_ATTR(cf_z16, DCW_OFF_DRAWER, 0x009b); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP_MEMORY, 0x009c); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_MODULE_MEMORY, 0x009d); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_DRAWER_MEMORY, 0x009e); +CPUMF_EVENT_ATTR(cf_z16, DCW_OFF_DRAWER_MEMORY, 0x009f); +CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_MODULE_IV, 0x00a0); +CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_MODULE_CHIP_HIT, 0x00a1); +CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_MODULE_DRAWER_HIT, 0x00a2); +CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_DRAWER_IV, 0x00a3); +CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_DRAWER_CHIP_HIT, 0x00a4); +CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_DRAWER_DRAWER_HIT, 0x00a5); +CPUMF_EVENT_ATTR(cf_z16, IDCW_OFF_DRAWER_IV, 0x00a6); +CPUMF_EVENT_ATTR(cf_z16, IDCW_OFF_DRAWER_CHIP_HIT, 0x00a7); +CPUMF_EVENT_ATTR(cf_z16, IDCW_OFF_DRAWER_DRAWER_HIT, 0x00a8); +CPUMF_EVENT_ATTR(cf_z16, ICW_REQ, 0x00a9); +CPUMF_EVENT_ATTR(cf_z16, ICW_REQ_IV, 0x00aa); +CPUMF_EVENT_ATTR(cf_z16, ICW_REQ_CHIP_HIT, 0x00ab); +CPUMF_EVENT_ATTR(cf_z16, ICW_REQ_DRAWER_HIT, 0x00ac); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP, 0x00ad); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP_IV, 0x00ae); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP_CHIP_HIT, 0x00af); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP_DRAWER_HIT, 0x00b0); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_MODULE, 0x00b1); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_DRAWER, 0x00b2); +CPUMF_EVENT_ATTR(cf_z16, ICW_OFF_DRAWER, 0x00b3); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP_MEMORY, 0x00b4); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_MODULE_MEMORY, 0x00b5); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_DRAWER_MEMORY, 0x00b6); +CPUMF_EVENT_ATTR(cf_z16, ICW_OFF_DRAWER_MEMORY, 0x00b7); +CPUMF_EVENT_ATTR(cf_z16, BCD_DFP_EXECUTION_SLOTS, 0x00e0); +CPUMF_EVENT_ATTR(cf_z16, VX_BCD_EXECUTION_SLOTS, 0x00e1); +CPUMF_EVENT_ATTR(cf_z16, DECIMAL_INSTRUCTIONS, 0x00e2); +CPUMF_EVENT_ATTR(cf_z16, LAST_HOST_TRANSLATIONS, 0x00e8); +CPUMF_EVENT_ATTR(cf_z16, TX_NC_TABORT, 0x00f4); +CPUMF_EVENT_ATTR(cf_z16, TX_C_TABORT_NO_SPECIAL, 0x00f5); +CPUMF_EVENT_ATTR(cf_z16, TX_C_TABORT_SPECIAL, 0x00f6); +CPUMF_EVENT_ATTR(cf_z16, DFLT_ACCESS, 0x00f8); +CPUMF_EVENT_ATTR(cf_z16, DFLT_CYCLES, 0x00fd); +CPUMF_EVENT_ATTR(cf_z16, SORTL, 0x0100); +CPUMF_EVENT_ATTR(cf_z16, DFLT_CC, 0x0109); +CPUMF_EVENT_ATTR(cf_z16, DFLT_CCFINISH, 0x010a); +CPUMF_EVENT_ATTR(cf_z16, NNPA_INVOCATIONS, 0x010b); +CPUMF_EVENT_ATTR(cf_z16, NNPA_COMPLETIONS, 0x010c); +CPUMF_EVENT_ATTR(cf_z16, NNPA_WAIT_LOCK, 0x010d); +CPUMF_EVENT_ATTR(cf_z16, NNPA_HOLD_LOCK, 0x010e); +CPUMF_EVENT_ATTR(cf_z16, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0); +CPUMF_EVENT_ATTR(cf_z16, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1); + static struct attribute *cpumcf_fvn1_pmu_event_attr[] __initdata = { CPUMF_EVENT_PTR(cf_fvn1, CPU_CYCLES), CPUMF_EVENT_PTR(cf_fvn1, INSTRUCTIONS), @@ -286,7 +414,7 @@ static struct attribute *cpumcf_svn_12345_pmu_event_attr[] __initdata = { NULL, }; -static struct attribute *cpumcf_svn_6_pmu_event_attr[] __initdata = { +static struct attribute *cpumcf_svn_67_pmu_event_attr[] __initdata = { CPUMF_EVENT_PTR(cf_svn_12345, PRNG_FUNCTIONS), CPUMF_EVENT_PTR(cf_svn_12345, PRNG_CYCLES), CPUMF_EVENT_PTR(cf_svn_12345, PRNG_BLOCKED_FUNCTIONS), @@ -516,6 +644,141 @@ static struct attribute *cpumcf_z14_pmu_event_attr[] __initdata = { NULL, }; +static struct attribute *cpumcf_z15_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_z15, L1D_RO_EXCL_WRITES), + CPUMF_EVENT_PTR(cf_z15, DTLB2_WRITES), + CPUMF_EVENT_PTR(cf_z15, DTLB2_MISSES), + CPUMF_EVENT_PTR(cf_z15, DTLB2_HPAGE_WRITES), + CPUMF_EVENT_PTR(cf_z15, DTLB2_GPAGE_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_L2D_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, ITLB2_WRITES), + CPUMF_EVENT_PTR(cf_z15, ITLB2_MISSES), + CPUMF_EVENT_PTR(cf_z15, L1I_L2I_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, TLB2_PTE_WRITES), + CPUMF_EVENT_PTR(cf_z15, TLB2_CRSTE_WRITES), + CPUMF_EVENT_PTR(cf_z15, TLB2_ENGINES_BUSY), + CPUMF_EVENT_PTR(cf_z15, TX_C_TEND), + CPUMF_EVENT_PTR(cf_z15, TX_NC_TEND), + CPUMF_EVENT_PTR(cf_z15, L1C_TLB2_MISSES), + CPUMF_EVENT_PTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_ONCHIP_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z15, L1D_ONCLUSTER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_ONCLUSTER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_ONCLUSTER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z15, L1D_OFFCLUSTER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_OFFCLUSTER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_OFFCLUSTER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z15, L1D_OFFDRAWER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_OFFDRAWER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_OFFDRAWER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z15, L1D_ONDRAWER_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_OFFDRAWER_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES_RO), + CPUMF_EVENT_PTR(cf_z15, L1I_ONCHIP_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1I_ONCHIP_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1I_ONCHIP_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z15, L1I_ONCLUSTER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1I_ONCLUSTER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1I_ONCLUSTER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z15, L1I_OFFCLUSTER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1I_OFFCLUSTER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1I_OFFCLUSTER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z15, L1I_OFFDRAWER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1I_OFFDRAWER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1I_OFFDRAWER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z15, L1I_ONDRAWER_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1I_OFFDRAWER_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, BCD_DFP_EXECUTION_SLOTS), + CPUMF_EVENT_PTR(cf_z15, VX_BCD_EXECUTION_SLOTS), + CPUMF_EVENT_PTR(cf_z15, DECIMAL_INSTRUCTIONS), + CPUMF_EVENT_PTR(cf_z15, LAST_HOST_TRANSLATIONS), + CPUMF_EVENT_PTR(cf_z15, TX_NC_TABORT), + CPUMF_EVENT_PTR(cf_z15, TX_C_TABORT_NO_SPECIAL), + CPUMF_EVENT_PTR(cf_z15, TX_C_TABORT_SPECIAL), + CPUMF_EVENT_PTR(cf_z15, DFLT_ACCESS), + CPUMF_EVENT_PTR(cf_z15, DFLT_CYCLES), + CPUMF_EVENT_PTR(cf_z15, DFLT_CC), + CPUMF_EVENT_PTR(cf_z15, DFLT_CCFINISH), + CPUMF_EVENT_PTR(cf_z15, MT_DIAG_CYCLES_ONE_THR_ACTIVE), + CPUMF_EVENT_PTR(cf_z15, MT_DIAG_CYCLES_TWO_THR_ACTIVE), + NULL, +}; + +static struct attribute *cpumcf_z16_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_z16, L1D_RO_EXCL_WRITES), + CPUMF_EVENT_PTR(cf_z16, DTLB2_WRITES), + CPUMF_EVENT_PTR(cf_z16, DTLB2_MISSES), + CPUMF_EVENT_PTR(cf_z16, CRSTE_1MB_WRITES), + CPUMF_EVENT_PTR(cf_z16, DTLB2_GPAGE_WRITES), + CPUMF_EVENT_PTR(cf_z16, ITLB2_WRITES), + CPUMF_EVENT_PTR(cf_z16, ITLB2_MISSES), + CPUMF_EVENT_PTR(cf_z16, TLB2_PTE_WRITES), + CPUMF_EVENT_PTR(cf_z16, TLB2_CRSTE_WRITES), + CPUMF_EVENT_PTR(cf_z16, TLB2_ENGINES_BUSY), + CPUMF_EVENT_PTR(cf_z16, TX_C_TEND), + CPUMF_EVENT_PTR(cf_z16, TX_NC_TEND), + CPUMF_EVENT_PTR(cf_z16, L1C_TLB2_MISSES), + CPUMF_EVENT_PTR(cf_z16, DCW_REQ), + CPUMF_EVENT_PTR(cf_z16, DCW_REQ_IV), + CPUMF_EVENT_PTR(cf_z16, DCW_REQ_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z16, DCW_REQ_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP_IV), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_MODULE), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_DRAWER), + CPUMF_EVENT_PTR(cf_z16, DCW_OFF_DRAWER), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP_MEMORY), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_MODULE_MEMORY), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_DRAWER_MEMORY), + CPUMF_EVENT_PTR(cf_z16, DCW_OFF_DRAWER_MEMORY), + CPUMF_EVENT_PTR(cf_z16, IDCW_ON_MODULE_IV), + CPUMF_EVENT_PTR(cf_z16, IDCW_ON_MODULE_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z16, IDCW_ON_MODULE_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z16, IDCW_ON_DRAWER_IV), + CPUMF_EVENT_PTR(cf_z16, IDCW_ON_DRAWER_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z16, IDCW_ON_DRAWER_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z16, IDCW_OFF_DRAWER_IV), + CPUMF_EVENT_PTR(cf_z16, IDCW_OFF_DRAWER_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z16, IDCW_OFF_DRAWER_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z16, ICW_REQ), + CPUMF_EVENT_PTR(cf_z16, ICW_REQ_IV), + CPUMF_EVENT_PTR(cf_z16, ICW_REQ_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z16, ICW_REQ_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP_IV), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_MODULE), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_DRAWER), + CPUMF_EVENT_PTR(cf_z16, ICW_OFF_DRAWER), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP_MEMORY), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_MODULE_MEMORY), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_DRAWER_MEMORY), + CPUMF_EVENT_PTR(cf_z16, ICW_OFF_DRAWER_MEMORY), + CPUMF_EVENT_PTR(cf_z16, BCD_DFP_EXECUTION_SLOTS), + CPUMF_EVENT_PTR(cf_z16, VX_BCD_EXECUTION_SLOTS), + CPUMF_EVENT_PTR(cf_z16, DECIMAL_INSTRUCTIONS), + CPUMF_EVENT_PTR(cf_z16, LAST_HOST_TRANSLATIONS), + CPUMF_EVENT_PTR(cf_z16, TX_NC_TABORT), + CPUMF_EVENT_PTR(cf_z16, TX_C_TABORT_NO_SPECIAL), + CPUMF_EVENT_PTR(cf_z16, TX_C_TABORT_SPECIAL), + CPUMF_EVENT_PTR(cf_z16, DFLT_ACCESS), + CPUMF_EVENT_PTR(cf_z16, DFLT_CYCLES), + CPUMF_EVENT_PTR(cf_z16, SORTL), + CPUMF_EVENT_PTR(cf_z16, DFLT_CC), + CPUMF_EVENT_PTR(cf_z16, DFLT_CCFINISH), + CPUMF_EVENT_PTR(cf_z16, NNPA_INVOCATIONS), + CPUMF_EVENT_PTR(cf_z16, NNPA_COMPLETIONS), + CPUMF_EVENT_PTR(cf_z16, NNPA_WAIT_LOCK), + CPUMF_EVENT_PTR(cf_z16, NNPA_HOLD_LOCK), + CPUMF_EVENT_PTR(cf_z16, MT_DIAG_CYCLES_ONE_THR_ACTIVE), + CPUMF_EVENT_PTR(cf_z16, MT_DIAG_CYCLES_TWO_THR_ACTIVE), + NULL, +}; + /* END: CPUM_CF COUNTER DEFINITIONS ===================================== */ static struct attribute_group cpumcf_pmu_events_group = { @@ -596,8 +859,8 @@ __init const struct attribute_group **cpumf_cf_event_group(void) case 1 ... 5: csvn = cpumcf_svn_12345_pmu_event_attr; break; - case 6: - csvn = cpumcf_svn_6_pmu_event_attr; + case 6 ... 7: + csvn = cpumcf_svn_67_pmu_event_attr; break; default: csvn = none; @@ -624,9 +887,15 @@ __init const struct attribute_group **cpumf_cf_event_group(void) break; case 0x3906: case 0x3907: + model = cpumcf_z14_pmu_event_attr; + break; case 0x8561: case 0x8562: - model = cpumcf_z14_pmu_event_attr; + model = cpumcf_z15_pmu_event_attr; + break; + case 0x3931: + case 0x3932: + model = cpumcf_z16_pmu_event_attr; break; default: model = none; diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index c07fdcd73726..06efad5b4f93 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -22,6 +22,7 @@ #include <asm/irq.h> #include <asm/debug.h> #include <asm/timex.h> +#include <linux/io.h> /* Minimum number of sample-data-block-tables: * At least one table is required for the sampling buffer structure. @@ -42,7 +43,7 @@ #define CPUM_SF_SDBT_TL_OFFSET (CPUM_SF_SDB_PER_TABLE * 8) static inline int require_table_link(const void *sdbt) { - return ((unsigned long) sdbt & ~PAGE_MASK) == CPUM_SF_SDBT_TL_OFFSET; + return ((unsigned long)sdbt & ~PAGE_MASK) == CPUM_SF_SDBT_TL_OFFSET; } /* Minimum and maximum sampling buffer sizes: @@ -99,6 +100,57 @@ static DEFINE_PER_CPU(struct cpu_hw_sf, cpu_hw_sf); /* Debug feature */ static debug_info_t *sfdbg; +/* Sampling control helper functions */ +static inline unsigned long freq_to_sample_rate(struct hws_qsi_info_block *qsi, + unsigned long freq) +{ + return (USEC_PER_SEC / freq) * qsi->cpu_speed; +} + +static inline unsigned long sample_rate_to_freq(struct hws_qsi_info_block *qsi, + unsigned long rate) +{ + return USEC_PER_SEC * qsi->cpu_speed / rate; +} + +/* Return TOD timestamp contained in an trailer entry */ +static inline unsigned long long trailer_timestamp(struct hws_trailer_entry *te) +{ + /* TOD in STCKE format */ + if (te->header.t) + return *((unsigned long long *)&te->timestamp[1]); + + /* TOD in STCK format */ + return *((unsigned long long *)&te->timestamp[0]); +} + +/* Return pointer to trailer entry of an sample data block */ +static inline struct hws_trailer_entry *trailer_entry_ptr(unsigned long v) +{ + void *ret; + + ret = (void *)v; + ret += PAGE_SIZE; + ret -= sizeof(struct hws_trailer_entry); + + return ret; +} + +/* + * Return true if the entry in the sample data block table (sdbt) + * is a link to the next sdbt + */ +static inline int is_link_entry(unsigned long *s) +{ + return *s & 0x1UL ? 1 : 0; +} + +/* Return pointer to the linked sdbt */ +static inline unsigned long *get_next_sdbt(unsigned long *s) +{ + return phys_to_virt(*s & ~0x1UL); +} + /* * sf_disable() - Switch off sampling facility */ @@ -140,7 +192,7 @@ static void free_sampling_buffer(struct sf_buffer *sfb) if (is_link_entry(curr)) { curr = get_next_sdbt(curr); if (sdbt) - free_page((unsigned long) sdbt); + free_page((unsigned long)sdbt); /* If the origin is reached, sampling buffer is freed */ if (curr == sfb->sdbt) @@ -150,7 +202,7 @@ static void free_sampling_buffer(struct sf_buffer *sfb) } else { /* Process SDB pointer */ if (*curr) { - free_page(*curr); + free_page((unsigned long)phys_to_virt(*curr)); curr++; } } @@ -163,17 +215,18 @@ static void free_sampling_buffer(struct sf_buffer *sfb) static int alloc_sample_data_block(unsigned long *sdbt, gfp_t gfp_flags) { - unsigned long sdb, *trailer; + struct hws_trailer_entry *te; + unsigned long sdb; /* Allocate and initialize sample-data-block */ sdb = get_zeroed_page(gfp_flags); if (!sdb) return -ENOMEM; - trailer = trailer_entry_ptr(sdb); - *trailer = SDB_TE_ALERT_REQ_MASK; + te = trailer_entry_ptr(sdb); + te->header.a = 1; /* Link SDB into the sample-data-block-table */ - *sdbt = sdb; + *sdbt = virt_to_phys((void *)sdb); return 0; } @@ -225,14 +278,14 @@ static int realloc_sampling_buffer(struct sf_buffer *sfb, for (i = 0; i < num_sdb; i++) { /* Allocate a new SDB-table if it is full. */ if (require_table_link(tail)) { - new = (unsigned long *) get_zeroed_page(gfp_flags); + new = (unsigned long *)get_zeroed_page(gfp_flags); if (!new) { rc = -ENOMEM; break; } sfb->num_sdbt++; /* Link current page to tail of chain */ - *tail = (unsigned long)(void *) new + 1; + *tail = virt_to_phys((void *)new) + 1; tail_prev = tail; tail = new; } @@ -251,7 +304,7 @@ static int realloc_sampling_buffer(struct sf_buffer *sfb, */ if (tail_prev) { sfb->num_sdbt--; - free_page((unsigned long) new); + free_page((unsigned long)new); tail = tail_prev; } break; @@ -262,7 +315,7 @@ static int realloc_sampling_buffer(struct sf_buffer *sfb, } /* Link sampling buffer to its origin */ - *tail = (unsigned long) sfb->sdbt + 1; + *tail = virt_to_phys(sfb->sdbt) + 1; sfb->tail = tail; debug_sprintf_event(sfdbg, 4, "%s: new buffer" @@ -290,7 +343,7 @@ static int alloc_sampling_buffer(struct sf_buffer *sfb, unsigned long num_sdb) return -EINVAL; /* Allocate the sample-data-block-table origin */ - sfb->sdbt = (unsigned long *) get_zeroed_page(GFP_KERNEL); + sfb->sdbt = (unsigned long *)get_zeroed_page(GFP_KERNEL); if (!sfb->sdbt) return -ENOMEM; sfb->num_sdb = 0; @@ -300,7 +353,7 @@ static int alloc_sampling_buffer(struct sf_buffer *sfb, unsigned long num_sdb) * realloc_sampling_buffer() invocation. */ sfb->tail = sfb->sdbt; - *sfb->tail = (unsigned long)(void *) sfb->sdbt + 1; + *sfb->tail = virt_to_phys((void *)sfb->sdbt) + 1; /* Allocate requested number of sample-data-blocks */ rc = realloc_sampling_buffer(sfb, num_sdb, GFP_KERNEL); @@ -372,28 +425,33 @@ static void deallocate_buffers(struct cpu_hw_sf *cpuhw) static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc) { - unsigned long n_sdb, freq, factor; + unsigned long n_sdb, freq; size_t sample_size; /* Calculate sampling buffers using 4K pages * - * 1. Determine the sample data size which depends on the used - * sampling functions, for example, basic-sampling or - * basic-sampling with diagnostic-sampling. + * 1. The sampling size is 32 bytes for basic sampling. This size + * is the same for all machine types. Diagnostic + * sampling uses auxlilary data buffer setup which provides the + * memory for SDBs using linux common code auxiliary trace + * setup. * - * 2. Use the sampling frequency as input. The sampling buffer is - * designed for almost one second. This can be adjusted through - * the "factor" variable. - * In any case, alloc_sampling_buffer() sets the Alert Request + * 2. Function alloc_sampling_buffer() sets the Alert Request * Control indicator to trigger a measurement-alert to harvest - * sample-data-blocks (sdb). + * sample-data-blocks (SDB). This is done per SDB. This + * measurement alert interrupt fires quick enough to handle + * one SDB, on very high frequency and work loads there might + * be 2 to 3 SBDs available for sample processing. + * Currently there is no need for setup alert request on every + * n-th page. This is counterproductive as one IRQ triggers + * a very high number of samples to be processed at one IRQ. * - * 3. Compute the number of sample-data-blocks and ensure a minimum - * of CPUM_SF_MIN_SDB. Also ensure the upper limit does not - * exceed a "calculated" maximum. The symbolic maximum is - * designed for basic-sampling only and needs to be increased if - * diagnostic-sampling is active. - * See also the remarks for these symbolic constants. + * 3. Use the sampling frequency as input. + * Compute the number of SDBs and ensure a minimum + * of CPUM_SF_MIN_SDB. Depending on frequency add some more + * SDBs to handle a higher sampling rate. + * Use a minimum of CPUM_SF_MIN_SDB and allow for 100 samples + * (one SDB) for every 10000 HZ frequency increment. * * 4. Compute the number of sample-data-block-tables (SDBT) and * ensure a minimum of CPUM_SF_MIN_SDBT (one table can manage up @@ -401,10 +459,7 @@ static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc) */ sample_size = sizeof(struct hws_basic_entry); freq = sample_rate_to_freq(&cpuhw->qsi, SAMPL_RATE(hwc)); - factor = 1; - n_sdb = DIV_ROUND_UP(freq, factor * ((PAGE_SIZE-64) / sample_size)); - if (n_sdb < CPUM_SF_MIN_SDB) - n_sdb = CPUM_SF_MIN_SDB; + n_sdb = CPUM_SF_MIN_SDB + DIV_ROUND_UP(freq, 10000); /* If there is already a sampling buffer allocated, it is very likely * that the sampling facility is enabled too. If the event to be @@ -539,11 +594,10 @@ static DEFINE_MUTEX(pmc_reserve_mutex); #define PMC_FAILURE 2 static void setup_pmc_cpu(void *flags) { - int err; struct cpu_hw_sf *cpusf = this_cpu_ptr(&cpu_hw_sf); + int err = 0; - err = 0; - switch (*((int *) flags)) { + switch (*((int *)flags)) { case PMC_INIT: memset(cpusf, 0, sizeof(*cpusf)); err = qsi(&cpusf->qsi); @@ -551,28 +605,18 @@ static void setup_pmc_cpu(void *flags) break; cpusf->flags |= PMU_F_RESERVED; err = sf_disable(); - if (err) - pr_err("Switching off the sampling facility failed " - "with rc %i\n", err); - debug_sprintf_event(sfdbg, 5, - "%s: initialized: cpuhw %p\n", __func__, - cpusf); break; case PMC_RELEASE: cpusf->flags &= ~PMU_F_RESERVED; err = sf_disable(); - if (err) { - pr_err("Switching off the sampling facility failed " - "with rc %i\n", err); - } else + if (!err) deallocate_buffers(cpusf); - debug_sprintf_event(sfdbg, 5, - "%s: released: cpuhw %p\n", __func__, - cpusf); break; } - if (err) - *((int *) flags) |= PMC_FAILURE; + if (err) { + *((int *)flags) |= PMC_FAILURE; + pr_err("Switching off the sampling facility failed with rc %i\n", err); + } } static void release_pmc_hardware(void) @@ -669,8 +713,9 @@ static void cpumsf_output_event_pid(struct perf_event *event, /* Protect callchain buffers, tasks */ rcu_read_lock(); - perf_prepare_sample(&header, data, event, regs); - if (perf_output_begin(&handle, event, header.size)) + perf_prepare_sample(data, event, regs); + perf_prepare_header(&header, data, event, regs); + if (perf_output_begin(&handle, data, event, header.size)) goto out; /* Update the process ID (see also kernel/events/core.c) */ @@ -832,10 +877,6 @@ static int __hw_perf_event_init(struct perf_event *event) SAMPL_FLAGS(hwc) |= PERF_CPUM_SF_DIAG_MODE; } - /* Check and set other sampling flags */ - if (attr->config1 & PERF_CPUM_SF_FULL_BLOCKS) - SAMPL_FLAGS(hwc) |= PERF_CPUM_SF_FULL_BLOCKS; - err = __hw_perf_event_init_rate(event, &si); if (err) goto out; @@ -879,12 +920,21 @@ out: return err; } +static bool is_callchain_event(struct perf_event *event) +{ + u64 sample_type = event->attr.sample_type; + + return sample_type & (PERF_SAMPLE_CALLCHAIN | PERF_SAMPLE_REGS_USER | + PERF_SAMPLE_STACK_USER); +} + static int cpumsf_pmu_event_init(struct perf_event *event) { int err; /* No support for taken branch sampling */ - if (has_branch_stack(event)) + /* No support for callchain, stacks and registers */ + if (has_branch_stack(event) || is_callchain_event(event)) return -EOPNOTSUPP; switch (event->attr.type) { @@ -908,10 +958,6 @@ static int cpumsf_pmu_event_init(struct perf_event *event) return -ENOENT; } - /* Check online status of the CPU to which the event is pinned */ - if (event->cpu >= 0 && !cpu_online(event->cpu)) - return -ENODEV; - /* Force reset of idle/hv excludes regardless of what the * user requested. */ @@ -971,8 +1017,7 @@ static void cpumsf_pmu_enable(struct pmu *pmu) err = lsctl(&cpuhw->lsctl); if (err) { cpuhw->flags &= ~PMU_F_ENABLED; - pr_err("Loading sampling controls failed: op %i err %i\n", - 1, err); + pr_err("Loading sampling controls failed: op 1 err %i\n", err); return; } @@ -1006,8 +1051,7 @@ static void cpumsf_pmu_disable(struct pmu *pmu) err = lsctl(&inactive); if (err) { - pr_err("Loading sampling controls failed: op %i err %i\n", - 2, err); + pr_err("Loading sampling controls failed: op 2 err %i\n", err); return; } @@ -1164,11 +1208,11 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt, struct hws_trailer_entry *te; struct hws_basic_entry *sample; - te = (struct hws_trailer_entry *) trailer_entry_ptr(*sdbt); - sample = (struct hws_basic_entry *) *sdbt; - while ((unsigned long *) sample < (unsigned long *) te) { + te = trailer_entry_ptr((unsigned long)sdbt); + sample = (struct hws_basic_entry *)sdbt; + while ((unsigned long *)sample < (unsigned long *)te) { /* Check for an empty sample */ - if (!sample->def) + if (!sample->def || sample->LS) break; /* Update perf event period */ @@ -1195,7 +1239,7 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt, "%s: Found unknown" " sampling data entry: te->f %i" " basic.def %#4x (%p)\n", __func__, - te->f, sample->def, sample); + te->header.f, sample->def, sample); /* Sample slot is not yet written or other record. * * This condition can occur if the buffer was reused @@ -1206,7 +1250,7 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt, * that are not full. Stop processing if the first * invalid format was detected. */ - if (!te->f) + if (!te->header.f) break; } @@ -1224,18 +1268,16 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt, * The sampling buffer position are retrieved and saved in the TEAR_REG * register of the specified perf event. * - * Only full sample-data-blocks are processed. Specify the flash_all flag - * to also walk through partially filled sample-data-blocks. It is ignored - * if PERF_CPUM_SF_FULL_BLOCKS is set. The PERF_CPUM_SF_FULL_BLOCKS flag - * enforces the processing of full sample-data-blocks only (trailer entries - * with the block-full-indicator bit set). + * Only full sample-data-blocks are processed. Specify the flush_all flag + * to also walk through partially filled sample-data-blocks. */ static void hw_perf_event_update(struct perf_event *event, int flush_all) { + unsigned long long event_overflow, sampl_overflow, num_sdb; + union hws_trailer_header old, prev, new; struct hw_perf_event *hwc = &event->hw; struct hws_trailer_entry *te; - unsigned long *sdbt; - unsigned long long event_overflow, sampl_overflow, num_sdb, te_flags; + unsigned long *sdbt, sdb; int done; /* @@ -1245,50 +1287,52 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all) if (SAMPL_DIAG_MODE(&event->hw)) return; - if (flush_all && SDB_FULL_BLOCKS(hwc)) - flush_all = 0; - - sdbt = (unsigned long *) TEAR_REG(hwc); + sdbt = (unsigned long *)TEAR_REG(hwc); done = event_overflow = sampl_overflow = num_sdb = 0; while (!done) { /* Get the trailer entry of the sample-data-block */ - te = (struct hws_trailer_entry *) trailer_entry_ptr(*sdbt); + sdb = (unsigned long)phys_to_virt(*sdbt); + te = trailer_entry_ptr(sdb); /* Leave loop if no more work to do (block full indicator) */ - if (!te->f) { + if (!te->header.f) { done = 1; if (!flush_all) break; } /* Check the sample overflow count */ - if (te->overflow) + if (te->header.overflow) /* Account sample overflows and, if a particular limit * is reached, extend the sampling buffer. * For details, see sfb_account_overflows(). */ - sampl_overflow += te->overflow; + sampl_overflow += te->header.overflow; /* Timestamps are valid for full sample-data-blocks only */ - debug_sprintf_event(sfdbg, 6, "%s: sdbt %#lx " + debug_sprintf_event(sfdbg, 6, "%s: sdbt %#lx/%#lx " "overflow %llu timestamp %#llx\n", - __func__, (unsigned long)sdbt, te->overflow, - (te->f) ? trailer_timestamp(te) : 0ULL); + __func__, sdb, (unsigned long)sdbt, + te->header.overflow, + (te->header.f) ? trailer_timestamp(te) : 0ULL); /* Collect all samples from a single sample-data-block and * flag if an (perf) event overflow happened. If so, the PMU * is stopped and remaining samples will be discarded. */ - hw_collect_samples(event, sdbt, &event_overflow); + hw_collect_samples(event, (unsigned long *)sdb, &event_overflow); num_sdb++; /* Reset trailer (using compare-double-and-swap) */ + prev.val = READ_ONCE_ALIGNED_128(te->header.val); do { - te_flags = te->flags & ~SDB_TE_BUFFER_FULL_MASK; - te_flags |= SDB_TE_ALERT_REQ_MASK; - } while (!cmpxchg_double(&te->flags, &te->overflow, - te->flags, te->overflow, - te_flags, 0ULL)); + old.val = prev.val; + new.val = prev.val; + new.f = 0; + new.a = 1; + new.overflow = 0; + prev.val = cmpxchg128(&te->header.val, old.val, new.val); + } while (prev.val != old.val); /* Advance to next sample-data-block */ sdbt++; @@ -1303,18 +1347,28 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all) */ if (flush_all && done) break; - - /* If an event overflow happened, discard samples by - * processing any remaining sample-data-blocks. - */ - if (event_overflow) - flush_all = 1; } /* Account sample overflows in the event hardware structure */ if (sampl_overflow) OVERFLOW_REG(hwc) = DIV_ROUND_UP(OVERFLOW_REG(hwc) + sampl_overflow, 1 + num_sdb); + + /* Perf_event_overflow() and perf_event_account_interrupt() limit + * the interrupt rate to an upper limit. Roughly 1000 samples per + * task tick. + * Hitting this limit results in a large number + * of throttled REF_REPORT_THROTTLE entries and the samples + * are dropped. + * Slightly increase the interval to avoid hitting this limit. + */ + if (event_overflow) { + SAMPL_RATE(hwc) += DIV_ROUND_UP(SAMPL_RATE(hwc), 10); + debug_sprintf_event(sfdbg, 1, "%s: rate adjustment %ld\n", + __func__, + DIV_ROUND_UP(SAMPL_RATE(hwc), 10)); + } + if (sampl_overflow || event_overflow) debug_sprintf_event(sfdbg, 4, "%s: " "overflows: sample %llu event %llu" @@ -1323,10 +1377,26 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all) OVERFLOW_REG(hwc), num_sdb); } -#define AUX_SDB_INDEX(aux, i) ((i) % aux->sfb.num_sdb) -#define AUX_SDB_NUM(aux, start, end) (end >= start ? end - start + 1 : 0) -#define AUX_SDB_NUM_ALERT(aux) AUX_SDB_NUM(aux, aux->head, aux->alert_mark) -#define AUX_SDB_NUM_EMPTY(aux) AUX_SDB_NUM(aux, aux->head, aux->empty_mark) +static inline unsigned long aux_sdb_index(struct aux_buffer *aux, + unsigned long i) +{ + return i % aux->sfb.num_sdb; +} + +static inline unsigned long aux_sdb_num(unsigned long start, unsigned long end) +{ + return end >= start ? end - start + 1 : 0; +} + +static inline unsigned long aux_sdb_num_alert(struct aux_buffer *aux) +{ + return aux_sdb_num(aux->head, aux->alert_mark); +} + +static inline unsigned long aux_sdb_num_empty(struct aux_buffer *aux) +{ + return aux_sdb_num(aux->head, aux->empty_mark); +} /* * Get trailer entry by index of SDB. @@ -1336,9 +1406,9 @@ static struct hws_trailer_entry *aux_sdb_trailer(struct aux_buffer *aux, { unsigned long sdb; - index = AUX_SDB_INDEX(aux, index); + index = aux_sdb_index(aux, index); sdb = aux->sdb_index[index]; - return (struct hws_trailer_entry *)trailer_entry_ptr(sdb); + return trailer_entry_ptr(sdb); } /* @@ -1360,10 +1430,10 @@ static void aux_output_end(struct perf_output_handle *handle) if (!aux) return; - range_scan = AUX_SDB_NUM_ALERT(aux); + range_scan = aux_sdb_num_alert(aux); for (i = 0, idx = aux->head; i < range_scan; i++, idx++) { te = aux_sdb_trailer(aux, idx); - if (!(te->flags & SDB_TE_BUFFER_FULL_MASK)) + if (!te->header.f) break; } /* i is num of SDBs which are full */ @@ -1371,9 +1441,10 @@ static void aux_output_end(struct perf_output_handle *handle) /* Remove alert indicators in the buffer */ te = aux_sdb_trailer(aux, aux->alert_mark); - te->flags &= ~SDB_TE_ALERT_REQ_MASK; + te->header.a = 0; - debug_sprintf_event(sfdbg, 6, "%s: collect %#lx SDBs\n", __func__, i); + debug_sprintf_event(sfdbg, 6, "%s: SDBs %ld range %ld head %ld\n", + __func__, i, range_scan, aux->head); } /* @@ -1389,9 +1460,7 @@ static int aux_output_begin(struct perf_output_handle *handle, struct aux_buffer *aux, struct cpu_hw_sf *cpuhw) { - unsigned long range; - unsigned long i, range_scan, idx; - unsigned long head, base, offset; + unsigned long range, i, range_scan, idx, head, base, offset; struct hws_trailer_entry *te; if (WARN_ON_ONCE(handle->head & ~PAGE_MASK)) @@ -1406,14 +1475,18 @@ static int aux_output_begin(struct perf_output_handle *handle, * SDBs between aux->head and aux->empty_mark are already ready * for new data. range_scan is num of SDBs not within them. */ - if (range > AUX_SDB_NUM_EMPTY(aux)) { - range_scan = range - AUX_SDB_NUM_EMPTY(aux); + debug_sprintf_event(sfdbg, 6, + "%s: range %ld head %ld alert %ld empty %ld\n", + __func__, range, aux->head, aux->alert_mark, + aux->empty_mark); + if (range > aux_sdb_num_empty(aux)) { + range_scan = range - aux_sdb_num_empty(aux); idx = aux->empty_mark + 1; for (i = 0; i < range_scan; i++, idx++) { te = aux_sdb_trailer(aux, idx); - te->flags = te->flags & ~SDB_TE_BUFFER_FULL_MASK; - te->flags = te->flags & ~SDB_TE_ALERT_REQ_MASK; - te->overflow = 0; + te->header.f = 0; + te->header.a = 0; + te->header.overflow = 0; } /* Save the position of empty SDBs */ aux->empty_mark = aux->head + range - 1; @@ -1422,24 +1495,20 @@ static int aux_output_begin(struct perf_output_handle *handle, /* Set alert indicator */ aux->alert_mark = aux->head + range/2 - 1; te = aux_sdb_trailer(aux, aux->alert_mark); - te->flags = te->flags | SDB_TE_ALERT_REQ_MASK; + te->header.a = 1; /* Reset hardware buffer head */ - head = AUX_SDB_INDEX(aux, aux->head); + head = aux_sdb_index(aux, aux->head); base = aux->sdbt_index[head / CPUM_SF_SDB_PER_TABLE]; offset = head % CPUM_SF_SDB_PER_TABLE; - cpuhw->lsctl.tear = base + offset * sizeof(unsigned long); - cpuhw->lsctl.dear = aux->sdb_index[head]; + cpuhw->lsctl.tear = virt_to_phys((void *)base) + offset * sizeof(unsigned long); + cpuhw->lsctl.dear = virt_to_phys((void *)aux->sdb_index[head]); - debug_sprintf_event(sfdbg, 6, "%s: " - "head->alert_mark->empty_mark (num_alert, range)" - "[%#lx -> %#lx -> %#lx] (%#lx, %#lx) " - "tear index %#lx, tear %#lx dear %#lx\n", __func__, + debug_sprintf_event(sfdbg, 6, "%s: head %ld alert %ld empty %ld " + "index %ld tear %#lx dear %#lx\n", __func__, aux->head, aux->alert_mark, aux->empty_mark, - AUX_SDB_NUM_ALERT(aux), range, head / CPUM_SF_SDB_PER_TABLE, - cpuhw->lsctl.tear, - cpuhw->lsctl.dear); + cpuhw->lsctl.tear, cpuhw->lsctl.dear); return 0; } @@ -1453,15 +1522,16 @@ static int aux_output_begin(struct perf_output_handle *handle, static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index, unsigned long long *overflow) { - unsigned long long orig_overflow, orig_flags, new_flags; + union hws_trailer_header old, prev, new; struct hws_trailer_entry *te; te = aux_sdb_trailer(aux, alert_index); + prev.val = READ_ONCE_ALIGNED_128(te->header.val); do { - orig_flags = te->flags; - orig_overflow = te->overflow; - *overflow = orig_overflow; - if (orig_flags & SDB_TE_BUFFER_FULL_MASK) { + old.val = prev.val; + new.val = prev.val; + *overflow = old.overflow; + if (old.f) { /* * SDB is already set by hardware. * Abort and try to set somewhere @@ -1469,10 +1539,10 @@ static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index, */ return false; } - new_flags = orig_flags | SDB_TE_ALERT_REQ_MASK; - } while (!cmpxchg_double(&te->flags, &te->overflow, - orig_flags, orig_overflow, - new_flags, 0ULL)); + new.a = 1; + new.overflow = 0; + prev.val = cmpxchg128(&te->header.val, old.val, new.val); + } while (prev.val != old.val); return true; } @@ -1501,11 +1571,15 @@ static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index, static bool aux_reset_buffer(struct aux_buffer *aux, unsigned long range, unsigned long long *overflow) { - unsigned long long orig_overflow, orig_flags, new_flags; - unsigned long i, range_scan, idx; + unsigned long i, range_scan, idx, idx_old; + union hws_trailer_header old, prev, new; + unsigned long long orig_overflow; struct hws_trailer_entry *te; - if (range <= AUX_SDB_NUM_EMPTY(aux)) + debug_sprintf_event(sfdbg, 6, "%s: range %ld head %ld alert %ld " + "empty %ld\n", __func__, range, aux->head, + aux->alert_mark, aux->empty_mark); + if (range <= aux_sdb_num_empty(aux)) /* * No need to scan. All SDBs in range are marked as empty. * Just set alert indicator. Should check race with hardware @@ -1526,27 +1600,32 @@ static bool aux_reset_buffer(struct aux_buffer *aux, unsigned long range, * Start scanning from one SDB behind empty_mark. If the new alert * indicator fall into this range, set it. */ - range_scan = range - AUX_SDB_NUM_EMPTY(aux); - idx = aux->empty_mark + 1; + range_scan = range - aux_sdb_num_empty(aux); + idx_old = idx = aux->empty_mark + 1; for (i = 0; i < range_scan; i++, idx++) { te = aux_sdb_trailer(aux, idx); + prev.val = READ_ONCE_ALIGNED_128(te->header.val); do { - orig_flags = te->flags; - orig_overflow = te->overflow; - new_flags = orig_flags & ~SDB_TE_BUFFER_FULL_MASK; + old.val = prev.val; + new.val = prev.val; + orig_overflow = old.overflow; + new.f = 0; + new.overflow = 0; if (idx == aux->alert_mark) - new_flags |= SDB_TE_ALERT_REQ_MASK; + new.a = 1; else - new_flags &= ~SDB_TE_ALERT_REQ_MASK; - } while (!cmpxchg_double(&te->flags, &te->overflow, - orig_flags, orig_overflow, - new_flags, 0ULL)); + new.a = 0; + prev.val = cmpxchg128(&te->header.val, old.val, new.val); + } while (prev.val != old.val); *overflow += orig_overflow; } /* Update empty_mark to new position */ aux->empty_mark = aux->head + range - 1; + debug_sprintf_event(sfdbg, 6, "%s: range_scan %ld idx %ld..%ld " + "empty %ld\n", __func__, range_scan, idx_old, + idx - 1, aux->empty_mark); return true; } @@ -1567,10 +1646,12 @@ static void hw_collect_aux(struct cpu_hw_sf *cpuhw) return; /* Inform user space new data arrived */ - size = AUX_SDB_NUM_ALERT(aux) << PAGE_SHIFT; + size = aux_sdb_num_alert(aux) << PAGE_SHIFT; + debug_sprintf_event(sfdbg, 6, "%s: #alert %ld\n", __func__, + size >> PAGE_SHIFT); perf_aux_output_end(handle, size); - num_sdb = aux->sfb.num_sdb; + num_sdb = aux->sfb.num_sdb; while (!done) { /* Get an output handle */ aux = perf_aux_output_begin(handle, cpuhw->event); @@ -1578,9 +1659,6 @@ static void hw_collect_aux(struct cpu_hw_sf *cpuhw) pr_err("The AUX buffer with %lu pages for the " "diagnostic-sampling mode is full\n", num_sdb); - debug_sprintf_event(sfdbg, 1, - "%s: AUX buffer used up\n", - __func__); break; } if (WARN_ON_ONCE(!aux)) @@ -1602,14 +1680,14 @@ static void hw_collect_aux(struct cpu_hw_sf *cpuhw) size = range << PAGE_SHIFT; perf_aux_output_end(&cpuhw->handle, size); pr_err("Sample data caused the AUX buffer with %lu " - "pages to overflow\n", num_sdb); - debug_sprintf_event(sfdbg, 1, "%s: head %#lx range %#lx " - "overflow %#llx\n", __func__, + "pages to overflow\n", aux->sfb.num_sdb); + debug_sprintf_event(sfdbg, 1, "%s: head %ld range %ld " + "overflow %lld\n", __func__, aux->head, range, overflow); } else { - size = AUX_SDB_NUM_ALERT(aux) << PAGE_SHIFT; + size = aux_sdb_num_alert(aux) << PAGE_SHIFT; perf_aux_output_end(&cpuhw->handle, size); - debug_sprintf_event(sfdbg, 6, "%s: head %#lx alert %#lx " + debug_sprintf_event(sfdbg, 6, "%s: head %ld alert %ld " "already full, try another\n", __func__, aux->head, aux->alert_mark); @@ -1617,11 +1695,9 @@ static void hw_collect_aux(struct cpu_hw_sf *cpuhw) } if (done) - debug_sprintf_event(sfdbg, 6, "%s: aux_reset_buffer " - "[%#lx -> %#lx -> %#lx] (%#lx, %#lx)\n", - __func__, aux->head, aux->alert_mark, - aux->empty_mark, AUX_SDB_NUM_ALERT(aux), - range); + debug_sprintf_event(sfdbg, 6, "%s: head %ld alert %ld " + "empty %ld\n", __func__, aux->head, + aux->alert_mark, aux->empty_mark); } /* @@ -1644,19 +1720,18 @@ static void aux_buffer_free(void *data) kfree(aux->sdb_index); kfree(aux); - debug_sprintf_event(sfdbg, 4, "%s: free " - "%lu SDBTs\n", __func__, num_sdbt); + debug_sprintf_event(sfdbg, 4, "%s: SDBTs %lu\n", __func__, num_sdbt); } static void aux_sdb_init(unsigned long sdb) { struct hws_trailer_entry *te; - te = (struct hws_trailer_entry *)trailer_entry_ptr(sdb); + te = trailer_entry_ptr(sdb); /* Save clock base */ te->clock_base = 1; - memcpy(&te->progusage2, &tod_clock_base[1], 8); + te->progusage2 = tod_clock_base.tod; } /* @@ -1697,13 +1772,13 @@ static void *aux_buffer_setup(struct perf_event *event, void **pages, } /* Allocate aux_buffer struct for the event */ - aux = kmalloc(sizeof(struct aux_buffer), GFP_KERNEL); + aux = kzalloc(sizeof(struct aux_buffer), GFP_KERNEL); if (!aux) goto no_aux; sfb = &aux->sfb; /* Allocate sdbt_index for fast reference */ - n_sdbt = (nr_pages + CPUM_SF_SDB_PER_TABLE - 1) / CPUM_SF_SDB_PER_TABLE; + n_sdbt = DIV_ROUND_UP(nr_pages, CPUM_SF_SDB_PER_TABLE); aux->sdbt_index = kmalloc_array(n_sdbt, sizeof(void *), GFP_KERNEL); if (!aux->sdbt_index) goto no_sdbt_index; @@ -1715,7 +1790,7 @@ static void *aux_buffer_setup(struct perf_event *event, void **pages, /* Allocate the first SDBT */ sfb->num_sdbt = 0; - sfb->sdbt = (unsigned long *) get_zeroed_page(GFP_KERNEL); + sfb->sdbt = (unsigned long *)get_zeroed_page(GFP_KERNEL); if (!sfb->sdbt) goto no_sdbt; aux->sdbt_index[sfb->num_sdbt++] = (unsigned long)sfb->sdbt; @@ -1727,23 +1802,23 @@ static void *aux_buffer_setup(struct perf_event *event, void **pages, */ for (i = 0; i < nr_pages; i++, tail++) { if (require_table_link(tail)) { - new = (unsigned long *) get_zeroed_page(GFP_KERNEL); + new = (unsigned long *)get_zeroed_page(GFP_KERNEL); if (!new) goto no_sdbt; aux->sdbt_index[sfb->num_sdbt++] = (unsigned long)new; /* Link current page to tail of chain */ - *tail = (unsigned long)(void *) new + 1; + *tail = virt_to_phys(new) + 1; tail = new; } /* Tail is the entry in a SDBT */ - *tail = (unsigned long)pages[i]; + *tail = virt_to_phys(pages[i]); aux->sdb_index[i] = (unsigned long)pages[i]; aux_sdb_init((unsigned long)pages[i]); } sfb->num_sdb = nr_pages; /* Link the last entry in the SDBT to the first SDBT */ - *tail = (unsigned long) sfb->sdbt + 1; + *tail = virt_to_phys(sfb->sdbt) + 1; sfb->tail = tail; /* @@ -1753,8 +1828,8 @@ static void *aux_buffer_setup(struct perf_event *event, void **pages, */ aux->empty_mark = sfb->num_sdb - 1; - debug_sprintf_event(sfdbg, 4, "%s: setup %lu SDBTs and %lu SDBs\n", - __func__, sfb->num_sdbt, sfb->num_sdb); + debug_sprintf_event(sfdbg, 4, "%s: SDBTs %lu SDBs %lu\n", __func__, + sfb->num_sdbt, sfb->num_sdb); return aux; @@ -1776,7 +1851,7 @@ static void cpumsf_pmu_read(struct perf_event *event) /* Nothing to do ... updates are interrupt-driven */ } -/* Check if the new sampling period/freqeuncy is appropriate. +/* Check if the new sampling period/frequency is appropriate. * * Return non-zero on error and zero on passed checks. */ @@ -1883,9 +1958,9 @@ static int cpumsf_pmu_add(struct perf_event *event, int flags) cpuhw->lsctl.h = 1; cpuhw->lsctl.interval = SAMPL_RATE(&event->hw); if (!SAMPL_DIAG_MODE(&event->hw)) { - cpuhw->lsctl.tear = (unsigned long) cpuhw->sfb.sdbt; - cpuhw->lsctl.dear = *(unsigned long *) cpuhw->sfb.sdbt; - TEAR_REG(&event->hw) = (unsigned long) cpuhw->sfb.sdbt; + cpuhw->lsctl.tear = virt_to_phys(cpuhw->sfb.sdbt); + cpuhw->lsctl.dear = *(unsigned long *)cpuhw->sfb.sdbt; + TEAR_REG(&event->hw) = (unsigned long)cpuhw->sfb.sdbt; } /* Ensure sampling functions are in the disabled state. If disabled, @@ -2202,4 +2277,4 @@ out: } arch_initcall(init_cpum_sampling_pmu); -core_param(cpum_sfb_size, CPUM_SF_MAX_SDB, sfb_size, 0640); +core_param(cpum_sfb_size, CPUM_SF_MAX_SDB, sfb_size, 0644); diff --git a/arch/s390/kernel/perf_event.c b/arch/s390/kernel/perf_event.c index 1e75cc983546..dfa77da2fd2e 100644 --- a/arch/s390/kernel/perf_event.c +++ b/arch/s390/kernel/perf_event.c @@ -15,7 +15,10 @@ #include <linux/export.h> #include <linux/seq_file.h> #include <linux/spinlock.h> +#include <linux/uaccess.h> +#include <linux/compat.h> #include <linux/sysfs.h> +#include <asm/stacktrace.h> #include <asm/irq.h> #include <asm/cpu_mf.h> #include <asm/lowcore.h> @@ -23,27 +26,6 @@ #include <asm/sysinfo.h> #include <asm/unwind.h> -const char *perf_pmu_name(void) -{ - if (cpum_cf_avail() || cpum_sf_avail()) - return "CPU-Measurement Facilities (CPU-MF)"; - return "pmu"; -} -EXPORT_SYMBOL(perf_pmu_name); - -int perf_num_counters(void) -{ - int num = 0; - - if (cpum_cf_avail()) - num += PERF_CPUM_CF_MAX_CTR; - if (cpum_sf_avail()) - num += PERF_CPUM_SF_MAX_CTR; - - return num; -} -EXPORT_SYMBOL(perf_num_counters); - static struct kvm_s390_sie_block *sie_block(struct pt_regs *regs) { struct stack_frame *stack = (struct stack_frame *) regs->gprs[15]; @@ -51,7 +33,7 @@ static struct kvm_s390_sie_block *sie_block(struct pt_regs *regs) if (!stack) return NULL; - return (struct kvm_s390_sie_block *) stack->empty1[0]; + return (struct kvm_s390_sie_block *)stack->sie_control_block; } static bool is_in_guest(struct pt_regs *regs) @@ -233,6 +215,44 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, } } +void perf_callchain_user(struct perf_callchain_entry_ctx *entry, + struct pt_regs *regs) +{ + struct stack_frame_user __user *sf; + unsigned long ip, sp; + bool first = true; + + if (is_compat_task()) + return; + perf_callchain_store(entry, instruction_pointer(regs)); + sf = (void __user *)user_stack_pointer(regs); + pagefault_disable(); + while (entry->nr < entry->max_stack) { + if (__get_user(sp, &sf->back_chain)) + break; + if (__get_user(ip, &sf->gprs[8])) + break; + if (ip & 0x1) { + /* + * If the instruction address is invalid, and this + * is the first stack frame, assume r14 has not + * been written to the stack yet. Otherwise exit. + */ + if (first && !(regs->gprs[14] & 0x1)) + ip = regs->gprs[14]; + else + break; + } + perf_callchain_store(entry, ip); + /* Sanity check: ABI requires SP to be aligned 8 bytes. */ + if (!sp || sp & 0x7) + break; + sf = (void __user *)sp; + first = false; + } + pagefault_enable(); +} + /* Perf definitions for PMU event attributes in sysfs */ ssize_t cpumf_events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page) diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c new file mode 100644 index 000000000000..bf8a672b15a4 --- /dev/null +++ b/arch/s390/kernel/perf_pai_crypto.c @@ -0,0 +1,773 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Performance event support - Processor Activity Instrumentation Facility + * + * Copyright IBM Corp. 2022 + * Author(s): Thomas Richter <tmricht@linux.ibm.com> + */ +#define KMSG_COMPONENT "pai_crypto" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/kernel.h> +#include <linux/kernel_stat.h> +#include <linux/percpu.h> +#include <linux/notifier.h> +#include <linux/init.h> +#include <linux/export.h> +#include <linux/io.h> +#include <linux/perf_event.h> +#include <asm/ctlreg.h> +#include <asm/pai.h> +#include <asm/debug.h> + +static debug_info_t *cfm_dbg; +static unsigned int paicrypt_cnt; /* Size of the mapped counter sets */ + /* extracted with QPACI instruction */ + +DEFINE_STATIC_KEY_FALSE(pai_key); + +struct pai_userdata { + u16 num; + u64 value; +} __packed; + +struct paicrypt_map { + unsigned long *page; /* Page for CPU to store counters */ + struct pai_userdata *save; /* Page to store no-zero counters */ + unsigned int active_events; /* # of PAI crypto users */ + refcount_t refcnt; /* Reference count mapped buffers */ + enum paievt_mode mode; /* Type of event */ + struct perf_event *event; /* Perf event for sampling */ +}; + +struct paicrypt_mapptr { + struct paicrypt_map *mapptr; +}; + +static struct paicrypt_root { /* Anchor to per CPU data */ + refcount_t refcnt; /* Overall active events */ + struct paicrypt_mapptr __percpu *mapptr; +} paicrypt_root; + +/* Free per CPU data when the last event is removed. */ +static void paicrypt_root_free(void) +{ + if (refcount_dec_and_test(&paicrypt_root.refcnt)) { + free_percpu(paicrypt_root.mapptr); + paicrypt_root.mapptr = NULL; + } + debug_sprintf_event(cfm_dbg, 5, "%s root.refcount %d\n", __func__, + refcount_read(&paicrypt_root.refcnt)); +} + +/* + * On initialization of first event also allocate per CPU data dynamically. + * Start with an array of pointers, the array size is the maximum number of + * CPUs possible, which might be larger than the number of CPUs currently + * online. + */ +static int paicrypt_root_alloc(void) +{ + if (!refcount_inc_not_zero(&paicrypt_root.refcnt)) { + /* The memory is already zeroed. */ + paicrypt_root.mapptr = alloc_percpu(struct paicrypt_mapptr); + if (!paicrypt_root.mapptr) + return -ENOMEM; + refcount_set(&paicrypt_root.refcnt, 1); + } + return 0; +} + +/* Release the PMU if event is the last perf event */ +static DEFINE_MUTEX(pai_reserve_mutex); + +/* Adjust usage counters and remove allocated memory when all users are + * gone. + */ +static void paicrypt_event_destroy(struct perf_event *event) +{ + struct paicrypt_mapptr *mp = per_cpu_ptr(paicrypt_root.mapptr, + event->cpu); + struct paicrypt_map *cpump = mp->mapptr; + + cpump->event = NULL; + static_branch_dec(&pai_key); + mutex_lock(&pai_reserve_mutex); + debug_sprintf_event(cfm_dbg, 5, "%s event %#llx cpu %d users %d" + " mode %d refcnt %u\n", __func__, + event->attr.config, event->cpu, + cpump->active_events, cpump->mode, + refcount_read(&cpump->refcnt)); + if (refcount_dec_and_test(&cpump->refcnt)) { + debug_sprintf_event(cfm_dbg, 4, "%s page %#lx save %p\n", + __func__, (unsigned long)cpump->page, + cpump->save); + free_page((unsigned long)cpump->page); + kvfree(cpump->save); + kfree(cpump); + mp->mapptr = NULL; + } + paicrypt_root_free(); + mutex_unlock(&pai_reserve_mutex); +} + +static u64 paicrypt_getctr(unsigned long *page, int nr, bool kernel) +{ + if (kernel) + nr += PAI_CRYPTO_MAXCTR; + return page[nr]; +} + +/* Read the counter values. Return value from location in CMP. For event + * CRYPTO_ALL sum up all events. + */ +static u64 paicrypt_getdata(struct perf_event *event, bool kernel) +{ + struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr); + struct paicrypt_map *cpump = mp->mapptr; + u64 sum = 0; + int i; + + if (event->attr.config != PAI_CRYPTO_BASE) { + return paicrypt_getctr(cpump->page, + event->attr.config - PAI_CRYPTO_BASE, + kernel); + } + + for (i = 1; i <= paicrypt_cnt; i++) { + u64 val = paicrypt_getctr(cpump->page, i, kernel); + + if (!val) + continue; + sum += val; + } + return sum; +} + +static u64 paicrypt_getall(struct perf_event *event) +{ + u64 sum = 0; + + if (!event->attr.exclude_kernel) + sum += paicrypt_getdata(event, true); + if (!event->attr.exclude_user) + sum += paicrypt_getdata(event, false); + + return sum; +} + +/* Used to avoid races in checking concurrent access of counting and + * sampling for crypto events + * + * Only one instance of event pai_crypto/CRYPTO_ALL/ for sampling is + * allowed and when this event is running, no counting event is allowed. + * Several counting events are allowed in parallel, but no sampling event + * is allowed while one (or more) counting events are running. + * + * This function is called in process context and it is save to block. + * When the event initialization functions fails, no other call back will + * be invoked. + * + * Allocate the memory for the event. + */ +static struct paicrypt_map *paicrypt_busy(struct perf_event *event) +{ + struct perf_event_attr *a = &event->attr; + struct paicrypt_map *cpump = NULL; + struct paicrypt_mapptr *mp; + int rc; + + mutex_lock(&pai_reserve_mutex); + + /* Allocate root node */ + rc = paicrypt_root_alloc(); + if (rc) + goto unlock; + + /* Allocate node for this event */ + mp = per_cpu_ptr(paicrypt_root.mapptr, event->cpu); + cpump = mp->mapptr; + if (!cpump) { /* Paicrypt_map allocated? */ + cpump = kzalloc(sizeof(*cpump), GFP_KERNEL); + if (!cpump) { + rc = -ENOMEM; + goto free_root; + } + } + + if (a->sample_period) { /* Sampling requested */ + if (cpump->mode != PAI_MODE_NONE) + rc = -EBUSY; /* ... sampling/counting active */ + } else { /* Counting requested */ + if (cpump->mode == PAI_MODE_SAMPLING) + rc = -EBUSY; /* ... and sampling active */ + } + /* + * This error case triggers when there is a conflict: + * Either sampling requested and counting already active, or visa + * versa. Therefore the struct paicrypto_map for this CPU is + * needed or the error could not have occurred. Only adjust root + * node refcount. + */ + if (rc) + goto free_root; + + /* Allocate memory for counter page and counter extraction. + * Only the first counting event has to allocate a page. + */ + if (cpump->page) { + refcount_inc(&cpump->refcnt); + goto unlock; + } + + rc = -ENOMEM; + cpump->page = (unsigned long *)get_zeroed_page(GFP_KERNEL); + if (!cpump->page) + goto free_paicrypt_map; + cpump->save = kvmalloc_array(paicrypt_cnt + 1, + sizeof(struct pai_userdata), GFP_KERNEL); + if (!cpump->save) { + free_page((unsigned long)cpump->page); + cpump->page = NULL; + goto free_paicrypt_map; + } + + /* Set mode and reference count */ + rc = 0; + refcount_set(&cpump->refcnt, 1); + cpump->mode = a->sample_period ? PAI_MODE_SAMPLING : PAI_MODE_COUNTING; + mp->mapptr = cpump; + debug_sprintf_event(cfm_dbg, 5, "%s sample_period %#llx users %d" + " mode %d refcnt %u page %#lx save %p rc %d\n", + __func__, a->sample_period, cpump->active_events, + cpump->mode, refcount_read(&cpump->refcnt), + (unsigned long)cpump->page, cpump->save, rc); + goto unlock; + +free_paicrypt_map: + kfree(cpump); + mp->mapptr = NULL; +free_root: + paicrypt_root_free(); + +unlock: + mutex_unlock(&pai_reserve_mutex); + return rc ? ERR_PTR(rc) : cpump; +} + +/* Might be called on different CPU than the one the event is intended for. */ +static int paicrypt_event_init(struct perf_event *event) +{ + struct perf_event_attr *a = &event->attr; + struct paicrypt_map *cpump; + + /* PAI crypto PMU registered as PERF_TYPE_RAW, check event type */ + if (a->type != PERF_TYPE_RAW && event->pmu->type != a->type) + return -ENOENT; + /* PAI crypto event must be in valid range */ + if (a->config < PAI_CRYPTO_BASE || + a->config > PAI_CRYPTO_BASE + paicrypt_cnt) + return -EINVAL; + /* Allow only CPU wide operation, no process context for now. */ + if ((event->attach_state & PERF_ATTACH_TASK) || event->cpu == -1) + return -ENOENT; + /* Allow only CRYPTO_ALL for sampling. */ + if (a->sample_period && a->config != PAI_CRYPTO_BASE) + return -EINVAL; + + cpump = paicrypt_busy(event); + if (IS_ERR(cpump)) + return PTR_ERR(cpump); + + event->destroy = paicrypt_event_destroy; + + if (a->sample_period) { + a->sample_period = 1; + a->freq = 0; + /* Register for paicrypt_sched_task() to be called */ + event->attach_state |= PERF_ATTACH_SCHED_CB; + /* Add raw data which contain the memory mapped counters */ + a->sample_type |= PERF_SAMPLE_RAW; + /* Turn off inheritance */ + a->inherit = 0; + } + + static_branch_inc(&pai_key); + return 0; +} + +static void paicrypt_read(struct perf_event *event) +{ + u64 prev, new, delta; + + prev = local64_read(&event->hw.prev_count); + new = paicrypt_getall(event); + local64_set(&event->hw.prev_count, new); + delta = (prev <= new) ? new - prev + : (-1ULL - prev) + new + 1; /* overflow */ + local64_add(delta, &event->count); +} + +static void paicrypt_start(struct perf_event *event, int flags) +{ + u64 sum; + + /* Event initialization sets last_tag to 0. When later on the events + * are deleted and re-added, do not reset the event count value to zero. + * Events are added, deleted and re-added when 2 or more events + * are active at the same time. + */ + if (!event->attr.sample_period) { /* Counting */ + if (!event->hw.last_tag) { + event->hw.last_tag = 1; + sum = paicrypt_getall(event); /* Get current value */ + local64_set(&event->hw.prev_count, sum); + } + } else { /* Sampling */ + perf_sched_cb_inc(event->pmu); + } +} + +static int paicrypt_add(struct perf_event *event, int flags) +{ + struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr); + struct paicrypt_map *cpump = mp->mapptr; + unsigned long ccd; + + if (++cpump->active_events == 1) { + ccd = virt_to_phys(cpump->page) | PAI_CRYPTO_KERNEL_OFFSET; + WRITE_ONCE(S390_lowcore.ccd, ccd); + local_ctl_set_bit(0, CR0_CRYPTOGRAPHY_COUNTER_BIT); + } + cpump->event = event; + if (flags & PERF_EF_START) + paicrypt_start(event, PERF_EF_RELOAD); + event->hw.state = 0; + return 0; +} + +static void paicrypt_stop(struct perf_event *event, int flags) +{ + if (!event->attr.sample_period) /* Counting */ + paicrypt_read(event); + else /* Sampling */ + perf_sched_cb_dec(event->pmu); + event->hw.state = PERF_HES_STOPPED; +} + +static void paicrypt_del(struct perf_event *event, int flags) +{ + struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr); + struct paicrypt_map *cpump = mp->mapptr; + + paicrypt_stop(event, PERF_EF_UPDATE); + if (--cpump->active_events == 0) { + local_ctl_clear_bit(0, CR0_CRYPTOGRAPHY_COUNTER_BIT); + WRITE_ONCE(S390_lowcore.ccd, 0); + } +} + +/* Create raw data and save it in buffer. Returns number of bytes copied. + * Saves only positive counter entries of the form + * 2 bytes: Number of counter + * 8 bytes: Value of counter + */ +static size_t paicrypt_copy(struct pai_userdata *userdata, unsigned long *page, + bool exclude_user, bool exclude_kernel) +{ + int i, outidx = 0; + + for (i = 1; i <= paicrypt_cnt; i++) { + u64 val = 0; + + if (!exclude_kernel) + val += paicrypt_getctr(page, i, true); + if (!exclude_user) + val += paicrypt_getctr(page, i, false); + if (val) { + userdata[outidx].num = i; + userdata[outidx].value = val; + outidx++; + } + } + return outidx * sizeof(struct pai_userdata); +} + +static int paicrypt_push_sample(size_t rawsize, struct paicrypt_map *cpump, + struct perf_event *event) +{ + struct perf_sample_data data; + struct perf_raw_record raw; + struct pt_regs regs; + int overflow; + + /* Setup perf sample */ + memset(®s, 0, sizeof(regs)); + memset(&raw, 0, sizeof(raw)); + memset(&data, 0, sizeof(data)); + perf_sample_data_init(&data, 0, event->hw.last_period); + if (event->attr.sample_type & PERF_SAMPLE_TID) { + data.tid_entry.pid = task_tgid_nr(current); + data.tid_entry.tid = task_pid_nr(current); + } + if (event->attr.sample_type & PERF_SAMPLE_TIME) + data.time = event->clock(); + if (event->attr.sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) + data.id = event->id; + if (event->attr.sample_type & PERF_SAMPLE_CPU) { + data.cpu_entry.cpu = smp_processor_id(); + data.cpu_entry.reserved = 0; + } + if (event->attr.sample_type & PERF_SAMPLE_RAW) { + raw.frag.size = rawsize; + raw.frag.data = cpump->save; + perf_sample_save_raw_data(&data, &raw); + } + + overflow = perf_event_overflow(event, &data, ®s); + perf_event_update_userpage(event); + /* Clear lowcore page after read */ + memset(cpump->page, 0, PAGE_SIZE); + return overflow; +} + +/* Check if there is data to be saved on schedule out of a task. */ +static int paicrypt_have_sample(void) +{ + struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr); + struct paicrypt_map *cpump = mp->mapptr; + struct perf_event *event = cpump->event; + size_t rawsize; + int rc = 0; + + if (!event) /* No event active */ + return 0; + rawsize = paicrypt_copy(cpump->save, cpump->page, + cpump->event->attr.exclude_user, + cpump->event->attr.exclude_kernel); + if (rawsize) /* No incremented counters */ + rc = paicrypt_push_sample(rawsize, cpump, event); + return rc; +} + +/* Called on schedule-in and schedule-out. No access to event structure, + * but for sampling only event CRYPTO_ALL is allowed. + */ +static void paicrypt_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) +{ + /* We started with a clean page on event installation. So read out + * results on schedule_out and if page was dirty, clear values. + */ + if (!sched_in) + paicrypt_have_sample(); +} + +/* Attribute definitions for paicrypt interface. As with other CPU + * Measurement Facilities, there is one attribute per mapped counter. + * The number of mapped counters may vary per machine generation. Use + * the QUERY PROCESSOR ACTIVITY COUNTER INFORMATION (QPACI) instruction + * to determine the number of mapped counters. The instructions returns + * a positive number, which is the highest number of supported counters. + * All counters less than this number are also supported, there are no + * holes. A returned number of zero means no support for mapped counters. + * + * The identification of the counter is a unique number. The chosen range + * is 0x1000 + offset in mapped kernel page. + * All CPU Measurement Facility counters identifiers must be unique and + * the numbers from 0 to 496 are already used for the CPU Measurement + * Counter facility. Numbers 0xb0000, 0xbc000 and 0xbd000 are already + * used for the CPU Measurement Sampling facility. + */ +PMU_FORMAT_ATTR(event, "config:0-63"); + +static struct attribute *paicrypt_format_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group paicrypt_events_group = { + .name = "events", + .attrs = NULL /* Filled in attr_event_init() */ +}; + +static struct attribute_group paicrypt_format_group = { + .name = "format", + .attrs = paicrypt_format_attr, +}; + +static const struct attribute_group *paicrypt_attr_groups[] = { + &paicrypt_events_group, + &paicrypt_format_group, + NULL, +}; + +/* Performance monitoring unit for mapped counters */ +static struct pmu paicrypt = { + .task_ctx_nr = perf_invalid_context, + .event_init = paicrypt_event_init, + .add = paicrypt_add, + .del = paicrypt_del, + .start = paicrypt_start, + .stop = paicrypt_stop, + .read = paicrypt_read, + .sched_task = paicrypt_sched_task, + .attr_groups = paicrypt_attr_groups +}; + +/* List of symbolic PAI counter names. */ +static const char * const paicrypt_ctrnames[] = { + [0] = "CRYPTO_ALL", + [1] = "KM_DEA", + [2] = "KM_TDEA_128", + [3] = "KM_TDEA_192", + [4] = "KM_ENCRYPTED_DEA", + [5] = "KM_ENCRYPTED_TDEA_128", + [6] = "KM_ENCRYPTED_TDEA_192", + [7] = "KM_AES_128", + [8] = "KM_AES_192", + [9] = "KM_AES_256", + [10] = "KM_ENCRYPTED_AES_128", + [11] = "KM_ENCRYPTED_AES_192", + [12] = "KM_ENCRYPTED_AES_256", + [13] = "KM_XTS_AES_128", + [14] = "KM_XTS_AES_256", + [15] = "KM_XTS_ENCRYPTED_AES_128", + [16] = "KM_XTS_ENCRYPTED_AES_256", + [17] = "KMC_DEA", + [18] = "KMC_TDEA_128", + [19] = "KMC_TDEA_192", + [20] = "KMC_ENCRYPTED_DEA", + [21] = "KMC_ENCRYPTED_TDEA_128", + [22] = "KMC_ENCRYPTED_TDEA_192", + [23] = "KMC_AES_128", + [24] = "KMC_AES_192", + [25] = "KMC_AES_256", + [26] = "KMC_ENCRYPTED_AES_128", + [27] = "KMC_ENCRYPTED_AES_192", + [28] = "KMC_ENCRYPTED_AES_256", + [29] = "KMC_PRNG", + [30] = "KMA_GCM_AES_128", + [31] = "KMA_GCM_AES_192", + [32] = "KMA_GCM_AES_256", + [33] = "KMA_GCM_ENCRYPTED_AES_128", + [34] = "KMA_GCM_ENCRYPTED_AES_192", + [35] = "KMA_GCM_ENCRYPTED_AES_256", + [36] = "KMF_DEA", + [37] = "KMF_TDEA_128", + [38] = "KMF_TDEA_192", + [39] = "KMF_ENCRYPTED_DEA", + [40] = "KMF_ENCRYPTED_TDEA_128", + [41] = "KMF_ENCRYPTED_TDEA_192", + [42] = "KMF_AES_128", + [43] = "KMF_AES_192", + [44] = "KMF_AES_256", + [45] = "KMF_ENCRYPTED_AES_128", + [46] = "KMF_ENCRYPTED_AES_192", + [47] = "KMF_ENCRYPTED_AES_256", + [48] = "KMCTR_DEA", + [49] = "KMCTR_TDEA_128", + [50] = "KMCTR_TDEA_192", + [51] = "KMCTR_ENCRYPTED_DEA", + [52] = "KMCTR_ENCRYPTED_TDEA_128", + [53] = "KMCTR_ENCRYPTED_TDEA_192", + [54] = "KMCTR_AES_128", + [55] = "KMCTR_AES_192", + [56] = "KMCTR_AES_256", + [57] = "KMCTR_ENCRYPTED_AES_128", + [58] = "KMCTR_ENCRYPTED_AES_192", + [59] = "KMCTR_ENCRYPTED_AES_256", + [60] = "KMO_DEA", + [61] = "KMO_TDEA_128", + [62] = "KMO_TDEA_192", + [63] = "KMO_ENCRYPTED_DEA", + [64] = "KMO_ENCRYPTED_TDEA_128", + [65] = "KMO_ENCRYPTED_TDEA_192", + [66] = "KMO_AES_128", + [67] = "KMO_AES_192", + [68] = "KMO_AES_256", + [69] = "KMO_ENCRYPTED_AES_128", + [70] = "KMO_ENCRYPTED_AES_192", + [71] = "KMO_ENCRYPTED_AES_256", + [72] = "KIMD_SHA_1", + [73] = "KIMD_SHA_256", + [74] = "KIMD_SHA_512", + [75] = "KIMD_SHA3_224", + [76] = "KIMD_SHA3_256", + [77] = "KIMD_SHA3_384", + [78] = "KIMD_SHA3_512", + [79] = "KIMD_SHAKE_128", + [80] = "KIMD_SHAKE_256", + [81] = "KIMD_GHASH", + [82] = "KLMD_SHA_1", + [83] = "KLMD_SHA_256", + [84] = "KLMD_SHA_512", + [85] = "KLMD_SHA3_224", + [86] = "KLMD_SHA3_256", + [87] = "KLMD_SHA3_384", + [88] = "KLMD_SHA3_512", + [89] = "KLMD_SHAKE_128", + [90] = "KLMD_SHAKE_256", + [91] = "KMAC_DEA", + [92] = "KMAC_TDEA_128", + [93] = "KMAC_TDEA_192", + [94] = "KMAC_ENCRYPTED_DEA", + [95] = "KMAC_ENCRYPTED_TDEA_128", + [96] = "KMAC_ENCRYPTED_TDEA_192", + [97] = "KMAC_AES_128", + [98] = "KMAC_AES_192", + [99] = "KMAC_AES_256", + [100] = "KMAC_ENCRYPTED_AES_128", + [101] = "KMAC_ENCRYPTED_AES_192", + [102] = "KMAC_ENCRYPTED_AES_256", + [103] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_DEA", + [104] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_TDEA_128", + [105] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_TDEA_192", + [106] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_DEA", + [107] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_TDEA_128", + [108] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_TDEA_192", + [109] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_128", + [110] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_192", + [111] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_256", + [112] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_128", + [113] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_192", + [114] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_256A", + [115] = "PCC_COMPUTE_XTS_PARAMETER_USING_AES_128", + [116] = "PCC_COMPUTE_XTS_PARAMETER_USING_AES_256", + [117] = "PCC_COMPUTE_XTS_PARAMETER_USING_ENCRYPTED_AES_128", + [118] = "PCC_COMPUTE_XTS_PARAMETER_USING_ENCRYPTED_AES_256", + [119] = "PCC_SCALAR_MULTIPLY_P256", + [120] = "PCC_SCALAR_MULTIPLY_P384", + [121] = "PCC_SCALAR_MULTIPLY_P521", + [122] = "PCC_SCALAR_MULTIPLY_ED25519", + [123] = "PCC_SCALAR_MULTIPLY_ED448", + [124] = "PCC_SCALAR_MULTIPLY_X25519", + [125] = "PCC_SCALAR_MULTIPLY_X448", + [126] = "PRNO_SHA_512_DRNG", + [127] = "PRNO_TRNG_QUERY_RAW_TO_CONDITIONED_RATIO", + [128] = "PRNO_TRNG", + [129] = "KDSA_ECDSA_VERIFY_P256", + [130] = "KDSA_ECDSA_VERIFY_P384", + [131] = "KDSA_ECDSA_VERIFY_P521", + [132] = "KDSA_ECDSA_SIGN_P256", + [133] = "KDSA_ECDSA_SIGN_P384", + [134] = "KDSA_ECDSA_SIGN_P521", + [135] = "KDSA_ENCRYPTED_ECDSA_SIGN_P256", + [136] = "KDSA_ENCRYPTED_ECDSA_SIGN_P384", + [137] = "KDSA_ENCRYPTED_ECDSA_SIGN_P521", + [138] = "KDSA_EDDSA_VERIFY_ED25519", + [139] = "KDSA_EDDSA_VERIFY_ED448", + [140] = "KDSA_EDDSA_SIGN_ED25519", + [141] = "KDSA_EDDSA_SIGN_ED448", + [142] = "KDSA_ENCRYPTED_EDDSA_SIGN_ED25519", + [143] = "KDSA_ENCRYPTED_EDDSA_SIGN_ED448", + [144] = "PCKMO_ENCRYPT_DEA_KEY", + [145] = "PCKMO_ENCRYPT_TDEA_128_KEY", + [146] = "PCKMO_ENCRYPT_TDEA_192_KEY", + [147] = "PCKMO_ENCRYPT_AES_128_KEY", + [148] = "PCKMO_ENCRYPT_AES_192_KEY", + [149] = "PCKMO_ENCRYPT_AES_256_KEY", + [150] = "PCKMO_ENCRYPT_ECC_P256_KEY", + [151] = "PCKMO_ENCRYPT_ECC_P384_KEY", + [152] = "PCKMO_ENCRYPT_ECC_P521_KEY", + [153] = "PCKMO_ENCRYPT_ECC_ED25519_KEY", + [154] = "PCKMO_ENCRYPT_ECC_ED448_KEY", + [155] = "IBM_RESERVED_155", + [156] = "IBM_RESERVED_156", +}; + +static void __init attr_event_free(struct attribute **attrs, int num) +{ + struct perf_pmu_events_attr *pa; + int i; + + for (i = 0; i < num; i++) { + struct device_attribute *dap; + + dap = container_of(attrs[i], struct device_attribute, attr); + pa = container_of(dap, struct perf_pmu_events_attr, attr); + kfree(pa); + } + kfree(attrs); +} + +static int __init attr_event_init_one(struct attribute **attrs, int num) +{ + struct perf_pmu_events_attr *pa; + + pa = kzalloc(sizeof(*pa), GFP_KERNEL); + if (!pa) + return -ENOMEM; + + sysfs_attr_init(&pa->attr.attr); + pa->id = PAI_CRYPTO_BASE + num; + pa->attr.attr.name = paicrypt_ctrnames[num]; + pa->attr.attr.mode = 0444; + pa->attr.show = cpumf_events_sysfs_show; + pa->attr.store = NULL; + attrs[num] = &pa->attr.attr; + return 0; +} + +/* Create PMU sysfs event attributes on the fly. */ +static int __init attr_event_init(void) +{ + struct attribute **attrs; + int ret, i; + + attrs = kmalloc_array(ARRAY_SIZE(paicrypt_ctrnames) + 1, sizeof(*attrs), + GFP_KERNEL); + if (!attrs) + return -ENOMEM; + for (i = 0; i < ARRAY_SIZE(paicrypt_ctrnames); i++) { + ret = attr_event_init_one(attrs, i); + if (ret) { + attr_event_free(attrs, i - 1); + return ret; + } + } + attrs[i] = NULL; + paicrypt_events_group.attrs = attrs; + return 0; +} + +static int __init paicrypt_init(void) +{ + struct qpaci_info_block ib; + int rc; + + if (!test_facility(196)) + return 0; + + qpaci(&ib); + paicrypt_cnt = ib.num_cc; + if (paicrypt_cnt == 0) + return 0; + if (paicrypt_cnt >= PAI_CRYPTO_MAXCTR) + paicrypt_cnt = PAI_CRYPTO_MAXCTR - 1; + + rc = attr_event_init(); /* Export known PAI crypto events */ + if (rc) { + pr_err("Creation of PMU pai_crypto /sysfs failed\n"); + return rc; + } + + /* Setup s390dbf facility */ + cfm_dbg = debug_register(KMSG_COMPONENT, 2, 256, 128); + if (!cfm_dbg) { + pr_err("Registration of s390dbf pai_crypto failed\n"); + return -ENOMEM; + } + debug_register_view(cfm_dbg, &debug_sprintf_view); + + rc = perf_pmu_register(&paicrypt, "pai_crypto", -1); + if (rc) { + pr_err("Registering the pai_crypto PMU failed with rc=%i\n", + rc); + debug_unregister_view(cfm_dbg, &debug_sprintf_view); + debug_unregister(cfm_dbg); + return rc; + } + return 0; +} + +device_initcall(paicrypt_init); diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c new file mode 100644 index 000000000000..af7f2b538c8f --- /dev/null +++ b/arch/s390/kernel/perf_pai_ext.c @@ -0,0 +1,671 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Performance event support - Processor Activity Instrumentation Extension + * Facility + * + * Copyright IBM Corp. 2022 + * Author(s): Thomas Richter <tmricht@linux.ibm.com> + */ +#define KMSG_COMPONENT "pai_ext" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/kernel.h> +#include <linux/kernel_stat.h> +#include <linux/percpu.h> +#include <linux/notifier.h> +#include <linux/init.h> +#include <linux/export.h> +#include <linux/io.h> +#include <linux/perf_event.h> +#include <asm/ctlreg.h> +#include <asm/pai.h> +#include <asm/debug.h> + +#define PAIE1_CB_SZ 0x200 /* Size of PAIE1 control block */ +#define PAIE1_CTRBLOCK_SZ 0x400 /* Size of PAIE1 counter blocks */ + +static debug_info_t *paiext_dbg; +static unsigned int paiext_cnt; /* Extracted with QPACI instruction */ + +struct pai_userdata { + u16 num; + u64 value; +} __packed; + +/* Create the PAI extension 1 control block area. + * The PAI extension control block 1 is pointed to by lowcore + * address 0x1508 for each CPU. This control block is 512 bytes in size + * and requires a 512 byte boundary alignment. + */ +struct paiext_cb { /* PAI extension 1 control block */ + u64 header; /* Not used */ + u64 reserved1; + u64 acc; /* Addr to analytics counter control block */ + u8 reserved2[488]; +} __packed; + +struct paiext_map { + unsigned long *area; /* Area for CPU to store counters */ + struct pai_userdata *save; /* Area to store non-zero counters */ + enum paievt_mode mode; /* Type of event */ + unsigned int active_events; /* # of PAI Extension users */ + refcount_t refcnt; + struct perf_event *event; /* Perf event for sampling */ + struct paiext_cb *paiext_cb; /* PAI extension control block area */ +}; + +struct paiext_mapptr { + struct paiext_map *mapptr; +}; + +static struct paiext_root { /* Anchor to per CPU data */ + refcount_t refcnt; /* Overall active events */ + struct paiext_mapptr __percpu *mapptr; +} paiext_root; + +/* Free per CPU data when the last event is removed. */ +static void paiext_root_free(void) +{ + if (refcount_dec_and_test(&paiext_root.refcnt)) { + free_percpu(paiext_root.mapptr); + paiext_root.mapptr = NULL; + } +} + +/* On initialization of first event also allocate per CPU data dynamically. + * Start with an array of pointers, the array size is the maximum number of + * CPUs possible, which might be larger than the number of CPUs currently + * online. + */ +static int paiext_root_alloc(void) +{ + if (!refcount_inc_not_zero(&paiext_root.refcnt)) { + /* The memory is already zeroed. */ + paiext_root.mapptr = alloc_percpu(struct paiext_mapptr); + if (!paiext_root.mapptr) { + /* Returning without refcnt adjustment is ok. The + * error code is handled by paiext_alloc() which + * decrements refcnt when an event can not be + * created. + */ + return -ENOMEM; + } + refcount_set(&paiext_root.refcnt, 1); + } + return 0; +} + +/* Protects against concurrent increment of sampler and counter member + * increments at the same time and prohibits concurrent execution of + * counting and sampling events. + * Ensures that analytics counter block is deallocated only when the + * sampling and counting on that cpu is zero. + * For details see paiext_alloc(). + */ +static DEFINE_MUTEX(paiext_reserve_mutex); + +/* Free all memory allocated for event counting/sampling setup */ +static void paiext_free(struct paiext_mapptr *mp) +{ + kfree(mp->mapptr->area); + kfree(mp->mapptr->paiext_cb); + kvfree(mp->mapptr->save); + kfree(mp->mapptr); + mp->mapptr = NULL; +} + +/* Release the PMU if event is the last perf event */ +static void paiext_event_destroy(struct perf_event *event) +{ + struct paiext_mapptr *mp = per_cpu_ptr(paiext_root.mapptr, event->cpu); + struct paiext_map *cpump = mp->mapptr; + + mutex_lock(&paiext_reserve_mutex); + cpump->event = NULL; + if (refcount_dec_and_test(&cpump->refcnt)) /* Last reference gone */ + paiext_free(mp); + paiext_root_free(); + mutex_unlock(&paiext_reserve_mutex); + debug_sprintf_event(paiext_dbg, 4, "%s cpu %d mapptr %p\n", __func__, + event->cpu, mp->mapptr); + +} + +/* Used to avoid races in checking concurrent access of counting and + * sampling for pai_extension events. + * + * Only one instance of event pai_ext/NNPA_ALL/ for sampling is + * allowed and when this event is running, no counting event is allowed. + * Several counting events are allowed in parallel, but no sampling event + * is allowed while one (or more) counting events are running. + * + * This function is called in process context and it is safe to block. + * When the event initialization functions fails, no other call back will + * be invoked. + * + * Allocate the memory for the event. + */ +static int paiext_alloc(struct perf_event_attr *a, struct perf_event *event) +{ + struct paiext_mapptr *mp; + struct paiext_map *cpump; + int rc; + + mutex_lock(&paiext_reserve_mutex); + + rc = paiext_root_alloc(); + if (rc) + goto unlock; + + mp = per_cpu_ptr(paiext_root.mapptr, event->cpu); + cpump = mp->mapptr; + if (!cpump) { /* Paiext_map allocated? */ + rc = -ENOMEM; + cpump = kzalloc(sizeof(*cpump), GFP_KERNEL); + if (!cpump) + goto undo; + + /* Allocate memory for counter area and counter extraction. + * These are + * - a 512 byte block and requires 512 byte boundary alignment. + * - a 1KB byte block and requires 1KB boundary alignment. + * Only the first counting event has to allocate the area. + * + * Note: This works with commit 59bb47985c1d by default. + * Backporting this to kernels without this commit might + * need adjustment. + */ + mp->mapptr = cpump; + cpump->area = kzalloc(PAIE1_CTRBLOCK_SZ, GFP_KERNEL); + cpump->paiext_cb = kzalloc(PAIE1_CB_SZ, GFP_KERNEL); + cpump->save = kvmalloc_array(paiext_cnt + 1, + sizeof(struct pai_userdata), + GFP_KERNEL); + if (!cpump->save || !cpump->area || !cpump->paiext_cb) { + paiext_free(mp); + goto undo; + } + refcount_set(&cpump->refcnt, 1); + cpump->mode = a->sample_period ? PAI_MODE_SAMPLING + : PAI_MODE_COUNTING; + } else { + /* Multiple invocation, check what is active. + * Supported are multiple counter events or only one sampling + * event concurrently at any one time. + */ + if (cpump->mode == PAI_MODE_SAMPLING || + (cpump->mode == PAI_MODE_COUNTING && a->sample_period)) { + rc = -EBUSY; + goto undo; + } + refcount_inc(&cpump->refcnt); + } + + rc = 0; + cpump->event = event; + +undo: + if (rc) { + /* Error in allocation of event, decrement anchor. Since + * the event in not created, its destroy() function is never + * invoked. Adjust the reference counter for the anchor. + */ + paiext_root_free(); + } +unlock: + mutex_unlock(&paiext_reserve_mutex); + /* If rc is non-zero, no increment of counter/sampler was done. */ + return rc; +} + +/* The PAI extension 1 control block supports up to 128 entries. Return + * the index within PAIE1_CB given the event number. Also validate event + * number. + */ +static int paiext_event_valid(struct perf_event *event) +{ + u64 cfg = event->attr.config; + + if (cfg >= PAI_NNPA_BASE && cfg <= PAI_NNPA_BASE + paiext_cnt) { + /* Offset NNPA in paiext_cb */ + event->hw.config_base = offsetof(struct paiext_cb, acc); + return 0; + } + return -EINVAL; +} + +/* Might be called on different CPU than the one the event is intended for. */ +static int paiext_event_init(struct perf_event *event) +{ + struct perf_event_attr *a = &event->attr; + int rc; + + /* PMU pai_ext registered as PERF_TYPE_RAW, check event type */ + if (a->type != PERF_TYPE_RAW && event->pmu->type != a->type) + return -ENOENT; + /* PAI extension event must be valid and in supported range */ + rc = paiext_event_valid(event); + if (rc) + return rc; + /* Allow only CPU wide operation, no process context for now. */ + if ((event->attach_state & PERF_ATTACH_TASK) || event->cpu == -1) + return -ENOENT; + /* Allow only event NNPA_ALL for sampling. */ + if (a->sample_period && a->config != PAI_NNPA_BASE) + return -EINVAL; + /* Prohibit exclude_user event selection */ + if (a->exclude_user) + return -EINVAL; + + rc = paiext_alloc(a, event); + if (rc) + return rc; + event->destroy = paiext_event_destroy; + + if (a->sample_period) { + a->sample_period = 1; + a->freq = 0; + /* Register for paicrypt_sched_task() to be called */ + event->attach_state |= PERF_ATTACH_SCHED_CB; + /* Add raw data which are the memory mapped counters */ + a->sample_type |= PERF_SAMPLE_RAW; + /* Turn off inheritance */ + a->inherit = 0; + } + + return 0; +} + +static u64 paiext_getctr(unsigned long *area, int nr) +{ + return area[nr]; +} + +/* Read the counter values. Return value from location in buffer. For event + * NNPA_ALL sum up all events. + */ +static u64 paiext_getdata(struct perf_event *event) +{ + struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr); + struct paiext_map *cpump = mp->mapptr; + u64 sum = 0; + int i; + + if (event->attr.config != PAI_NNPA_BASE) + return paiext_getctr(cpump->area, + event->attr.config - PAI_NNPA_BASE); + + for (i = 1; i <= paiext_cnt; i++) + sum += paiext_getctr(cpump->area, i); + + return sum; +} + +static u64 paiext_getall(struct perf_event *event) +{ + return paiext_getdata(event); +} + +static void paiext_read(struct perf_event *event) +{ + u64 prev, new, delta; + + prev = local64_read(&event->hw.prev_count); + new = paiext_getall(event); + local64_set(&event->hw.prev_count, new); + delta = new - prev; + local64_add(delta, &event->count); +} + +static void paiext_start(struct perf_event *event, int flags) +{ + u64 sum; + + if (!event->attr.sample_period) { /* Counting */ + if (!event->hw.last_tag) { + event->hw.last_tag = 1; + sum = paiext_getall(event); /* Get current value */ + local64_set(&event->hw.prev_count, sum); + } + } else { /* Sampling */ + perf_sched_cb_inc(event->pmu); + } +} + +static int paiext_add(struct perf_event *event, int flags) +{ + struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr); + struct paiext_map *cpump = mp->mapptr; + struct paiext_cb *pcb = cpump->paiext_cb; + + if (++cpump->active_events == 1) { + S390_lowcore.aicd = virt_to_phys(cpump->paiext_cb); + pcb->acc = virt_to_phys(cpump->area) | 0x1; + /* Enable CPU instruction lookup for PAIE1 control block */ + local_ctl_set_bit(0, CR0_PAI_EXTENSION_BIT); + debug_sprintf_event(paiext_dbg, 4, "%s 1508 %llx acc %llx\n", + __func__, S390_lowcore.aicd, pcb->acc); + } + cpump->event = event; + if (flags & PERF_EF_START) + paiext_start(event, PERF_EF_RELOAD); + event->hw.state = 0; + return 0; +} + +static void paiext_stop(struct perf_event *event, int flags) +{ + if (!event->attr.sample_period) /* Counting */ + paiext_read(event); + else /* Sampling */ + perf_sched_cb_dec(event->pmu); + event->hw.state = PERF_HES_STOPPED; +} + +static void paiext_del(struct perf_event *event, int flags) +{ + struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr); + struct paiext_map *cpump = mp->mapptr; + struct paiext_cb *pcb = cpump->paiext_cb; + + paiext_stop(event, PERF_EF_UPDATE); + if (--cpump->active_events == 0) { + /* Disable CPU instruction lookup for PAIE1 control block */ + local_ctl_clear_bit(0, CR0_PAI_EXTENSION_BIT); + pcb->acc = 0; + S390_lowcore.aicd = 0; + debug_sprintf_event(paiext_dbg, 4, "%s 1508 %llx acc %llx\n", + __func__, S390_lowcore.aicd, pcb->acc); + } +} + +/* Create raw data and save it in buffer. Returns number of bytes copied. + * Saves only positive counter entries of the form + * 2 bytes: Number of counter + * 8 bytes: Value of counter + */ +static size_t paiext_copy(struct pai_userdata *userdata, unsigned long *area) +{ + int i, outidx = 0; + + for (i = 1; i <= paiext_cnt; i++) { + u64 val = paiext_getctr(area, i); + + if (val) { + userdata[outidx].num = i; + userdata[outidx].value = val; + outidx++; + } + } + return outidx * sizeof(*userdata); +} + +/* Write sample when one or more counters values are nonzero. + * + * Note: The function paiext_sched_task() and paiext_push_sample() are not + * invoked after function paiext_del() has been called because of function + * perf_sched_cb_dec(). + * The function paiext_sched_task() and paiext_push_sample() are only + * called when sampling is active. Function perf_sched_cb_inc() + * has been invoked to install function paiext_sched_task() as call back + * to run at context switch time (see paiext_add()). + * + * This causes function perf_event_context_sched_out() and + * perf_event_context_sched_in() to check whether the PMU has installed an + * sched_task() callback. That callback is not active after paiext_del() + * returns and has deleted the event on that CPU. + */ +static int paiext_push_sample(size_t rawsize, struct paiext_map *cpump, + struct perf_event *event) +{ + struct perf_sample_data data; + struct perf_raw_record raw; + struct pt_regs regs; + int overflow; + + /* Setup perf sample */ + memset(®s, 0, sizeof(regs)); + memset(&raw, 0, sizeof(raw)); + memset(&data, 0, sizeof(data)); + perf_sample_data_init(&data, 0, event->hw.last_period); + if (event->attr.sample_type & PERF_SAMPLE_TID) { + data.tid_entry.pid = task_tgid_nr(current); + data.tid_entry.tid = task_pid_nr(current); + } + if (event->attr.sample_type & PERF_SAMPLE_TIME) + data.time = event->clock(); + if (event->attr.sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) + data.id = event->id; + if (event->attr.sample_type & PERF_SAMPLE_CPU) + data.cpu_entry.cpu = smp_processor_id(); + if (event->attr.sample_type & PERF_SAMPLE_RAW) { + raw.frag.size = rawsize; + raw.frag.data = cpump->save; + perf_sample_save_raw_data(&data, &raw); + } + + overflow = perf_event_overflow(event, &data, ®s); + perf_event_update_userpage(event); + /* Clear lowcore area after read */ + memset(cpump->area, 0, PAIE1_CTRBLOCK_SZ); + return overflow; +} + +/* Check if there is data to be saved on schedule out of a task. */ +static int paiext_have_sample(void) +{ + struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr); + struct paiext_map *cpump = mp->mapptr; + struct perf_event *event = cpump->event; + size_t rawsize; + int rc = 0; + + if (!event) + return 0; + rawsize = paiext_copy(cpump->save, cpump->area); + if (rawsize) /* Incremented counters */ + rc = paiext_push_sample(rawsize, cpump, event); + return rc; +} + +/* Called on schedule-in and schedule-out. No access to event structure, + * but for sampling only event NNPA_ALL is allowed. + */ +static void paiext_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) +{ + /* We started with a clean page on event installation. So read out + * results on schedule_out and if page was dirty, clear values. + */ + if (!sched_in) + paiext_have_sample(); +} + +/* Attribute definitions for pai extension1 interface. As with other CPU + * Measurement Facilities, there is one attribute per mapped counter. + * The number of mapped counters may vary per machine generation. Use + * the QUERY PROCESSOR ACTIVITY COUNTER INFORMATION (QPACI) instruction + * to determine the number of mapped counters. The instructions returns + * a positive number, which is the highest number of supported counters. + * All counters less than this number are also supported, there are no + * holes. A returned number of zero means no support for mapped counters. + * + * The identification of the counter is a unique number. The chosen range + * is 0x1800 + offset in mapped kernel page. + * All CPU Measurement Facility counters identifiers must be unique and + * the numbers from 0 to 496 are already used for the CPU Measurement + * Counter facility. Number 0x1000 to 0x103e are used for PAI cryptography + * counters. + * Numbers 0xb0000, 0xbc000 and 0xbd000 are already + * used for the CPU Measurement Sampling facility. + */ +PMU_FORMAT_ATTR(event, "config:0-63"); + +static struct attribute *paiext_format_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group paiext_events_group = { + .name = "events", + .attrs = NULL, /* Filled in attr_event_init() */ +}; + +static struct attribute_group paiext_format_group = { + .name = "format", + .attrs = paiext_format_attr, +}; + +static const struct attribute_group *paiext_attr_groups[] = { + &paiext_events_group, + &paiext_format_group, + NULL, +}; + +/* Performance monitoring unit for mapped counters */ +static struct pmu paiext = { + .task_ctx_nr = perf_invalid_context, + .event_init = paiext_event_init, + .add = paiext_add, + .del = paiext_del, + .start = paiext_start, + .stop = paiext_stop, + .read = paiext_read, + .sched_task = paiext_sched_task, + .attr_groups = paiext_attr_groups, +}; + +/* List of symbolic PAI extension 1 NNPA counter names. */ +static const char * const paiext_ctrnames[] = { + [0] = "NNPA_ALL", + [1] = "NNPA_ADD", + [2] = "NNPA_SUB", + [3] = "NNPA_MUL", + [4] = "NNPA_DIV", + [5] = "NNPA_MIN", + [6] = "NNPA_MAX", + [7] = "NNPA_LOG", + [8] = "NNPA_EXP", + [9] = "NNPA_IBM_RESERVED_9", + [10] = "NNPA_RELU", + [11] = "NNPA_TANH", + [12] = "NNPA_SIGMOID", + [13] = "NNPA_SOFTMAX", + [14] = "NNPA_BATCHNORM", + [15] = "NNPA_MAXPOOL2D", + [16] = "NNPA_AVGPOOL2D", + [17] = "NNPA_LSTMACT", + [18] = "NNPA_GRUACT", + [19] = "NNPA_CONVOLUTION", + [20] = "NNPA_MATMUL_OP", + [21] = "NNPA_MATMUL_OP_BCAST23", + [22] = "NNPA_SMALLBATCH", + [23] = "NNPA_LARGEDIM", + [24] = "NNPA_SMALLTENSOR", + [25] = "NNPA_1MFRAME", + [26] = "NNPA_2GFRAME", + [27] = "NNPA_ACCESSEXCEPT", +}; + +static void __init attr_event_free(struct attribute **attrs, int num) +{ + struct perf_pmu_events_attr *pa; + struct device_attribute *dap; + int i; + + for (i = 0; i < num; i++) { + dap = container_of(attrs[i], struct device_attribute, attr); + pa = container_of(dap, struct perf_pmu_events_attr, attr); + kfree(pa); + } + kfree(attrs); +} + +static int __init attr_event_init_one(struct attribute **attrs, int num) +{ + struct perf_pmu_events_attr *pa; + + pa = kzalloc(sizeof(*pa), GFP_KERNEL); + if (!pa) + return -ENOMEM; + + sysfs_attr_init(&pa->attr.attr); + pa->id = PAI_NNPA_BASE + num; + pa->attr.attr.name = paiext_ctrnames[num]; + pa->attr.attr.mode = 0444; + pa->attr.show = cpumf_events_sysfs_show; + pa->attr.store = NULL; + attrs[num] = &pa->attr.attr; + return 0; +} + +/* Create PMU sysfs event attributes on the fly. */ +static int __init attr_event_init(void) +{ + struct attribute **attrs; + int ret, i; + + attrs = kmalloc_array(ARRAY_SIZE(paiext_ctrnames) + 1, sizeof(*attrs), + GFP_KERNEL); + if (!attrs) + return -ENOMEM; + for (i = 0; i < ARRAY_SIZE(paiext_ctrnames); i++) { + ret = attr_event_init_one(attrs, i); + if (ret) { + attr_event_free(attrs, i - 1); + return ret; + } + } + attrs[i] = NULL; + paiext_events_group.attrs = attrs; + return 0; +} + +static int __init paiext_init(void) +{ + struct qpaci_info_block ib; + int rc = -ENOMEM; + + if (!test_facility(197)) + return 0; + + qpaci(&ib); + paiext_cnt = ib.num_nnpa; + if (paiext_cnt >= PAI_NNPA_MAXCTR) + paiext_cnt = PAI_NNPA_MAXCTR; + if (!paiext_cnt) + return 0; + + rc = attr_event_init(); + if (rc) { + pr_err("Creation of PMU " KMSG_COMPONENT " /sysfs failed\n"); + return rc; + } + + /* Setup s390dbf facility */ + paiext_dbg = debug_register(KMSG_COMPONENT, 2, 256, 128); + if (!paiext_dbg) { + pr_err("Registration of s390dbf " KMSG_COMPONENT " failed\n"); + rc = -ENOMEM; + goto out_init; + } + debug_register_view(paiext_dbg, &debug_sprintf_view); + + rc = perf_pmu_register(&paiext, KMSG_COMPONENT, -1); + if (rc) { + pr_err("Registration of " KMSG_COMPONENT " PMU failed with " + "rc=%i\n", rc); + goto out_pmu; + } + + return 0; + +out_pmu: + debug_unregister_view(paiext_dbg, &debug_sprintf_view); + debug_unregister(paiext_dbg); +out_init: + attr_event_free(paiext_events_group.attrs, + ARRAY_SIZE(paiext_ctrnames) + 1); + return rc; +} + +device_initcall(paiext_init); diff --git a/arch/s390/kernel/perf_regs.c b/arch/s390/kernel/perf_regs.c index 4352a504f235..3d93656bd948 100644 --- a/arch/s390/kernel/perf_regs.c +++ b/arch/s390/kernel/perf_regs.c @@ -20,8 +20,10 @@ u64 perf_reg_value(struct pt_regs *regs, int idx) return 0; idx -= PERF_REG_S390_FP0; - fp = MACHINE_HAS_VX ? *(freg_t *)(current->thread.fpu.vxrs + idx) - : current->thread.fpu.fprs[idx]; + if (cpu_has_vx()) + fp = *(freg_t *)(current->thread.fpu.vxrs + idx); + else + fp = current->thread.fpu.fprs[idx]; return fp.ui; } @@ -53,8 +55,7 @@ u64 perf_reg_abi(struct task_struct *task) } void perf_get_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs, - struct pt_regs *regs_user_copy) + struct pt_regs *regs) { /* * Use the regs from the first interruption and let diff --git a/arch/s390/kernel/pgm_check.S b/arch/s390/kernel/pgm_check.S deleted file mode 100644 index 59dee9d3bebf..000000000000 --- a/arch/s390/kernel/pgm_check.S +++ /dev/null @@ -1,147 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Program check table. - * - * Copyright IBM Corp. 2012 - */ - -#include <linux/linkage.h> - -#define PGM_CHECK(handler) .quad handler -#define PGM_CHECK_DEFAULT PGM_CHECK(default_trap_handler) - -/* - * The program check table contains exactly 128 (0x00-0x7f) entries. Each - * line defines the function to be called corresponding to the program check - * interruption code. - */ -.section .rodata, "a" -ENTRY(pgm_check_table) -PGM_CHECK_DEFAULT /* 00 */ -PGM_CHECK(illegal_op) /* 01 */ -PGM_CHECK(privileged_op) /* 02 */ -PGM_CHECK(execute_exception) /* 03 */ -PGM_CHECK(do_protection_exception) /* 04 */ -PGM_CHECK(addressing_exception) /* 05 */ -PGM_CHECK(specification_exception) /* 06 */ -PGM_CHECK(data_exception) /* 07 */ -PGM_CHECK(overflow_exception) /* 08 */ -PGM_CHECK(divide_exception) /* 09 */ -PGM_CHECK(overflow_exception) /* 0a */ -PGM_CHECK(divide_exception) /* 0b */ -PGM_CHECK(hfp_overflow_exception) /* 0c */ -PGM_CHECK(hfp_underflow_exception) /* 0d */ -PGM_CHECK(hfp_significance_exception) /* 0e */ -PGM_CHECK(hfp_divide_exception) /* 0f */ -PGM_CHECK(do_dat_exception) /* 10 */ -PGM_CHECK(do_dat_exception) /* 11 */ -PGM_CHECK(translation_exception) /* 12 */ -PGM_CHECK(special_op_exception) /* 13 */ -PGM_CHECK_DEFAULT /* 14 */ -PGM_CHECK(operand_exception) /* 15 */ -PGM_CHECK_DEFAULT /* 16 */ -PGM_CHECK_DEFAULT /* 17 */ -PGM_CHECK(transaction_exception) /* 18 */ -PGM_CHECK_DEFAULT /* 19 */ -PGM_CHECK_DEFAULT /* 1a */ -PGM_CHECK(vector_exception) /* 1b */ -PGM_CHECK(space_switch_exception) /* 1c */ -PGM_CHECK(hfp_sqrt_exception) /* 1d */ -PGM_CHECK_DEFAULT /* 1e */ -PGM_CHECK_DEFAULT /* 1f */ -PGM_CHECK_DEFAULT /* 20 */ -PGM_CHECK_DEFAULT /* 21 */ -PGM_CHECK_DEFAULT /* 22 */ -PGM_CHECK_DEFAULT /* 23 */ -PGM_CHECK_DEFAULT /* 24 */ -PGM_CHECK_DEFAULT /* 25 */ -PGM_CHECK_DEFAULT /* 26 */ -PGM_CHECK_DEFAULT /* 27 */ -PGM_CHECK_DEFAULT /* 28 */ -PGM_CHECK_DEFAULT /* 29 */ -PGM_CHECK_DEFAULT /* 2a */ -PGM_CHECK_DEFAULT /* 2b */ -PGM_CHECK_DEFAULT /* 2c */ -PGM_CHECK_DEFAULT /* 2d */ -PGM_CHECK_DEFAULT /* 2e */ -PGM_CHECK_DEFAULT /* 2f */ -PGM_CHECK_DEFAULT /* 30 */ -PGM_CHECK_DEFAULT /* 31 */ -PGM_CHECK_DEFAULT /* 32 */ -PGM_CHECK_DEFAULT /* 33 */ -PGM_CHECK_DEFAULT /* 34 */ -PGM_CHECK_DEFAULT /* 35 */ -PGM_CHECK_DEFAULT /* 36 */ -PGM_CHECK_DEFAULT /* 37 */ -PGM_CHECK(do_dat_exception) /* 38 */ -PGM_CHECK(do_dat_exception) /* 39 */ -PGM_CHECK(do_dat_exception) /* 3a */ -PGM_CHECK(do_dat_exception) /* 3b */ -PGM_CHECK_DEFAULT /* 3c */ -PGM_CHECK_DEFAULT /* 3d */ -PGM_CHECK_DEFAULT /* 3e */ -PGM_CHECK_DEFAULT /* 3f */ -PGM_CHECK_DEFAULT /* 40 */ -PGM_CHECK_DEFAULT /* 41 */ -PGM_CHECK_DEFAULT /* 42 */ -PGM_CHECK_DEFAULT /* 43 */ -PGM_CHECK_DEFAULT /* 44 */ -PGM_CHECK_DEFAULT /* 45 */ -PGM_CHECK_DEFAULT /* 46 */ -PGM_CHECK_DEFAULT /* 47 */ -PGM_CHECK_DEFAULT /* 48 */ -PGM_CHECK_DEFAULT /* 49 */ -PGM_CHECK_DEFAULT /* 4a */ -PGM_CHECK_DEFAULT /* 4b */ -PGM_CHECK_DEFAULT /* 4c */ -PGM_CHECK_DEFAULT /* 4d */ -PGM_CHECK_DEFAULT /* 4e */ -PGM_CHECK_DEFAULT /* 4f */ -PGM_CHECK_DEFAULT /* 50 */ -PGM_CHECK_DEFAULT /* 51 */ -PGM_CHECK_DEFAULT /* 52 */ -PGM_CHECK_DEFAULT /* 53 */ -PGM_CHECK_DEFAULT /* 54 */ -PGM_CHECK_DEFAULT /* 55 */ -PGM_CHECK_DEFAULT /* 56 */ -PGM_CHECK_DEFAULT /* 57 */ -PGM_CHECK_DEFAULT /* 58 */ -PGM_CHECK_DEFAULT /* 59 */ -PGM_CHECK_DEFAULT /* 5a */ -PGM_CHECK_DEFAULT /* 5b */ -PGM_CHECK_DEFAULT /* 5c */ -PGM_CHECK_DEFAULT /* 5d */ -PGM_CHECK_DEFAULT /* 5e */ -PGM_CHECK_DEFAULT /* 5f */ -PGM_CHECK_DEFAULT /* 60 */ -PGM_CHECK_DEFAULT /* 61 */ -PGM_CHECK_DEFAULT /* 62 */ -PGM_CHECK_DEFAULT /* 63 */ -PGM_CHECK_DEFAULT /* 64 */ -PGM_CHECK_DEFAULT /* 65 */ -PGM_CHECK_DEFAULT /* 66 */ -PGM_CHECK_DEFAULT /* 67 */ -PGM_CHECK_DEFAULT /* 68 */ -PGM_CHECK_DEFAULT /* 69 */ -PGM_CHECK_DEFAULT /* 6a */ -PGM_CHECK_DEFAULT /* 6b */ -PGM_CHECK_DEFAULT /* 6c */ -PGM_CHECK_DEFAULT /* 6d */ -PGM_CHECK_DEFAULT /* 6e */ -PGM_CHECK_DEFAULT /* 6f */ -PGM_CHECK_DEFAULT /* 70 */ -PGM_CHECK_DEFAULT /* 71 */ -PGM_CHECK_DEFAULT /* 72 */ -PGM_CHECK_DEFAULT /* 73 */ -PGM_CHECK_DEFAULT /* 74 */ -PGM_CHECK_DEFAULT /* 75 */ -PGM_CHECK_DEFAULT /* 76 */ -PGM_CHECK_DEFAULT /* 77 */ -PGM_CHECK_DEFAULT /* 78 */ -PGM_CHECK_DEFAULT /* 79 */ -PGM_CHECK_DEFAULT /* 7a */ -PGM_CHECK_DEFAULT /* 7b */ -PGM_CHECK_DEFAULT /* 7c */ -PGM_CHECK_DEFAULT /* 7d */ -PGM_CHECK_DEFAULT /* 7e */ -PGM_CHECK_DEFAULT /* 7f */ diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 6ccef5f29761..4e3b366589fb 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -29,8 +29,9 @@ #include <linux/random.h> #include <linux/export.h> #include <linux/init_task.h> +#include <linux/entry-common.h> +#include <linux/io.h> #include <asm/cpu_mf.h> -#include <asm/io.h> #include <asm/processor.h> #include <asm/vtimer.h> #include <asm/exec.h> @@ -43,9 +44,22 @@ #include <asm/unwind.h> #include "entry.h" -asmlinkage void ret_from_fork(void) asm ("ret_from_fork"); +void ret_from_fork(void) asm("ret_from_fork"); -extern void kernel_thread_starter(void); +void __ret_from_fork(struct task_struct *prev, struct pt_regs *regs) +{ + void (*func)(void *arg); + + schedule_tail(prev); + + if (!user_mode(regs)) { + /* Kernel thread */ + func = (void *)regs->gprs[9]; + func((void *)regs->gprs[10]); + } + clear_pt_regs_flag(regs, PIF_SYSCALL); + syscall_exit_to_user_mode(regs); +} void flush_thread(void) { @@ -75,14 +89,28 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) */ save_fpu_regs(); - memcpy(dst, src, arch_task_struct_size); + *dst = *src; dst->thread.fpu.regs = dst->thread.fpu.fprs; + + /* + * Don't transfer over the runtime instrumentation or the guarded + * storage control block pointers. These fields are cleared here instead + * of in copy_thread() to avoid premature freeing of associated memory + * on fork() failure. Wait to clear the RI flag because ->stack still + * refers to the source thread. + */ + dst->thread.ri_cb = NULL; + dst->thread.gs_cb = NULL; + dst->thread.gs_bc_cb = NULL; + return 0; } -int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp, - unsigned long arg, struct task_struct *p, unsigned long tls) +int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long new_stackp = args->stack; + unsigned long tls = args->tls; struct fake_frame { struct stack_frame sf; @@ -94,7 +122,6 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp, /* Save access registers to new thread structure. */ save_access_regs(&p->thread.acrs[0]); /* start new process with ar4 pointing to the correct address space */ - p->thread.mm_segment = get_fs(); /* Don't copy debug registers */ memset(&p->thread.per_user, 0, sizeof(p->thread.per_user)); memset(&p->thread.per_event, 0, sizeof(p->thread.per_event)); @@ -106,26 +133,26 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp, p->thread.system_timer = 0; p->thread.hardirq_timer = 0; p->thread.softirq_timer = 0; + p->thread.last_break = 1; frame->sf.back_chain = 0; + frame->sf.gprs[11 - 6] = (unsigned long)&frame->childregs; + frame->sf.gprs[12 - 6] = (unsigned long)p; /* new return point is ret_from_fork */ - frame->sf.gprs[8] = (unsigned long) ret_from_fork; + frame->sf.gprs[14 - 6] = (unsigned long)ret_from_fork; /* fake return stack for resume(), don't go back to schedule */ - frame->sf.gprs[9] = (unsigned long) frame; + frame->sf.gprs[15 - 6] = (unsigned long)frame; /* Store access registers to kernel stack of new process. */ - if (unlikely(p->flags & PF_KTHREAD)) { + if (unlikely(args->fn)) { /* kernel thread */ memset(&frame->childregs, 0, sizeof(struct pt_regs)); - frame->childregs.psw.mask = PSW_KERNEL_BITS | PSW_MASK_DAT | - PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK; - frame->childregs.psw.addr = - (unsigned long) kernel_thread_starter; - frame->childregs.gprs[9] = new_stackp; /* function */ - frame->childregs.gprs[10] = arg; - frame->childregs.gprs[11] = (unsigned long) do_exit; + frame->childregs.psw.mask = PSW_KERNEL_BITS | PSW_MASK_IO | + PSW_MASK_EXT | PSW_MASK_MCHECK; + frame->childregs.gprs[9] = (unsigned long)args->fn; + frame->childregs.gprs[10] = (unsigned long)args->fn_arg; frame->childregs.orig_gpr2 = -1; - + frame->childregs.last_break = 1; return 0; } frame->childregs = *current_pt_regs(); @@ -133,13 +160,11 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp, frame->childregs.flags = 0; if (new_stackp) frame->childregs.gprs[15] = new_stackp; - - /* Don't copy runtime instrumentation info */ - p->thread.ri_cb = NULL; + /* + * Clear the runtime instrumentation flag after the above childregs + * copy. The CB pointer was already cleared in arch_dup_task_struct(). + */ frame->childregs.psw.mask &= ~PSW_MASK_RI; - /* Don't copy guarded storage control block */ - p->thread.gs_cb = NULL; - p->thread.gs_bc_cb = NULL; /* Set a new TLS ? */ if (clone_flags & CLONE_SETTLS) { @@ -150,39 +175,27 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp, p->thread.acrs[1] = (unsigned int)tls; } } + /* + * s390 stores the svc return address in arch_data when calling + * sigreturn()/restart_syscall() via vdso. 1 means no valid address + * stored. + */ + p->restart_block.arch_data = 1; return 0; } -asmlinkage void execve_tail(void) +void execve_tail(void) { current->thread.fpu.fpc = 0; asm volatile("sfpc %0" : : "d" (0)); } -/* - * fill in the FPU structure for a core dump. - */ -int dump_fpu (struct pt_regs * regs, s390_fp_regs *fpregs) -{ - save_fpu_regs(); - fpregs->fpc = current->thread.fpu.fpc; - fpregs->pad = 0; - if (MACHINE_HAS_VX) - convert_vx_to_fp((freg_t *)&fpregs->fprs, - current->thread.fpu.vxrs); - else - memcpy(&fpregs->fprs, current->thread.fpu.fprs, - sizeof(fpregs->fprs)); - return 1; -} -EXPORT_SYMBOL(dump_fpu); - -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { struct unwind_state state; unsigned long ip = 0; - if (!p || p == current || p->state == TASK_RUNNING || !task_stack_page(p)) + if (!task_stack_page(p)) return 0; if (!try_get_task_stack(p)) @@ -209,13 +222,13 @@ unsigned long get_wchan(struct task_struct *p) unsigned long arch_align_stack(unsigned long sp) { if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= get_random_int() & ~PAGE_MASK; + sp -= get_random_u32_below(PAGE_SIZE); return sp & ~0xf; } static inline unsigned long brk_rnd(void) { - return (get_random_int() & BRK_RND_MASK) << PAGE_SHIFT; + return (get_random_u16() & BRK_RND_MASK) << PAGE_SHIFT; } unsigned long arch_randomize_brk(struct mm_struct *mm) @@ -225,16 +238,3 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) ret = PAGE_ALIGN(mm->brk + brk_rnd()); return (ret > mm->brk) ? ret : mm->brk; } - -void set_fs_fixup(void) -{ - struct pt_regs *regs = current_pt_regs(); - static bool warned; - - set_fs(USER_DS); - if (warned) - return; - WARN(1, "Unbalanced set_fs - int code: 0x%x\n", regs->int_code); - show_registers(regs); - warned = true; -} diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c index 6ebc2117c66c..65c1464eea4f 100644 --- a/arch/s390/kernel/processor.c +++ b/arch/s390/kernel/processor.c @@ -8,9 +8,9 @@ #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/stop_machine.h> -#include <linux/cpufeature.h> #include <linux/bitops.h> #include <linux/kernel.h> +#include <linux/random.h> #include <linux/sched/mm.h> #include <linux/init.h> #include <linux/seq_file.h> @@ -23,8 +23,12 @@ #include <asm/elf.h> #include <asm/lowcore.h> #include <asm/param.h> +#include <asm/sclp.h> #include <asm/smp.h> +unsigned long __read_mostly elf_hwcap; +char elf_platform[ELF_PLATFORM_SIZE]; + struct cpu_info { unsigned int cpu_mhz_dynamic; unsigned int cpu_mhz_static; @@ -91,23 +95,12 @@ void cpu_init(void) enter_lazy_tlb(&init_mm, current); } -/* - * cpu_have_feature - Test CPU features on module initialization - */ -int cpu_have_feature(unsigned int num) -{ - return elf_hwcap & (1UL << num); -} -EXPORT_SYMBOL(cpu_have_feature); - static void show_facilities(struct seq_file *m) { unsigned int bit; - long *facilities; - facilities = (long *)&S390_lowcore.stfle_fac_list; seq_puts(m, "facilities :"); - for_each_set_bit_inv(bit, facilities, MAX_FACILITY_BIT) + for_each_set_bit_inv(bit, (long *)&stfle_fac_list, MAX_FACILITY_BIT) seq_printf(m, " %d", bit); seq_putc(m, '\n'); } @@ -115,15 +108,33 @@ static void show_facilities(struct seq_file *m) static void show_cpu_summary(struct seq_file *m, void *v) { static const char *hwcap_str[] = { - "esan3", "zarch", "stfle", "msa", "ldisp", "eimm", "dfp", - "edat", "etf3eh", "highgprs", "te", "vx", "vxd", "vxe", "gs", - "vxe2", "vxp", "sort", "dflt" - }; - static const char * const int_hwcap_str[] = { - "sie" + [HWCAP_NR_ESAN3] = "esan3", + [HWCAP_NR_ZARCH] = "zarch", + [HWCAP_NR_STFLE] = "stfle", + [HWCAP_NR_MSA] = "msa", + [HWCAP_NR_LDISP] = "ldisp", + [HWCAP_NR_EIMM] = "eimm", + [HWCAP_NR_DFP] = "dfp", + [HWCAP_NR_HPAGE] = "edat", + [HWCAP_NR_ETF3EH] = "etf3eh", + [HWCAP_NR_HIGH_GPRS] = "highgprs", + [HWCAP_NR_TE] = "te", + [HWCAP_NR_VXRS] = "vx", + [HWCAP_NR_VXRS_BCD] = "vxd", + [HWCAP_NR_VXRS_EXT] = "vxe", + [HWCAP_NR_GS] = "gs", + [HWCAP_NR_VXRS_EXT2] = "vxe2", + [HWCAP_NR_VXRS_PDE] = "vxp", + [HWCAP_NR_SORT] = "sort", + [HWCAP_NR_DFLT] = "dflt", + [HWCAP_NR_VXRS_PDE2] = "vxp2", + [HWCAP_NR_NNPA] = "nnpa", + [HWCAP_NR_PCI_MIO] = "pcimio", + [HWCAP_NR_SIE] = "sie", }; int i, cpu; + BUILD_BUG_ON(ARRAY_SIZE(hwcap_str) != HWCAP_NR_MAX); seq_printf(m, "vendor_id : IBM/S390\n" "# processors : %i\n" "bogomips per cpu: %lu.%02lu\n", @@ -134,9 +145,6 @@ static void show_cpu_summary(struct seq_file *m, void *v) for (i = 0; i < ARRAY_SIZE(hwcap_str); i++) if (hwcap_str[i] && (elf_hwcap & (1UL << i))) seq_printf(m, "%s ", hwcap_str[i]); - for (i = 0; i < ARRAY_SIZE(int_hwcap_str); i++) - if (int_hwcap_str[i] && (int_hwcap & (1UL << i))) - seq_printf(m, "%s ", int_hwcap_str[i]); seq_puts(m, "\n"); show_facilities(m); show_cacheinfo(m); @@ -151,10 +159,155 @@ static void show_cpu_summary(struct seq_file *m, void *v) } } +static int __init setup_hwcaps(void) +{ + /* instructions named N3, "backported" to esa-mode */ + elf_hwcap |= HWCAP_ESAN3; + + /* z/Architecture mode active */ + elf_hwcap |= HWCAP_ZARCH; + + /* store-facility-list-extended */ + if (test_facility(7)) + elf_hwcap |= HWCAP_STFLE; + + /* message-security assist */ + if (test_facility(17)) + elf_hwcap |= HWCAP_MSA; + + /* long-displacement */ + if (test_facility(19)) + elf_hwcap |= HWCAP_LDISP; + + /* extended-immediate */ + elf_hwcap |= HWCAP_EIMM; + + /* extended-translation facility 3 enhancement */ + if (test_facility(22) && test_facility(30)) + elf_hwcap |= HWCAP_ETF3EH; + + /* decimal floating point & perform floating point operation */ + if (test_facility(42) && test_facility(44)) + elf_hwcap |= HWCAP_DFP; + + /* huge page support */ + if (MACHINE_HAS_EDAT1) + elf_hwcap |= HWCAP_HPAGE; + + /* 64-bit register support for 31-bit processes */ + elf_hwcap |= HWCAP_HIGH_GPRS; + + /* transactional execution */ + if (MACHINE_HAS_TE) + elf_hwcap |= HWCAP_TE; + + /* vector */ + if (test_facility(129)) { + elf_hwcap |= HWCAP_VXRS; + if (test_facility(134)) + elf_hwcap |= HWCAP_VXRS_BCD; + if (test_facility(135)) + elf_hwcap |= HWCAP_VXRS_EXT; + if (test_facility(148)) + elf_hwcap |= HWCAP_VXRS_EXT2; + if (test_facility(152)) + elf_hwcap |= HWCAP_VXRS_PDE; + if (test_facility(192)) + elf_hwcap |= HWCAP_VXRS_PDE2; + } + + if (test_facility(150)) + elf_hwcap |= HWCAP_SORT; + + if (test_facility(151)) + elf_hwcap |= HWCAP_DFLT; + + if (test_facility(165)) + elf_hwcap |= HWCAP_NNPA; + + /* guarded storage */ + if (MACHINE_HAS_GS) + elf_hwcap |= HWCAP_GS; + + if (MACHINE_HAS_PCI_MIO) + elf_hwcap |= HWCAP_PCI_MIO; + + /* virtualization support */ + if (sclp.has_sief2) + elf_hwcap |= HWCAP_SIE; + + return 0; +} +arch_initcall(setup_hwcaps); + +static int __init setup_elf_platform(void) +{ + struct cpuid cpu_id; + + get_cpu_id(&cpu_id); + add_device_randomness(&cpu_id, sizeof(cpu_id)); + switch (cpu_id.machine) { + default: /* Use "z10" as default. */ + strcpy(elf_platform, "z10"); + break; + case 0x2817: + case 0x2818: + strcpy(elf_platform, "z196"); + break; + case 0x2827: + case 0x2828: + strcpy(elf_platform, "zEC12"); + break; + case 0x2964: + case 0x2965: + strcpy(elf_platform, "z13"); + break; + case 0x3906: + case 0x3907: + strcpy(elf_platform, "z14"); + break; + case 0x8561: + case 0x8562: + strcpy(elf_platform, "z15"); + break; + case 0x3931: + case 0x3932: + strcpy(elf_platform, "z16"); + break; + } + return 0; +} +arch_initcall(setup_elf_platform); + +static void show_cpu_topology(struct seq_file *m, unsigned long n) +{ +#ifdef CONFIG_SCHED_TOPOLOGY + seq_printf(m, "physical id : %d\n", topology_physical_package_id(n)); + seq_printf(m, "core id : %d\n", topology_core_id(n)); + seq_printf(m, "book id : %d\n", topology_book_id(n)); + seq_printf(m, "drawer id : %d\n", topology_drawer_id(n)); + seq_printf(m, "dedicated : %d\n", topology_cpu_dedicated(n)); + seq_printf(m, "address : %d\n", smp_cpu_get_cpu_address(n)); + seq_printf(m, "siblings : %d\n", cpumask_weight(topology_core_cpumask(n))); + seq_printf(m, "cpu cores : %d\n", topology_booted_cores(n)); +#endif /* CONFIG_SCHED_TOPOLOGY */ +} + +static void show_cpu_ids(struct seq_file *m, unsigned long n) +{ + struct cpuid *id = &per_cpu(cpu_info.cpu_id, n); + + seq_printf(m, "version : %02X\n", id->version); + seq_printf(m, "identification : %06X\n", id->ident); + seq_printf(m, "machine : %04X\n", id->machine); +} + static void show_cpu_mhz(struct seq_file *m, unsigned long n) { struct cpu_info *c = per_cpu_ptr(&cpu_info, n); + if (!machine_has_cpu_mhz) + return; seq_printf(m, "cpu MHz dynamic : %d\n", c->cpu_mhz_dynamic); seq_printf(m, "cpu MHz static : %d\n", c->cpu_mhz_static); } @@ -165,12 +318,13 @@ static void show_cpu_mhz(struct seq_file *m, unsigned long n) static int show_cpuinfo(struct seq_file *m, void *v) { unsigned long n = (unsigned long) v - 1; + unsigned long first = cpumask_first(cpu_online_mask); - if (!n) + if (n == first) show_cpu_summary(m, v); - if (!machine_has_cpu_mhz) - return 0; seq_printf(m, "\ncpu number : %ld\n", n); + show_cpu_topology(m, n); + show_cpu_ids(m, n); show_cpu_mhz(m, n); return 0; } @@ -179,12 +333,14 @@ static inline void *c_update(loff_t *pos) { if (*pos) *pos = cpumask_next(*pos - 1, cpu_online_mask); + else + *pos = cpumask_first(cpu_online_mask); return *pos < nr_cpu_ids ? (void *)*pos + 1 : NULL; } static void *c_start(struct seq_file *m, loff_t *pos) { - get_online_cpus(); + cpus_read_lock(); return c_update(pos); } @@ -196,7 +352,7 @@ static void *c_next(struct seq_file *m, void *v, loff_t *pos) static void c_stop(struct seq_file *m, void *v) { - put_online_cpus(); + cpus_read_unlock(); } const struct seq_operations cpuinfo_op = { @@ -205,21 +361,3 @@ const struct seq_operations cpuinfo_op = { .stop = c_stop, .show = show_cpuinfo, }; - -int s390_isolate_bp(void) -{ - if (!test_facility(82)) - return -EOPNOTSUPP; - set_thread_flag(TIF_ISOLATE_BP); - return 0; -} -EXPORT_SYMBOL(s390_isolate_bp); - -int s390_isolate_bp_guest(void) -{ - if (!test_facility(82)) - return -EOPNOTSUPP; - set_thread_flag(TIF_ISOLATE_BP_GUEST); - return 0; -} -EXPORT_SYMBOL(s390_isolate_bp_guest); diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index 58faa12542a1..f1897a8bb221 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -7,6 +7,7 @@ * Martin Schwidefsky (schwidefsky@de.ibm.com) */ +#include "asm/ptrace.h" #include <linux/kernel.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> @@ -20,18 +21,16 @@ #include <linux/signal.h> #include <linux/elf.h> #include <linux/regset.h> -#include <linux/tracehook.h> #include <linux/seccomp.h> #include <linux/compat.h> #include <trace/syscall.h> #include <asm/page.h> -#include <asm/pgtable.h> -#include <asm/pgalloc.h> #include <linux/uaccess.h> #include <asm/unistd.h> #include <asm/switch_to.h> #include <asm/runtime_instr.h> #include <asm/facility.h> +#include <asm/fpu/api.h> #include "entry.h" @@ -39,20 +38,24 @@ #include "compat_ptrace.h" #endif -#define CREATE_TRACE_POINTS -#include <trace/events/syscalls.h> - void update_cr_regs(struct task_struct *task) { struct pt_regs *regs = task_pt_regs(task); struct thread_struct *thread = &task->thread; - struct per_regs old, new; union ctlreg0 cr0_old, cr0_new; union ctlreg2 cr2_old, cr2_new; int cr0_changed, cr2_changed; - - __ctl_store(cr0_old.val, 0, 0); - __ctl_store(cr2_old.val, 2, 2); + union { + struct ctlreg regs[3]; + struct { + struct ctlreg control; + struct ctlreg start; + struct ctlreg end; + }; + } old, new; + + local_ctl_store(0, &cr0_old.reg); + local_ctl_store(2, &cr2_old.reg); cr0_new = cr0_old; cr2_new = cr2_old; /* Take care of the enable/disable of transactional execution. */ @@ -80,38 +83,38 @@ void update_cr_regs(struct task_struct *task) cr0_changed = cr0_new.val != cr0_old.val; cr2_changed = cr2_new.val != cr2_old.val; if (cr0_changed) - __ctl_load(cr0_new.val, 0, 0); + local_ctl_load(0, &cr0_new.reg); if (cr2_changed) - __ctl_load(cr2_new.val, 2, 2); + local_ctl_load(2, &cr2_new.reg); /* Copy user specified PER registers */ - new.control = thread->per_user.control; - new.start = thread->per_user.start; - new.end = thread->per_user.end; + new.control.val = thread->per_user.control; + new.start.val = thread->per_user.start; + new.end.val = thread->per_user.end; /* merge TIF_SINGLE_STEP into user specified PER registers. */ if (test_tsk_thread_flag(task, TIF_SINGLE_STEP) || test_tsk_thread_flag(task, TIF_UPROBE_SINGLESTEP)) { if (test_tsk_thread_flag(task, TIF_BLOCK_STEP)) - new.control |= PER_EVENT_BRANCH; + new.control.val |= PER_EVENT_BRANCH; else - new.control |= PER_EVENT_IFETCH; - new.control |= PER_CONTROL_SUSPENSION; - new.control |= PER_EVENT_TRANSACTION_END; + new.control.val |= PER_EVENT_IFETCH; + new.control.val |= PER_CONTROL_SUSPENSION; + new.control.val |= PER_EVENT_TRANSACTION_END; if (test_tsk_thread_flag(task, TIF_UPROBE_SINGLESTEP)) - new.control |= PER_EVENT_IFETCH; - new.start = 0; - new.end = -1UL; + new.control.val |= PER_EVENT_IFETCH; + new.start.val = 0; + new.end.val = -1UL; } /* Take care of the PER enablement bit in the PSW. */ - if (!(new.control & PER_EVENT_MASK)) { + if (!(new.control.val & PER_EVENT_MASK)) { regs->psw.mask &= ~PSW_MASK_PER; return; } regs->psw.mask |= PSW_MASK_PER; - __ctl_store(old, 9, 11); + __local_ctl_store(9, 11, old.regs); if (memcmp(&new, &old, sizeof(struct per_regs)) != 0) - __ctl_load(new, 9, 11); + __local_ctl_load(9, 11, new.regs); } void user_enable_single_step(struct task_struct *task) @@ -142,7 +145,7 @@ void ptrace_disable(struct task_struct *task) memset(&task->thread.per_user, 0, sizeof(task->thread.per_user)); memset(&task->thread.per_event, 0, sizeof(task->thread.per_event)); clear_tsk_thread_flag(task, TIF_SINGLE_STEP); - clear_pt_regs_flag(task_pt_regs(task), PIF_PER_TRAP); + clear_tsk_thread_flag(task, TIF_PER_TRAP); task->thread.per_flags = 0; } @@ -151,38 +154,36 @@ void ptrace_disable(struct task_struct *task) static inline unsigned long __peek_user_per(struct task_struct *child, addr_t addr) { - struct per_struct_kernel *dummy = NULL; - - if (addr == (addr_t) &dummy->cr9) + if (addr == offsetof(struct per_struct_kernel, cr9)) /* Control bits of the active per set. */ return test_thread_flag(TIF_SINGLE_STEP) ? PER_EVENT_IFETCH : child->thread.per_user.control; - else if (addr == (addr_t) &dummy->cr10) + else if (addr == offsetof(struct per_struct_kernel, cr10)) /* Start address of the active per set. */ return test_thread_flag(TIF_SINGLE_STEP) ? 0 : child->thread.per_user.start; - else if (addr == (addr_t) &dummy->cr11) + else if (addr == offsetof(struct per_struct_kernel, cr11)) /* End address of the active per set. */ return test_thread_flag(TIF_SINGLE_STEP) ? -1UL : child->thread.per_user.end; - else if (addr == (addr_t) &dummy->bits) + else if (addr == offsetof(struct per_struct_kernel, bits)) /* Single-step bit. */ return test_thread_flag(TIF_SINGLE_STEP) ? (1UL << (BITS_PER_LONG - 1)) : 0; - else if (addr == (addr_t) &dummy->starting_addr) + else if (addr == offsetof(struct per_struct_kernel, starting_addr)) /* Start address of the user specified per set. */ return child->thread.per_user.start; - else if (addr == (addr_t) &dummy->ending_addr) + else if (addr == offsetof(struct per_struct_kernel, ending_addr)) /* End address of the user specified per set. */ return child->thread.per_user.end; - else if (addr == (addr_t) &dummy->perc_atmid) + else if (addr == offsetof(struct per_struct_kernel, perc_atmid)) /* PER code, ATMID and AI of the last PER trap */ return (unsigned long) child->thread.per_event.cause << (BITS_PER_LONG - 16); - else if (addr == (addr_t) &dummy->address) + else if (addr == offsetof(struct per_struct_kernel, address)) /* Address of the last PER trap */ return child->thread.per_event.address; - else if (addr == (addr_t) &dummy->access_id) + else if (addr == offsetof(struct per_struct_kernel, access_id)) /* Access id of the last PER trap */ return (unsigned long) child->thread.per_event.paid << (BITS_PER_LONG - 8); @@ -200,73 +201,72 @@ static inline unsigned long __peek_user_per(struct task_struct *child, */ static unsigned long __peek_user(struct task_struct *child, addr_t addr) { - struct user *dummy = NULL; addr_t offset, tmp; - if (addr < (addr_t) &dummy->regs.acrs) { + if (addr < offsetof(struct user, regs.acrs)) { /* * psw and gprs are stored on the stack */ tmp = *(addr_t *)((addr_t) &task_pt_regs(child)->psw + addr); - if (addr == (addr_t) &dummy->regs.psw.mask) { + if (addr == offsetof(struct user, regs.psw.mask)) { /* Return a clean psw mask. */ tmp &= PSW_MASK_USER | PSW_MASK_RI; tmp |= PSW_USER_BITS; } - } else if (addr < (addr_t) &dummy->regs.orig_gpr2) { + } else if (addr < offsetof(struct user, regs.orig_gpr2)) { /* * access registers are stored in the thread structure */ - offset = addr - (addr_t) &dummy->regs.acrs; + offset = addr - offsetof(struct user, regs.acrs); /* * Very special case: old & broken 64 bit gdb reading * from acrs[15]. Result is a 64 bit value. Read the * 32 bit acrs[15] value and shift it by 32. Sick... */ - if (addr == (addr_t) &dummy->regs.acrs[15]) + if (addr == offsetof(struct user, regs.acrs[15])) tmp = ((unsigned long) child->thread.acrs[15]) << 32; else tmp = *(addr_t *)((addr_t) &child->thread.acrs + offset); - } else if (addr == (addr_t) &dummy->regs.orig_gpr2) { + } else if (addr == offsetof(struct user, regs.orig_gpr2)) { /* * orig_gpr2 is stored on the kernel stack */ tmp = (addr_t) task_pt_regs(child)->orig_gpr2; - } else if (addr < (addr_t) &dummy->regs.fp_regs) { + } else if (addr < offsetof(struct user, regs.fp_regs)) { /* * prevent reads of padding hole between * orig_gpr2 and fp_regs on s390. */ tmp = 0; - } else if (addr == (addr_t) &dummy->regs.fp_regs.fpc) { + } else if (addr == offsetof(struct user, regs.fp_regs.fpc)) { /* * floating point control reg. is in the thread structure */ tmp = child->thread.fpu.fpc; tmp <<= BITS_PER_LONG - 32; - } else if (addr < (addr_t) (&dummy->regs.fp_regs + 1)) { + } else if (addr < offsetof(struct user, regs.fp_regs) + sizeof(s390_fp_regs)) { /* * floating point regs. are either in child->thread.fpu * or the child->thread.fpu.vxrs array */ - offset = addr - (addr_t) &dummy->regs.fp_regs.fprs; - if (MACHINE_HAS_VX) + offset = addr - offsetof(struct user, regs.fp_regs.fprs); + if (cpu_has_vx()) tmp = *(addr_t *) ((addr_t) child->thread.fpu.vxrs + 2*offset); else tmp = *(addr_t *) ((addr_t) child->thread.fpu.fprs + offset); - } else if (addr < (addr_t) (&dummy->regs.per_info + 1)) { + } else if (addr < offsetof(struct user, regs.per_info) + sizeof(per_struct)) { /* * Handle access to the per_info structure. */ - addr -= (addr_t) &dummy->regs.per_info; + addr -= offsetof(struct user, regs.per_info); tmp = __peek_user_per(child, addr); } else @@ -285,8 +285,8 @@ peek_user(struct task_struct *child, addr_t addr, addr_t data) * an alignment of 4. Programmers from hell... */ mask = __ADDR_MASK; - if (addr >= (addr_t) &((struct user *) NULL)->regs.acrs && - addr < (addr_t) &((struct user *) NULL)->regs.orig_gpr2) + if (addr >= offsetof(struct user, regs.acrs) && + addr < offsetof(struct user, regs.orig_gpr2)) mask = 3; if ((addr & mask) || addr > sizeof(struct user) - __ADDR_MASK) return -EIO; @@ -298,8 +298,6 @@ peek_user(struct task_struct *child, addr_t addr, addr_t data) static inline void __poke_user_per(struct task_struct *child, addr_t addr, addr_t data) { - struct per_struct_kernel *dummy = NULL; - /* * There are only three fields in the per_info struct that the * debugger user can write to. @@ -312,14 +310,14 @@ static inline void __poke_user_per(struct task_struct *child, * addresses are used only if single stepping is not in effect. * Writes to any other field in per_info are ignored. */ - if (addr == (addr_t) &dummy->cr9) + if (addr == offsetof(struct per_struct_kernel, cr9)) /* PER event mask of the user specified per set. */ child->thread.per_user.control = data & (PER_EVENT_MASK | PER_CONTROL_MASK); - else if (addr == (addr_t) &dummy->starting_addr) + else if (addr == offsetof(struct per_struct_kernel, starting_addr)) /* Starting address of the user specified per set. */ child->thread.per_user.start = data; - else if (addr == (addr_t) &dummy->ending_addr) + else if (addr == offsetof(struct per_struct_kernel, ending_addr)) /* Ending address of the user specified per set. */ child->thread.per_user.end = data; } @@ -332,14 +330,15 @@ static inline void __poke_user_per(struct task_struct *child, */ static int __poke_user(struct task_struct *child, addr_t addr, addr_t data) { - struct user *dummy = NULL; addr_t offset; - if (addr < (addr_t) &dummy->regs.acrs) { + + if (addr < offsetof(struct user, regs.acrs)) { + struct pt_regs *regs = task_pt_regs(child); /* * psw and gprs are stored on the stack */ - if (addr == (addr_t) &dummy->regs.psw.mask) { + if (addr == offsetof(struct user, regs.psw.mask)) { unsigned long mask = PSW_MASK_USER; mask |= is_ri_task(child) ? PSW_MASK_RI : 0; @@ -353,64 +352,69 @@ static int __poke_user(struct task_struct *child, addr_t addr, addr_t data) /* Invalid addressing mode bits */ return -EINVAL; } - *(addr_t *)((addr_t) &task_pt_regs(child)->psw + addr) = data; - } else if (addr < (addr_t) (&dummy->regs.orig_gpr2)) { + if (test_pt_regs_flag(regs, PIF_SYSCALL) && + addr == offsetof(struct user, regs.gprs[2])) { + struct pt_regs *regs = task_pt_regs(child); + + regs->int_code = 0x20000 | (data & 0xffff); + } + *(addr_t *)((addr_t) ®s->psw + addr) = data; + } else if (addr < offsetof(struct user, regs.orig_gpr2)) { /* * access registers are stored in the thread structure */ - offset = addr - (addr_t) &dummy->regs.acrs; + offset = addr - offsetof(struct user, regs.acrs); /* * Very special case: old & broken 64 bit gdb writing * to acrs[15] with a 64 bit value. Ignore the lower * half of the value and write the upper 32 bit to * acrs[15]. Sick... */ - if (addr == (addr_t) &dummy->regs.acrs[15]) + if (addr == offsetof(struct user, regs.acrs[15])) child->thread.acrs[15] = (unsigned int) (data >> 32); else *(addr_t *)((addr_t) &child->thread.acrs + offset) = data; - } else if (addr == (addr_t) &dummy->regs.orig_gpr2) { + } else if (addr == offsetof(struct user, regs.orig_gpr2)) { /* * orig_gpr2 is stored on the kernel stack */ task_pt_regs(child)->orig_gpr2 = data; - } else if (addr < (addr_t) &dummy->regs.fp_regs) { + } else if (addr < offsetof(struct user, regs.fp_regs)) { /* * prevent writes of padding hole between * orig_gpr2 and fp_regs on s390. */ return 0; - } else if (addr == (addr_t) &dummy->regs.fp_regs.fpc) { + } else if (addr == offsetof(struct user, regs.fp_regs.fpc)) { /* * floating point control reg. is in the thread structure */ - if ((unsigned int) data != 0 || - test_fp_ctl(data >> (BITS_PER_LONG - 32))) + if ((unsigned int)data != 0) return -EINVAL; child->thread.fpu.fpc = data >> (BITS_PER_LONG - 32); - } else if (addr < (addr_t) (&dummy->regs.fp_regs + 1)) { + } else if (addr < offsetof(struct user, regs.fp_regs) + sizeof(s390_fp_regs)) { /* * floating point regs. are either in child->thread.fpu * or the child->thread.fpu.vxrs array */ - offset = addr - (addr_t) &dummy->regs.fp_regs.fprs; - if (MACHINE_HAS_VX) + offset = addr - offsetof(struct user, regs.fp_regs.fprs); + if (cpu_has_vx()) *(addr_t *)((addr_t) child->thread.fpu.vxrs + 2*offset) = data; else *(addr_t *)((addr_t) child->thread.fpu.fprs + offset) = data; - } else if (addr < (addr_t) (&dummy->regs.per_info + 1)) { + } else if (addr < offsetof(struct user, regs.per_info) + sizeof(per_struct)) { /* * Handle access to the per_info structure. */ - addr -= (addr_t) &dummy->regs.per_info; + addr -= offsetof(struct user, regs.per_info); __poke_user_per(child, addr, data); } @@ -427,8 +431,8 @@ static int poke_user(struct task_struct *child, addr_t addr, addr_t data) * an alignment of 4. Programmers from hell indeed... */ mask = __ADDR_MASK; - if (addr >= (addr_t) &((struct user *) NULL)->regs.acrs && - addr < (addr_t) &((struct user *) NULL)->regs.orig_gpr2) + if (addr >= offsetof(struct user, regs.acrs) && + addr < offsetof(struct user, regs.orig_gpr2)) mask = 3; if ((addr & mask) || addr > sizeof(struct user) - __ADDR_MASK) return -EIO; @@ -477,9 +481,7 @@ long arch_ptrace(struct task_struct *child, long request, } return 0; case PTRACE_GET_LAST_BREAK: - put_user(child->thread.last_break, - (unsigned long __user *) data); - return 0; + return put_user(child->thread.last_break, (unsigned long __user *)data); case PTRACE_ENABLE_TE: if (!MACHINE_HAS_TE) return -EIO; @@ -536,37 +538,35 @@ long arch_ptrace(struct task_struct *child, long request, static inline __u32 __peek_user_per_compat(struct task_struct *child, addr_t addr) { - struct compat_per_struct_kernel *dummy32 = NULL; - - if (addr == (addr_t) &dummy32->cr9) + if (addr == offsetof(struct compat_per_struct_kernel, cr9)) /* Control bits of the active per set. */ return (__u32) test_thread_flag(TIF_SINGLE_STEP) ? PER_EVENT_IFETCH : child->thread.per_user.control; - else if (addr == (addr_t) &dummy32->cr10) + else if (addr == offsetof(struct compat_per_struct_kernel, cr10)) /* Start address of the active per set. */ return (__u32) test_thread_flag(TIF_SINGLE_STEP) ? 0 : child->thread.per_user.start; - else if (addr == (addr_t) &dummy32->cr11) + else if (addr == offsetof(struct compat_per_struct_kernel, cr11)) /* End address of the active per set. */ return test_thread_flag(TIF_SINGLE_STEP) ? PSW32_ADDR_INSN : child->thread.per_user.end; - else if (addr == (addr_t) &dummy32->bits) + else if (addr == offsetof(struct compat_per_struct_kernel, bits)) /* Single-step bit. */ return (__u32) test_thread_flag(TIF_SINGLE_STEP) ? 0x80000000 : 0; - else if (addr == (addr_t) &dummy32->starting_addr) + else if (addr == offsetof(struct compat_per_struct_kernel, starting_addr)) /* Start address of the user specified per set. */ return (__u32) child->thread.per_user.start; - else if (addr == (addr_t) &dummy32->ending_addr) + else if (addr == offsetof(struct compat_per_struct_kernel, ending_addr)) /* End address of the user specified per set. */ return (__u32) child->thread.per_user.end; - else if (addr == (addr_t) &dummy32->perc_atmid) + else if (addr == offsetof(struct compat_per_struct_kernel, perc_atmid)) /* PER code, ATMID and AI of the last PER trap */ return (__u32) child->thread.per_event.cause << 16; - else if (addr == (addr_t) &dummy32->address) + else if (addr == offsetof(struct compat_per_struct_kernel, address)) /* Address of the last PER trap */ return (__u32) child->thread.per_event.address; - else if (addr == (addr_t) &dummy32->access_id) + else if (addr == offsetof(struct compat_per_struct_kernel, access_id)) /* Access id of the last PER trap */ return (__u32) child->thread.per_event.paid << 24; return 0; @@ -577,21 +577,20 @@ static inline __u32 __peek_user_per_compat(struct task_struct *child, */ static u32 __peek_user_compat(struct task_struct *child, addr_t addr) { - struct compat_user *dummy32 = NULL; addr_t offset; __u32 tmp; - if (addr < (addr_t) &dummy32->regs.acrs) { + if (addr < offsetof(struct compat_user, regs.acrs)) { struct pt_regs *regs = task_pt_regs(child); /* * psw and gprs are stored on the stack */ - if (addr == (addr_t) &dummy32->regs.psw.mask) { + if (addr == offsetof(struct compat_user, regs.psw.mask)) { /* Fake a 31 bit psw mask. */ tmp = (__u32)(regs->psw.mask >> 32); tmp &= PSW32_MASK_USER | PSW32_MASK_RI; tmp |= PSW32_USER_BITS; - } else if (addr == (addr_t) &dummy32->regs.psw.addr) { + } else if (addr == offsetof(struct compat_user, regs.psw.addr)) { /* Fake a 31 bit psw address. */ tmp = (__u32) regs->psw.addr | (__u32)(regs->psw.mask & PSW_MASK_BA); @@ -599,50 +598,50 @@ static u32 __peek_user_compat(struct task_struct *child, addr_t addr) /* gpr 0-15 */ tmp = *(__u32 *)((addr_t) ®s->psw + addr*2 + 4); } - } else if (addr < (addr_t) (&dummy32->regs.orig_gpr2)) { + } else if (addr < offsetof(struct compat_user, regs.orig_gpr2)) { /* * access registers are stored in the thread structure */ - offset = addr - (addr_t) &dummy32->regs.acrs; + offset = addr - offsetof(struct compat_user, regs.acrs); tmp = *(__u32*)((addr_t) &child->thread.acrs + offset); - } else if (addr == (addr_t) (&dummy32->regs.orig_gpr2)) { + } else if (addr == offsetof(struct compat_user, regs.orig_gpr2)) { /* * orig_gpr2 is stored on the kernel stack */ tmp = *(__u32*)((addr_t) &task_pt_regs(child)->orig_gpr2 + 4); - } else if (addr < (addr_t) &dummy32->regs.fp_regs) { + } else if (addr < offsetof(struct compat_user, regs.fp_regs)) { /* * prevent reads of padding hole between * orig_gpr2 and fp_regs on s390. */ tmp = 0; - } else if (addr == (addr_t) &dummy32->regs.fp_regs.fpc) { + } else if (addr == offsetof(struct compat_user, regs.fp_regs.fpc)) { /* * floating point control reg. is in the thread structure */ tmp = child->thread.fpu.fpc; - } else if (addr < (addr_t) (&dummy32->regs.fp_regs + 1)) { + } else if (addr < offsetof(struct compat_user, regs.fp_regs) + sizeof(s390_fp_regs)) { /* * floating point regs. are either in child->thread.fpu * or the child->thread.fpu.vxrs array */ - offset = addr - (addr_t) &dummy32->regs.fp_regs.fprs; - if (MACHINE_HAS_VX) + offset = addr - offsetof(struct compat_user, regs.fp_regs.fprs); + if (cpu_has_vx()) tmp = *(__u32 *) ((addr_t) child->thread.fpu.vxrs + 2*offset); else tmp = *(__u32 *) ((addr_t) child->thread.fpu.fprs + offset); - } else if (addr < (addr_t) (&dummy32->regs.per_info + 1)) { + } else if (addr < offsetof(struct compat_user, regs.per_info) + sizeof(struct compat_per_struct_kernel)) { /* * Handle access to the per_info structure. */ - addr -= (addr_t) &dummy32->regs.per_info; + addr -= offsetof(struct compat_user, regs.per_info); tmp = __peek_user_per_compat(child, addr); } else @@ -669,16 +668,14 @@ static int peek_user_compat(struct task_struct *child, static inline void __poke_user_per_compat(struct task_struct *child, addr_t addr, __u32 data) { - struct compat_per_struct_kernel *dummy32 = NULL; - - if (addr == (addr_t) &dummy32->cr9) + if (addr == offsetof(struct compat_per_struct_kernel, cr9)) /* PER event mask of the user specified per set. */ child->thread.per_user.control = data & (PER_EVENT_MASK | PER_CONTROL_MASK); - else if (addr == (addr_t) &dummy32->starting_addr) + else if (addr == offsetof(struct compat_per_struct_kernel, starting_addr)) /* Starting address of the user specified per set. */ child->thread.per_user.start = data; - else if (addr == (addr_t) &dummy32->ending_addr) + else if (addr == offsetof(struct compat_per_struct_kernel, ending_addr)) /* Ending address of the user specified per set. */ child->thread.per_user.end = data; } @@ -689,16 +686,15 @@ static inline void __poke_user_per_compat(struct task_struct *child, static int __poke_user_compat(struct task_struct *child, addr_t addr, addr_t data) { - struct compat_user *dummy32 = NULL; __u32 tmp = (__u32) data; addr_t offset; - if (addr < (addr_t) &dummy32->regs.acrs) { + if (addr < offsetof(struct compat_user, regs.acrs)) { struct pt_regs *regs = task_pt_regs(child); /* * psw, gprs, acrs and orig_gpr2 are stored on the stack */ - if (addr == (addr_t) &dummy32->regs.psw.mask) { + if (addr == offsetof(struct compat_user, regs.psw.mask)) { __u32 mask = PSW32_MASK_USER; mask |= is_ri_task(child) ? PSW32_MASK_RI : 0; @@ -712,62 +708,66 @@ static int __poke_user_compat(struct task_struct *child, regs->psw.mask = (regs->psw.mask & ~PSW_MASK_USER) | (regs->psw.mask & PSW_MASK_BA) | (__u64)(tmp & mask) << 32; - } else if (addr == (addr_t) &dummy32->regs.psw.addr) { + } else if (addr == offsetof(struct compat_user, regs.psw.addr)) { /* Build a 64 bit psw address from 31 bit address. */ regs->psw.addr = (__u64) tmp & PSW32_ADDR_INSN; /* Transfer 31 bit amode bit to psw mask. */ regs->psw.mask = (regs->psw.mask & ~PSW_MASK_BA) | (__u64)(tmp & PSW32_ADDR_AMODE); } else { + if (test_pt_regs_flag(regs, PIF_SYSCALL) && + addr == offsetof(struct compat_user, regs.gprs[2])) { + struct pt_regs *regs = task_pt_regs(child); + + regs->int_code = 0x20000 | (data & 0xffff); + } /* gpr 0-15 */ *(__u32*)((addr_t) ®s->psw + addr*2 + 4) = tmp; } - } else if (addr < (addr_t) (&dummy32->regs.orig_gpr2)) { + } else if (addr < offsetof(struct compat_user, regs.orig_gpr2)) { /* * access registers are stored in the thread structure */ - offset = addr - (addr_t) &dummy32->regs.acrs; + offset = addr - offsetof(struct compat_user, regs.acrs); *(__u32*)((addr_t) &child->thread.acrs + offset) = tmp; - } else if (addr == (addr_t) (&dummy32->regs.orig_gpr2)) { + } else if (addr == offsetof(struct compat_user, regs.orig_gpr2)) { /* * orig_gpr2 is stored on the kernel stack */ *(__u32*)((addr_t) &task_pt_regs(child)->orig_gpr2 + 4) = tmp; - } else if (addr < (addr_t) &dummy32->regs.fp_regs) { + } else if (addr < offsetof(struct compat_user, regs.fp_regs)) { /* * prevent writess of padding hole between * orig_gpr2 and fp_regs on s390. */ return 0; - } else if (addr == (addr_t) &dummy32->regs.fp_regs.fpc) { + } else if (addr == offsetof(struct compat_user, regs.fp_regs.fpc)) { /* * floating point control reg. is in the thread structure */ - if (test_fp_ctl(tmp)) - return -EINVAL; child->thread.fpu.fpc = data; - } else if (addr < (addr_t) (&dummy32->regs.fp_regs + 1)) { + } else if (addr < offsetof(struct compat_user, regs.fp_regs) + sizeof(s390_fp_regs)) { /* * floating point regs. are either in child->thread.fpu * or the child->thread.fpu.vxrs array */ - offset = addr - (addr_t) &dummy32->regs.fp_regs.fprs; - if (MACHINE_HAS_VX) + offset = addr - offsetof(struct compat_user, regs.fp_regs.fprs); + if (cpu_has_vx()) *(__u32 *)((addr_t) child->thread.fpu.vxrs + 2*offset) = tmp; else *(__u32 *)((addr_t) child->thread.fpu.fprs + offset) = tmp; - } else if (addr < (addr_t) (&dummy32->regs.per_info + 1)) { + } else if (addr < offsetof(struct compat_user, regs.per_info) + sizeof(struct compat_per_struct_kernel)) { /* * Handle access to the per_info structure. */ - addr -= (addr_t) &dummy32->regs.per_info; + addr -= offsetof(struct compat_user, regs.per_info); __poke_user_per_compat(child, addr, data); } @@ -827,92 +827,26 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, } return 0; case PTRACE_GET_LAST_BREAK: - put_user(child->thread.last_break, - (unsigned int __user *) data); - return 0; + return put_user(child->thread.last_break, (unsigned int __user *)data); } return compat_ptrace_request(child, request, addr, data); } #endif -asmlinkage long do_syscall_trace_enter(struct pt_regs *regs) -{ - unsigned long mask = -1UL; - - /* - * The sysc_tracesys code in entry.S stored the system - * call number to gprs[2]. - */ - if (test_thread_flag(TIF_SYSCALL_TRACE) && - (tracehook_report_syscall_entry(regs) || - regs->gprs[2] >= NR_syscalls)) { - /* - * Tracing decided this syscall should not happen or the - * debugger stored an invalid system call number. Skip - * the system call and the system call restart handling. - */ - clear_pt_regs_flag(regs, PIF_SYSCALL); - return -1; - } - - /* Do the secure computing check after ptrace. */ - if (secure_computing()) { - /* seccomp failures shouldn't expose any additional code. */ - return -1; - } - - if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) - trace_sys_enter(regs, regs->gprs[2]); - - if (is_compat_task()) - mask = 0xffffffff; - - audit_syscall_entry(regs->gprs[2], regs->orig_gpr2 & mask, - regs->gprs[3] &mask, regs->gprs[4] &mask, - regs->gprs[5] &mask); - - return regs->gprs[2]; -} - -asmlinkage void do_syscall_trace_exit(struct pt_regs *regs) -{ - audit_syscall_exit(regs); - - if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) - trace_sys_exit(regs, regs->gprs[2]); - - if (test_thread_flag(TIF_SYSCALL_TRACE)) - tracehook_report_syscall_exit(regs, 0); -} - /* * user_regset definitions. */ static int s390_regs_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { + unsigned pos; if (target == current) save_access_regs(target->thread.acrs); - if (kbuf) { - unsigned long *k = kbuf; - while (count > 0) { - *k++ = __peek_user(target, pos); - count -= sizeof(*k); - pos += sizeof(*k); - } - } else { - unsigned long __user *u = ubuf; - while (count > 0) { - if (__put_user(__peek_user(target, pos), u++)) - return -EFAULT; - count -= sizeof(*u); - pos += sizeof(*u); - } - } + for (pos = 0; pos < sizeof(s390_regs); pos += sizeof(long)) + membuf_store(&to, __peek_user(target, pos)); return 0; } @@ -953,8 +887,8 @@ static int s390_regs_set(struct task_struct *target, } static int s390_fpregs_get(struct task_struct *target, - const struct user_regset *regset, unsigned int pos, - unsigned int count, void *kbuf, void __user *ubuf) + const struct user_regset *regset, + struct membuf to) { _s390_fp_regs fp_regs; @@ -964,8 +898,7 @@ static int s390_fpregs_get(struct task_struct *target, fp_regs.fpc = target->thread.fpu.fpc; fpregs_store(&fp_regs, &target->thread.fpu); - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &fp_regs, 0, -1); + return membuf_write(&to, &fp_regs, sizeof(fp_regs)); } static int s390_fpregs_set(struct task_struct *target, @@ -979,19 +912,18 @@ static int s390_fpregs_set(struct task_struct *target, if (target == current) save_fpu_regs(); - if (MACHINE_HAS_VX) + if (cpu_has_vx()) convert_vx_to_fp(fprs, target->thread.fpu.vxrs); else memcpy(&fprs, target->thread.fpu.fprs, sizeof(fprs)); - /* If setting FPC, must validate it first. */ if (count > 0 && pos < offsetof(s390_fp_regs, fprs)) { u32 ufpc[2] = { target->thread.fpu.fpc, 0 }; rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ufpc, 0, offsetof(s390_fp_regs, fprs)); if (rc) return rc; - if (ufpc[1] != 0 || test_fp_ctl(ufpc[0])) + if (ufpc[1] != 0) return -EINVAL; target->thread.fpu.fpc = ufpc[0]; } @@ -1002,7 +934,7 @@ static int s390_fpregs_set(struct task_struct *target, if (rc) return rc; - if (MACHINE_HAS_VX) + if (cpu_has_vx()) convert_fp_to_vx(target->thread.fpu.vxrs, fprs); else memcpy(target->thread.fpu.fprs, &fprs, sizeof(fprs)); @@ -1012,20 +944,9 @@ static int s390_fpregs_set(struct task_struct *target, static int s390_last_break_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { - if (count > 0) { - if (kbuf) { - unsigned long *k = kbuf; - *k = target->thread.last_break; - } else { - unsigned long __user *u = ubuf; - if (__put_user(target->thread.last_break, u)) - return -EFAULT; - } - } - return 0; + return membuf_store(&to, target->thread.last_break); } static int s390_last_break_set(struct task_struct *target, @@ -1038,16 +959,15 @@ static int s390_last_break_set(struct task_struct *target, static int s390_tdb_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { struct pt_regs *regs = task_pt_regs(target); - unsigned char *data; + size_t size; if (!(regs->int_code & 0x200)) return -ENODATA; - data = target->thread.trap_tdb; - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, data, 0, 256); + size = sizeof(target->thread.trap_tdb.data); + return membuf_write(&to, target->thread.trap_tdb.data, size); } static int s390_tdb_set(struct task_struct *target, @@ -1060,19 +980,18 @@ static int s390_tdb_set(struct task_struct *target, static int s390_vxrs_low_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { __u64 vxrs[__NUM_VXRS_LOW]; int i; - if (!MACHINE_HAS_VX) + if (!cpu_has_vx()) return -ENODEV; if (target == current) save_fpu_regs(); for (i = 0; i < __NUM_VXRS_LOW; i++) - vxrs[i] = *((__u64 *)(target->thread.fpu.vxrs + i) + 1); - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, vxrs, 0, -1); + vxrs[i] = target->thread.fpu.vxrs[i].low; + return membuf_write(&to, vxrs, sizeof(vxrs)); } static int s390_vxrs_low_set(struct task_struct *target, @@ -1083,36 +1002,32 @@ static int s390_vxrs_low_set(struct task_struct *target, __u64 vxrs[__NUM_VXRS_LOW]; int i, rc; - if (!MACHINE_HAS_VX) + if (!cpu_has_vx()) return -ENODEV; if (target == current) save_fpu_regs(); for (i = 0; i < __NUM_VXRS_LOW; i++) - vxrs[i] = *((__u64 *)(target->thread.fpu.vxrs + i) + 1); + vxrs[i] = target->thread.fpu.vxrs[i].low; rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, vxrs, 0, -1); if (rc == 0) for (i = 0; i < __NUM_VXRS_LOW; i++) - *((__u64 *)(target->thread.fpu.vxrs + i) + 1) = vxrs[i]; + target->thread.fpu.vxrs[i].low = vxrs[i]; return rc; } static int s390_vxrs_high_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { - __vector128 vxrs[__NUM_VXRS_HIGH]; - - if (!MACHINE_HAS_VX) + if (!cpu_has_vx()) return -ENODEV; if (target == current) save_fpu_regs(); - memcpy(vxrs, target->thread.fpu.vxrs + __NUM_VXRS_LOW, sizeof(vxrs)); - - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, vxrs, 0, -1); + return membuf_write(&to, target->thread.fpu.vxrs + __NUM_VXRS_LOW, + __NUM_VXRS_HIGH * sizeof(__vector128)); } static int s390_vxrs_high_set(struct task_struct *target, @@ -1122,7 +1037,7 @@ static int s390_vxrs_high_set(struct task_struct *target, { int rc; - if (!MACHINE_HAS_VX) + if (!cpu_has_vx()) return -ENODEV; if (target == current) save_fpu_regs(); @@ -1134,12 +1049,9 @@ static int s390_vxrs_high_set(struct task_struct *target, static int s390_system_call_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { - unsigned int *data = &target->thread.system_call; - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - data, 0, sizeof(unsigned int)); + return membuf_store(&to, target->thread.system_call); } static int s390_system_call_set(struct task_struct *target, @@ -1154,8 +1066,7 @@ static int s390_system_call_set(struct task_struct *target, static int s390_gs_cb_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { struct gs_cb *data = target->thread.gs_cb; @@ -1165,8 +1076,7 @@ static int s390_gs_cb_get(struct task_struct *target, return -ENODATA; if (target == current) save_gs_cb(data); - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - data, 0, sizeof(struct gs_cb)); + return membuf_write(&to, data, sizeof(struct gs_cb)); } static int s390_gs_cb_set(struct task_struct *target, @@ -1201,7 +1111,7 @@ static int s390_gs_cb_set(struct task_struct *target, target->thread.gs_cb = data; *target->thread.gs_cb = gs_cb; if (target == current) { - __ctl_set_bit(2, 4); + local_ctl_set_bit(2, CR2_GUARDED_STORAGE_BIT); restore_gs_cb(target->thread.gs_cb); } preempt_enable(); @@ -1210,8 +1120,7 @@ static int s390_gs_cb_set(struct task_struct *target, static int s390_gs_bc_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { struct gs_cb *data = target->thread.gs_bc_cb; @@ -1219,8 +1128,7 @@ static int s390_gs_bc_get(struct task_struct *target, return -ENODEV; if (!data) return -ENODATA; - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - data, 0, sizeof(struct gs_cb)); + return membuf_write(&to, data, sizeof(struct gs_cb)); } static int s390_gs_bc_set(struct task_struct *target, @@ -1256,7 +1164,6 @@ static bool is_ri_cb_valid(struct runtime_instr_cb *cb) cb->pc == 1 && cb->qc == 0 && cb->reserved2 == 0 && - cb->key == PAGE_DEFAULT_KEY && cb->reserved3 == 0 && cb->reserved4 == 0 && cb->reserved5 == 0 && @@ -1271,8 +1178,7 @@ static bool is_ri_cb_valid(struct runtime_instr_cb *cb) static int s390_runtime_instr_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { struct runtime_instr_cb *data = target->thread.ri_cb; @@ -1281,8 +1187,7 @@ static int s390_runtime_instr_get(struct task_struct *target, if (!data) return -ENODATA; - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - data, 0, sizeof(struct runtime_instr_cb)); + return membuf_write(&to, data, sizeof(struct runtime_instr_cb)); } static int s390_runtime_instr_set(struct task_struct *target, @@ -1320,7 +1225,11 @@ static int s390_runtime_instr_set(struct task_struct *target, kfree(data); return -EINVAL; } - + /* + * Override access key in any case, since user space should + * not be able to set it, nor should it care about it. + */ + ri_cb.key = PAGE_DEFAULT_KEY >> 4; preempt_disable(); if (!target->thread.ri_cb) target->thread.ri_cb = data; @@ -1338,7 +1247,7 @@ static const struct user_regset s390_regsets[] = { .n = sizeof(s390_regs) / sizeof(long), .size = sizeof(long), .align = sizeof(long), - .get = s390_regs_get, + .regset_get = s390_regs_get, .set = s390_regs_set, }, { @@ -1346,7 +1255,7 @@ static const struct user_regset s390_regsets[] = { .n = sizeof(s390_fp_regs) / sizeof(long), .size = sizeof(long), .align = sizeof(long), - .get = s390_fpregs_get, + .regset_get = s390_fpregs_get, .set = s390_fpregs_set, }, { @@ -1354,7 +1263,7 @@ static const struct user_regset s390_regsets[] = { .n = 1, .size = sizeof(unsigned int), .align = sizeof(unsigned int), - .get = s390_system_call_get, + .regset_get = s390_system_call_get, .set = s390_system_call_set, }, { @@ -1362,7 +1271,7 @@ static const struct user_regset s390_regsets[] = { .n = 1, .size = sizeof(long), .align = sizeof(long), - .get = s390_last_break_get, + .regset_get = s390_last_break_get, .set = s390_last_break_set, }, { @@ -1370,7 +1279,7 @@ static const struct user_regset s390_regsets[] = { .n = 1, .size = 256, .align = 1, - .get = s390_tdb_get, + .regset_get = s390_tdb_get, .set = s390_tdb_set, }, { @@ -1378,7 +1287,7 @@ static const struct user_regset s390_regsets[] = { .n = __NUM_VXRS_LOW, .size = sizeof(__u64), .align = sizeof(__u64), - .get = s390_vxrs_low_get, + .regset_get = s390_vxrs_low_get, .set = s390_vxrs_low_set, }, { @@ -1386,7 +1295,7 @@ static const struct user_regset s390_regsets[] = { .n = __NUM_VXRS_HIGH, .size = sizeof(__vector128), .align = sizeof(__vector128), - .get = s390_vxrs_high_get, + .regset_get = s390_vxrs_high_get, .set = s390_vxrs_high_set, }, { @@ -1394,7 +1303,7 @@ static const struct user_regset s390_regsets[] = { .n = sizeof(struct gs_cb) / sizeof(__u64), .size = sizeof(__u64), .align = sizeof(__u64), - .get = s390_gs_cb_get, + .regset_get = s390_gs_cb_get, .set = s390_gs_cb_set, }, { @@ -1402,7 +1311,7 @@ static const struct user_regset s390_regsets[] = { .n = sizeof(struct gs_cb) / sizeof(__u64), .size = sizeof(__u64), .align = sizeof(__u64), - .get = s390_gs_bc_get, + .regset_get = s390_gs_bc_get, .set = s390_gs_bc_set, }, { @@ -1410,13 +1319,13 @@ static const struct user_regset s390_regsets[] = { .n = sizeof(struct runtime_instr_cb) / sizeof(__u64), .size = sizeof(__u64), .align = sizeof(__u64), - .get = s390_runtime_instr_get, + .regset_get = s390_runtime_instr_get, .set = s390_runtime_instr_set, }, }; static const struct user_regset_view user_s390_view = { - .name = UTS_MACHINE, + .name = "s390x", .e_machine = EM_S390, .regsets = s390_regsets, .n = ARRAY_SIZE(s390_regsets) @@ -1425,28 +1334,15 @@ static const struct user_regset_view user_s390_view = { #ifdef CONFIG_COMPAT static int s390_compat_regs_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { + unsigned n; + if (target == current) save_access_regs(target->thread.acrs); - if (kbuf) { - compat_ulong_t *k = kbuf; - while (count > 0) { - *k++ = __peek_user_compat(target, pos); - count -= sizeof(*k); - pos += sizeof(*k); - } - } else { - compat_ulong_t __user *u = ubuf; - while (count > 0) { - if (__put_user(__peek_user_compat(target, pos), u++)) - return -EFAULT; - count -= sizeof(*u); - pos += sizeof(*u); - } - } + for (n = 0; n < sizeof(s390_compat_regs); n += sizeof(compat_ulong_t)) + membuf_store(&to, __peek_user_compat(target, n)); return 0; } @@ -1488,29 +1384,14 @@ static int s390_compat_regs_set(struct task_struct *target, static int s390_compat_regs_high_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { compat_ulong_t *gprs_high; + int i; - gprs_high = (compat_ulong_t *) - &task_pt_regs(target)->gprs[pos / sizeof(compat_ulong_t)]; - if (kbuf) { - compat_ulong_t *k = kbuf; - while (count > 0) { - *k++ = *gprs_high; - gprs_high += 2; - count -= sizeof(*k); - } - } else { - compat_ulong_t __user *u = ubuf; - while (count > 0) { - if (__put_user(*gprs_high, u++)) - return -EFAULT; - gprs_high += 2; - count -= sizeof(*u); - } - } + gprs_high = (compat_ulong_t *)task_pt_regs(target)->gprs; + for (i = 0; i < NUM_GPRS; i++, gprs_high += 2) + membuf_store(&to, *gprs_high); return 0; } @@ -1549,23 +1430,11 @@ static int s390_compat_regs_high_set(struct task_struct *target, static int s390_compat_last_break_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { - compat_ulong_t last_break; + compat_ulong_t last_break = target->thread.last_break; - if (count > 0) { - last_break = target->thread.last_break; - if (kbuf) { - unsigned long *k = kbuf; - *k = last_break; - } else { - unsigned long __user *u = ubuf; - if (__put_user(last_break, u)) - return -EFAULT; - } - } - return 0; + return membuf_store(&to, (unsigned long)last_break); } static int s390_compat_last_break_set(struct task_struct *target, @@ -1582,7 +1451,7 @@ static const struct user_regset s390_compat_regsets[] = { .n = sizeof(s390_compat_regs) / sizeof(compat_long_t), .size = sizeof(compat_long_t), .align = sizeof(compat_long_t), - .get = s390_compat_regs_get, + .regset_get = s390_compat_regs_get, .set = s390_compat_regs_set, }, { @@ -1590,7 +1459,7 @@ static const struct user_regset s390_compat_regsets[] = { .n = sizeof(s390_fp_regs) / sizeof(compat_long_t), .size = sizeof(compat_long_t), .align = sizeof(compat_long_t), - .get = s390_fpregs_get, + .regset_get = s390_fpregs_get, .set = s390_fpregs_set, }, { @@ -1598,7 +1467,7 @@ static const struct user_regset s390_compat_regsets[] = { .n = 1, .size = sizeof(compat_uint_t), .align = sizeof(compat_uint_t), - .get = s390_system_call_get, + .regset_get = s390_system_call_get, .set = s390_system_call_set, }, { @@ -1606,7 +1475,7 @@ static const struct user_regset s390_compat_regsets[] = { .n = 1, .size = sizeof(long), .align = sizeof(long), - .get = s390_compat_last_break_get, + .regset_get = s390_compat_last_break_get, .set = s390_compat_last_break_set, }, { @@ -1614,7 +1483,7 @@ static const struct user_regset s390_compat_regsets[] = { .n = 1, .size = 256, .align = 1, - .get = s390_tdb_get, + .regset_get = s390_tdb_get, .set = s390_tdb_set, }, { @@ -1622,7 +1491,7 @@ static const struct user_regset s390_compat_regsets[] = { .n = __NUM_VXRS_LOW, .size = sizeof(__u64), .align = sizeof(__u64), - .get = s390_vxrs_low_get, + .regset_get = s390_vxrs_low_get, .set = s390_vxrs_low_set, }, { @@ -1630,7 +1499,7 @@ static const struct user_regset s390_compat_regsets[] = { .n = __NUM_VXRS_HIGH, .size = sizeof(__vector128), .align = sizeof(__vector128), - .get = s390_vxrs_high_get, + .regset_get = s390_vxrs_high_get, .set = s390_vxrs_high_set, }, { @@ -1638,7 +1507,7 @@ static const struct user_regset s390_compat_regsets[] = { .n = sizeof(s390_compat_regs_high) / sizeof(compat_long_t), .size = sizeof(compat_long_t), .align = sizeof(compat_long_t), - .get = s390_compat_regs_high_get, + .regset_get = s390_compat_regs_high_get, .set = s390_compat_regs_high_set, }, { @@ -1646,7 +1515,7 @@ static const struct user_regset s390_compat_regsets[] = { .n = sizeof(struct gs_cb) / sizeof(__u64), .size = sizeof(__u64), .align = sizeof(__u64), - .get = s390_gs_cb_get, + .regset_get = s390_gs_cb_get, .set = s390_gs_cb_set, }, { @@ -1654,7 +1523,7 @@ static const struct user_regset s390_compat_regsets[] = { .n = sizeof(struct gs_cb) / sizeof(__u64), .size = sizeof(__u64), .align = sizeof(__u64), - .get = s390_gs_bc_get, + .regset_get = s390_gs_bc_get, .set = s390_gs_bc_set, }, { @@ -1662,7 +1531,7 @@ static const struct user_regset s390_compat_regsets[] = { .n = sizeof(struct runtime_instr_cb) / sizeof(__u64), .size = sizeof(__u64), .align = sizeof(__u64), - .get = s390_runtime_instr_get, + .regset_get = s390_runtime_instr_get, .set = s390_runtime_instr_set, }, }; diff --git a/arch/s390/kernel/reipl.S b/arch/s390/kernel/reipl.S index 4a22163962eb..88087a32ebc6 100644 --- a/arch/s390/kernel/reipl.S +++ b/arch/s390/kernel/reipl.S @@ -19,7 +19,7 @@ # r2 = Function to be called after store status # r3 = Parameter for function # -ENTRY(store_status) +SYM_CODE_START(store_status) /* Save register one and load save area base */ stg %r1,__LC_SAVE_AREA_RESTART /* General purpose registers */ @@ -61,7 +61,7 @@ ENTRY(store_status) stpx 0(%r1) /* Clock comparator - seven bytes */ lghi %r1,__LC_CLOCK_COMP_SAVE_AREA - larl %r4,.Lclkcmp + larl %r4,clkcmp stckc 0(%r4) mvc 1(7,%r1),1(%r4) /* Program status word */ @@ -73,9 +73,9 @@ ENTRY(store_status) lgr %r9,%r2 lgr %r2,%r3 BR_EX %r9 -ENDPROC(store_status) +SYM_CODE_END(store_status) .section .bss - .align 8 -.Lclkcmp: .quad 0x0000000000000000 + .balign 8 +SYM_DATA_LOCAL(clkcmp, .quad 0x0000000000000000) .previous diff --git a/arch/s390/kernel/relocate_kernel.S b/arch/s390/kernel/relocate_kernel.S index fe396673e8a6..0ae297c82afd 100644 --- a/arch/s390/kernel/relocate_kernel.S +++ b/arch/s390/kernel/relocate_kernel.S @@ -2,8 +2,7 @@ /* * Copyright IBM Corp. 2005 * - * Author(s): Rolf Adelsberger, - * Heiko Carstens <heiko.carstens@de.ibm.com> + * Author(s): Rolf Adelsberger * */ @@ -15,6 +14,7 @@ * moves the new kernel to its destination... * %r2 = pointer to first kimage_entry_t * %r3 = start address - where to jump to after the job is done... + * %r4 = subcode * * %r5 will be used as temp. storage * %r6 holds the destination address @@ -26,53 +26,51 @@ */ .text -ENTRY(relocate_kernel) - basr %r13,0 # base address - .base: - lghi %r7,PAGE_SIZE # load PAGE_SIZE in r7 - lghi %r9,PAGE_SIZE # load PAGE_SIZE in r9 - lg %r5,0(%r2) # read another word for indirection page - aghi %r2,8 # increment pointer - tml %r5,0x1 # is it a destination page? - je .indir_check # NO, goto "indir_check" - lgr %r6,%r5 # r6 = r5 - nill %r6,0xf000 # mask it out and... - j .base # ...next iteration - .indir_check: - tml %r5,0x2 # is it a indirection page? - je .done_test # NO, goto "done_test" - nill %r5,0xf000 # YES, mask out, - lgr %r2,%r5 # move it into the right register, - j .base # and read next... - .done_test: - tml %r5,0x4 # is it the done indicator? - je .source_test # NO! Well, then it should be the source indicator... - j .done # ok, lets finish it here... - .source_test: - tml %r5,0x8 # it should be a source indicator... - je .base # NO, ignore it... - lgr %r8,%r5 # r8 = r5 - nill %r8,0xf000 # masking - 0: mvcle %r6,%r8,0x0 # copy PAGE_SIZE bytes from r8 to r6 - pad with 0 - jo 0b - j .base - .done: - sgr %r0,%r0 # clear register r0 - cghi %r3,0 - je .diag - la %r4,load_psw-.base(%r13) # load psw-address into the register - o %r3,4(%r4) # or load address into psw - st %r3,4(%r4) - mvc 0(8,%r0),0(%r4) # copy psw to absolute address 0 - .diag: - diag %r0,%r0,0x308 -ENDPROC(relocate_kernel) +SYM_CODE_START(relocate_kernel) + basr %r13,0 # base address +.base: + lghi %r7,PAGE_SIZE # load PAGE_SIZE in r7 + lghi %r9,PAGE_SIZE # load PAGE_SIZE in r9 + lg %r5,0(%r2) # read another word for indirection page + aghi %r2,8 # increment pointer + tml %r5,0x1 # is it a destination page? + je .indir_check # NO, goto "indir_check" + lgr %r6,%r5 # r6 = r5 + nill %r6,0xf000 # mask it out and... + j .base # ...next iteration +.indir_check: + tml %r5,0x2 # is it a indirection page? + je .done_test # NO, goto "done_test" + nill %r5,0xf000 # YES, mask out, + lgr %r2,%r5 # move it into the right register, + j .base # and read next... +.done_test: + tml %r5,0x4 # is it the done indicator? + je .source_test # NO! Well, then it should be the source indicator... + j .done # ok, lets finish it here... +.source_test: + tml %r5,0x8 # it should be a source indicator... + je .base # NO, ignore it... + lgr %r8,%r5 # r8 = r5 + nill %r8,0xf000 # masking +0: mvcle %r6,%r8,0x0 # copy PAGE_SIZE bytes from r8 to r6 - pad with 0 + jo 0b + j .base +.done: + lgr %r0,%r4 # subcode + cghi %r3,0 + je .diag + la %r4,load_psw-.base(%r13) # load psw-address into the register + o %r3,4(%r4) # or load address into psw + st %r3,4(%r4) + mvc 0(8,%r0),0(%r4) # copy psw to absolute address 0 +.diag: + diag %r0,%r0,0x308 +SYM_CODE_END(relocate_kernel) - .align 8 - load_psw: - .long 0x00080000,0x80000000 - relocate_kernel_end: - .align 8 - .globl relocate_kernel_len - relocate_kernel_len: - .quad relocate_kernel_end - relocate_kernel + .balign 8 +SYM_DATA_START_LOCAL(load_psw) + .long 0x00080000,0x80000000 +SYM_DATA_END_LABEL(load_psw, SYM_L_LOCAL, relocate_kernel_end) + .balign 8 +SYM_DATA(relocate_kernel_len, .quad relocate_kernel_end - relocate_kernel) diff --git a/arch/s390/kernel/rethook.c b/arch/s390/kernel/rethook.c new file mode 100644 index 000000000000..af10e6bdd34e --- /dev/null +++ b/arch/s390/kernel/rethook.c @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/rethook.h> +#include <linux/kprobes.h> +#include "rethook.h" + +void arch_rethook_prepare(struct rethook_node *rh, struct pt_regs *regs, bool mcount) +{ + rh->ret_addr = regs->gprs[14]; + rh->frame = regs->gprs[15]; + + /* Replace the return addr with trampoline addr */ + regs->gprs[14] = (unsigned long)&arch_rethook_trampoline; +} +NOKPROBE_SYMBOL(arch_rethook_prepare); + +void arch_rethook_fixup_return(struct pt_regs *regs, + unsigned long correct_ret_addr) +{ + /* Replace fake return address with real one. */ + regs->gprs[14] = correct_ret_addr; +} +NOKPROBE_SYMBOL(arch_rethook_fixup_return); + +/* + * Called from arch_rethook_trampoline + */ +unsigned long arch_rethook_trampoline_callback(struct pt_regs *regs) +{ + return rethook_trampoline_handler(regs, regs->gprs[15]); +} +NOKPROBE_SYMBOL(arch_rethook_trampoline_callback); + +/* assembler function that handles the rethook must not be probed itself */ +NOKPROBE_SYMBOL(arch_rethook_trampoline); diff --git a/arch/s390/kernel/rethook.h b/arch/s390/kernel/rethook.h new file mode 100644 index 000000000000..32f069eed3f3 --- /dev/null +++ b/arch/s390/kernel/rethook.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __S390_RETHOOK_H +#define __S390_RETHOOK_H + +unsigned long arch_rethook_trampoline_callback(struct pt_regs *regs); + +#endif diff --git a/arch/s390/kernel/runtime_instr.c b/arch/s390/kernel/runtime_instr.c index 125c7f6e8715..1788a5454b6f 100644 --- a/arch/s390/kernel/runtime_instr.c +++ b/arch/s390/kernel/runtime_instr.c @@ -57,7 +57,7 @@ static void init_runtime_instr_cb(struct runtime_instr_cb *cb) cb->k = 1; cb->ps = 1; cb->pc = 1; - cb->key = PAGE_DEFAULT_KEY; + cb->key = PAGE_DEFAULT_KEY >> 4; cb->v = 1; } diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 9cbf490fd162..d1f3b56e7afc 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -37,7 +37,7 @@ #include <linux/root_dev.h> #include <linux/console.h> #include <linux/kernel_stat.h> -#include <linux/dma-contiguous.h> +#include <linux/dma-map-ops.h> #include <linux/device.h> #include <linux/notifier.h> #include <linux/pfn.h> @@ -49,14 +49,17 @@ #include <linux/memory.h> #include <linux/compat.h> #include <linux/start_kernel.h> +#include <linux/hugetlb.h> +#include <linux/kmemleak.h> +#include <asm/archrandom.h> #include <asm/boot_data.h> #include <asm/ipl.h> #include <asm/facility.h> #include <asm/smp.h> #include <asm/mmu_context.h> #include <asm/cpcmd.h> -#include <asm/lowcore.h> +#include <asm/abs_lowcore.h> #include <asm/nmi.h> #include <asm/irq.h> #include <asm/page.h> @@ -71,8 +74,10 @@ #include <asm/numa.h> #include <asm/alternative.h> #include <asm/nospec-branch.h> -#include <asm/mem_detect.h> +#include <asm/physmem_info.h> +#include <asm/maccess.h> #include <asm/uv.h> +#include <asm/asm-offsets.h> #include "entry.h" /* @@ -87,30 +92,71 @@ EXPORT_SYMBOL(console_devno); unsigned int console_irq = -1; EXPORT_SYMBOL(console_irq); -unsigned long elf_hwcap __read_mostly = 0; -char elf_platform[ELF_PLATFORM_SIZE]; +/* + * Some code and data needs to stay below 2 GB, even when the kernel would be + * relocated above 2 GB, because it has to use 31 bit addresses. + * Such code and data is part of the .amode31 section. + */ +char __amode31_ref *__samode31 = _samode31; +char __amode31_ref *__eamode31 = _eamode31; +char __amode31_ref *__stext_amode31 = _stext_amode31; +char __amode31_ref *__etext_amode31 = _etext_amode31; +struct exception_table_entry __amode31_ref *__start_amode31_ex_table = _start_amode31_ex_table; +struct exception_table_entry __amode31_ref *__stop_amode31_ex_table = _stop_amode31_ex_table; -unsigned long int_hwcap = 0; +/* + * Control registers CR2, CR5 and CR15 are initialized with addresses + * of tables that must be placed below 2G which is handled by the AMODE31 + * sections. + * Because the AMODE31 sections are relocated below 2G at startup, + * the content of control registers CR2, CR5 and CR15 must be updated + * with new addresses after the relocation. The initial initialization of + * control registers occurs in head64.S and then gets updated again after AMODE31 + * relocation. We must access the relevant AMODE31 tables indirectly via + * pointers placed in the .amode31.refs linker section. Those pointers get + * updated automatically during AMODE31 relocation and always contain a valid + * address within AMODE31 sections. + */ -#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST -int __bootdata_preserved(prot_virt_guest); -#endif +static __amode31_data u32 __ctl_duct_amode31[16] __aligned(64); + +static __amode31_data u64 __ctl_aste_amode31[8] __aligned(64) = { + [1] = 0xffffffffffffffff +}; + +static __amode31_data u32 __ctl_duald_amode31[32] __aligned(128) = { + 0x80000000, 0, 0, 0, + 0x80000000, 0, 0, 0, + 0x80000000, 0, 0, 0, + 0x80000000, 0, 0, 0, + 0x80000000, 0, 0, 0, + 0x80000000, 0, 0, 0, + 0x80000000, 0, 0, 0, + 0x80000000, 0, 0, 0 +}; + +static __amode31_data u32 __ctl_linkage_stack_amode31[8] __aligned(64) = { + 0, 0, 0x89000000, 0, + 0, 0, 0x8a000000, 0 +}; + +static u64 __amode31_ref *__ctl_aste = __ctl_aste_amode31; +static u32 __amode31_ref *__ctl_duald = __ctl_duald_amode31; +static u32 __amode31_ref *__ctl_linkage_stack = __ctl_linkage_stack_amode31; +static u32 __amode31_ref *__ctl_duct = __ctl_duct_amode31; + +unsigned long __bootdata_preserved(max_mappable); +unsigned long __bootdata(ident_map_size); +struct physmem_info __bootdata(physmem_info); -int __bootdata(noexec_disabled); -int __bootdata(memory_end_set); -unsigned long __bootdata(memory_end); -unsigned long __bootdata(vmalloc_size); -unsigned long __bootdata(max_physmem_end); -struct mem_detect_info __bootdata(mem_detect); - -struct exception_table_entry *__bootdata_preserved(__start_dma_ex_table); -struct exception_table_entry *__bootdata_preserved(__stop_dma_ex_table); -unsigned long __bootdata_preserved(__swsusp_reset_dma); -unsigned long __bootdata_preserved(__stext_dma); -unsigned long __bootdata_preserved(__etext_dma); -unsigned long __bootdata_preserved(__sdma); -unsigned long __bootdata_preserved(__edma); unsigned long __bootdata_preserved(__kaslr_offset); +int __bootdata_preserved(__kaslr_enabled); +unsigned int __bootdata_preserved(zlib_dfltcc_support); +EXPORT_SYMBOL(zlib_dfltcc_support); +u64 __bootdata_preserved(stfle_fac_list[16]); +EXPORT_SYMBOL(stfle_fac_list); +u64 __bootdata_preserved(alt_stfle_fac_list[16]); +struct oldmem_data __bootdata_preserved(oldmem_data); unsigned long VMALLOC_START; EXPORT_SYMBOL(VMALLOC_START); @@ -120,6 +166,7 @@ EXPORT_SYMBOL(VMALLOC_END); struct page *vmemmap; EXPORT_SYMBOL(vmemmap); +unsigned long vmemmap_size; unsigned long MODULES_VADDR; unsigned long MODULES_END; @@ -128,6 +175,14 @@ unsigned long MODULES_END; struct lowcore *lowcore_ptr[NR_CPUS]; EXPORT_SYMBOL(lowcore_ptr); +DEFINE_STATIC_KEY_FALSE(cpu_has_bear); + +/* + * The Write Back bit position in the physaddr is given by the SLPC PCI. + * Leaving the mask zero always uses write through which is safe + */ +unsigned long mio_wb_bit_mask __ro_after_init; + /* * This is set up by the setup-routine at boot-time * for S390 need to find out, what we have to setup @@ -161,7 +216,7 @@ static void __init set_preferred_console(void) else if (CONSOLE_IS_3270) add_preferred_console("tty3270", 0, NULL); else if (CONSOLE_IS_VT220) - add_preferred_console("ttyS", 1, NULL); + add_preferred_console("ttysclp", 0, NULL); else if (CONSOLE_IS_HVC) add_preferred_console("hvc", 0, NULL); } @@ -241,18 +296,16 @@ static void __init conmode_default(void) SET_CONSOLE_SCLP; #endif } - if (IS_ENABLED(CONFIG_VT) && IS_ENABLED(CONFIG_DUMMY_CONSOLE)) - conswitchp = &dummy_con; } #ifdef CONFIG_CRASH_DUMP static void __init setup_zfcpdump(void) { - if (ipl_info.type != IPL_TYPE_FCP_DUMP) + if (!is_ipl_type_dump()) return; - if (OLDMEM_BASE) + if (oldmem_data.start) return; - strcat(boot_command_line, " cio_ignore=all,!ipldev,!condev"); + strlcat(boot_command_line, " cio_ignore=all,!ipldev,!condev", COMMAND_LINE_SIZE); console_loglevel = 2; } #else @@ -303,17 +356,17 @@ void machine_power_off(void) void (*pm_power_off)(void) = machine_power_off; EXPORT_SYMBOL_GPL(pm_power_off); -void *restart_stack __section(.data); +void *restart_stack; unsigned long stack_alloc(void) { #ifdef CONFIG_VMAP_STACK - return (unsigned long) - __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, - VMALLOC_START, VMALLOC_END, - THREADINFO_GFP, - PAGE_KERNEL, 0, NUMA_NO_NODE, - __builtin_return_address(0)); + void *ret; + + ret = __vmalloc_node(THREAD_SIZE, THREAD_SIZE, THREADINFO_GFP, + NUMA_NO_NODE, __builtin_return_address(0)); + kmemleak_not_leak(ret); + return (unsigned long)ret; #else return __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER); #endif @@ -328,51 +381,21 @@ void stack_free(unsigned long stack) #endif } -int __init arch_early_irq_init(void) -{ - unsigned long stack; - - stack = __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER); - if (!stack) - panic("Couldn't allocate async stack"); - S390_lowcore.async_stack = stack + STACK_INIT_OFFSET; - return 0; -} - -static int __init async_stack_realloc(void) -{ - unsigned long old, new; - - old = S390_lowcore.async_stack - STACK_INIT_OFFSET; - new = stack_alloc(); - if (!new) - panic("Couldn't allocate async stack"); - S390_lowcore.async_stack = new + STACK_INIT_OFFSET; - free_pages(old, THREAD_SIZE_ORDER); - return 0; -} -early_initcall(async_stack_realloc); - -void __init arch_call_rest_init(void) +static unsigned long __init stack_alloc_early(void) { unsigned long stack; - stack = stack_alloc(); - if (!stack) - panic("Couldn't allocate kernel stack"); - current->stack = (void *) stack; -#ifdef CONFIG_VMAP_STACK - current->stack_vm_area = (void *) stack; -#endif - set_task_stack_end_magic(current); - stack += STACK_INIT_OFFSET; - S390_lowcore.kernel_stack = stack; - CALL_ON_STACK_NORETURN(rest_init, stack); + stack = (unsigned long)memblock_alloc(THREAD_SIZE, THREAD_SIZE); + if (!stack) { + panic("%s: Failed to allocate %lu bytes align=0x%lx\n", + __func__, THREAD_SIZE, THREAD_SIZE); + } + return stack; } -static void __init setup_lowcore_dat_off(void) +static void __init setup_lowcore(void) { - struct lowcore *lc; + struct lowcore *lc, *abs_lc; /* * Setup lowcore for boot cpu @@ -383,52 +406,40 @@ static void __init setup_lowcore_dat_off(void) panic("%s: Failed to allocate %zu bytes align=%zx\n", __func__, sizeof(*lc), sizeof(*lc)); - lc->restart_psw.mask = PSW_KERNEL_BITS; - lc->restart_psw.addr = (unsigned long) restart_int_handler; - lc->external_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK; + lc->restart_psw.mask = PSW_KERNEL_BITS & ~PSW_MASK_DAT; + lc->restart_psw.addr = __pa(restart_int_handler); + lc->external_new_psw.mask = PSW_KERNEL_BITS; lc->external_new_psw.addr = (unsigned long) ext_int_handler; - lc->svc_new_psw.mask = PSW_KERNEL_BITS | - PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK; + lc->svc_new_psw.mask = PSW_KERNEL_BITS; lc->svc_new_psw.addr = (unsigned long) system_call; - lc->program_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK; + lc->program_new_psw.mask = PSW_KERNEL_BITS; lc->program_new_psw.addr = (unsigned long) pgm_check_handler; lc->mcck_new_psw.mask = PSW_KERNEL_BITS; lc->mcck_new_psw.addr = (unsigned long) mcck_int_handler; - lc->io_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK; + lc->io_new_psw.mask = PSW_KERNEL_BITS; lc->io_new_psw.addr = (unsigned long) io_int_handler; lc->clock_comparator = clock_comparator_max; - lc->nodat_stack = ((unsigned long) &init_thread_union) - + THREAD_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs); lc->current_task = (unsigned long)&init_task; lc->lpp = LPP_MAGIC; lc->machine_flags = S390_lowcore.machine_flags; lc->preempt_count = S390_lowcore.preempt_count; - lc->stfl_fac_list = S390_lowcore.stfl_fac_list; - memcpy(lc->stfle_fac_list, S390_lowcore.stfle_fac_list, - sizeof(lc->stfle_fac_list)); - memcpy(lc->alt_stfle_fac_list, S390_lowcore.alt_stfle_fac_list, - sizeof(lc->alt_stfle_fac_list)); - nmi_alloc_boot_cpu(lc); - vdso_alloc_boot_cpu(lc); - lc->sync_enter_timer = S390_lowcore.sync_enter_timer; - lc->async_enter_timer = S390_lowcore.async_enter_timer; + nmi_alloc_mcesa_early(&lc->mcesad); + lc->sys_enter_timer = S390_lowcore.sys_enter_timer; lc->exit_timer = S390_lowcore.exit_timer; lc->user_timer = S390_lowcore.user_timer; lc->system_timer = S390_lowcore.system_timer; lc->steal_timer = S390_lowcore.steal_timer; lc->last_update_timer = S390_lowcore.last_update_timer; lc->last_update_clock = S390_lowcore.last_update_clock; - /* * Allocate the global restart stack which is the same for - * all CPUs in cast *one* of them does a PSW restart. + * all CPUs in case *one* of them does a PSW restart. */ - restart_stack = memblock_alloc(THREAD_SIZE, THREAD_SIZE); - if (!restart_stack) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, THREAD_SIZE, THREAD_SIZE); - restart_stack += STACK_INIT_OFFSET; - + restart_stack = (void *)(stack_alloc_early() + STACK_INIT_OFFSET); + lc->mcck_stack = stack_alloc_early() + STACK_INIT_OFFSET; + lc->async_stack = stack_alloc_early() + STACK_INIT_OFFSET; + lc->nodat_stack = stack_alloc_early() + STACK_INIT_OFFSET; + lc->kernel_stack = S390_lowcore.kernel_stack; /* * Set up PSW restart to call ipl.c:do_restart(). Copy the relevant * restart data to the absolute zero lowcore. This is necessary if @@ -437,32 +448,32 @@ static void __init setup_lowcore_dat_off(void) lc->restart_stack = (unsigned long) restart_stack; lc->restart_fn = (unsigned long) do_restart; lc->restart_data = 0; - lc->restart_source = -1UL; - - /* Setup absolute zero lowcore */ - mem_assign_absolute(S390_lowcore.restart_stack, lc->restart_stack); - mem_assign_absolute(S390_lowcore.restart_fn, lc->restart_fn); - mem_assign_absolute(S390_lowcore.restart_data, lc->restart_data); - mem_assign_absolute(S390_lowcore.restart_source, lc->restart_source); - mem_assign_absolute(S390_lowcore.restart_psw, lc->restart_psw); - + lc->restart_source = -1U; lc->spinlock_lockval = arch_spin_lockval(0); lc->spinlock_index = 0; arch_spin_lock_setup(0); - lc->br_r1_trampoline = 0x07f1; /* br %r1 */ - - set_prefix((u32)(unsigned long) lc); + lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW); + lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW); + lc->preempt_count = PREEMPT_DISABLED; + lc->kernel_asce = S390_lowcore.kernel_asce; + lc->user_asce = S390_lowcore.user_asce; + + system_ctlreg_init_save_area(lc); + abs_lc = get_abs_lowcore(); + abs_lc->restart_stack = lc->restart_stack; + abs_lc->restart_fn = lc->restart_fn; + abs_lc->restart_data = lc->restart_data; + abs_lc->restart_source = lc->restart_source; + abs_lc->restart_psw = lc->restart_psw; + abs_lc->restart_flags = RESTART_FLAG_CTLREGS; + abs_lc->program_new_psw = lc->program_new_psw; + abs_lc->mcesad = lc->mcesad; + put_abs_lowcore(abs_lc); + + set_prefix(__pa(lc)); lowcore_ptr[0] = lc; -} - -static void __init setup_lowcore_dat_on(void) -{ - __ctl_clear_bit(0, 28); - S390_lowcore.external_new_psw.mask |= PSW_MASK_DAT; - S390_lowcore.svc_new_psw.mask |= PSW_MASK_DAT; - S390_lowcore.program_new_psw.mask |= PSW_MASK_DAT; - S390_lowcore.io_new_psw.mask |= PSW_MASK_DAT; - __ctl_set_bit(0, 28); + if (abs_lowcore_map(0, lowcore_ptr[0], false)) + panic("Couldn't setup absolute lowcore"); } static struct resource code_resource = { @@ -489,8 +500,9 @@ static struct resource __initdata *standard_resources[] = { static void __init setup_resources(void) { struct resource *res, *std_res, *sub_res; - struct memblock_region *reg; + phys_addr_t start, end; int j; + u64 i; code_resource.start = (unsigned long) _text; code_resource.end = (unsigned long) _etext - 1; @@ -499,7 +511,7 @@ static void __init setup_resources(void) bss_resource.start = (unsigned long) __bss_start; bss_resource.end = (unsigned long) __bss_stop - 1; - for_each_memblock(memory, reg) { + for_each_mem_range(i, &start, &end) { res = memblock_alloc(sizeof(*res), 8); if (!res) panic("%s: Failed to allocate %zu bytes align=0x%x\n", @@ -507,8 +519,13 @@ static void __init setup_resources(void) res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; res->name = "System RAM"; - res->start = reg->base; - res->end = reg->base + reg->size - 1; + res->start = start; + /* + * In memblock, end points to the first byte after the + * range while in resources, end points to the last byte in + * the range. + */ + res->end = end - 1; request_resource(&iomem_resource, res); for (j = 0; j < ARRAY_SIZE(standard_resources); j++) { @@ -539,7 +556,8 @@ static void __init setup_resources(void) * part of the System RAM resource. */ if (crashk_res.end) { - memblock_add_node(crashk_res.start, resource_size(&crashk_res), 0); + memblock_add_node(crashk_res.start, resource_size(&crashk_res), + 0, MEMBLOCK_NONE); memblock_reserve(crashk_res.start, resource_size(&crashk_res)); insert_resource(&iomem_resource, &crashk_res); } @@ -548,56 +566,17 @@ static void __init setup_resources(void) static void __init setup_memory_end(void) { - unsigned long vmax, tmp; - - /* Choose kernel address space layout: 3 or 4 levels. */ - if (IS_ENABLED(CONFIG_KASAN)) { - vmax = IS_ENABLED(CONFIG_KASAN_S390_4_LEVEL_PAGING) - ? _REGION1_SIZE - : _REGION2_SIZE; - } else { - tmp = (memory_end ?: max_physmem_end) / PAGE_SIZE; - tmp = tmp * (sizeof(struct page) + PAGE_SIZE); - if (tmp + vmalloc_size + MODULES_LEN <= _REGION2_SIZE) - vmax = _REGION2_SIZE; /* 3-level kernel page table */ - else - vmax = _REGION1_SIZE; /* 4-level kernel page table */ - } - - /* module area is at the end of the kernel address space. */ - MODULES_END = vmax; - MODULES_VADDR = MODULES_END - MODULES_LEN; - VMALLOC_END = MODULES_VADDR; - VMALLOC_START = VMALLOC_END - vmalloc_size; - - /* Split remaining virtual space between 1:1 mapping & vmemmap array */ - tmp = VMALLOC_START / (PAGE_SIZE + sizeof(struct page)); - /* vmemmap contains a multiple of PAGES_PER_SECTION struct pages */ - tmp = SECTION_ALIGN_UP(tmp); - tmp = VMALLOC_START - tmp * sizeof(struct page); - tmp &= ~((vmax >> 11) - 1); /* align to page table level */ - tmp = min(tmp, 1UL << MAX_PHYSMEM_BITS); - vmemmap = (struct page *) tmp; - - /* Take care that memory_end is set and <= vmemmap */ - memory_end = min(memory_end ?: max_physmem_end, (unsigned long)vmemmap); -#ifdef CONFIG_KASAN - /* fit in kasan shadow memory region between 1:1 and vmemmap */ - memory_end = min(memory_end, KASAN_SHADOW_START); - vmemmap = max(vmemmap, (struct page *)KASAN_SHADOW_END); -#endif - max_pfn = max_low_pfn = PFN_DOWN(memory_end); - memblock_remove(memory_end, ULONG_MAX); - - pr_notice("The maximum memory size is %luMB\n", memory_end >> 20); + max_pfn = max_low_pfn = PFN_DOWN(ident_map_size); + pr_notice("The maximum memory size is %luMB\n", ident_map_size >> 20); } #ifdef CONFIG_CRASH_DUMP /* - * When kdump is enabled, we have to ensure that no memory from - * the area [0 - crashkernel memory size] and - * [crashk_res.start - crashk_res.end] is set offline. + * When kdump is enabled, we have to ensure that no memory from the area + * [0 - crashkernel memory size] is set offline - it will be exchanged with + * the crashkernel memory region when kdump is triggered. The crashkernel + * memory region can never get offlined (pages are unmovable). */ static int kdump_mem_notifier(struct notifier_block *nb, unsigned long action, void *data) @@ -608,11 +587,7 @@ static int kdump_mem_notifier(struct notifier_block *nb, return NOTIFY_OK; if (arg->start_pfn < PFN_DOWN(resource_size(&crashk_res))) return NOTIFY_BAD; - if (arg->start_pfn > PFN_DOWN(crashk_res.end)) - return NOTIFY_OK; - if (arg->start_pfn + arg->nr_pages - 1 < PFN_DOWN(crashk_res.start)) - return NOTIFY_OK; - return NOTIFY_BAD; + return NOTIFY_OK; } static struct notifier_block kdump_mem_nb = { @@ -622,36 +597,15 @@ static struct notifier_block kdump_mem_nb = { #endif /* - * Make sure that the area behind memory_end is protected - */ -static void reserve_memory_end(void) -{ - if (memory_end_set) - memblock_reserve(memory_end, ULONG_MAX); -} - -/* - * Make sure that oldmem, where the dump is stored, is protected + * Reserve page tables created by decompressor */ -static void reserve_oldmem(void) +static void __init reserve_pgtables(void) { -#ifdef CONFIG_CRASH_DUMP - if (OLDMEM_BASE) - /* Forget all memory above the running kdump system */ - memblock_reserve(OLDMEM_SIZE, (phys_addr_t)ULONG_MAX); -#endif -} + unsigned long start, end; + struct reserved_range *range; -/* - * Make sure that oldmem, where the dump is stored, is protected - */ -static void remove_oldmem(void) -{ -#ifdef CONFIG_CRASH_DUMP - if (OLDMEM_BASE) - /* Forget all memory above the running kdump system */ - memblock_remove(OLDMEM_SIZE, (phys_addr_t)ULONG_MAX); -#endif + for_each_physmem_reserved_type_range(RR_VMEM, range, &start, &end) + memblock_reserve(start, end - start); } /* @@ -664,8 +618,8 @@ static void __init reserve_crashkernel(void) phys_addr_t low, high; int rc; - rc = parse_crashkernel(boot_command_line, memory_end, &crash_size, - &crash_base); + rc = parse_crashkernel(boot_command_line, ident_map_size, + &crash_size, &crash_base, NULL, NULL); crash_base = ALIGN(crash_base, KEXEC_CRASH_MEM_ALIGN); crash_size = ALIGN(crash_size, KEXEC_CRASH_MEM_ALIGN); @@ -678,9 +632,9 @@ static void __init reserve_crashkernel(void) return; } - low = crash_base ?: OLDMEM_BASE; + low = crash_base ?: oldmem_data.start; high = low + crash_size; - if (low >= OLDMEM_BASE && high <= OLDMEM_BASE + OLDMEM_SIZE) { + if (low >= oldmem_data.start && high <= oldmem_data.start + oldmem_data.size) { /* The crashkernel fits into OLDMEM, reuse OLDMEM */ crash_base = low; } else { @@ -694,8 +648,9 @@ static void __init reserve_crashkernel(void) return; } low = crash_base ?: low; - crash_base = memblock_find_in_range(low, high, crash_size, - KEXEC_CRASH_MEM_ALIGN); + crash_base = memblock_phys_alloc_range(crash_size, + KEXEC_CRASH_MEM_ALIGN, + low, high); } if (!crash_base) { @@ -704,10 +659,12 @@ static void __init reserve_crashkernel(void) return; } - if (register_memory_notifier(&kdump_mem_nb)) + if (register_memory_notifier(&kdump_mem_nb)) { + memblock_phys_free(crash_base, crash_size); return; + } - if (!OLDMEM_BASE && MACHINE_IS_VM) + if (!oldmem_data.start && MACHINE_IS_VM) diag10_range(PFN_DOWN(crash_base), PFN_DOWN(crash_size)); crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; @@ -725,13 +682,13 @@ static void __init reserve_crashkernel(void) */ static void __init reserve_initrd(void) { -#ifdef CONFIG_BLK_DEV_INITRD - if (!INITRD_START || !INITRD_SIZE) + unsigned long addr, size; + + if (!IS_ENABLED(CONFIG_BLK_DEV_INITRD) || !get_physmem_reserved(RR_INITRD, &addr, &size)) return; - initrd_start = INITRD_START; - initrd_end = initrd_start + INITRD_SIZE; - memblock_reserve(INITRD_START, INITRD_SIZE); -#endif + initrd_start = (unsigned long)__va(addr); + initrd_end = initrd_start + size; + memblock_reserve(addr, size); } /* @@ -743,75 +700,37 @@ static void __init reserve_certificate_list(void) memblock_reserve(ipl_cert_list_addr, ipl_cert_list_size); } -static void __init reserve_mem_detect_info(void) +static void __init reserve_physmem_info(void) { - unsigned long start, size; + unsigned long addr, size; - get_mem_detect_reserved(&start, &size); - if (size) - memblock_reserve(start, size); + if (get_physmem_reserved(RR_MEM_DETECT_EXTENDED, &addr, &size)) + memblock_reserve(addr, size); } -static void __init free_mem_detect_info(void) +static void __init free_physmem_info(void) { - unsigned long start, size; + unsigned long addr, size; - get_mem_detect_reserved(&start, &size); - if (size) - memblock_free(start, size); + if (get_physmem_reserved(RR_MEM_DETECT_EXTENDED, &addr, &size)) + memblock_phys_free(addr, size); } -static void __init memblock_physmem_add(phys_addr_t start, phys_addr_t size) -{ - memblock_dbg("memblock_physmem_add: [%#016llx-%#016llx]\n", - start, start + size - 1); - memblock_add_range(&memblock.memory, start, size, 0, 0); - memblock_add_range(&memblock.physmem, start, size, 0, 0); -} - -static const char * __init get_mem_info_source(void) -{ - switch (mem_detect.info_source) { - case MEM_DETECT_SCLP_STOR_INFO: - return "sclp storage info"; - case MEM_DETECT_DIAG260: - return "diag260"; - case MEM_DETECT_SCLP_READ_INFO: - return "sclp read info"; - case MEM_DETECT_BIN_SEARCH: - return "binary search"; - } - return "none"; -} - -static void __init memblock_add_mem_detect_info(void) +static void __init memblock_add_physmem_info(void) { unsigned long start, end; int i; - memblock_dbg("physmem info source: %s (%hhd)\n", - get_mem_info_source(), mem_detect.info_source); + pr_debug("physmem info source: %s (%hhd)\n", + get_physmem_info_source(), physmem_info.info_source); /* keep memblock lists close to the kernel */ memblock_set_bottom_up(true); - for_each_mem_detect_block(i, &start, &end) + for_each_physmem_usable_range(i, &start, &end) + memblock_add(start, end - start); + for_each_physmem_online_range(i, &start, &end) memblock_physmem_add(start, end - start); memblock_set_bottom_up(false); - memblock_dump_all(); -} - -/* - * Check for initrd being in usable memory - */ -static void __init check_initrd(void) -{ -#ifdef CONFIG_BLK_DEV_INITRD - if (INITRD_START && INITRD_SIZE && - !memblock_is_region_memory(INITRD_START, INITRD_SIZE)) { - pr_err("The initial RAM disk does not fit into the memory\n"); - memblock_free(INITRD_START, INITRD_SIZE); - initrd_start = initrd_end = 0; - } -#endif + memblock_set_node(0, ULONG_MAX, &memblock.memory, 0); } /* @@ -819,176 +738,68 @@ static void __init check_initrd(void) */ static void __init reserve_kernel(void) { - unsigned long start_pfn = PFN_UP(__pa(_end)); - - memblock_reserve(0, HEAD_END); - memblock_reserve((unsigned long)_stext, PFN_PHYS(start_pfn) - - (unsigned long)_stext); - memblock_reserve(__sdma, __edma - __sdma); + memblock_reserve(0, STARTUP_NORMAL_OFFSET); + memblock_reserve(OLDMEM_BASE, sizeof(unsigned long)); + memblock_reserve(OLDMEM_SIZE, sizeof(unsigned long)); + memblock_reserve(physmem_info.reserved[RR_AMODE31].start, __eamode31 - __samode31); + memblock_reserve(__pa(sclp_early_sccb), EXT_SCCB_READ_SCP); + memblock_reserve(__pa(_stext), _end - _stext); } static void __init setup_memory(void) { - struct memblock_region *reg; + phys_addr_t start, end; + u64 i; /* * Init storage key for present memory */ - for_each_memblock(memory, reg) { - storage_key_init_range(reg->base, reg->base + reg->size); - } - psw_set_key(PAGE_DEFAULT_KEY); + for_each_mem_range(i, &start, &end) + storage_key_init_range(start, end); - /* Only cosmetics */ - memblock_enforce_memory_limit(memblock_end_of_DRAM()); + psw_set_key(PAGE_DEFAULT_KEY); } -/* - * Setup hardware capabilities. - */ -static int __init setup_hwcaps(void) +static void __init relocate_amode31_section(void) { - static const int stfl_bits[6] = { 0, 2, 7, 17, 19, 21 }; - struct cpuid cpu_id; - int i; + unsigned long amode31_size = __eamode31 - __samode31; + long amode31_offset, *ptr; - /* - * The store facility list bits numbers as found in the principles - * of operation are numbered with bit 1UL<<31 as number 0 to - * bit 1UL<<0 as number 31. - * Bit 0: instructions named N3, "backported" to esa-mode - * Bit 2: z/Architecture mode is active - * Bit 7: the store-facility-list-extended facility is installed - * Bit 17: the message-security assist is installed - * Bit 19: the long-displacement facility is installed - * Bit 21: the extended-immediate facility is installed - * Bit 22: extended-translation facility 3 is installed - * Bit 30: extended-translation facility 3 enhancement facility - * These get translated to: - * HWCAP_S390_ESAN3 bit 0, HWCAP_S390_ZARCH bit 1, - * HWCAP_S390_STFLE bit 2, HWCAP_S390_MSA bit 3, - * HWCAP_S390_LDISP bit 4, HWCAP_S390_EIMM bit 5 and - * HWCAP_S390_ETF3EH bit 8 (22 && 30). - */ - for (i = 0; i < 6; i++) - if (test_facility(stfl_bits[i])) - elf_hwcap |= 1UL << i; - - if (test_facility(22) && test_facility(30)) - elf_hwcap |= HWCAP_S390_ETF3EH; - - /* - * Check for additional facilities with store-facility-list-extended. - * stfle stores doublewords (8 byte) with bit 1ULL<<63 as bit 0 - * and 1ULL<<0 as bit 63. Bits 0-31 contain the same information - * as stored by stfl, bits 32-xxx contain additional facilities. - * How many facility words are stored depends on the number of - * doublewords passed to the instruction. The additional facilities - * are: - * Bit 42: decimal floating point facility is installed - * Bit 44: perform floating point operation facility is installed - * translated to: - * HWCAP_S390_DFP bit 6 (42 && 44). - */ - if ((elf_hwcap & (1UL << 2)) && test_facility(42) && test_facility(44)) - elf_hwcap |= HWCAP_S390_DFP; + amode31_offset = physmem_info.reserved[RR_AMODE31].start - (unsigned long)__samode31; + pr_info("Relocating AMODE31 section of size 0x%08lx\n", amode31_size); - /* - * Huge page support HWCAP_S390_HPAGE is bit 7. - */ - if (MACHINE_HAS_EDAT1) - elf_hwcap |= HWCAP_S390_HPAGE; + /* Move original AMODE31 section to the new one */ + memmove((void *)physmem_info.reserved[RR_AMODE31].start, __samode31, amode31_size); + /* Zero out the old AMODE31 section to catch invalid accesses within it */ + memset(__samode31, 0, amode31_size); - /* - * 64-bit register support for 31-bit processes - * HWCAP_S390_HIGH_GPRS is bit 9. - */ - elf_hwcap |= HWCAP_S390_HIGH_GPRS; - - /* - * Transactional execution support HWCAP_S390_TE is bit 10. - */ - if (MACHINE_HAS_TE) - elf_hwcap |= HWCAP_S390_TE; - - /* - * Vector extension HWCAP_S390_VXRS is bit 11. The Vector extension - * can be disabled with the "novx" parameter. Use MACHINE_HAS_VX - * instead of facility bit 129. - */ - if (MACHINE_HAS_VX) { - elf_hwcap |= HWCAP_S390_VXRS; - if (test_facility(134)) - elf_hwcap |= HWCAP_S390_VXRS_EXT; - if (test_facility(135)) - elf_hwcap |= HWCAP_S390_VXRS_BCD; - if (test_facility(148)) - elf_hwcap |= HWCAP_S390_VXRS_EXT2; - if (test_facility(152)) - elf_hwcap |= HWCAP_S390_VXRS_PDE; - } - if (test_facility(150)) - elf_hwcap |= HWCAP_S390_SORT; - if (test_facility(151)) - elf_hwcap |= HWCAP_S390_DFLT; + /* Update all AMODE31 region references */ + for (ptr = _start_amode31_refs; ptr != _end_amode31_refs; ptr++) + *ptr += amode31_offset; +} - /* - * Guarded storage support HWCAP_S390_GS is bit 12. - */ - if (MACHINE_HAS_GS) - elf_hwcap |= HWCAP_S390_GS; - - get_cpu_id(&cpu_id); - add_device_randomness(&cpu_id, sizeof(cpu_id)); - switch (cpu_id.machine) { - case 0x2064: - case 0x2066: - default: /* Use "z900" as default for 64 bit kernels. */ - strcpy(elf_platform, "z900"); - break; - case 0x2084: - case 0x2086: - strcpy(elf_platform, "z990"); - break; - case 0x2094: - case 0x2096: - strcpy(elf_platform, "z9-109"); - break; - case 0x2097: - case 0x2098: - strcpy(elf_platform, "z10"); - break; - case 0x2817: - case 0x2818: - strcpy(elf_platform, "z196"); - break; - case 0x2827: - case 0x2828: - strcpy(elf_platform, "zEC12"); - break; - case 0x2964: - case 0x2965: - strcpy(elf_platform, "z13"); - break; - case 0x3906: - case 0x3907: - strcpy(elf_platform, "z14"); - break; - case 0x8561: - case 0x8562: - strcpy(elf_platform, "z15"); - break; - } +/* This must be called after AMODE31 relocation */ +static void __init setup_cr(void) +{ + union ctlreg2 cr2; + union ctlreg5 cr5; + union ctlreg15 cr15; - /* - * Virtualization support HWCAP_INT_SIE is bit 0. - */ - if (sclp.has_sief2) - int_hwcap |= HWCAP_INT_SIE; + __ctl_duct[1] = (unsigned long)__ctl_aste; + __ctl_duct[2] = (unsigned long)__ctl_aste; + __ctl_duct[4] = (unsigned long)__ctl_duald; - return 0; + /* Update control registers CR2, CR5 and CR15 */ + local_ctl_store(2, &cr2.reg); + local_ctl_store(5, &cr5.reg); + local_ctl_store(15, &cr15.reg); + cr2.ducto = (unsigned long)__ctl_duct >> 6; + cr5.pasteo = (unsigned long)__ctl_duct >> 6; + cr15.lsea = (unsigned long)__ctl_linkage_stack >> 3; + system_ctl_load(2, &cr2.reg); + system_ctl_load(5, &cr5.reg); + system_ctl_load(15, &cr15.reg); } -arch_initcall(setup_hwcaps); /* * Add system information as device randomness @@ -997,30 +808,15 @@ static void __init setup_randomness(void) { struct sysinfo_3_2_2 *vmms; - vmms = (struct sysinfo_3_2_2 *) memblock_phys_alloc(PAGE_SIZE, - PAGE_SIZE); + vmms = memblock_alloc(PAGE_SIZE, PAGE_SIZE); if (!vmms) panic("Failed to allocate memory for sysinfo structure\n"); - if (stsi(vmms, 3, 2, 2) == 0 && vmms->count) add_device_randomness(&vmms->vm, sizeof(vmms->vm[0]) * vmms->count); - memblock_free((unsigned long) vmms, PAGE_SIZE); -} - -/* - * Find the correct size for the task_struct. This depends on - * the size of the struct fpu at the end of the thread_struct - * which is embedded in the task_struct. - */ -static void __init setup_task_size(void) -{ - int task_size = sizeof(struct task_struct); + memblock_free(vmms, PAGE_SIZE); - if (!MACHINE_HAS_VX) { - task_size -= sizeof(__vector128) * __NUM_VXRS; - task_size += sizeof(freg_t) * __NUM_FPRS; - } - arch_task_struct_size = task_size; + if (cpacf_query_func(CPACF_PRNO, CPACF_PRNO_TRNG)) + static_branch_enable(&s390_arch_random_available); } /* @@ -1031,8 +827,7 @@ static void __init setup_control_program_code(void) { union diag318_info diag318_info = { .cpnc = CPNC_LINUX, - .cpvc_linux = 0, - .cpvc_distro = {0}, + .cpvc = 0, }; if (!sclp.has_diag318) @@ -1052,11 +847,11 @@ static void __init log_component_list(void) if (!early_ipl_comp_list_addr) return; - if (ipl_block.hdr.flags & IPL_PL_FLAG_IPLSR) + if (ipl_block.hdr.flags & IPL_PL_FLAG_SIPL) pr_info("Linux is running with Secure-IPL enabled\n"); else pr_info("Linux is running with Secure-IPL disabled\n"); - ptr = (void *) early_ipl_comp_list_addr; + ptr = __va(early_ipl_comp_list_addr); end = (void *) ptr + early_ipl_comp_list_size; pr_info("The IPL report contains the following components:\n"); while (ptr < end) { @@ -1102,14 +897,12 @@ void __init setup_arch(char **cmdline_p) ROOT_DEV = Root_RAM0; - init_mm.start_code = (unsigned long) _text; - init_mm.end_code = (unsigned long) _etext; - init_mm.end_data = (unsigned long) _edata; - init_mm.brk = (unsigned long) _end; + setup_initial_init_mm(_text, _etext, _edata, _end); if (IS_ENABLED(CONFIG_EXPOLINE_AUTO)) nospec_auto_detect(); + jump_label_init(); parse_early_param(); #ifdef CONFIG_CRASH_DUMP /* Deactivate elfcorehdr= kernel parameter */ @@ -1118,49 +911,44 @@ void __init setup_arch(char **cmdline_p) os_info_init(); setup_ipl(); - setup_task_size(); setup_control_program_code(); /* Do some memory reservations *before* memory is added to memblock */ - reserve_memory_end(); - reserve_oldmem(); + reserve_pgtables(); reserve_kernel(); reserve_initrd(); reserve_certificate_list(); - reserve_mem_detect_info(); + reserve_physmem_info(); + memblock_set_current_limit(ident_map_size); memblock_allow_resize(); /* Get information about *all* installed memory */ - memblock_add_mem_detect_info(); - - free_mem_detect_info(); - remove_oldmem(); - - /* - * Make sure all chunks are MAX_ORDER aligned so we don't need the - * extra checks that HOLES_IN_ZONE would require. - * - * Is this still required? - */ - memblock_trim_memory(1UL << (MAX_ORDER - 1 + PAGE_SHIFT)); + memblock_add_physmem_info(); + free_physmem_info(); setup_memory_end(); + memblock_dump_all(); setup_memory(); - dma_contiguous_reserve(memory_end); + + relocate_amode31_section(); + setup_cr(); + setup_uv(); + dma_contiguous_reserve(ident_map_size); vmcp_cma_reserve(); + if (MACHINE_HAS_EDAT2) + hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT); - check_initrd(); reserve_crashkernel(); #ifdef CONFIG_CRASH_DUMP /* - * Be aware that smp_save_dump_cpus() triggers a system reset. + * Be aware that smp_save_dump_secondary_cpus() triggers a system reset. * Therefore CPU and device initialization should be done afterwards. */ - smp_save_dump_cpus(); + smp_save_dump_secondary_cpus(); #endif setup_resources(); - setup_lowcore_dat_off(); + setup_lowcore(); smp_fill_possible_mask(); cpu_detect_mhz_feature(); cpu_init(); @@ -1168,8 +956,11 @@ void __init setup_arch(char **cmdline_p) smp_detect_cpus(); topology_init_early(); + if (test_facility(193)) + static_branch_enable(&cpu_has_bear); + /* - * Create kernel page tables and switch to virtual addressing. + * Create kernel page tables. */ paging_init(); @@ -1177,7 +968,9 @@ void __init setup_arch(char **cmdline_p) * After paging_init created the kernel page table, the new PSWs * in lowcore can now run with DAT enabled. */ - setup_lowcore_dat_on(); +#ifdef CONFIG_CRASH_DUMP + smp_save_dump_ipl_cpu(); +#endif /* Setup default console */ conmode_default(); @@ -1187,7 +980,7 @@ void __init setup_arch(char **cmdline_p) if (IS_ENABLED(CONFIG_EXPOLINE)) nospec_init_branches(); - /* Setup zfcpdump support */ + /* Setup zfcp/nvme dump support */ setup_zfcpdump(); /* Add system specific data to the random pool */ diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index e6fca5498e1f..43e9661cd715 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -12,10 +12,12 @@ #include <linux/sched.h> #include <linux/sched/task_stack.h> +#include <linux/rseq.h> #include <linux/mm.h> #include <linux/smp.h> #include <linux/kernel.h> #include <linux/signal.h> +#include <linux/entry-common.h> #include <linux/errno.h> #include <linux/wait.h> #include <linux/ptrace.h> @@ -24,13 +26,13 @@ #include <linux/tty.h> #include <linux/personality.h> #include <linux/binfmts.h> -#include <linux/tracehook.h> #include <linux/syscalls.h> #include <linux/compat.h> #include <asm/ucontext.h> #include <linux/uaccess.h> #include <asm/lowcore.h> #include <asm/switch_to.h> +#include <asm/vdso.h> #include "entry.h" /* @@ -139,7 +141,7 @@ static int restore_sigregs(struct pt_regs *regs, _sigregs __user *sregs) { _sigregs user_sregs; - /* Alwys make any pending restarted system call return -EINTR */ + /* Always make any pending restarted system call return -EINTR */ current->restart_block.fn = do_no_restart_syscall; if (__copy_from_user(&user_sregs, sregs, sizeof(user_sregs))) @@ -148,10 +150,6 @@ static int restore_sigregs(struct pt_regs *regs, _sigregs __user *sregs) if (!is_ri_task(current) && (user_sregs.regs.psw.mask & PSW_MASK_RI)) return -EINVAL; - /* Test the floating-point-control word. */ - if (test_fp_ctl(user_sregs.fpregs.fpc)) - return -EINVAL; - /* Use regs->psw.mask instead of PSW_USER_BITS to preserve PER bit. */ regs->psw.mask = (regs->psw.mask & ~(PSW_MASK_USER | PSW_MASK_RI)) | (user_sregs.regs.psw.mask & (PSW_MASK_USER | PSW_MASK_RI)); @@ -181,9 +179,9 @@ static int save_sigregs_ext(struct pt_regs *regs, int i; /* Save vector registers to signal stack */ - if (MACHINE_HAS_VX) { + if (cpu_has_vx()) { for (i = 0; i < __NUM_VXRS_LOW; i++) - vxrs[i] = *((__u64 *)(current->thread.fpu.vxrs + i) + 1); + vxrs[i] = current->thread.fpu.vxrs[i].low; if (__copy_to_user(&sregs_ext->vxrs_low, vxrs, sizeof(sregs_ext->vxrs_low)) || __copy_to_user(&sregs_ext->vxrs_high, @@ -201,7 +199,7 @@ static int restore_sigregs_ext(struct pt_regs *regs, int i; /* Restore vector registers from signal stack */ - if (MACHINE_HAS_VX) { + if (cpu_has_vx()) { if (__copy_from_user(vxrs, &sregs_ext->vxrs_low, sizeof(sregs_ext->vxrs_low)) || __copy_from_user(current->thread.fpu.vxrs + __NUM_VXRS_LOW, @@ -209,7 +207,7 @@ static int restore_sigregs_ext(struct pt_regs *regs, sizeof(sregs_ext->vxrs_high))) return -EFAULT; for (i = 0; i < __NUM_VXRS_LOW; i++) - *((__u64 *)(current->thread.fpu.vxrs + i) + 1) = vxrs[i]; + current->thread.fpu.vxrs[i].low = vxrs[i]; } return 0; } @@ -299,7 +297,7 @@ static int setup_frame(int sig, struct k_sigaction *ka, * included in the signal frame on a 31-bit system. */ frame_size = sizeof(*frame) - sizeof(frame->sregs_ext); - if (MACHINE_HAS_VX) + if (cpu_has_vx()) frame_size += sizeof(frame->sregs_ext); frame = get_sigframe(ka, regs, frame_size); if (frame == (void __user *) -1UL) @@ -332,15 +330,10 @@ static int setup_frame(int sig, struct k_sigaction *ka, /* Set up to return from userspace. If provided, use a stub already in userspace. */ - if (ka->sa.sa_flags & SA_RESTORER) { + if (ka->sa.sa_flags & SA_RESTORER) restorer = (unsigned long) ka->sa.sa_restorer; - } else { - /* Signal frame without vector registers are short ! */ - __u16 __user *svc = (void __user *) frame + frame_size - 2; - if (__put_user(S390_SYSCALL_OPCODE | __NR_sigreturn, svc)) - return -EFAULT; - restorer = (unsigned long) svc; - } + else + restorer = VDSO64_SYMBOL(current, sigreturn); /* Set up registers for signal handler */ regs->gprs[14] = restorer; @@ -381,7 +374,7 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, * included in the signal frame on a 31-bit system. */ uc_flags = 0; - if (MACHINE_HAS_VX) { + if (cpu_has_vx()) { frame_size += sizeof(_sigregs_ext); uc_flags |= UC_VXRS; } @@ -395,14 +388,10 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, /* Set up to return from userspace. If provided, use a stub already in userspace. */ - if (ksig->ka.sa.sa_flags & SA_RESTORER) { + if (ksig->ka.sa.sa_flags & SA_RESTORER) restorer = (unsigned long) ksig->ka.sa.sa_restorer; - } else { - __u16 __user *svc = &frame->svc_insn; - if (__put_user(S390_SYSCALL_OPCODE | __NR_rt_sigreturn, svc)) - return -EFAULT; - restorer = (unsigned long) svc; - } + else + restorer = VDSO64_SYMBOL(current, rt_sigreturn); /* Create siginfo on the signal stack */ if (copy_siginfo_to_user(&frame->info, &ksig->info)) @@ -459,7 +448,8 @@ static void handle_signal(struct ksignal *ksig, sigset_t *oldset, * the kernel can handle, and then we build all the user-level signal handling * stack-frames in one go after that. */ -void do_signal(struct pt_regs *regs) + +void arch_do_signal_or_restart(struct pt_regs *regs) { struct ksignal ksig; sigset_t *oldset = sigmask_to_save(); @@ -487,7 +477,7 @@ void do_signal(struct pt_regs *regs) regs->gprs[2] = -EINTR; break; } - /* fallthrough */ + fallthrough; case -ERESTARTNOINTR: regs->gprs[2] = regs->orig_gpr2; regs->psw.addr = @@ -498,6 +488,7 @@ void do_signal(struct pt_regs *regs) } /* No longer in a system call */ clear_pt_regs_flag(regs, PIF_SYSCALL); + rseq_signal_deliver(&ksig, regs); if (is_compat_task()) handle_signal32(&ksig, oldset, regs); @@ -513,16 +504,22 @@ void do_signal(struct pt_regs *regs) switch (regs->gprs[2]) { case -ERESTART_RESTARTBLOCK: /* Restart with sys_restart_syscall */ - regs->int_code = __NR_restart_syscall; - /* fallthrough */ + regs->gprs[2] = regs->orig_gpr2; + current->restart_block.arch_data = regs->psw.addr; + if (is_compat_task()) + regs->psw.addr = VDSO32_SYMBOL(current, restart_syscall); + else + regs->psw.addr = VDSO64_SYMBOL(current, restart_syscall); + if (test_thread_flag(TIF_SINGLE_STEP)) + clear_thread_flag(TIF_PER_TRAP); + break; case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: - /* Restart system call with magic TIF bit. */ regs->gprs[2] = regs->orig_gpr2; - set_pt_regs_flag(regs, PIF_SYSCALL); + regs->psw.addr = __rewind_psw(regs->psw, regs->int_code >> 16); if (test_thread_flag(TIF_SINGLE_STEP)) - clear_pt_regs_flag(regs, PIF_PER_TRAP); + clear_thread_flag(TIF_PER_TRAP); break; } } @@ -532,10 +529,3 @@ void do_signal(struct pt_regs *regs) */ restore_saved_sigmask(); } - -void do_notify_resume(struct pt_regs *regs) -{ - clear_thread_flag(TIF_NOTIFY_RESUME); - tracehook_notify_resume(regs); - rseq_handle_notify_resume(NULL, regs); -} diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 2794cad9312e..c39d9f0d4b1c 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -5,7 +5,6 @@ * Copyright IBM Corp. 1999, 2012 * Author(s): Denis Joseph Barrow, * Martin Schwidefsky <schwidefsky@de.ibm.com>, - * Heiko Carstens <heiko.carstens@de.ibm.com>, * * based on other smp stuff by * (c) 1995 Alan Cox, CymruNET Ltd <alan@cymru.net> @@ -30,6 +29,7 @@ #include <linux/delay.h> #include <linux/interrupt.h> #include <linux/irqflags.h> +#include <linux/irq_work.h> #include <linux/cpu.h> #include <linux/slab.h> #include <linux/sched/hotplug.h> @@ -37,6 +37,8 @@ #include <linux/crash_dump.h> #include <linux/kprobes.h> #include <asm/asm-offsets.h> +#include <asm/ctlreg.h> +#include <asm/pfault.h> #include <asm/diag.h> #include <asm/switch_to.h> #include <asm/facility.h> @@ -45,9 +47,8 @@ #include <asm/irq.h> #include <asm/tlbflush.h> #include <asm/vtimer.h> -#include <asm/lowcore.h> +#include <asm/abs_lowcore.h> #include <asm/sclp.h> -#include <asm/vdso.h> #include <asm/debug.h> #include <asm/os_info.h> #include <asm/sigp.h> @@ -55,12 +56,16 @@ #include <asm/nmi.h> #include <asm/stacktrace.h> #include <asm/topology.h> +#include <asm/vdso.h> +#include <asm/maccess.h> #include "entry.h" enum { ec_schedule = 0, ec_call_function_single, ec_stop_cpu, + ec_mcck_pending, + ec_irq_work, }; enum { @@ -71,7 +76,6 @@ enum { static DEFINE_PER_CPU(struct cpu *, cpu_device); struct pcpu { - struct lowcore *lowcore; /* lowcore page(s) for the cpu */ unsigned long ec_mask; /* bit mask for ec_xxx functions */ unsigned long ec_clk; /* sigp timestamp for ec_xxx */ signed char state; /* physical cpu state */ @@ -93,6 +97,7 @@ __vector128 __initdata boot_cpu_vector_save_area[__NUM_VXRS]; #endif static unsigned int smp_max_threads __initdata = -1U; +cpumask_t cpu_setup_mask; static int __init early_nosmt(char *s) { @@ -110,7 +115,7 @@ early_param("smt", early_smt); /* * The smp_cpu_state_mutex must be held when changing the state or polarization - * member of a pcpu data structure within the pcpu_devices arreay. + * member of a pcpu data structure within the pcpu_devices array. */ DEFINE_MUTEX(smp_cpu_state_mutex); @@ -145,7 +150,7 @@ static int pcpu_sigp_retry(struct pcpu *pcpu, u8 order, u32 parm) static inline int pcpu_stopped(struct pcpu *pcpu) { - u32 uninitialized_var(status); + u32 status; if (__pcpu_sigp(pcpu->address, SIGP_SENSE, 0, &status) != SIGP_CC_STATUS_STORED) @@ -188,102 +193,100 @@ static void pcpu_ec_call(struct pcpu *pcpu, int ec_bit) static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu) { - unsigned long async_stack, nodat_stack; + unsigned long async_stack, nodat_stack, mcck_stack; struct lowcore *lc; - if (pcpu != &pcpu_devices[0]) { - pcpu->lowcore = (struct lowcore *) - __get_free_pages(GFP_KERNEL | GFP_DMA, LC_ORDER); - nodat_stack = __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER); - if (!pcpu->lowcore || !nodat_stack) - goto out; - } else { - nodat_stack = pcpu->lowcore->nodat_stack - STACK_INIT_OFFSET; - } + lc = (struct lowcore *) __get_free_pages(GFP_KERNEL | GFP_DMA, LC_ORDER); + nodat_stack = __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER); async_stack = stack_alloc(); - if (!async_stack) + mcck_stack = stack_alloc(); + if (!lc || !nodat_stack || !async_stack || !mcck_stack) goto out; - lc = pcpu->lowcore; memcpy(lc, &S390_lowcore, 512); memset((char *) lc + 512, 0, sizeof(*lc) - 512); lc->async_stack = async_stack + STACK_INIT_OFFSET; lc->nodat_stack = nodat_stack + STACK_INIT_OFFSET; + lc->mcck_stack = mcck_stack + STACK_INIT_OFFSET; lc->cpu_nr = cpu; lc->spinlock_lockval = arch_spin_lockval(cpu); lc->spinlock_index = 0; - lc->br_r1_trampoline = 0x07f1; /* br %r1 */ - if (nmi_alloc_per_cpu(lc)) - goto out_async; - if (vdso_alloc_per_cpu(lc)) + lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW); + lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW); + lc->preempt_count = PREEMPT_DISABLED; + if (nmi_alloc_mcesa(&lc->mcesad)) + goto out; + if (abs_lowcore_map(cpu, lc, true)) goto out_mcesa; lowcore_ptr[cpu] = lc; - pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, (u32)(unsigned long) lc); + pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, __pa(lc)); return 0; out_mcesa: - nmi_free_per_cpu(lc); -out_async: - stack_free(async_stack); + nmi_free_mcesa(&lc->mcesad); out: - if (pcpu != &pcpu_devices[0]) { - free_pages(nodat_stack, THREAD_SIZE_ORDER); - free_pages((unsigned long) pcpu->lowcore, LC_ORDER); - } + stack_free(mcck_stack); + stack_free(async_stack); + free_pages(nodat_stack, THREAD_SIZE_ORDER); + free_pages((unsigned long) lc, LC_ORDER); return -ENOMEM; } static void pcpu_free_lowcore(struct pcpu *pcpu) { - unsigned long async_stack, nodat_stack, lowcore; - - nodat_stack = pcpu->lowcore->nodat_stack - STACK_INIT_OFFSET; - async_stack = pcpu->lowcore->async_stack - STACK_INIT_OFFSET; - lowcore = (unsigned long) pcpu->lowcore; + unsigned long async_stack, nodat_stack, mcck_stack; + struct lowcore *lc; + int cpu; + cpu = pcpu - pcpu_devices; + lc = lowcore_ptr[cpu]; + nodat_stack = lc->nodat_stack - STACK_INIT_OFFSET; + async_stack = lc->async_stack - STACK_INIT_OFFSET; + mcck_stack = lc->mcck_stack - STACK_INIT_OFFSET; pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, 0); - lowcore_ptr[pcpu - pcpu_devices] = NULL; - vdso_free_per_cpu(pcpu->lowcore); - nmi_free_per_cpu(pcpu->lowcore); + lowcore_ptr[cpu] = NULL; + abs_lowcore_unmap(cpu); + nmi_free_mcesa(&lc->mcesad); stack_free(async_stack); - if (pcpu == &pcpu_devices[0]) - return; + stack_free(mcck_stack); free_pages(nodat_stack, THREAD_SIZE_ORDER); - free_pages(lowcore, LC_ORDER); + free_pages((unsigned long) lc, LC_ORDER); } static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu) { - struct lowcore *lc = pcpu->lowcore; + struct lowcore *lc, *abs_lc; + lc = lowcore_ptr[cpu]; cpumask_set_cpu(cpu, &init_mm.context.cpu_attach_mask); cpumask_set_cpu(cpu, mm_cpumask(&init_mm)); lc->cpu_nr = cpu; + lc->restart_flags = RESTART_FLAG_CTLREGS; lc->spinlock_lockval = arch_spin_lockval(cpu); lc->spinlock_index = 0; lc->percpu_offset = __per_cpu_offset[cpu]; lc->kernel_asce = S390_lowcore.kernel_asce; - lc->user_asce = S390_lowcore.kernel_asce; + lc->user_asce = s390_invalid_asce; lc->machine_flags = S390_lowcore.machine_flags; lc->user_timer = lc->system_timer = lc->steal_timer = lc->avg_steal_timer = 0; - __ctl_store(lc->cregs_save_area, 0, 15); + abs_lc = get_abs_lowcore(); + memcpy(lc->cregs_save_area, abs_lc->cregs_save_area, sizeof(lc->cregs_save_area)); + put_abs_lowcore(abs_lc); lc->cregs_save_area[1] = lc->kernel_asce; - lc->cregs_save_area[7] = lc->vdso_asce; + lc->cregs_save_area[7] = lc->user_asce; save_access_regs((unsigned int *) lc->access_regs_save_area); - memcpy(lc->stfle_fac_list, S390_lowcore.stfle_fac_list, - sizeof(lc->stfle_fac_list)); - memcpy(lc->alt_stfle_fac_list, S390_lowcore.alt_stfle_fac_list, - sizeof(lc->alt_stfle_fac_list)); arch_spin_lock_setup(cpu); } static void pcpu_attach_task(struct pcpu *pcpu, struct task_struct *tsk) { - struct lowcore *lc = pcpu->lowcore; + struct lowcore *lc; + int cpu; - lc->kernel_stack = (unsigned long) task_stack_page(tsk) - + THREAD_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs); - lc->current_task = (unsigned long) tsk; + cpu = pcpu - pcpu_devices; + lc = lowcore_ptr[cpu]; + lc->kernel_stack = (unsigned long)task_stack_page(tsk) + STACK_INIT_OFFSET; + lc->current_task = (unsigned long)tsk; lc->lpp = LPP_MAGIC; lc->current_pid = tsk->pid; lc->user_timer = tsk->thread.user_timer; @@ -296,41 +299,59 @@ static void pcpu_attach_task(struct pcpu *pcpu, struct task_struct *tsk) static void pcpu_start_fn(struct pcpu *pcpu, void (*func)(void *), void *data) { - struct lowcore *lc = pcpu->lowcore; + struct lowcore *lc; + int cpu; - lc->restart_stack = lc->nodat_stack; + cpu = pcpu - pcpu_devices; + lc = lowcore_ptr[cpu]; + lc->restart_stack = lc->kernel_stack; lc->restart_fn = (unsigned long) func; lc->restart_data = (unsigned long) data; - lc->restart_source = -1UL; + lc->restart_source = -1U; pcpu_sigp_retry(pcpu, SIGP_RESTART, 0); } +typedef void (pcpu_delegate_fn)(void *); + /* * Call function via PSW restart on pcpu and stop the current cpu. */ -static void __pcpu_delegate(void (*func)(void*), void *data) +static void __pcpu_delegate(pcpu_delegate_fn *func, void *data) { func(data); /* should not return */ } -static void __no_sanitize_address pcpu_delegate(struct pcpu *pcpu, - void (*func)(void *), - void *data, unsigned long stack) +static void pcpu_delegate(struct pcpu *pcpu, + pcpu_delegate_fn *func, + void *data, unsigned long stack) { - struct lowcore *lc = lowcore_ptr[pcpu - pcpu_devices]; - unsigned long source_cpu = stap(); + struct lowcore *lc, *abs_lc; + unsigned int source_cpu; - __load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT); - if (pcpu->address == source_cpu) - CALL_ON_STACK(__pcpu_delegate, stack, 2, func, data); + lc = lowcore_ptr[pcpu - pcpu_devices]; + source_cpu = stap(); + + if (pcpu->address == source_cpu) { + call_on_stack(2, stack, void, __pcpu_delegate, + pcpu_delegate_fn *, func, void *, data); + } /* Stop target cpu (if func returns this stops the current cpu). */ pcpu_sigp_retry(pcpu, SIGP_STOP, 0); + pcpu_sigp_retry(pcpu, SIGP_CPU_RESET, 0); /* Restart func on the target cpu and stop the current cpu. */ - mem_assign_absolute(lc->restart_stack, stack); - mem_assign_absolute(lc->restart_fn, (unsigned long) func); - mem_assign_absolute(lc->restart_data, (unsigned long) data); - mem_assign_absolute(lc->restart_source, source_cpu); - __bpon(); + if (lc) { + lc->restart_stack = stack; + lc->restart_fn = (unsigned long)func; + lc->restart_data = (unsigned long)data; + lc->restart_source = source_cpu; + } else { + abs_lc = get_abs_lowcore(); + abs_lc->restart_stack = stack; + abs_lc->restart_fn = (unsigned long)func; + abs_lc->restart_data = (unsigned long)data; + abs_lc->restart_source = source_cpu; + put_abs_lowcore(abs_lc); + } asm volatile( "0: sigp 0,%0,%2 # sigp restart to target cpu\n" " brc 2,0b # busy, try again\n" @@ -382,7 +403,7 @@ void smp_call_online_cpu(void (*func)(void *), void *data) */ void smp_call_ipl_cpu(void (*func)(void *), void *data) { - struct lowcore *lc = pcpu_devices->lowcore; + struct lowcore *lc = lowcore_ptr[0]; if (pcpu_devices[0].address == stap()) lc = &S390_lowcore; @@ -401,7 +422,12 @@ int smp_find_processor_id(u16 address) return -1; } -bool arch_vcpu_is_preempted(int cpu) +void schedule_mcck_handler(void) +{ + pcpu_ec_call(pcpu_devices + smp_processor_id(), ec_mcck_pending); +} + +bool notrace arch_vcpu_is_preempted(int cpu) { if (test_cpu_flag_of(CIF_ENABLED_WAIT, cpu)) return false; @@ -411,17 +437,15 @@ bool arch_vcpu_is_preempted(int cpu) } EXPORT_SYMBOL(arch_vcpu_is_preempted); -void smp_yield_cpu(int cpu) +void notrace smp_yield_cpu(int cpu) { - if (MACHINE_HAS_DIAG9C) { - diag_stat_inc_norecursion(DIAG_STAT_X09C); - asm volatile("diag %0,0,0x9c" - : : "d" (pcpu_devices[cpu].address)); - } else if (MACHINE_HAS_DIAG44 && !smp_cpu_mtid) { - diag_stat_inc_norecursion(DIAG_STAT_X044); - asm volatile("diag 0,0,0x44"); - } + if (!MACHINE_HAS_DIAG9C) + return; + diag_stat_inc_norecursion(DIAG_STAT_X09C); + asm volatile("diag %0,0,0x9c" + : : "d" (pcpu_devices[cpu].address)); } +EXPORT_SYMBOL_GPL(smp_yield_cpu); /* * Send cpus emergency shutdown signal. This gives the cpus the @@ -429,10 +453,12 @@ void smp_yield_cpu(int cpu) */ void notrace smp_emergency_stop(void) { - cpumask_t cpumask; + static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED; + static cpumask_t cpumask; u64 end; int cpu; + arch_spin_lock(&lock); cpumask_copy(&cpumask, cpu_online_mask); cpumask_clear_cpu(smp_processor_id(), &cpumask); @@ -453,6 +479,7 @@ void notrace smp_emergency_stop(void) break; cpu_relax(); } + arch_spin_unlock(&lock); } NOKPROBE_SYMBOL(smp_emergency_stop); @@ -464,7 +491,7 @@ void smp_send_stop(void) int cpu; /* Disable all interrupts/machine checks */ - __load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT); + __load_psw_mask(PSW_KERNEL_BITS); trace_hardirqs_off(); debug_set_critical(); @@ -498,6 +525,10 @@ static void smp_handle_ext_call(void) scheduler_ipi(); if (test_bit(ec_call_function_single, &bits)) generic_smp_call_function_single_interrupt(); + if (test_bit(ec_mcck_pending, &bits)) + s390_handle_mcck(); + if (test_bit(ec_irq_work, &bits)) + irq_work_run(); } static void do_ext_call_interrupt(struct ext_code ext_code, @@ -525,71 +556,37 @@ void arch_send_call_function_single_ipi(int cpu) * it goes straight through and wastes no time serializing * anything. Worst case is that we lose a reschedule ... */ -void smp_send_reschedule(int cpu) +void arch_smp_send_reschedule(int cpu) { pcpu_ec_call(pcpu_devices + cpu, ec_schedule); } -/* - * parameter area for the set/clear control bit callbacks - */ -struct ec_creg_mask_parms { - unsigned long orval; - unsigned long andval; - int cr; -}; - -/* - * callback for setting/clearing control bits - */ -static void smp_ctl_bit_callback(void *info) -{ - struct ec_creg_mask_parms *pp = info; - unsigned long cregs[16]; - - __ctl_store(cregs, 0, 15); - cregs[pp->cr] = (cregs[pp->cr] & pp->andval) | pp->orval; - __ctl_load(cregs, 0, 15); -} - -/* - * Set a bit in a control register of all cpus - */ -void smp_ctl_set_bit(int cr, int bit) -{ - struct ec_creg_mask_parms parms = { 1UL << bit, -1UL, cr }; - - on_each_cpu(smp_ctl_bit_callback, &parms, 1); -} -EXPORT_SYMBOL(smp_ctl_set_bit); - -/* - * Clear a bit in a control register of all cpus - */ -void smp_ctl_clear_bit(int cr, int bit) +#ifdef CONFIG_IRQ_WORK +void arch_irq_work_raise(void) { - struct ec_creg_mask_parms parms = { 0, ~(1UL << bit), cr }; - - on_each_cpu(smp_ctl_bit_callback, &parms, 1); + pcpu_ec_call(pcpu_devices + smp_processor_id(), ec_irq_work); } -EXPORT_SYMBOL(smp_ctl_clear_bit); +#endif #ifdef CONFIG_CRASH_DUMP int smp_store_status(int cpu) { - struct pcpu *pcpu = pcpu_devices + cpu; + struct lowcore *lc; + struct pcpu *pcpu; unsigned long pa; - pa = __pa(&pcpu->lowcore->floating_pt_save_area); + pcpu = pcpu_devices + cpu; + lc = lowcore_ptr[cpu]; + pa = __pa(&lc->floating_pt_save_area); if (__pcpu_sigp_relax(pcpu->address, SIGP_STORE_STATUS_AT_ADDRESS, pa) != SIGP_CC_ORDER_CODE_ACCEPTED) return -EIO; - if (!MACHINE_HAS_VX && !MACHINE_HAS_GS) + if (!cpu_has_vx() && !MACHINE_HAS_GS) return 0; - pa = __pa(pcpu->lowcore->mcesad & MCESA_ORIGIN_MASK); + pa = lc->mcesad & MCESA_ORIGIN_MASK; if (MACHINE_HAS_GS) - pa |= pcpu->lowcore->mcesad & MCESA_LC_MASK; + pa |= lc->mcesad & MCESA_LC_MASK; if (__pcpu_sigp_relax(pcpu->address, SIGP_STORE_ADDITIONAL_STATUS, pa) != SIGP_CC_ORDER_CODE_ACCEPTED) return -EIO; @@ -599,14 +596,14 @@ int smp_store_status(int cpu) /* * Collect CPU state of the previous, crashed system. * There are four cases: - * 1) standard zfcp dump - * condition: OLDMEM_BASE == NULL && ipl_info.type == IPL_TYPE_FCP_DUMP + * 1) standard zfcp/nvme dump + * condition: OLDMEM_BASE == NULL && is_ipl_type_dump() == true * The state for all CPUs except the boot CPU needs to be collected * with sigp stop-and-store-status. The boot CPU state is located in * the absolute lowcore of the memory stored in the HSA. The zcore code * will copy the boot CPU state from the HSA. - * 2) stand-alone kdump for SCSI (zfcp dump with swapped memory) - * condition: OLDMEM_BASE != NULL && ipl_info.type == IPL_TYPE_FCP_DUMP + * 2) stand-alone kdump for SCSI/NVMe (zfcp/nvme dump with swapped memory) + * condition: OLDMEM_BASE != NULL && is_ipl_type_dump() == true * The state for all CPUs except the boot CPU needs to be collected * with sigp stop-and-store-status. The firmware or the boot-loader * stored the registers of the boot CPU in the absolute lowcore in the @@ -622,42 +619,39 @@ int smp_store_status(int cpu) * This case does not exist for s390 anymore, setup_arch explicitly * deactivates the elfcorehdr= kernel parameter */ -static __init void smp_save_cpu_vxrs(struct save_area *sa, u16 addr, - bool is_boot_cpu, unsigned long page) +static bool dump_available(void) { - __vector128 *vxrs = (__vector128 *) page; - - if (is_boot_cpu) - vxrs = boot_cpu_vector_save_area; - else - __pcpu_sigp_relax(addr, SIGP_STORE_ADDITIONAL_STATUS, page); - save_area_add_vxrs(sa, vxrs); + return oldmem_data.start || is_ipl_type_dump(); } -static __init void smp_save_cpu_regs(struct save_area *sa, u16 addr, - bool is_boot_cpu, unsigned long page) +void __init smp_save_dump_ipl_cpu(void) { - void *regs = (void *) page; + struct save_area *sa; + void *regs; - if (is_boot_cpu) - copy_oldmem_kernel(regs, (void *) __LC_FPREGS_SAVE_AREA, 512); - else - __pcpu_sigp_relax(addr, SIGP_STORE_STATUS_AT_ADDRESS, page); + if (!dump_available()) + return; + sa = save_area_alloc(true); + regs = memblock_alloc(512, 8); + if (!sa || !regs) + panic("could not allocate memory for boot CPU save area\n"); + copy_oldmem_kernel(regs, __LC_FPREGS_SAVE_AREA, 512); save_area_add_regs(sa, regs); + memblock_free(regs, 512); + if (cpu_has_vx()) + save_area_add_vxrs(sa, boot_cpu_vector_save_area); } -void __init smp_save_dump_cpus(void) +void __init smp_save_dump_secondary_cpus(void) { int addr, boot_cpu_addr, max_cpu_addr; struct save_area *sa; - unsigned long page; - bool is_boot_cpu; + void *page; - if (!(OLDMEM_BASE || ipl_info.type == IPL_TYPE_FCP_DUMP)) - /* No previous system present, normal boot. */ + if (!dump_available()) return; /* Allocate a page as dumping area for the store status sigps */ - page = memblock_phys_alloc_range(PAGE_SIZE, PAGE_SIZE, 0, 1UL << 31); + page = memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); if (!page) panic("ERROR: Failed to allocate %lx bytes below %lx\n", PAGE_SIZE, 1UL << 31); @@ -667,29 +661,23 @@ void __init smp_save_dump_cpus(void) boot_cpu_addr = stap(); max_cpu_addr = SCLP_MAX_CORES << sclp.mtid_prev; for (addr = 0; addr <= max_cpu_addr; addr++) { + if (addr == boot_cpu_addr) + continue; if (__pcpu_sigp_relax(addr, SIGP_SENSE, 0) == SIGP_CC_NOT_OPERATIONAL) continue; - is_boot_cpu = (addr == boot_cpu_addr); - /* Allocate save area */ - sa = save_area_alloc(is_boot_cpu); + sa = save_area_alloc(false); if (!sa) panic("could not allocate memory for save area\n"); - if (MACHINE_HAS_VX) - /* Get the vector registers */ - smp_save_cpu_vxrs(sa, addr, is_boot_cpu, page); - /* - * For a zfcp dump OLDMEM_BASE == NULL and the registers - * of the boot CPU are stored in the HSA. To retrieve - * these registers an SCLP request is required which is - * done by drivers/s390/char/zcore.c:init_cpu_info() - */ - if (!is_boot_cpu || OLDMEM_BASE) - /* Get the CPU registers */ - smp_save_cpu_regs(sa, addr, is_boot_cpu, page); + __pcpu_sigp_relax(addr, SIGP_STORE_STATUS_AT_ADDRESS, __pa(page)); + save_area_add_regs(sa, page); + if (cpu_has_vx()) { + __pcpu_sigp_relax(addr, SIGP_STORE_ADDITIONAL_STATUS, __pa(page)); + save_area_add_vxrs(sa, page); + } } memblock_free(page, PAGE_SIZE); - diag_dma_ops.diag308_reset(); + diag_amode31_ops.diag308_reset(); pcpu_set_smt(0); } #endif /* CONFIG_CRASH_DUMP */ @@ -704,6 +692,11 @@ int smp_cpu_get_polarization(int cpu) return pcpu_devices[cpu].polarization; } +int smp_cpu_get_cpu_address(int cpu) +{ + return pcpu_devices[cpu].address; +} + static void __ref smp_get_core_info(struct sclp_core_info *info, int early) { static int use_sigp_detection; @@ -763,11 +756,13 @@ static int smp_add_core(struct sclp_core_entry *core, cpumask_t *avail, static int __smp_rescan_cpus(struct sclp_core_info *info, bool early) { struct sclp_core_entry *core; - cpumask_t avail; + static cpumask_t avail; bool configured; u16 core_id; int nr, i; + cpus_read_lock(); + mutex_lock(&smp_cpu_state_mutex); nr = 0; cpumask_xor(&avail, cpu_possible_mask, cpu_present_mask); /* @@ -788,6 +783,8 @@ static int __smp_rescan_cpus(struct sclp_core_info *info, bool early) configured = i < info->configured; nr += smp_add_core(&info->core[i], &avail, configured, early); } + mutex_unlock(&smp_cpu_state_mutex); + cpus_read_unlock(); return nr; } @@ -835,83 +832,70 @@ void __init smp_detect_cpus(void) pr_info("%d configured CPUs, %d standby CPUs\n", c_cpus, s_cpus); /* Add CPUs present at boot */ - get_online_cpus(); __smp_rescan_cpus(info, true); - put_online_cpus(); - memblock_free_early((unsigned long)info, sizeof(*info)); + memblock_free(info, sizeof(*info)); } -static void smp_init_secondary(void) +/* + * Activate a secondary processor. + */ +static void smp_start_secondary(void *cpuvoid) { - int cpu = smp_processor_id(); + int cpu = raw_smp_processor_id(); S390_lowcore.last_update_clock = get_tod_clock(); + S390_lowcore.restart_stack = (unsigned long)restart_stack; + S390_lowcore.restart_fn = (unsigned long)do_restart; + S390_lowcore.restart_data = 0; + S390_lowcore.restart_source = -1U; + S390_lowcore.restart_flags = 0; restore_access_regs(S390_lowcore.access_regs_save_area); - set_cpu_flag(CIF_ASCE_PRIMARY); - set_cpu_flag(CIF_ASCE_SECONDARY); cpu_init(); - preempt_disable(); + rcutree_report_cpu_starting(cpu); init_cpu_timer(); vtime_init(); + vdso_getcpu_init(); pfault_init(); - notify_cpu_starting(smp_processor_id()); + cpumask_set_cpu(cpu, &cpu_setup_mask); + update_cpu_masks(); + notify_cpu_starting(cpu); if (topology_cpu_dedicated(cpu)) set_cpu_flag(CIF_DEDICATED_CPU); else clear_cpu_flag(CIF_DEDICATED_CPU); - set_cpu_online(smp_processor_id(), true); + set_cpu_online(cpu, true); inc_irq_stat(CPU_RST); local_irq_enable(); cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } -/* - * Activate a secondary processor. - */ -static void __no_sanitize_address smp_start_secondary(void *cpuvoid) -{ - S390_lowcore.restart_stack = (unsigned long) restart_stack; - S390_lowcore.restart_fn = (unsigned long) do_restart; - S390_lowcore.restart_data = 0; - S390_lowcore.restart_source = -1UL; - __ctl_load(S390_lowcore.cregs_save_area, 0, 15); - __load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT); - CALL_ON_STACK_NORETURN(smp_init_secondary, S390_lowcore.kernel_stack); -} - /* Upping and downing of CPUs */ int __cpu_up(unsigned int cpu, struct task_struct *tidle) { - struct pcpu *pcpu; - int base, i, rc; + struct pcpu *pcpu = pcpu_devices + cpu; + int rc; - pcpu = pcpu_devices + cpu; if (pcpu->state != CPU_STATE_CONFIGURED) return -EIO; - base = smp_get_base_cpu(cpu); - for (i = 0; i <= smp_cpu_mtid; i++) { - if (base + i < nr_cpu_ids) - if (cpu_online(base + i)) - break; - } - /* - * If this is the first CPU of the core to get online - * do an initial CPU reset. - */ - if (i > smp_cpu_mtid && - pcpu_sigp_retry(pcpu_devices + base, SIGP_INITIAL_CPU_RESET, 0) != + if (pcpu_sigp_retry(pcpu, SIGP_INITIAL_CPU_RESET, 0) != SIGP_CC_ORDER_CODE_ACCEPTED) return -EIO; rc = pcpu_alloc_lowcore(pcpu, cpu); if (rc) return rc; + /* + * Make sure global control register contents do not change + * until new CPU has initialized control registers. + */ + system_ctlreg_lock(); pcpu_prepare_secondary(pcpu, cpu); pcpu_attach_task(pcpu, tidle); pcpu_start_fn(pcpu, smp_start_secondary, NULL); /* Wait until cpu puts itself in the online & active maps */ while (!cpu_online(cpu)) cpu_relax(); + system_ctlreg_unlock(); return 0; } @@ -926,19 +910,23 @@ early_param("possible_cpus", _setup_possible_cpus); int __cpu_disable(void) { - unsigned long cregs[16]; + struct ctlreg cregs[16]; + int cpu; /* Handle possible pending IPIs */ smp_handle_ext_call(); - set_cpu_online(smp_processor_id(), false); + cpu = smp_processor_id(); + set_cpu_online(cpu, false); + cpumask_clear_cpu(cpu, &cpu_setup_mask); + update_cpu_masks(); /* Disable pseudo page faults on this cpu. */ pfault_fini(); /* Disable interrupt sources via control register. */ - __ctl_store(cregs, 0, 15); - cregs[0] &= ~0x0000ee70UL; /* disable all external interrupts */ - cregs[6] &= ~0xff000000UL; /* disable all I/O interrupts */ - cregs[14] &= ~0x1f000000UL; /* disable most machine checks */ - __ctl_load(cregs, 0, 15); + __local_ctl_store(0, 15, cregs); + cregs[0].val &= ~0x0000ee70UL; /* disable all external interrupts */ + cregs[6].val &= ~0xff000000UL; /* disable all I/O interrupts */ + cregs[14].val &= ~0x1f000000UL; /* disable most machine checks */ + __local_ctl_load(0, 15, cregs); clear_cpu_flag(CIF_NOHZ_DELAY); return 0; } @@ -959,7 +947,6 @@ void __cpu_die(unsigned int cpu) void __noreturn cpu_die(void) { idle_task_exit(); - __bpon(); pcpu_sigp_retry(pcpu_devices + smp_processor_id(), SIGP_STOP, 0); for (;;) ; } @@ -979,12 +966,12 @@ void __init smp_fill_possible_mask(void) void __init smp_prepare_cpus(unsigned int max_cpus) { - /* request the 0x1201 emergency signal external interrupt */ if (register_external_irq(EXT_IRQ_EMERGENCY_SIG, do_ext_call_interrupt)) panic("Couldn't request external interrupt 0x1201"); - /* request the 0x1202 external call external interrupt */ + system_ctl_set_bit(0, 14); if (register_external_irq(EXT_IRQ_EXTERNAL_CALL, do_ext_call_interrupt)) panic("Couldn't request external interrupt 0x1202"); + system_ctl_set_bit(0, 13); } void __init smp_prepare_boot_cpu(void) @@ -993,15 +980,10 @@ void __init smp_prepare_boot_cpu(void) WARN_ON(!cpu_present(0) || !cpu_online(0)); pcpu->state = CPU_STATE_CONFIGURED; - pcpu->lowcore = (struct lowcore *)(unsigned long) store_prefix(); S390_lowcore.percpu_offset = __per_cpu_offset[0]; smp_cpu_set_polarization(0, POLARIZATION_UNKNOWN); } -void __init smp_cpus_done(unsigned int max_cpus) -{ -} - void __init smp_setup_processor_id(void) { pcpu_devices[0].address = stap(); @@ -1044,14 +1026,12 @@ static ssize_t cpu_configure_store(struct device *dev, return -EINVAL; if (val != 0 && val != 1) return -EINVAL; - get_online_cpus(); + cpus_read_lock(); mutex_lock(&smp_cpu_state_mutex); rc = -EBUSY; - /* disallow configuration changes of online cpus and cpu 0 */ + /* disallow configuration changes of online cpus */ cpu = dev->id; cpu = smp_get_base_cpu(cpu); - if (cpu == 0) - goto out; for (i = 0; i <= smp_cpu_mtid; i++) if (cpu_online(cpu + i)) goto out; @@ -1093,7 +1073,7 @@ static ssize_t cpu_configure_store(struct device *dev, } out: mutex_unlock(&smp_cpu_state_mutex); - put_online_cpus(); + cpus_read_unlock(); return rc ? rc : count; } static DEVICE_ATTR(configure, 0644, cpu_configure_show, cpu_configure_store); @@ -1131,6 +1111,7 @@ static int smp_cpu_online(unsigned int cpu) return sysfs_create_group(&s->kobj, &cpu_online_attr_group); } + static int smp_cpu_pre_down(unsigned int cpu) { struct device *s = &per_cpu(cpu_device, cpu)->dev; @@ -1150,7 +1131,7 @@ static int smp_add_present_cpu(int cpu) return -ENOMEM; per_cpu(cpu_device, cpu) = c; s = &c->dev; - c->hotpluggable = 1; + c->hotpluggable = !!cpu; rc = register_cpu(c, cpu); if (rc) goto out; @@ -1179,11 +1160,7 @@ int __ref smp_rescan_cpus(void) if (!info) return -ENOMEM; smp_get_core_info(info, 0); - get_online_cpus(); - mutex_lock(&smp_cpu_state_mutex); nr = __smp_rescan_cpus(info, false); - mutex_unlock(&smp_cpu_state_mutex); - put_online_cpus(); kfree(info); if (nr) topology_schedule_update(); @@ -1208,11 +1185,17 @@ static DEVICE_ATTR_WO(rescan); static int __init s390_smp_init(void) { + struct device *dev_root; int cpu, rc = 0; - rc = device_create_file(cpu_subsys.dev_root, &dev_attr_rescan); - if (rc) - return rc; + dev_root = bus_get_dev_root(&cpu_subsys); + if (dev_root) { + rc = device_create_file(dev_root, &dev_attr_rescan); + put_device(dev_root); + if (rc) + return rc; + } + for_each_present_cpu(cpu) { rc = smp_add_present_cpu(cpu); if (rc) diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c index fc5419ac64c8..94f440e38303 100644 --- a/arch/s390/kernel/stacktrace.c +++ b/arch/s390/kernel/stacktrace.c @@ -3,13 +3,15 @@ * Stack trace management functions * * Copyright IBM Corp. 2006 - * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com> */ #include <linux/stacktrace.h> +#include <linux/uaccess.h> +#include <linux/compat.h> #include <asm/stacktrace.h> #include <asm/unwind.h> #include <asm/kprobes.h> +#include <asm/ptrace.h> void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, struct task_struct *task, struct pt_regs *regs) @@ -19,17 +21,11 @@ void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, unwind_for_each_frame(&state, task, regs, 0) { addr = unwind_get_return_address(&state); - if (!addr || !consume_entry(cookie, addr, false)) + if (!addr || !consume_entry(cookie, addr)) break; } } -/* - * This function returns an error if it detects any unreliable features of the - * stack. Otherwise it guarantees that the stack trace is reliable. - * - * If the task is not 'current', the caller *must* ensure the task is inactive. - */ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, void *cookie, struct task_struct *task) { @@ -47,16 +43,16 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, if (!addr) return -EINVAL; -#ifdef CONFIG_KPROBES +#ifdef CONFIG_RETHOOK /* - * Mark stacktraces with kretprobed functions on them + * Mark stacktraces with krethook functions on them * as unreliable. */ - if (state.ip == (unsigned long)kretprobe_trampoline) + if (state.ip == (unsigned long)arch_rethook_trampoline) return -EINVAL; #endif - if (!consume_entry(cookie, addr, false)) + if (!consume_entry(cookie, addr)) return -EINVAL; } @@ -65,3 +61,43 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, return -EINVAL; return 0; } + +void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie, + const struct pt_regs *regs) +{ + struct stack_frame_user __user *sf; + unsigned long ip, sp; + bool first = true; + + if (is_compat_task()) + return; + if (!consume_entry(cookie, instruction_pointer(regs))) + return; + sf = (void __user *)user_stack_pointer(regs); + pagefault_disable(); + while (1) { + if (__get_user(sp, &sf->back_chain)) + break; + if (__get_user(ip, &sf->gprs[8])) + break; + if (ip & 0x1) { + /* + * If the instruction address is invalid, and this + * is the first stack frame, assume r14 has not + * been written to the stack yet. Otherwise exit. + */ + if (first && !(regs->gprs[14] & 0x1)) + ip = regs->gprs[14]; + else + break; + } + if (!consume_entry(cookie, ip)) + break; + /* Sanity check: ABI requires SP to be aligned 8 bytes. */ + if (!sp || sp & 0x7) + break; + sf = (void __user *)sp; + first = false; + } + pagefault_enable(); +} diff --git a/arch/s390/kernel/sthyi.c b/arch/s390/kernel/sthyi.c index 888cc2f166db..30bb20461db4 100644 --- a/arch/s390/kernel/sthyi.c +++ b/arch/s390/kernel/sthyi.c @@ -317,7 +317,9 @@ static void fill_diag(struct sthyi_sctns *sctns) if (pages <= 0) return; - diag204_buf = vmalloc(array_size(pages, PAGE_SIZE)); + diag204_buf = __vmalloc_node(array_size(pages, PAGE_SIZE), + PAGE_SIZE, GFP_KERNEL, NUMA_NO_NODE, + __builtin_return_address(0)); if (!diag204_buf) return; @@ -395,19 +397,18 @@ out: static int sthyi(u64 vaddr, u64 *rc) { - register u64 code asm("0") = 0; - register u64 addr asm("2") = vaddr; - register u64 rcode asm("3"); + union register_pair r1 = { .even = 0, }; /* subcode */ + union register_pair r2 = { .even = vaddr, }; int cc; asm volatile( - ".insn rre,0xB2560000,%[code],%[addr]\n" + ".insn rre,0xB2560000,%[r1],%[r2]\n" "ipm %[cc]\n" "srl %[cc],28\n" - : [cc] "=d" (cc), "=d" (rcode) - : [code] "d" (code), [addr] "a" (addr) + : [cc] "=&d" (cc), [r2] "+&d" (r2.pair) + : [r1] "d" (r1.pair) : "memory", "cc"); - *rc = rcode; + *rc = r2.odd; return cc; } @@ -460,9 +461,9 @@ static int sthyi_update_cache(u64 *rc) * * Fills the destination with system information returned by the STHYI * instruction. The data is generated by emulation or execution of STHYI, - * if available. The return value is the condition code that would be - * returned, the rc parameter is the return code which is passed in - * register R2 + 1. + * if available. The return value is either a negative error value or + * the condition code that would be returned, the rc parameter is the + * return code which is passed in register R2 + 1. */ int sthyi_fill(void *dst, u64 *rc) { diff --git a/arch/s390/kernel/suspend.c b/arch/s390/kernel/suspend.c deleted file mode 100644 index 75b7b307946e..000000000000 --- a/arch/s390/kernel/suspend.c +++ /dev/null @@ -1,240 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Suspend support specific for s390. - * - * Copyright IBM Corp. 2009 - * - * Author(s): Hans-Joachim Picht <hans@linux.vnet.ibm.com> - */ - -#include <linux/pfn.h> -#include <linux/suspend.h> -#include <linux/mm.h> -#include <linux/pci.h> -#include <asm/ctl_reg.h> -#include <asm/ipl.h> -#include <asm/cio.h> -#include <asm/sections.h> -#include "entry.h" - -/* - * The restore of the saved pages in an hibernation image will set - * the change and referenced bits in the storage key for each page. - * Overindication of the referenced bits after an hibernation cycle - * does not cause any harm but the overindication of the change bits - * would cause trouble. - * Use the ARCH_SAVE_PAGE_KEYS hooks to save the storage key of each - * page to the most significant byte of the associated page frame - * number in the hibernation image. - */ - -/* - * Key storage is allocated as a linked list of pages. - * The size of the keys array is (PAGE_SIZE - sizeof(long)) - */ -struct page_key_data { - struct page_key_data *next; - unsigned char data[]; -}; - -#define PAGE_KEY_DATA_SIZE (PAGE_SIZE - sizeof(struct page_key_data *)) - -static struct page_key_data *page_key_data; -static struct page_key_data *page_key_rp, *page_key_wp; -static unsigned long page_key_rx, page_key_wx; -unsigned long suspend_zero_pages; - -/* - * For each page in the hibernation image one additional byte is - * stored in the most significant byte of the page frame number. - * On suspend no additional memory is required but on resume the - * keys need to be memorized until the page data has been restored. - * Only then can the storage keys be set to their old state. - */ -unsigned long page_key_additional_pages(unsigned long pages) -{ - return DIV_ROUND_UP(pages, PAGE_KEY_DATA_SIZE); -} - -/* - * Free page_key_data list of arrays. - */ -void page_key_free(void) -{ - struct page_key_data *pkd; - - while (page_key_data) { - pkd = page_key_data; - page_key_data = pkd->next; - free_page((unsigned long) pkd); - } -} - -/* - * Allocate page_key_data list of arrays with enough room to store - * one byte for each page in the hibernation image. - */ -int page_key_alloc(unsigned long pages) -{ - struct page_key_data *pk; - unsigned long size; - - size = DIV_ROUND_UP(pages, PAGE_KEY_DATA_SIZE); - while (size--) { - pk = (struct page_key_data *) get_zeroed_page(GFP_KERNEL); - if (!pk) { - page_key_free(); - return -ENOMEM; - } - pk->next = page_key_data; - page_key_data = pk; - } - page_key_rp = page_key_wp = page_key_data; - page_key_rx = page_key_wx = 0; - return 0; -} - -/* - * Save the storage key into the upper 8 bits of the page frame number. - */ -void page_key_read(unsigned long *pfn) -{ - struct page *page; - unsigned long addr; - unsigned char key; - - page = pfn_to_page(*pfn); - addr = (unsigned long) page_address(page); - key = (unsigned char) page_get_storage_key(addr) & 0x7f; - if (arch_test_page_nodat(page)) - key |= 0x80; - *(unsigned char *) pfn = key; -} - -/* - * Extract the storage key from the upper 8 bits of the page frame number - * and store it in the page_key_data list of arrays. - */ -void page_key_memorize(unsigned long *pfn) -{ - page_key_wp->data[page_key_wx] = *(unsigned char *) pfn; - *(unsigned char *) pfn = 0; - if (++page_key_wx < PAGE_KEY_DATA_SIZE) - return; - page_key_wp = page_key_wp->next; - page_key_wx = 0; -} - -/* - * Get the next key from the page_key_data list of arrays and set the - * storage key of the page referred by @address. If @address refers to - * a "safe" page the swsusp_arch_resume code will transfer the storage - * key from the buffer page to the original page. - */ -void page_key_write(void *address) -{ - struct page *page; - unsigned char key; - - key = page_key_rp->data[page_key_rx]; - page_set_storage_key((unsigned long) address, key & 0x7f, 0); - page = virt_to_page(address); - if (key & 0x80) - arch_set_page_nodat(page, 0); - else - arch_set_page_dat(page, 0); - if (++page_key_rx >= PAGE_KEY_DATA_SIZE) - return; - page_key_rp = page_key_rp->next; - page_key_rx = 0; -} - -int pfn_is_nosave(unsigned long pfn) -{ - unsigned long nosave_begin_pfn = PFN_DOWN(__pa(&__nosave_begin)); - unsigned long nosave_end_pfn = PFN_DOWN(__pa(&__nosave_end)); - unsigned long end_rodata_pfn = PFN_DOWN(__pa(__end_rodata)) - 1; - unsigned long stext_pfn = PFN_DOWN(__pa(_stext)); - - /* Always save lowcore pages (LC protection might be enabled). */ - if (pfn <= LC_PAGES) - return 0; - if (pfn >= nosave_begin_pfn && pfn < nosave_end_pfn) - return 1; - /* Skip memory holes and read-only pages (DCSS, ...). */ - if (pfn >= stext_pfn && pfn <= end_rodata_pfn) - return 0; - if (tprot(PFN_PHYS(pfn))) - return 1; - return 0; -} - -/* - * PM notifier callback for suspend - */ -static int suspend_pm_cb(struct notifier_block *nb, unsigned long action, - void *ptr) -{ - switch (action) { - case PM_SUSPEND_PREPARE: - case PM_HIBERNATION_PREPARE: - suspend_zero_pages = __get_free_pages(GFP_KERNEL, LC_ORDER); - if (!suspend_zero_pages) - return NOTIFY_BAD; - break; - case PM_POST_SUSPEND: - case PM_POST_HIBERNATION: - free_pages(suspend_zero_pages, LC_ORDER); - break; - default: - return NOTIFY_DONE; - } - return NOTIFY_OK; -} - -static int __init suspend_pm_init(void) -{ - pm_notifier(suspend_pm_cb, 0); - return 0; -} -arch_initcall(suspend_pm_init); - -void save_processor_state(void) -{ - /* swsusp_arch_suspend() actually saves all cpu register contents. - * Machine checks must be disabled since swsusp_arch_suspend() stores - * register contents to their lowcore save areas. That's the same - * place where register contents on machine checks would be saved. - * To avoid register corruption disable machine checks. - * We must also disable machine checks in the new psw mask for - * program checks, since swsusp_arch_suspend() may generate program - * checks. Disabling machine checks for all other new psw masks is - * just paranoia. - */ - local_mcck_disable(); - /* Disable lowcore protection */ - __ctl_clear_bit(0,28); - S390_lowcore.external_new_psw.mask &= ~PSW_MASK_MCHECK; - S390_lowcore.svc_new_psw.mask &= ~PSW_MASK_MCHECK; - S390_lowcore.io_new_psw.mask &= ~PSW_MASK_MCHECK; - S390_lowcore.program_new_psw.mask &= ~PSW_MASK_MCHECK; -} - -void restore_processor_state(void) -{ - S390_lowcore.external_new_psw.mask |= PSW_MASK_MCHECK; - S390_lowcore.svc_new_psw.mask |= PSW_MASK_MCHECK; - S390_lowcore.io_new_psw.mask |= PSW_MASK_MCHECK; - S390_lowcore.program_new_psw.mask |= PSW_MASK_MCHECK; - /* Enable lowcore protection */ - __ctl_set_bit(0,28); - local_mcck_enable(); -} - -/* Called at the end of swsusp_arch_resume */ -void s390_early_resume(void) -{ - lgr_info_log(); - channel_subsystem_reinit(); - zpci_rescan(); -} diff --git a/arch/s390/kernel/swsusp.S b/arch/s390/kernel/swsusp.S deleted file mode 100644 index a7baf0b5f818..000000000000 --- a/arch/s390/kernel/swsusp.S +++ /dev/null @@ -1,276 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * S390 64-bit swsusp implementation - * - * Copyright IBM Corp. 2009 - * - * Author(s): Hans-Joachim Picht <hans@linux.vnet.ibm.com> - * Michael Holzheu <holzheu@linux.vnet.ibm.com> - */ - -#include <linux/linkage.h> -#include <asm/page.h> -#include <asm/ptrace.h> -#include <asm/thread_info.h> -#include <asm/asm-offsets.h> -#include <asm/nospec-insn.h> -#include <asm/sigp.h> - -/* - * Save register context in absolute 0 lowcore and call swsusp_save() to - * create in-memory kernel image. The context is saved in the designated - * "store status" memory locations (see POP). - * We return from this function twice. The first time during the suspend to - * disk process. The second time via the swsusp_arch_resume() function - * (see below) in the resume process. - * This function runs with disabled interrupts. - */ - GEN_BR_THUNK %r14 - - .section .text -ENTRY(swsusp_arch_suspend) - lg %r1,__LC_NODAT_STACK - stmg %r6,%r15,__SF_GPRS(%r1) - aghi %r1,-STACK_FRAME_OVERHEAD - stg %r15,__SF_BACKCHAIN(%r1) - lgr %r15,%r1 - - /* Store FPU registers */ - brasl %r14,save_fpu_regs - - /* Deactivate DAT */ - stnsm __SF_EMPTY(%r15),0xfb - - /* Store prefix register on stack */ - stpx __SF_EMPTY(%r15) - - /* Save prefix register contents for lowcore copy */ - llgf %r10,__SF_EMPTY(%r15) - - /* Get pointer to save area */ - lghi %r1,0x1000 - - /* Save CPU address */ - stap __LC_EXT_CPU_ADDR(%r0) - - /* Store registers */ - mvc 0x318(4,%r1),__SF_EMPTY(%r15) /* move prefix to lowcore */ - stam %a0,%a15,0x340(%r1) /* store access registers */ - stctg %c0,%c15,0x380(%r1) /* store control registers */ - stmg %r0,%r15,0x280(%r1) /* store general registers */ - - stpt 0x328(%r1) /* store timer */ - stck __SF_EMPTY(%r15) /* store clock */ - stckc 0x330(%r1) /* store clock comparator */ - - /* Update cputime accounting before going to sleep */ - lg %r0,__LC_LAST_UPDATE_TIMER - slg %r0,0x328(%r1) - alg %r0,__LC_SYSTEM_TIMER - stg %r0,__LC_SYSTEM_TIMER - mvc __LC_LAST_UPDATE_TIMER(8),0x328(%r1) - lg %r0,__LC_LAST_UPDATE_CLOCK - slg %r0,__SF_EMPTY(%r15) - alg %r0,__LC_STEAL_TIMER - stg %r0,__LC_STEAL_TIMER - mvc __LC_LAST_UPDATE_CLOCK(8),__SF_EMPTY(%r15) - - /* Activate DAT */ - stosm __SF_EMPTY(%r15),0x04 - - /* Set prefix page to zero */ - xc __SF_EMPTY(4,%r15),__SF_EMPTY(%r15) - spx __SF_EMPTY(%r15) - - /* Save absolute zero pages */ - larl %r2,suspend_zero_pages - lg %r2,0(%r2) - lghi %r4,0 - lghi %r3,2*PAGE_SIZE - lghi %r5,2*PAGE_SIZE -1: mvcle %r2,%r4,0 - jo 1b - - /* Copy lowcore to absolute zero lowcore */ - lghi %r2,0 - lgr %r4,%r10 - lghi %r3,2*PAGE_SIZE - lghi %r5,2*PAGE_SIZE -1: mvcle %r2,%r4,0 - jo 1b - - /* Save image */ - brasl %r14,swsusp_save - - /* Restore prefix register and return */ - lghi %r1,0x1000 - spx 0x318(%r1) - lmg %r6,%r15,STACK_FRAME_OVERHEAD + __SF_GPRS(%r15) - lghi %r2,0 - BR_EX %r14 -ENDPROC(swsusp_arch_suspend) - -/* - * Restore saved memory image to correct place and restore register context. - * Then we return to the function that called swsusp_arch_suspend(). - * swsusp_arch_resume() runs with disabled interrupts. - */ -ENTRY(swsusp_arch_resume) - stmg %r6,%r15,__SF_GPRS(%r15) - lgr %r1,%r15 - aghi %r15,-STACK_FRAME_OVERHEAD - stg %r1,__SF_BACKCHAIN(%r15) - - /* Make all free pages stable */ - lghi %r2,1 - brasl %r14,arch_set_page_states - - /* Set prefix page to zero */ - xc __SF_EMPTY(4,%r15),__SF_EMPTY(%r15) - spx __SF_EMPTY(%r15) - - /* Deactivate DAT */ - stnsm __SF_EMPTY(%r15),0xfb - - /* Restore saved image */ - larl %r1,restore_pblist - lg %r1,0(%r1) - ltgr %r1,%r1 - jz 2f -0: - lg %r2,8(%r1) - lg %r4,0(%r1) - iske %r0,%r4 - lghi %r3,PAGE_SIZE - lghi %r5,PAGE_SIZE -1: - mvcle %r2,%r4,0 - jo 1b - lg %r2,8(%r1) - sske %r0,%r2 - lg %r1,16(%r1) - ltgr %r1,%r1 - jnz 0b -2: - ptlb /* flush tlb */ - - /* Reset System */ - larl %r1,.Lnew_pgm_check_psw - epsw %r2,%r3 - stm %r2,%r3,0(%r1) - mvc __LC_PGM_NEW_PSW(16,%r0),0(%r1) - larl %r1,__swsusp_reset_dma - lg %r1,0(%r1) - BASR_EX %r14,%r1 - larl %r1,smp_cpu_mt_shift - icm %r1,15,0(%r1) - jz smt_done - llgfr %r1,%r1 -smt_loop: - sigp %r1,%r0,SIGP_SET_MULTI_THREADING - brc 8,smt_done /* accepted */ - brc 2,smt_loop /* busy, try again */ -smt_done: - larl %r1,.Lnew_pgm_check_psw - lpswe 0(%r1) -pgm_check_entry: - - /* Switch to original suspend CPU */ - larl %r1,.Lresume_cpu /* Resume CPU address: r2 */ - stap 0(%r1) - llgh %r2,0(%r1) - llgh %r1,__LC_EXT_CPU_ADDR(%r0) /* Suspend CPU address: r1 */ - cgr %r1,%r2 - je restore_registers /* r1 = r2 -> nothing to do */ - larl %r4,.Lrestart_suspend_psw /* Set new restart PSW */ - mvc __LC_RST_NEW_PSW(16,%r0),0(%r4) -3: - sigp %r9,%r1,SIGP_INITIAL_CPU_RESET /* sigp initial cpu reset */ - brc 8,4f /* accepted */ - brc 2,3b /* busy, try again */ - - /* Suspend CPU not available -> panic */ - larl %r15,init_thread_union+THREAD_SIZE-STACK_FRAME_OVERHEAD - larl %r2,.Lpanic_string - brasl %r14,sclp_early_printk_force - larl %r3,.Ldisabled_wait_31 - lpsw 0(%r3) -4: - /* Switch to suspend CPU */ - sigp %r9,%r1,SIGP_RESTART /* sigp restart to suspend CPU */ - brc 2,4b /* busy, try again */ -5: - sigp %r9,%r2,SIGP_STOP /* sigp stop to current resume CPU */ - brc 2,5b /* busy, try again */ -6: j 6b - -restart_suspend: - larl %r1,.Lresume_cpu - llgh %r2,0(%r1) -7: - sigp %r9,%r2,SIGP_SENSE /* sigp sense, wait for resume CPU */ - brc 8,7b /* accepted, status 0, still running */ - brc 2,7b /* busy, try again */ - tmll %r9,0x40 /* Test if resume CPU is stopped */ - jz 7b - -restore_registers: - /* Restore registers */ - lghi %r13,0x1000 /* %r1 = pointer to save area */ - - /* Ignore time spent in suspended state. */ - llgf %r1,0x318(%r13) - stck __LC_LAST_UPDATE_CLOCK(%r1) - spt 0x328(%r13) /* reprogram timer */ - //sckc 0x330(%r13) /* set clock comparator */ - - lctlg %c0,%c15,0x380(%r13) /* load control registers */ - lam %a0,%a15,0x340(%r13) /* load access registers */ - - /* Load old stack */ - lg %r15,0x2f8(%r13) - - /* Save prefix register */ - mvc __SF_EMPTY(4,%r15),0x318(%r13) - - /* Restore absolute zero pages */ - lghi %r2,0 - larl %r4,suspend_zero_pages - lg %r4,0(%r4) - lghi %r3,2*PAGE_SIZE - lghi %r5,2*PAGE_SIZE -1: mvcle %r2,%r4,0 - jo 1b - - /* Restore prefix register */ - spx __SF_EMPTY(%r15) - - /* Activate DAT */ - stosm __SF_EMPTY(%r15),0x04 - - /* Make all free pages unstable */ - lghi %r2,0 - brasl %r14,arch_set_page_states - - /* Call arch specific early resume code */ - brasl %r14,s390_early_resume - - /* Return 0 */ - lmg %r6,%r15,STACK_FRAME_OVERHEAD + __SF_GPRS(%r15) - lghi %r2,0 - BR_EX %r14 -ENDPROC(swsusp_arch_resume) - - .section .data..nosave,"aw",@progbits - .align 8 -.Ldisabled_wait_31: - .long 0x000a0000,0x00000000 -.Lpanic_string: - .asciz "Resume not possible because suspend CPU is no longer available\n" - .align 8 -.Lrestart_suspend_psw: - .quad 0x0000000180000000,restart_suspend -.Lnew_pgm_check_psw: - .quad 0,pgm_check_entry -.Lresume_cpu: - .byte 0,0 diff --git a/arch/s390/kernel/sys_s390.c b/arch/s390/kernel/syscall.c index 202fa73ac167..dc2355c623d6 100644 --- a/arch/s390/kernel/sys_s390.c +++ b/arch/s390/kernel/syscall.c @@ -29,6 +29,13 @@ #include <linux/unistd.h> #include <linux/ipc.h> #include <linux/uaccess.h> +#include <linux/string.h> +#include <linux/thread_info.h> +#include <linux/entry-common.h> + +#include <asm/ptrace.h> +#include <asm/vtime.h> + #include "entry.h" /* @@ -100,3 +107,64 @@ SYSCALL_DEFINE0(ni_syscall) { return -ENOSYS; } + +static void do_syscall(struct pt_regs *regs) +{ + unsigned long nr; + + nr = regs->int_code & 0xffff; + if (!nr) { + nr = regs->gprs[1] & 0xffff; + regs->int_code &= ~0xffffUL; + regs->int_code |= nr; + } + + regs->gprs[2] = nr; + + if (nr == __NR_restart_syscall && !(current->restart_block.arch_data & 1)) { + regs->psw.addr = current->restart_block.arch_data; + current->restart_block.arch_data = 1; + } + nr = syscall_enter_from_user_mode_work(regs, nr); + + /* + * In the s390 ptrace ABI, both the syscall number and the return value + * use gpr2. However, userspace puts the syscall number either in the + * svc instruction itself, or uses gpr1. To make at least skipping syscalls + * work, the ptrace code sets PIF_SYSCALL_RET_SET, which is checked here + * and if set, the syscall will be skipped. + */ + + if (unlikely(test_and_clear_pt_regs_flag(regs, PIF_SYSCALL_RET_SET))) + goto out; + regs->gprs[2] = -ENOSYS; + if (likely(nr >= NR_syscalls)) + goto out; + do { + regs->gprs[2] = current->thread.sys_call_table[nr](regs); + } while (test_and_clear_pt_regs_flag(regs, PIF_EXECVE_PGSTE_RESTART)); +out: + syscall_exit_to_user_mode_work(regs); +} + +void noinstr __do_syscall(struct pt_regs *regs, int per_trap) +{ + add_random_kstack_offset(); + enter_from_user_mode(regs); + regs->psw = S390_lowcore.svc_old_psw; + regs->int_code = S390_lowcore.svc_int_code; + update_timer_sys(); + if (static_branch_likely(&cpu_has_bear)) + current->thread.last_break = regs->last_break; + + local_irq_enable(); + regs->orig_gpr2 = regs->gprs[2]; + + if (per_trap) + set_thread_flag(TIF_PER_TRAP); + + regs->flags = 0; + set_pt_regs_flag(regs, PIF_SYSCALL); + do_syscall(regs); + exit_to_user_mode(); +} diff --git a/arch/s390/kernel/syscalls/Makefile b/arch/s390/kernel/syscalls/Makefile index b98f25029b8e..fb85e797946d 100644 --- a/arch/s390/kernel/syscalls/Makefile +++ b/arch/s390/kernel/syscalls/Makefile @@ -21,8 +21,7 @@ uapi: $(uapi-hdrs-y) # Create output directory if not already present -_dummy := $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)') \ - $(shell [ -d '$(kapi)' ] || mkdir -p '$(kapi)') +$(shell mkdir -p $(uapi) $(kapi)) filechk_syshdr = $(CONFIG_SHELL) '$(systbl)' -H -a $(syshdr_abi_$(basetarget)) -f "$2" < $< diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index 3054e9c035a3..095bb86339a7 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -26,7 +26,7 @@ 16 32 lchown - sys_lchown16 19 common lseek sys_lseek compat_sys_lseek 20 common getpid sys_getpid sys_getpid -21 common mount sys_mount compat_sys_mount +21 common mount sys_mount sys_mount 22 common umount sys_oldumount sys_oldumount 23 32 setuid - sys_setuid16 24 32 getuid - sys_getuid16 @@ -100,7 +100,7 @@ 106 common stat sys_newstat compat_sys_newstat 107 common lstat sys_newlstat compat_sys_newlstat 108 common fstat sys_newfstat compat_sys_newfstat -110 common lookup_dcookie sys_lookup_dcookie compat_sys_lookup_dcookie +110 common lookup_dcookie - - 111 common vhangup sys_vhangup sys_vhangup 112 common idle - - 114 common wait4 sys_wait4 compat_sys_wait4 @@ -122,7 +122,7 @@ 131 common quotactl sys_quotactl sys_quotactl 132 common getpgid sys_getpgid sys_getpgid 133 common fchdir sys_fchdir sys_fchdir -134 common bdflush sys_bdflush sys_bdflush +134 common bdflush sys_ni_syscall sys_ni_syscall 135 common sysfs sys_sysfs sys_sysfs 136 common personality sys_s390_personality sys_s390_personality 137 common afs_syscall - - @@ -134,11 +134,11 @@ 142 64 select sys_select - 143 common flock sys_flock sys_flock 144 common msync sys_msync sys_msync -145 common readv sys_readv compat_sys_readv -146 common writev sys_writev compat_sys_writev +145 common readv sys_readv sys_readv +146 common writev sys_writev sys_writev 147 common getsid sys_getsid sys_getsid 148 common fdatasync sys_fdatasync sys_fdatasync -149 common _sysctl sys_sysctl compat_sys_sysctl +149 common _sysctl - - 150 common mlock sys_mlock sys_mlock 151 common munlock sys_munlock sys_munlock 152 common mlockall sys_mlockall sys_mlockall @@ -274,9 +274,9 @@ 265 common statfs64 sys_statfs64 compat_sys_statfs64 266 common fstatfs64 sys_fstatfs64 compat_sys_fstatfs64 267 common remap_file_pages sys_remap_file_pages sys_remap_file_pages -268 common mbind sys_mbind compat_sys_mbind -269 common get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy -270 common set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy +268 common mbind sys_mbind sys_mbind +269 common get_mempolicy sys_get_mempolicy sys_get_mempolicy +270 common set_mempolicy sys_set_mempolicy sys_set_mempolicy 271 common mq_open sys_mq_open compat_sys_mq_open 272 common mq_unlink sys_mq_unlink sys_mq_unlink 273 common mq_timedsend sys_mq_timedsend sys_mq_timedsend_time32 @@ -293,7 +293,7 @@ 284 common inotify_init sys_inotify_init sys_inotify_init 285 common inotify_add_watch sys_inotify_add_watch sys_inotify_add_watch 286 common inotify_rm_watch sys_inotify_rm_watch sys_inotify_rm_watch -287 common migrate_pages sys_migrate_pages compat_sys_migrate_pages +287 common migrate_pages sys_migrate_pages sys_migrate_pages 288 common openat sys_openat compat_sys_openat 289 common mkdirat sys_mkdirat sys_mkdirat 290 common mknodat sys_mknodat sys_mknodat @@ -316,8 +316,8 @@ 306 common splice sys_splice sys_splice 307 common sync_file_range sys_sync_file_range compat_sys_s390_sync_file_range 308 common tee sys_tee sys_tee -309 common vmsplice sys_vmsplice compat_sys_vmsplice -310 common move_pages sys_move_pages compat_sys_move_pages +309 common vmsplice sys_vmsplice sys_vmsplice +310 common move_pages sys_move_pages sys_move_pages 311 common getcpu sys_getcpu sys_getcpu 312 common epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait 313 common utimes sys_utimes sys_utimes_time32 @@ -347,8 +347,8 @@ 337 common clock_adjtime sys_clock_adjtime sys_clock_adjtime32 338 common syncfs sys_syncfs sys_syncfs 339 common setns sys_setns sys_setns -340 common process_vm_readv sys_process_vm_readv compat_sys_process_vm_readv -341 common process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev +340 common process_vm_readv sys_process_vm_readv sys_process_vm_readv +341 common process_vm_writev sys_process_vm_writev sys_process_vm_writev 342 common s390_runtime_instr sys_s390_runtime_instr sys_s390_runtime_instr 343 common kcmp sys_kcmp sys_kcmp 344 common finit_module sys_finit_module sys_finit_module @@ -372,8 +372,8 @@ 362 common connect sys_connect sys_connect 363 common listen sys_listen sys_listen 364 common accept4 sys_accept4 sys_accept4 -365 common getsockopt sys_getsockopt compat_sys_getsockopt -366 common setsockopt sys_setsockopt compat_sys_setsockopt +365 common getsockopt sys_getsockopt sys_getsockopt +366 common setsockopt sys_setsockopt sys_setsockopt 367 common getsockname sys_getsockname sys_getsockname 368 common getpeername sys_getpeername sys_getpeername 369 common sendto sys_sendto sys_sendto @@ -438,3 +438,29 @@ 433 common fspick sys_fspick sys_fspick 434 common pidfd_open sys_pidfd_open sys_pidfd_open 435 common clone3 sys_clone3 sys_clone3 +436 common close_range sys_close_range sys_close_range +437 common openat2 sys_openat2 sys_openat2 +438 common pidfd_getfd sys_pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise sys_process_madvise +441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 +442 common mount_setattr sys_mount_setattr sys_mount_setattr +443 common quotactl_fd sys_quotactl_fd sys_quotactl_fd +444 common landlock_create_ruleset sys_landlock_create_ruleset sys_landlock_create_ruleset +445 common landlock_add_rule sys_landlock_add_rule sys_landlock_add_rule +446 common landlock_restrict_self sys_landlock_restrict_self sys_landlock_restrict_self +447 common memfd_secret sys_memfd_secret sys_memfd_secret +448 common process_mrelease sys_process_mrelease sys_process_mrelease +449 common futex_waitv sys_futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node +451 common cachestat sys_cachestat sys_cachestat +452 common fchmodat2 sys_fchmodat2 sys_fchmodat2 +453 common map_shadow_stack sys_map_shadow_stack sys_map_shadow_stack +454 common futex_wake sys_futex_wake sys_futex_wake +455 common futex_wait sys_futex_wait sys_futex_wait +456 common futex_requeue sys_futex_requeue sys_futex_requeue +457 common statmount sys_statmount sys_statmount +458 common listmount sys_listmount sys_listmount +459 common lsm_get_self_attr sys_lsm_get_self_attr sys_lsm_get_self_attr +460 common lsm_set_self_attr sys_lsm_set_self_attr sys_lsm_set_self_attr +461 common lsm_list_modules sys_lsm_list_modules sys_lsm_list_modules diff --git a/arch/s390/kernel/sysinfo.c b/arch/s390/kernel/sysinfo.c index 2ac3c9b56a13..f6f8f498c9be 100644 --- a/arch/s390/kernel/sysinfo.c +++ b/arch/s390/kernel/sysinfo.c @@ -14,6 +14,7 @@ #include <linux/delay.h> #include <linux/export.h> #include <linux/slab.h> +#include <asm/asm-extable.h> #include <asm/ebcdic.h> #include <asm/debug.h> #include <asm/sysinfo.h> @@ -25,19 +26,22 @@ int topology_max_mnest; static inline int __stsi(void *sysinfo, int fc, int sel1, int sel2, int *lvl) { - register int r0 asm("0") = (fc << 28) | sel1; - register int r1 asm("1") = sel2; + int r0 = (fc << 28) | sel1; int rc = 0; asm volatile( - " stsi 0(%3)\n" + " lr 0,%[r0]\n" + " lr 1,%[r1]\n" + " stsi 0(%[sysinfo])\n" "0: jz 2f\n" - "1: lhi %1,%4\n" - "2:\n" + "1: lhi %[rc],%[retval]\n" + "2: lr %[r0],0\n" EX_TABLE(0b, 1b) - : "+d" (r0), "+d" (rc) - : "d" (r1), "a" (sysinfo), "K" (-EOPNOTSUPP) - : "cc", "memory"); + : [r0] "+d" (r0), [rc] "+d" (rc) + : [r1] "d" (sel2), + [sysinfo] "a" (sysinfo), + [retval] "K" (-EOPNOTSUPP) + : "cc", "0", "1", "memory"); *lvl = ((unsigned int) r0) >> 28; return rc; } @@ -77,10 +81,12 @@ static bool convert_ext_name(unsigned char encoding, char *name, size_t len) static void stsi_1_1_1(struct seq_file *m, struct sysinfo_1_1_1 *info) { + bool has_var_cap; int i; if (stsi(info, 1, 1, 1)) return; + has_var_cap = !!info->model_var_cap[0]; EBCASC(info->manufacturer, sizeof(info->manufacturer)); EBCASC(info->type, sizeof(info->type)); EBCASC(info->model, sizeof(info->model)); @@ -89,6 +95,8 @@ static void stsi_1_1_1(struct seq_file *m, struct sysinfo_1_1_1 *info) EBCASC(info->model_capacity, sizeof(info->model_capacity)); EBCASC(info->model_perm_cap, sizeof(info->model_perm_cap)); EBCASC(info->model_temp_cap, sizeof(info->model_temp_cap)); + if (has_var_cap) + EBCASC(info->model_var_cap, sizeof(info->model_var_cap)); seq_printf(m, "Manufacturer: %-16.16s\n", info->manufacturer); seq_printf(m, "Type: %-4.4s\n", info->type); if (info->lic) @@ -116,12 +124,18 @@ static void stsi_1_1_1(struct seq_file *m, struct sysinfo_1_1_1 *info) seq_printf(m, "Model Temp. Capacity: %-16.16s %08u\n", info->model_temp_cap, info->model_temp_cap_rating); + if (has_var_cap && info->model_var_cap_rating) + seq_printf(m, "Model Var. Capacity: %-16.16s %08u\n", + info->model_var_cap, + info->model_var_cap_rating); if (info->ncr) seq_printf(m, "Nominal Cap. Rating: %08u\n", info->ncr); if (info->npr) seq_printf(m, "Nominal Perm. Rating: %08u\n", info->npr); if (info->ntr) seq_printf(m, "Nominal Temp. Rating: %08u\n", info->ntr); + if (has_var_cap && info->nvr) + seq_printf(m, "Nominal Var. Rating: %08u\n", info->nvr); if (info->cai) { seq_printf(m, "Capacity Adj. Ind.: %d\n", info->cai); seq_printf(m, "Capacity Ch. Reason: %d\n", info->ccr); diff --git a/arch/s390/kernel/text_amode31.S b/arch/s390/kernel/text_amode31.S new file mode 100644 index 000000000000..14c6d25c035f --- /dev/null +++ b/arch/s390/kernel/text_amode31.S @@ -0,0 +1,159 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Code that needs to run below 2 GB. + * + * Copyright IBM Corp. 2019 + */ + +#include <linux/linkage.h> +#include <asm/asm-extable.h> +#include <asm/errno.h> +#include <asm/sigp.h> + + .section .amode31.text,"ax" +/* + * Simplified version of expoline thunk. The normal thunks can not be used here, + * because they might be more than 2 GB away, and not reachable by the relative + * branch. No comdat, exrl, etc. optimizations used here, because it only + * affects a few functions that are not performance-relevant. + */ + .macro BR_EX_AMODE31_r14 + larl %r1,0f + ex 0,0(%r1) + j . +0: br %r14 + .endm + +/* + * int _diag14_amode31(unsigned long rx, unsigned long ry1, unsigned long subcode) + */ +SYM_FUNC_START(_diag14_amode31) + lgr %r1,%r2 + lgr %r2,%r3 + lgr %r3,%r4 + lhi %r5,-EIO + sam31 + diag %r1,%r2,0x14 +.Ldiag14_ex: + ipm %r5 + srl %r5,28 +.Ldiag14_fault: + sam64 + lgfr %r2,%r5 + BR_EX_AMODE31_r14 + EX_TABLE_AMODE31(.Ldiag14_ex, .Ldiag14_fault) +SYM_FUNC_END(_diag14_amode31) + +/* + * int _diag210_amode31(struct diag210 *addr) + */ +SYM_FUNC_START(_diag210_amode31) + lgr %r1,%r2 + lhi %r2,-1 + sam31 + diag %r1,%r0,0x210 +.Ldiag210_ex: + ipm %r2 + srl %r2,28 +.Ldiag210_fault: + sam64 + lgfr %r2,%r2 + BR_EX_AMODE31_r14 + EX_TABLE_AMODE31(.Ldiag210_ex, .Ldiag210_fault) +SYM_FUNC_END(_diag210_amode31) + +/* + * int diag8c(struct diag8c *addr, struct ccw_dev_id *devno, size_t len) +*/ +SYM_FUNC_START(_diag8c_amode31) + llgf %r3,0(%r3) + sam31 + diag %r2,%r4,0x8c +.Ldiag8c_ex: + sam64 + lgfr %r2,%r3 + BR_EX_AMODE31_r14 + EX_TABLE_AMODE31(.Ldiag8c_ex, .Ldiag8c_ex) +SYM_FUNC_END(_diag8c_amode31) +/* + * int _diag26c_amode31(void *req, void *resp, enum diag26c_sc subcode) + */ +SYM_FUNC_START(_diag26c_amode31) + lghi %r5,-EOPNOTSUPP + sam31 + diag %r2,%r4,0x26c +.Ldiag26c_ex: + sam64 + lgfr %r2,%r5 + BR_EX_AMODE31_r14 + EX_TABLE_AMODE31(.Ldiag26c_ex, .Ldiag26c_ex) +SYM_FUNC_END(_diag26c_amode31) + +/* + * void _diag0c_amode31(struct hypfs_diag0c_entry *entry) + */ +SYM_FUNC_START(_diag0c_amode31) + sam31 + diag %r2,%r2,0x0c + sam64 + BR_EX_AMODE31_r14 +SYM_FUNC_END(_diag0c_amode31) + +/* + * void _diag308_reset_amode31(void) + * + * Calls diag 308 subcode 1 and continues execution + */ +SYM_FUNC_START(_diag308_reset_amode31) + larl %r4,ctlregs # Save control registers + stctg %c0,%c15,0(%r4) + lg %r2,0(%r4) # Disable lowcore protection + nilh %r2,0xefff + larl %r4,ctlreg0 + stg %r2,0(%r4) + lctlg %c0,%c0,0(%r4) + larl %r4,fpctl # Floating point control register + stfpc 0(%r4) + larl %r4,prefix # Save prefix register + stpx 0(%r4) + larl %r4,prefix_zero # Set prefix register to 0 + spx 0(%r4) + larl %r4,continue_psw # Save PSW flags + epsw %r2,%r3 + stm %r2,%r3,0(%r4) + larl %r4,.Lrestart_part2 # Setup restart PSW at absolute 0 + larl %r3,restart_diag308_psw + og %r4,0(%r3) # Save PSW + lghi %r3,0 + sturg %r4,%r3 # Use sturg, because of large pages + lghi %r1,1 + lghi %r0,0 + diag %r0,%r1,0x308 +.Lrestart_part2: + lhi %r0,0 # Load r0 with zero + lhi %r1,2 # Use mode 2 = ESAME (dump) + sigp %r1,%r0,SIGP_SET_ARCHITECTURE # Switch to ESAME mode + sam64 # Switch to 64 bit addressing mode + larl %r4,ctlregs # Restore control registers + lctlg %c0,%c15,0(%r4) + larl %r4,fpctl # Restore floating point ctl register + lfpc 0(%r4) + larl %r4,prefix # Restore prefix register + spx 0(%r4) + larl %r4,continue_psw # Restore PSW flags + larl %r2,.Lcontinue + stg %r2,8(%r4) + lpswe 0(%r4) +.Lcontinue: + BR_EX_AMODE31_r14 +SYM_FUNC_END(_diag308_reset_amode31) + + .section .amode31.data,"aw",@progbits + .balign 8 +SYM_DATA_LOCAL(restart_diag308_psw, .long 0x00080000,0x80000000) +SYM_DATA_LOCAL(continue_psw, .quad 0,0) +SYM_DATA_LOCAL(ctlreg0, .quad 0) +SYM_DATA_LOCAL(ctlregs, .fill 16,8,0) +SYM_DATA_LOCAL(fpctl, .long 0) +SYM_DATA_LOCAL(prefix, .long 0) +SYM_DATA_LOCAL(prefix_zero, .long 0) diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index f9d070d016e3..14abad953c02 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -41,6 +41,9 @@ #include <linux/gfp.h> #include <linux/kprobes.h> #include <linux/uaccess.h> +#include <vdso/vsyscall.h> +#include <vdso/clocksource.h> +#include <vdso/helpers.h> #include <asm/facility.h> #include <asm/delay.h> #include <asm/div64.h> @@ -52,11 +55,7 @@ #include <asm/cio.h> #include "entry.h" -unsigned char tod_clock_base[16] __aligned(8) = { - /* Force to data section. */ - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff -}; +union tod_clock tod_clock_base __section(".data"); EXPORT_SYMBOL_GPL(tod_clock_base); u64 clock_comparator_max = -1ULL; @@ -69,10 +68,10 @@ EXPORT_SYMBOL(s390_epoch_delta_notifier); unsigned char ptff_function_mask[16]; -static unsigned long long lpar_offset; -static unsigned long long initial_leap_seconds; -static unsigned long long tod_steering_end; -static long long tod_steering_delta; +static unsigned long lpar_offset; +static unsigned long initial_leap_seconds; +static unsigned long tod_steering_end; +static long tod_steering_delta; /* * Get time offsets with PTFF @@ -81,10 +80,12 @@ void __init time_early_init(void) { struct ptff_qto qto; struct ptff_qui qui; + int cs; /* Initialize TOD steering parameters */ - tod_steering_end = *(unsigned long long *) &tod_clock_base[1]; - vdso_data->ts_end = tod_steering_end; + tod_steering_end = tod_clock_base.tod; + for (cs = 0; cs < CS_BASES; cs++) + vdso_data[cs].arch_data.tod_steering_end = tod_steering_end; if (!test_facility(28)) return; @@ -97,10 +98,15 @@ void __init time_early_init(void) /* get initial leap seconds */ if (ptff_query(PTFF_QUI) && ptff(&qui, sizeof(qui), PTFF_QUI) == 0) - initial_leap_seconds = (unsigned long long) + initial_leap_seconds = (unsigned long) ((long) qui.old_leap * 4096000000L); } +unsigned long long noinstr sched_clock_noinstr(void) +{ + return tod_to_ns(__get_tod_clock_monotonic()); +} + /* * Scheduler clock - returns current time in nanosec units. */ @@ -110,18 +116,13 @@ unsigned long long notrace sched_clock(void) } NOKPROBE_SYMBOL(sched_clock); -static void ext_to_timespec64(unsigned char *clk, struct timespec64 *xt) +static void ext_to_timespec64(union tod_clock *clk, struct timespec64 *xt) { - unsigned long long high, low, rem, sec, nsec; + unsigned long rem, sec, nsec; - /* Split extendnd TOD clock to micro-seconds and sub-micro-seconds */ - high = (*(unsigned long long *) clk) >> 4; - low = (*(unsigned long long *)&clk[7]) << 4; - /* Calculate seconds and nano-seconds */ - sec = high; + sec = clk->us; rem = do_div(sec, 1000000); - nsec = (((low >> 32) + (rem << 32)) * 1000) >> 32; - + nsec = ((clk->sus + (rem << 12)) * 125) >> 9; xt->tv_sec = sec; xt->tv_nsec = nsec; } @@ -172,10 +173,10 @@ void init_cpu_timer(void) clockevents_register_device(cd); /* Enable clock comparator timer interrupt. */ - __ctl_set_bit(0,11); + local_ctl_set_bit(0, CR0_CLOCK_COMPARATOR_SUBMASK_BIT); /* Always allow the timing alert external interrupt. */ - __ctl_set_bit(0, 4); + local_ctl_set_bit(0, CR0_ETR_SUBMASK_BIT); } static void clock_comparator_interrupt(struct ext_code ext_code, @@ -201,30 +202,26 @@ static void stp_reset(void); void read_persistent_clock64(struct timespec64 *ts) { - unsigned char clk[STORE_CLOCK_EXT_SIZE]; - __u64 delta; + union tod_clock clk; + u64 delta; delta = initial_leap_seconds + TOD_UNIX_EPOCH; - get_tod_clock_ext(clk); - *(__u64 *) &clk[1] -= delta; - if (*(__u64 *) &clk[1] > delta) - clk[0]--; - ext_to_timespec64(clk, ts); + store_tod_clock_ext(&clk); + clk.eitod -= delta; + ext_to_timespec64(&clk, ts); } void __init read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, struct timespec64 *boot_offset) { - unsigned char clk[STORE_CLOCK_EXT_SIZE]; struct timespec64 boot_time; - __u64 delta; + union tod_clock clk; + u64 delta; delta = initial_leap_seconds + TOD_UNIX_EPOCH; - memcpy(clk, tod_clock_base, STORE_CLOCK_EXT_SIZE); - *(__u64 *)&clk[1] -= delta; - if (*(__u64 *)&clk[1] > delta) - clk[0]--; - ext_to_timespec64(clk, &boot_time); + clk = tod_clock_base; + clk.eitod -= delta; + ext_to_timespec64(&clk, &boot_time); read_persistent_clock64(wall_time); *boot_offset = timespec64_sub(*wall_time, boot_time); @@ -232,12 +229,12 @@ void __init read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, static u64 read_tod_clock(struct clocksource *cs) { - unsigned long long now, adj; + unsigned long now, adj; preempt_disable(); /* protect from changes to steering parameters */ now = get_tod_clock(); adj = tod_steering_end - now; - if (unlikely((s64) adj >= 0)) + if (unlikely((s64) adj > 0)) /* * manually steer by 1 cycle every 2^16 cycles. This * corresponds to shifting the tod delta by 15. 1s is @@ -253,10 +250,11 @@ static struct clocksource clocksource_tod = { .name = "tod", .rating = 400, .read = read_tod_clock, - .mask = -1ULL, + .mask = CLOCKSOURCE_MASK(64), .mult = 1000, .shift = 12, .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .vdso_clock_mode = VDSO_CLOCKMODE_TOD, }; struct clocksource * __init clocksource_default_clock(void) @@ -264,55 +262,6 @@ struct clocksource * __init clocksource_default_clock(void) return &clocksource_tod; } -void update_vsyscall(struct timekeeper *tk) -{ - u64 nsecps; - - if (tk->tkr_mono.clock != &clocksource_tod) - return; - - /* Make userspace gettimeofday spin until we're done. */ - ++vdso_data->tb_update_count; - smp_wmb(); - vdso_data->xtime_tod_stamp = tk->tkr_mono.cycle_last; - vdso_data->xtime_clock_sec = tk->xtime_sec; - vdso_data->xtime_clock_nsec = tk->tkr_mono.xtime_nsec; - vdso_data->wtom_clock_sec = - tk->xtime_sec + tk->wall_to_monotonic.tv_sec; - vdso_data->wtom_clock_nsec = tk->tkr_mono.xtime_nsec + - + ((u64) tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift); - nsecps = (u64) NSEC_PER_SEC << tk->tkr_mono.shift; - while (vdso_data->wtom_clock_nsec >= nsecps) { - vdso_data->wtom_clock_nsec -= nsecps; - vdso_data->wtom_clock_sec++; - } - - vdso_data->xtime_coarse_sec = tk->xtime_sec; - vdso_data->xtime_coarse_nsec = - (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); - vdso_data->wtom_coarse_sec = - vdso_data->xtime_coarse_sec + tk->wall_to_monotonic.tv_sec; - vdso_data->wtom_coarse_nsec = - vdso_data->xtime_coarse_nsec + tk->wall_to_monotonic.tv_nsec; - while (vdso_data->wtom_coarse_nsec >= NSEC_PER_SEC) { - vdso_data->wtom_coarse_nsec -= NSEC_PER_SEC; - vdso_data->wtom_coarse_sec++; - } - - vdso_data->tk_mult = tk->tkr_mono.mult; - vdso_data->tk_shift = tk->tkr_mono.shift; - smp_wmb(); - ++vdso_data->tb_update_count; -} - -extern struct timezone sys_tz; - -void update_vsyscall_tz(void) -{ - vdso_data->tz_minuteswest = sys_tz.tz_minuteswest; - vdso_data->tz_dsttime = sys_tz.tz_dsttime; -} - /* * Initialize the TOD clock and the CPU timer of * the boot cpu. @@ -341,11 +290,12 @@ void __init time_init(void) } static DEFINE_PER_CPU(atomic_t, clock_sync_word); -static DEFINE_MUTEX(clock_sync_mutex); +static DEFINE_MUTEX(stp_mutex); static unsigned long clock_sync_flags; -#define CLOCK_SYNC_HAS_STP 0 -#define CLOCK_SYNC_STP 1 +#define CLOCK_SYNC_HAS_STP 0 +#define CLOCK_SYNC_STP 1 +#define CLOCK_SYNC_STPINFO_VALID 2 /* * The get_clock function for the physical clock. It will get the current @@ -419,18 +369,15 @@ static inline int check_sync_clock(void) * Apply clock delta to the global data structures. * This is called once on the CPU that performed the clock sync. */ -static void clock_sync_global(unsigned long long delta) +static void clock_sync_global(long delta) { unsigned long now, adj; struct ptff_qto qto; + int cs; /* Fixup the monotonic sched clock. */ - *(unsigned long long *) &tod_clock_base[1] += delta; - if (*(unsigned long long *) &tod_clock_base[1] < delta) - /* Epoch overflow */ - tod_clock_base[0]++; + tod_clock_base.eitod += delta; /* Adjust TOD steering parameters. */ - vdso_data->tb_update_count++; now = get_tod_clock(); adj = tod_steering_end - now; if (unlikely((s64) adj >= 0)) @@ -439,12 +386,14 @@ static void clock_sync_global(unsigned long long delta) -(adj >> 15) : (adj >> 15); tod_steering_delta += delta; if ((abs(tod_steering_delta) >> 48) != 0) - panic("TOD clock sync offset %lli is too large to drift\n", + panic("TOD clock sync offset %li is too large to drift\n", tod_steering_delta); tod_steering_end = now + (abs(tod_steering_delta) << 15); - vdso_data->ts_dir = (tod_steering_delta < 0) ? 0 : 1; - vdso_data->ts_end = tod_steering_end; - vdso_data->tb_update_count++; + for (cs = 0; cs < CS_BASES; cs++) { + vdso_data[cs].arch_data.tod_steering_end = tod_steering_end; + vdso_data[cs].arch_data.tod_steering_delta = tod_steering_delta; + } + /* Update LPAR offset. */ if (ptff_query(PTFF_QTO) && ptff(&qto, sizeof(qto), PTFF_QTO) == 0) lpar_offset = qto.tod_epoch_difference; @@ -456,7 +405,7 @@ static void clock_sync_global(unsigned long long delta) * Apply clock delta to the per-CPU data structures of this CPU. * This is called for each online CPU after the call to clock_sync_global. */ -static void clock_sync_local(unsigned long long delta) +static void clock_sync_local(long delta) { /* Add the delta to the clock comparator. */ if (S390_lowcore.clock_comparator != clock_comparator_max) { @@ -480,7 +429,7 @@ static void __init time_init_wq(void) struct clock_sync_data { atomic_t cpus; int in_sync; - unsigned long long clock_delta; + long clock_delta; }; /* @@ -491,7 +440,6 @@ static struct stp_sstpi stp_info; static void *stp_page; static void stp_work_fn(struct work_struct *work); -static DEFINE_MUTEX(stp_work_mutex); static DECLARE_WORK(stp_work, stp_work_fn); static struct timer_list stp_timer; @@ -582,10 +530,26 @@ void stp_queue_work(void) queue_work(time_sync_wq, &stp_work); } +static int __store_stpinfo(void) +{ + int rc = chsc_sstpi(stp_page, &stp_info, sizeof(struct stp_sstpi)); + + if (rc) + clear_bit(CLOCK_SYNC_STPINFO_VALID, &clock_sync_flags); + else + set_bit(CLOCK_SYNC_STPINFO_VALID, &clock_sync_flags); + return rc; +} + +static int stpinfo_valid(void) +{ + return stp_online && test_bit(CLOCK_SYNC_STPINFO_VALID, &clock_sync_flags); +} + static int stp_sync_clock(void *data) { struct clock_sync_data *sync = data; - unsigned long long clock_delta; + long clock_delta, flags; static int first; int rc; @@ -595,19 +559,18 @@ static int stp_sync_clock(void *data) while (atomic_read(&sync->cpus) != 0) cpu_relax(); rc = 0; - if (stp_info.todoff[0] || stp_info.todoff[1] || - stp_info.todoff[2] || stp_info.todoff[3] || - stp_info.tmd != 2) { + if (stp_info.todoff || stp_info.tmd != 2) { + flags = vdso_update_begin(); rc = chsc_sstpc(stp_page, STP_OP_SYNC, 0, &clock_delta); if (rc == 0) { sync->clock_delta = clock_delta; clock_sync_global(clock_delta); - rc = chsc_sstpi(stp_page, &stp_info, - sizeof(struct stp_sstpi)); + rc = __store_stpinfo(); if (rc == 0 && stp_info.tmd != 2) rc = -EAGAIN; } + vdso_update_end(flags); } sync->in_sync = rc ? -EAGAIN : 1; xchg(&first, 0); @@ -627,6 +590,81 @@ static int stp_sync_clock(void *data) return 0; } +static int stp_clear_leap(void) +{ + struct __kernel_timex txc; + int ret; + + memset(&txc, 0, sizeof(txc)); + + ret = do_adjtimex(&txc); + if (ret < 0) + return ret; + + txc.modes = ADJ_STATUS; + txc.status &= ~(STA_INS|STA_DEL); + return do_adjtimex(&txc); +} + +static void stp_check_leap(void) +{ + struct stp_stzi stzi; + struct stp_lsoib *lsoib = &stzi.lsoib; + struct __kernel_timex txc; + int64_t timediff; + int leapdiff, ret; + + if (!stp_info.lu || !check_sync_clock()) { + /* + * Either a scheduled leap second was removed by the operator, + * or STP is out of sync. In both cases, clear the leap second + * kernel flags. + */ + if (stp_clear_leap() < 0) + pr_err("failed to clear leap second flags\n"); + return; + } + + if (chsc_stzi(stp_page, &stzi, sizeof(stzi))) { + pr_err("stzi failed\n"); + return; + } + + timediff = tod_to_ns(lsoib->nlsout - get_tod_clock()) / NSEC_PER_SEC; + leapdiff = lsoib->nlso - lsoib->also; + + if (leapdiff != 1 && leapdiff != -1) { + pr_err("Cannot schedule %d leap seconds\n", leapdiff); + return; + } + + if (timediff < 0) { + if (stp_clear_leap() < 0) + pr_err("failed to clear leap second flags\n"); + } else if (timediff < 7200) { + memset(&txc, 0, sizeof(txc)); + ret = do_adjtimex(&txc); + if (ret < 0) + return; + + txc.modes = ADJ_STATUS; + if (leapdiff > 0) + txc.status |= STA_INS; + else + txc.status |= STA_DEL; + ret = do_adjtimex(&txc); + if (ret < 0) + pr_err("failed to set leap second flags\n"); + /* arm Timer to clear leap second flags */ + mod_timer(&stp_timer, jiffies + msecs_to_jiffies(14400 * MSEC_PER_SEC)); + } else { + /* The day the leap second is scheduled for hasn't been reached. Retry + * in one hour. + */ + mod_timer(&stp_timer, jiffies + msecs_to_jiffies(3600 * MSEC_PER_SEC)); + } +} + /* * STP work. Check for the STP state and take over the clock * synchronization if the STP clock source is usable. @@ -637,7 +675,7 @@ static void stp_work_fn(struct work_struct *work) int rc; /* prevent multiple execution. */ - mutex_lock(&stp_work_mutex); + mutex_lock(&stp_mutex); if (!stp_online) { chsc_sstpc(stp_page, STP_OP_CTRL, 0x0000, NULL); @@ -645,33 +683,34 @@ static void stp_work_fn(struct work_struct *work) goto out_unlock; } - rc = chsc_sstpc(stp_page, STP_OP_CTRL, 0xb0e0, NULL); + rc = chsc_sstpc(stp_page, STP_OP_CTRL, 0xf0e0, NULL); if (rc) goto out_unlock; - rc = chsc_sstpi(stp_page, &stp_info, sizeof(struct stp_sstpi)); + rc = __store_stpinfo(); if (rc || stp_info.c == 0) goto out_unlock; /* Skip synchronization if the clock is already in sync. */ - if (check_sync_clock()) - goto out_unlock; - - memset(&stp_sync, 0, sizeof(stp_sync)); - cpus_read_lock(); - atomic_set(&stp_sync.cpus, num_online_cpus() - 1); - stop_machine_cpuslocked(stp_sync_clock, &stp_sync, cpu_online_mask); - cpus_read_unlock(); + if (!check_sync_clock()) { + memset(&stp_sync, 0, sizeof(stp_sync)); + cpus_read_lock(); + atomic_set(&stp_sync.cpus, num_online_cpus() - 1); + stop_machine_cpuslocked(stp_sync_clock, &stp_sync, cpu_online_mask); + cpus_read_unlock(); + } if (!check_sync_clock()) /* - * There is a usable clock but the synchonization failed. + * There is a usable clock but the synchronization failed. * Retry after a second. */ - mod_timer(&stp_timer, jiffies + HZ); + mod_timer(&stp_timer, jiffies + msecs_to_jiffies(MSEC_PER_SEC)); + else if (stp_info.lu) + stp_check_leap(); out_unlock: - mutex_unlock(&stp_work_mutex); + mutex_unlock(&stp_mutex); } /* @@ -682,115 +721,178 @@ static struct bus_type stp_subsys = { .dev_name = "stp", }; -static ssize_t stp_ctn_id_show(struct device *dev, +static ssize_t ctn_id_show(struct device *dev, struct device_attribute *attr, char *buf) { - if (!stp_online) - return -ENODATA; - return sprintf(buf, "%016llx\n", - *(unsigned long long *) stp_info.ctnid); + ssize_t ret = -ENODATA; + + mutex_lock(&stp_mutex); + if (stpinfo_valid()) + ret = sprintf(buf, "%016lx\n", + *(unsigned long *) stp_info.ctnid); + mutex_unlock(&stp_mutex); + return ret; } -static DEVICE_ATTR(ctn_id, 0400, stp_ctn_id_show, NULL); +static DEVICE_ATTR_RO(ctn_id); -static ssize_t stp_ctn_type_show(struct device *dev, +static ssize_t ctn_type_show(struct device *dev, struct device_attribute *attr, char *buf) { - if (!stp_online) - return -ENODATA; - return sprintf(buf, "%i\n", stp_info.ctn); + ssize_t ret = -ENODATA; + + mutex_lock(&stp_mutex); + if (stpinfo_valid()) + ret = sprintf(buf, "%i\n", stp_info.ctn); + mutex_unlock(&stp_mutex); + return ret; } -static DEVICE_ATTR(ctn_type, 0400, stp_ctn_type_show, NULL); +static DEVICE_ATTR_RO(ctn_type); -static ssize_t stp_dst_offset_show(struct device *dev, +static ssize_t dst_offset_show(struct device *dev, struct device_attribute *attr, char *buf) { - if (!stp_online || !(stp_info.vbits & 0x2000)) - return -ENODATA; - return sprintf(buf, "%i\n", (int)(s16) stp_info.dsto); + ssize_t ret = -ENODATA; + + mutex_lock(&stp_mutex); + if (stpinfo_valid() && (stp_info.vbits & 0x2000)) + ret = sprintf(buf, "%i\n", (int)(s16) stp_info.dsto); + mutex_unlock(&stp_mutex); + return ret; } -static DEVICE_ATTR(dst_offset, 0400, stp_dst_offset_show, NULL); +static DEVICE_ATTR_RO(dst_offset); -static ssize_t stp_leap_seconds_show(struct device *dev, +static ssize_t leap_seconds_show(struct device *dev, struct device_attribute *attr, char *buf) { - if (!stp_online || !(stp_info.vbits & 0x8000)) + ssize_t ret = -ENODATA; + + mutex_lock(&stp_mutex); + if (stpinfo_valid() && (stp_info.vbits & 0x8000)) + ret = sprintf(buf, "%i\n", (int)(s16) stp_info.leaps); + mutex_unlock(&stp_mutex); + return ret; +} + +static DEVICE_ATTR_RO(leap_seconds); + +static ssize_t leap_seconds_scheduled_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct stp_stzi stzi; + ssize_t ret; + + mutex_lock(&stp_mutex); + if (!stpinfo_valid() || !(stp_info.vbits & 0x8000) || !stp_info.lu) { + mutex_unlock(&stp_mutex); return -ENODATA; - return sprintf(buf, "%i\n", (int)(s16) stp_info.leaps); + } + + ret = chsc_stzi(stp_page, &stzi, sizeof(stzi)); + mutex_unlock(&stp_mutex); + if (ret < 0) + return ret; + + if (!stzi.lsoib.p) + return sprintf(buf, "0,0\n"); + + return sprintf(buf, "%lu,%d\n", + tod_to_ns(stzi.lsoib.nlsout - TOD_UNIX_EPOCH) / NSEC_PER_SEC, + stzi.lsoib.nlso - stzi.lsoib.also); } -static DEVICE_ATTR(leap_seconds, 0400, stp_leap_seconds_show, NULL); +static DEVICE_ATTR_RO(leap_seconds_scheduled); -static ssize_t stp_stratum_show(struct device *dev, +static ssize_t stratum_show(struct device *dev, struct device_attribute *attr, char *buf) { - if (!stp_online) - return -ENODATA; - return sprintf(buf, "%i\n", (int)(s16) stp_info.stratum); + ssize_t ret = -ENODATA; + + mutex_lock(&stp_mutex); + if (stpinfo_valid()) + ret = sprintf(buf, "%i\n", (int)(s16) stp_info.stratum); + mutex_unlock(&stp_mutex); + return ret; } -static DEVICE_ATTR(stratum, 0400, stp_stratum_show, NULL); +static DEVICE_ATTR_RO(stratum); -static ssize_t stp_time_offset_show(struct device *dev, +static ssize_t time_offset_show(struct device *dev, struct device_attribute *attr, char *buf) { - if (!stp_online || !(stp_info.vbits & 0x0800)) - return -ENODATA; - return sprintf(buf, "%i\n", (int) stp_info.tto); + ssize_t ret = -ENODATA; + + mutex_lock(&stp_mutex); + if (stpinfo_valid() && (stp_info.vbits & 0x0800)) + ret = sprintf(buf, "%i\n", (int) stp_info.tto); + mutex_unlock(&stp_mutex); + return ret; } -static DEVICE_ATTR(time_offset, 0400, stp_time_offset_show, NULL); +static DEVICE_ATTR_RO(time_offset); -static ssize_t stp_time_zone_offset_show(struct device *dev, +static ssize_t time_zone_offset_show(struct device *dev, struct device_attribute *attr, char *buf) { - if (!stp_online || !(stp_info.vbits & 0x4000)) - return -ENODATA; - return sprintf(buf, "%i\n", (int)(s16) stp_info.tzo); + ssize_t ret = -ENODATA; + + mutex_lock(&stp_mutex); + if (stpinfo_valid() && (stp_info.vbits & 0x4000)) + ret = sprintf(buf, "%i\n", (int)(s16) stp_info.tzo); + mutex_unlock(&stp_mutex); + return ret; } -static DEVICE_ATTR(time_zone_offset, 0400, - stp_time_zone_offset_show, NULL); +static DEVICE_ATTR_RO(time_zone_offset); -static ssize_t stp_timing_mode_show(struct device *dev, +static ssize_t timing_mode_show(struct device *dev, struct device_attribute *attr, char *buf) { - if (!stp_online) - return -ENODATA; - return sprintf(buf, "%i\n", stp_info.tmd); + ssize_t ret = -ENODATA; + + mutex_lock(&stp_mutex); + if (stpinfo_valid()) + ret = sprintf(buf, "%i\n", stp_info.tmd); + mutex_unlock(&stp_mutex); + return ret; } -static DEVICE_ATTR(timing_mode, 0400, stp_timing_mode_show, NULL); +static DEVICE_ATTR_RO(timing_mode); -static ssize_t stp_timing_state_show(struct device *dev, +static ssize_t timing_state_show(struct device *dev, struct device_attribute *attr, char *buf) { - if (!stp_online) - return -ENODATA; - return sprintf(buf, "%i\n", stp_info.tst); + ssize_t ret = -ENODATA; + + mutex_lock(&stp_mutex); + if (stpinfo_valid()) + ret = sprintf(buf, "%i\n", stp_info.tst); + mutex_unlock(&stp_mutex); + return ret; } -static DEVICE_ATTR(timing_state, 0400, stp_timing_state_show, NULL); +static DEVICE_ATTR_RO(timing_state); -static ssize_t stp_online_show(struct device *dev, +static ssize_t online_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%i\n", stp_online); } -static ssize_t stp_online_store(struct device *dev, +static ssize_t online_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { @@ -801,14 +903,14 @@ static ssize_t stp_online_store(struct device *dev, return -EINVAL; if (!test_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags)) return -EOPNOTSUPP; - mutex_lock(&clock_sync_mutex); + mutex_lock(&stp_mutex); stp_online = value; if (stp_online) set_bit(CLOCK_SYNC_STP, &clock_sync_flags); else clear_bit(CLOCK_SYNC_STP, &clock_sync_flags); queue_work(time_sync_wq, &stp_work); - mutex_unlock(&clock_sync_mutex); + mutex_unlock(&stp_mutex); return count; } @@ -816,46 +918,27 @@ static ssize_t stp_online_store(struct device *dev, * Can't use DEVICE_ATTR because the attribute should be named * stp/online but dev_attr_online already exists in this file .. */ -static struct device_attribute dev_attr_stp_online = { - .attr = { .name = "online", .mode = 0600 }, - .show = stp_online_show, - .store = stp_online_store, -}; - -static struct device_attribute *stp_attributes[] = { - &dev_attr_ctn_id, - &dev_attr_ctn_type, - &dev_attr_dst_offset, - &dev_attr_leap_seconds, - &dev_attr_stp_online, - &dev_attr_stratum, - &dev_attr_time_offset, - &dev_attr_time_zone_offset, - &dev_attr_timing_mode, - &dev_attr_timing_state, +static DEVICE_ATTR_RW(online); + +static struct attribute *stp_dev_attrs[] = { + &dev_attr_ctn_id.attr, + &dev_attr_ctn_type.attr, + &dev_attr_dst_offset.attr, + &dev_attr_leap_seconds.attr, + &dev_attr_online.attr, + &dev_attr_leap_seconds_scheduled.attr, + &dev_attr_stratum.attr, + &dev_attr_time_offset.attr, + &dev_attr_time_zone_offset.attr, + &dev_attr_timing_mode.attr, + &dev_attr_timing_state.attr, NULL }; +ATTRIBUTE_GROUPS(stp_dev); static int __init stp_init_sysfs(void) { - struct device_attribute **attr; - int rc; - - rc = subsys_system_register(&stp_subsys, NULL); - if (rc) - goto out; - for (attr = stp_attributes; *attr; attr++) { - rc = device_create_file(stp_subsys.dev_root, *attr); - if (rc) - goto out_unreg; - } - return 0; -out_unreg: - for (; attr >= stp_attributes; attr--) - device_remove_file(stp_subsys.dev_root, *attr); - bus_unregister(&stp_subsys); -out: - return rc; + return subsys_system_register(&stp_subsys, stp_dev_groups); } device_initcall(stp_init_sysfs); diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 3627953007ed..89e91b8ce842 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright IBM Corp. 2007, 2011 - * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com> */ #define KMSG_COMPONENT "cpu" @@ -26,7 +25,6 @@ #include <linux/nodemask.h> #include <linux/node.h> #include <asm/sysinfo.h> -#include <asm/numa.h> #define PTF_HORIZONTAL (0UL) #define PTF_VERTICAL (1UL) @@ -63,50 +61,56 @@ static struct mask_info drawer_info; struct cpu_topology_s390 cpu_topology[NR_CPUS]; EXPORT_SYMBOL_GPL(cpu_topology); -cpumask_t cpus_with_topology; - -static cpumask_t cpu_group_map(struct mask_info *info, unsigned int cpu) +static void cpu_group_map(cpumask_t *dst, struct mask_info *info, unsigned int cpu) { - cpumask_t mask; + static cpumask_t mask; - cpumask_copy(&mask, cpumask_of(cpu)); + cpumask_clear(&mask); + if (!cpumask_test_cpu(cpu, &cpu_setup_mask)) + goto out; + cpumask_set_cpu(cpu, &mask); switch (topology_mode) { case TOPOLOGY_MODE_HW: while (info) { if (cpumask_test_cpu(cpu, &info->mask)) { - mask = info->mask; + cpumask_copy(&mask, &info->mask); break; } info = info->next; } - if (cpumask_empty(&mask)) - cpumask_copy(&mask, cpumask_of(cpu)); break; case TOPOLOGY_MODE_PACKAGE: cpumask_copy(&mask, cpu_present_mask); break; default: - /* fallthrough */ + fallthrough; case TOPOLOGY_MODE_SINGLE: - cpumask_copy(&mask, cpumask_of(cpu)); break; } - return mask; + cpumask_and(&mask, &mask, &cpu_setup_mask); +out: + cpumask_copy(dst, &mask); } -static cpumask_t cpu_thread_map(unsigned int cpu) +static void cpu_thread_map(cpumask_t *dst, unsigned int cpu) { - cpumask_t mask; - int i; + static cpumask_t mask; + unsigned int max_cpu; - cpumask_copy(&mask, cpumask_of(cpu)); + cpumask_clear(&mask); + if (!cpumask_test_cpu(cpu, &cpu_setup_mask)) + goto out; + cpumask_set_cpu(cpu, &mask); if (topology_mode != TOPOLOGY_MODE_HW) - return mask; + goto out; cpu -= cpu % (smp_cpu_mtid + 1); - for (i = 0; i <= smp_cpu_mtid; i++) - if (cpu_present(cpu + i)) - cpumask_set_cpu(cpu + i, &mask); - return mask; + max_cpu = min(cpu + smp_cpu_mtid, nr_cpu_ids - 1); + for (; cpu <= max_cpu; cpu++) { + if (cpumask_test_cpu(cpu, &cpu_setup_mask)) + cpumask_set_cpu(cpu, &mask); + } +out: + cpumask_copy(dst, &mask); } #define TOPOLOGY_CORE_BITS 64 @@ -120,26 +124,26 @@ static void add_cpus_to_mask(struct topology_core *tl_core, unsigned int core; for_each_set_bit(core, &tl_core->mask, TOPOLOGY_CORE_BITS) { - unsigned int rcore; - int lcpu, i; + unsigned int max_cpu, rcore; + int cpu; rcore = TOPOLOGY_CORE_BITS - 1 - core + tl_core->origin; - lcpu = smp_find_processor_id(rcore << smp_cpu_mt_shift); - if (lcpu < 0) + cpu = smp_find_processor_id(rcore << smp_cpu_mt_shift); + if (cpu < 0) continue; - for (i = 0; i <= smp_cpu_mtid; i++) { - topo = &cpu_topology[lcpu + i]; + max_cpu = min(cpu + smp_cpu_mtid, nr_cpu_ids - 1); + for (; cpu <= max_cpu; cpu++) { + topo = &cpu_topology[cpu]; topo->drawer_id = drawer->id; topo->book_id = book->id; topo->socket_id = socket->id; topo->core_id = rcore; - topo->thread_id = lcpu + i; + topo->thread_id = cpu; topo->dedicated = tl_core->d; - cpumask_set_cpu(lcpu + i, &drawer->mask); - cpumask_set_cpu(lcpu + i, &book->mask); - cpumask_set_cpu(lcpu + i, &socket->mask); - cpumask_set_cpu(lcpu + i, &cpus_with_topology); - smp_cpu_set_polarization(lcpu + i, tl_core->pp); + cpumask_set_cpu(cpu, &drawer->mask); + cpumask_set_cpu(cpu, &book->mask); + cpumask_set_cpu(cpu, &socket->mask); + smp_cpu_set_polarization(cpu, tl_core->pp); } } } @@ -245,17 +249,18 @@ int topology_set_cpu_management(int fc) return rc; } -static void update_cpu_masks(void) +void update_cpu_masks(void) { - struct cpu_topology_s390 *topo; - int cpu, id; + struct cpu_topology_s390 *topo, *topo_package, *topo_sibling; + int cpu, sibling, pkg_first, smt_first, id; for_each_possible_cpu(cpu) { topo = &cpu_topology[cpu]; - topo->thread_mask = cpu_thread_map(cpu); - topo->core_mask = cpu_group_map(&socket_info, cpu); - topo->book_mask = cpu_group_map(&book_info, cpu); - topo->drawer_mask = cpu_group_map(&drawer_info, cpu); + cpu_thread_map(&topo->thread_mask, cpu); + cpu_group_map(&topo->core_mask, &socket_info, cpu); + cpu_group_map(&topo->book_mask, &book_info, cpu); + cpu_group_map(&topo->drawer_mask, &drawer_info, cpu); + topo->booted_cores = 0; if (topology_mode != TOPOLOGY_MODE_HW) { id = topology_mode == TOPOLOGY_MODE_PACKAGE ? 0 : cpu; topo->thread_id = cpu; @@ -263,11 +268,23 @@ static void update_cpu_masks(void) topo->socket_id = id; topo->book_id = id; topo->drawer_id = id; - if (cpu_present(cpu)) - cpumask_set_cpu(cpu, &cpus_with_topology); } } - numa_update_cpu_topology(); + for_each_online_cpu(cpu) { + topo = &cpu_topology[cpu]; + pkg_first = cpumask_first(&topo->core_mask); + topo_package = &cpu_topology[pkg_first]; + if (cpu == pkg_first) { + for_each_cpu(sibling, &topo->core_mask) { + topo_sibling = &cpu_topology[sibling]; + smt_first = cpumask_first(&topo_sibling->thread_mask); + if (sibling == smt_first) + topo_package->booted_cores++; + } + } else { + topo->booted_cores = topo_package->booted_cores; + } + } } void store_topology(struct sysinfo_15_1_x *info) @@ -289,7 +306,6 @@ static int __arch_update_cpu_topology(void) int rc = 0; mutex_lock(&smp_cpu_state_mutex); - cpumask_clear(&cpus_with_topology); if (MACHINE_HAS_TOPOLOGY) { rc = 1; store_topology(info); @@ -346,9 +362,9 @@ static atomic_t topology_poll = ATOMIC_INIT(0); static void set_topology_timer(void) { if (atomic_add_unless(&topology_poll, -1, 0)) - mod_timer(&topology_timer, jiffies + HZ / 10); + mod_timer(&topology_timer, jiffies + msecs_to_jiffies(100)); else - mod_timer(&topology_timer, jiffies + HZ * 60); + mod_timer(&topology_timer, jiffies + msecs_to_jiffies(60 * MSEC_PER_SEC)); } void topology_expect_change(void) @@ -391,7 +407,7 @@ static ssize_t dispatching_store(struct device *dev, if (val != 0 && val != 1) return -EINVAL; rc = 0; - get_online_cpus(); + cpus_read_lock(); mutex_lock(&smp_cpu_state_mutex); if (cpu_management == val) goto out; @@ -402,7 +418,7 @@ static ssize_t dispatching_store(struct device *dev, topology_expect_change(); out: mutex_unlock(&smp_cpu_state_mutex); - put_online_cpus(); + cpus_read_unlock(); return rc ? rc : count; } static DEVICE_ATTR_RW(dispatching); @@ -506,7 +522,7 @@ static struct sched_domain_topology_level s390_topology[] = { { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, { cpu_book_mask, SD_INIT_NAME(BOOK) }, { cpu_drawer_mask, SD_INIT_NAME(DRAWER) }, - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { cpu_cpu_mask, SD_INIT_NAME(PKG) }, { NULL, }, }; @@ -554,6 +570,7 @@ void __init topology_init_early(void) alloc_masks(info, &book_info, 2); alloc_masks(info, &drawer_info, 3); out: + cpumask_set_cpu(0, &cpu_setup_mask); __arch_update_cpu_topology(); __arch_update_dedicated_flag(NULL); } @@ -584,7 +601,7 @@ static int __init topology_setup(char *str) early_param("topology", topology_setup); static int topology_ctl_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int enabled = topology_is_enabled(); int new_mode; @@ -619,27 +636,25 @@ static struct ctl_table topology_ctl_table[] = { .mode = 0644, .proc_handler = topology_ctl_handler, }, - { }, -}; - -static struct ctl_table topology_dir_table[] = { - { - .procname = "s390", - .maxlen = 0, - .mode = 0555, - .child = topology_ctl_table, - }, - { }, }; static int __init topology_init(void) { + struct device *dev_root; + int rc = 0; + timer_setup(&topology_timer, topology_timer_fn, TIMER_DEFERRABLE); if (MACHINE_HAS_TOPOLOGY) set_topology_timer(); else topology_update_polarization_simple(); - register_sysctl_table(topology_dir_table); - return device_create_file(cpu_subsys.dev_root, &dev_attr_dispatching); + register_sysctl("s390", topology_ctl_table); + + dev_root = bus_get_dev_root(&cpu_subsys); + if (dev_root) { + rc = device_create_file(dev_root, &dev_attr_dispatching); + put_device(dev_root); + } + return rc; } device_initcall(topology_init); diff --git a/arch/s390/kernel/trace.c b/arch/s390/kernel/trace.c index 490b52e85014..11a669f3cc93 100644 --- a/arch/s390/kernel/trace.c +++ b/arch/s390/kernel/trace.c @@ -14,7 +14,7 @@ EXPORT_TRACEPOINT_SYMBOL(s390_diagnose); static DEFINE_PER_CPU(unsigned int, diagnose_trace_depth); -void trace_s390_diagnose_norecursion(int diag_nr) +void notrace trace_s390_diagnose_norecursion(int diag_nr) { unsigned long flags; unsigned int *depth; diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index 164c0282b41a..46dac4540ca8 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -13,8 +13,11 @@ * 'Traps.c' handles hardware traps and faults after we have saved some * state in 'asm.s'. */ +#include "asm/irqflags.h" +#include "asm/ptrace.h" #include <linux/kprobes.h> #include <linux/kdebug.h> +#include <linux/randomize_kstack.h> #include <linux/extable.h> #include <linux/ptrace.h> #include <linux/sched.h> @@ -23,7 +26,10 @@ #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/cpu.h> +#include <linux/entry-common.h> +#include <asm/asm-extable.h> #include <asm/fpu/api.h> +#include <asm/vtime.h> #include "entry.h" static inline void __user *get_trap_ip(struct pt_regs *regs) @@ -31,16 +37,18 @@ static inline void __user *get_trap_ip(struct pt_regs *regs) unsigned long address; if (regs->int_code & 0x200) - address = *(unsigned long *)(current->thread.trap_tdb + 24); + address = current->thread.trap_tdb.data[3]; else address = regs->psw.addr; return (void __user *) (address - (regs->int_code >> 16)); } +#ifdef CONFIG_GENERIC_BUG int is_valid_bugaddr(unsigned long addr) { return 1; } +#endif void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str) { @@ -48,18 +56,8 @@ void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str) force_sig_fault(si_signo, si_code, get_trap_ip(regs)); report_user_fault(regs, si_signo, 0); } else { - const struct exception_table_entry *fixup; - fixup = s390_search_extables(regs->psw.addr); - if (fixup) - regs->psw.addr = extable_fixup(fixup); - else { - enum bug_trap_type btt; - - btt = report_bug(regs->psw.addr, regs); - if (btt == BUG_TRAP_TYPE_WARN) - return; + if (!fixup_exception(regs)) die(regs, str); - } } } @@ -83,17 +81,17 @@ void do_per_trap(struct pt_regs *regs) } NOKPROBE_SYMBOL(do_per_trap); -void default_trap_handler(struct pt_regs *regs) +static void default_trap_handler(struct pt_regs *regs) { if (user_mode(regs)) { report_user_fault(regs, SIGSEGV, 0); - do_exit(SIGSEGV); + force_exit_sig(SIGSEGV); } else die(regs, "Unknown program exception"); } #define DO_ERROR_INFO(name, signr, sicode, str) \ -void name(struct pt_regs *regs) \ +static void name(struct pt_regs *regs) \ { \ do_trap(regs, signr, sicode, str); \ } @@ -145,13 +143,13 @@ static inline void do_fp_trap(struct pt_regs *regs, __u32 fpc) do_trap(regs, SIGFPE, si_code, "floating point exception"); } -void translation_exception(struct pt_regs *regs) +static void translation_specification_exception(struct pt_regs *regs) { /* May never happen. */ - panic("Translation exception"); + panic("Translation-Specification Exception"); } -void illegal_op(struct pt_regs *regs) +static void illegal_op(struct pt_regs *regs) { __u8 opcode[6]; __u16 __user *location; @@ -193,11 +191,11 @@ NOKPROBE_SYMBOL(illegal_op); DO_ERROR_INFO(specification_exception, SIGILL, ILL_ILLOPN, "specification exception"); -void vector_exception(struct pt_regs *regs) +static void vector_exception(struct pt_regs *regs) { int si_code, vic; - if (!MACHINE_HAS_VX) { + if (!cpu_has_vx()) { do_trap(regs, SIGILL, ILL_ILLOPN, "illegal operation"); return; } @@ -227,7 +225,7 @@ void vector_exception(struct pt_regs *regs) do_trap(regs, SIGFPE, si_code, "vector exception"); } -void data_exception(struct pt_regs *regs) +static void data_exception(struct pt_regs *regs) { save_fpu_regs(); if (current->thread.fpu.fpc & FPC_DXC_MASK) @@ -236,7 +234,7 @@ void data_exception(struct pt_regs *regs) do_trap(regs, SIGILL, ILL_ILLOPN, "data exception"); } -void space_switch_exception(struct pt_regs *regs) +static void space_switch_exception(struct pt_regs *regs) { /* Set user psw back to home space mode. */ if (user_mode(regs)) @@ -245,6 +243,23 @@ void space_switch_exception(struct pt_regs *regs) do_trap(regs, SIGILL, ILL_PRVOPC, "space switch event"); } +static void monitor_event_exception(struct pt_regs *regs) +{ + if (user_mode(regs)) + return; + + switch (report_bug(regs->psw.addr - (regs->int_code >> 16), regs)) { + case BUG_TRAP_TYPE_NONE: + fixup_exception(regs); + break; + case BUG_TRAP_TYPE_WARN: + break; + case BUG_TRAP_TYPE_BUG: + die(regs, "monitor event"); + break; + } +} + void kernel_stack_overflow(struct pt_regs *regs) { bust_spinlocks(1); @@ -255,8 +270,147 @@ void kernel_stack_overflow(struct pt_regs *regs) } NOKPROBE_SYMBOL(kernel_stack_overflow); +static void __init test_monitor_call(void) +{ + int val = 1; + + if (!IS_ENABLED(CONFIG_BUG)) + return; + asm volatile( + " mc 0,0\n" + "0: xgr %0,%0\n" + "1:\n" + EX_TABLE(0b,1b) + : "+d" (val)); + if (!val) + panic("Monitor call doesn't work!\n"); +} + void __init trap_init(void) { - sort_extable(__start_dma_ex_table, __stop_dma_ex_table); + unsigned long flags; + struct ctlreg cr0; + + local_irq_save(flags); + cr0 = local_ctl_clear_bit(0, CR0_LOW_ADDRESS_PROTECTION_BIT); + psw_bits(S390_lowcore.external_new_psw).mcheck = 1; + psw_bits(S390_lowcore.program_new_psw).mcheck = 1; + psw_bits(S390_lowcore.svc_new_psw).mcheck = 1; + psw_bits(S390_lowcore.io_new_psw).mcheck = 1; + local_ctl_load(0, &cr0); + local_irq_restore(flags); local_mcck_enable(); + test_monitor_call(); } + +static void (*pgm_check_table[128])(struct pt_regs *regs); + +void noinstr __do_pgm_check(struct pt_regs *regs) +{ + unsigned int trapnr; + irqentry_state_t state; + + regs->int_code = S390_lowcore.pgm_int_code; + regs->int_parm_long = S390_lowcore.trans_exc_code; + + state = irqentry_enter(regs); + + if (user_mode(regs)) { + update_timer_sys(); + if (!static_branch_likely(&cpu_has_bear)) { + if (regs->last_break < 4096) + regs->last_break = 1; + } + current->thread.last_break = regs->last_break; + } + + if (S390_lowcore.pgm_code & 0x0200) { + /* transaction abort */ + current->thread.trap_tdb = S390_lowcore.pgm_tdb; + } + + if (S390_lowcore.pgm_code & PGM_INT_CODE_PER) { + if (user_mode(regs)) { + struct per_event *ev = ¤t->thread.per_event; + + set_thread_flag(TIF_PER_TRAP); + ev->address = S390_lowcore.per_address; + ev->cause = S390_lowcore.per_code_combined; + ev->paid = S390_lowcore.per_access_id; + } else { + /* PER event in kernel is kprobes */ + __arch_local_irq_ssm(regs->psw.mask & ~PSW_MASK_PER); + do_per_trap(regs); + goto out; + } + } + + if (!irqs_disabled_flags(regs->psw.mask)) + trace_hardirqs_on(); + __arch_local_irq_ssm(regs->psw.mask & ~PSW_MASK_PER); + + trapnr = regs->int_code & PGM_INT_CODE_MASK; + if (trapnr) + pgm_check_table[trapnr](regs); +out: + local_irq_disable(); + irqentry_exit(regs, state); +} + +/* + * The program check table contains exactly 128 (0x00-0x7f) entries. Each + * line defines the function to be called corresponding to the program check + * interruption code. + */ +static void (*pgm_check_table[128])(struct pt_regs *regs) = { + [0x00] = default_trap_handler, + [0x01] = illegal_op, + [0x02] = privileged_op, + [0x03] = execute_exception, + [0x04] = do_protection_exception, + [0x05] = addressing_exception, + [0x06] = specification_exception, + [0x07] = data_exception, + [0x08] = overflow_exception, + [0x09] = divide_exception, + [0x0a] = overflow_exception, + [0x0b] = divide_exception, + [0x0c] = hfp_overflow_exception, + [0x0d] = hfp_underflow_exception, + [0x0e] = hfp_significance_exception, + [0x0f] = hfp_divide_exception, + [0x10] = do_dat_exception, + [0x11] = do_dat_exception, + [0x12] = translation_specification_exception, + [0x13] = special_op_exception, + [0x14] = default_trap_handler, + [0x15] = operand_exception, + [0x16] = default_trap_handler, + [0x17] = default_trap_handler, + [0x18] = transaction_exception, + [0x19] = default_trap_handler, + [0x1a] = default_trap_handler, + [0x1b] = vector_exception, + [0x1c] = space_switch_exception, + [0x1d] = hfp_sqrt_exception, + [0x1e ... 0x37] = default_trap_handler, + [0x38] = do_dat_exception, + [0x39] = do_dat_exception, + [0x3a] = do_dat_exception, + [0x3b] = do_dat_exception, + [0x3c] = default_trap_handler, + [0x3d] = do_secure_storage_access, + [0x3e] = do_non_secure_storage_access, + [0x3f] = do_secure_storage_violation, + [0x40] = monitor_event_exception, + [0x41 ... 0x7f] = default_trap_handler, +}; + +#define COND_TRAP(x) asm( \ + ".weak " __stringify(x) "\n\t" \ + ".set " __stringify(x) "," \ + __stringify(default_trap_handler)) + +COND_TRAP(do_secure_storage_access); +COND_TRAP(do_non_secure_storage_access); +COND_TRAP(do_secure_storage_violation); diff --git a/arch/s390/kernel/unwind_bc.c b/arch/s390/kernel/unwind_bc.c index da2d4d4c5b0e..0ece156fdd7c 100644 --- a/arch/s390/kernel/unwind_bc.c +++ b/arch/s390/kernel/unwind_bc.c @@ -36,10 +36,17 @@ static bool update_stack_info(struct unwind_state *state, unsigned long sp) return true; } -static inline bool is_task_pt_regs(struct unwind_state *state, - struct pt_regs *regs) +static inline bool is_final_pt_regs(struct unwind_state *state, + struct pt_regs *regs) { - return task_pt_regs(state->task) == regs; + /* user mode or kernel thread pt_regs at the bottom of task stack */ + if (task_pt_regs(state->task) == regs) + return true; + + /* user mode pt_regs at the bottom of irq stack */ + return state->stack_info.type == STACK_TYPE_IRQ && + state->stack_info.end - sizeof(struct pt_regs) == (unsigned long)regs && + READ_ONCE_NOCHECK(regs->psw.mask) & PSW_MASK_PSTATE; } bool unwind_next_frame(struct unwind_state *state) @@ -57,8 +64,8 @@ bool unwind_next_frame(struct unwind_state *state) ip = READ_ONCE_NOCHECK(sf->gprs[8]); reliable = false; regs = NULL; - if (!__kernel_text_address(ip)) { - /* skip bogus %r14 */ + /* skip bogus %r14 or if is the same as regs->psw.addr */ + if (!__kernel_text_address(ip) || state->ip == unwind_recover_ret_addr(state, ip)) { state->regs = NULL; return unwind_next_frame(state); } @@ -80,7 +87,7 @@ bool unwind_next_frame(struct unwind_state *state) if (!on_stack(info, sp, sizeof(struct pt_regs))) goto out_err; regs = (struct pt_regs *) sp; - if (is_task_pt_regs(state, regs)) + if (is_final_pt_regs(state, regs)) goto out_stop; ip = READ_ONCE_NOCHECK(regs->psw.addr); sp = READ_ONCE_NOCHECK(regs->gprs[15]); @@ -96,13 +103,11 @@ bool unwind_next_frame(struct unwind_state *state) if (sp & 0x7) goto out_err; - ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, ip, (void *) sp); - /* Update unwind state */ state->sp = sp; - state->ip = ip; state->regs = regs; state->reliable = reliable; + state->ip = unwind_recover_ret_addr(state, ip); return true; out_err: @@ -154,12 +159,10 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, ip = READ_ONCE_NOCHECK(sf->gprs[8]); } - ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, ip, NULL); - /* Update unwind state */ state->sp = sp; - state->ip = ip; state->reliable = true; + state->ip = unwind_recover_ret_addr(state, ip); if (!first_frame) return; diff --git a/arch/s390/kernel/uprobes.c b/arch/s390/kernel/uprobes.c index 5007fac01bb5..b88345ef8bd9 100644 --- a/arch/s390/kernel/uprobes.c +++ b/arch/s390/kernel/uprobes.c @@ -32,7 +32,7 @@ int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) return -EINVAL; if (!is_compat_task() && psw_bits(regs->psw).eaba == PSW_BITS_AMODE_31BIT) return -EINVAL; - clear_pt_regs_flag(regs, PIF_PER_TRAP); + clear_thread_flag(TIF_PER_TRAP); auprobe->saved_per = psw_bits(regs->psw).per; auprobe->saved_int_code = regs->int_code; regs->int_code = UPROBE_TRAP_NR; @@ -103,7 +103,7 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) /* fix per address */ current->thread.per_event.address = utask->vaddr; /* trigger per event */ - set_pt_regs_flag(regs, PIF_PER_TRAP); + set_thread_flag(TIF_PER_TRAP); } return 0; } @@ -126,6 +126,7 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, case DIE_SSTEP: if (uprobe_post_sstep_notifier(regs)) return NOTIFY_STOP; + break; default: break; } @@ -176,9 +177,7 @@ static void adjust_psw_addr(psw_t *psw, unsigned long len) __typeof__(*(ptr)) input; \ int __rc = 0; \ \ - if (!test_facility(34)) \ - __rc = EMU_ILLEGAL_OP; \ - else if ((u64 __force)ptr & mask) \ + if ((u64 __force)ptr & mask) \ __rc = EMU_SPECIFICATION; \ else if (get_user(input, ptr)) \ __rc = EMU_ADDRESSING; \ @@ -193,9 +192,7 @@ static void adjust_psw_addr(psw_t *psw, unsigned long len) __typeof__(ptr) __ptr = (ptr); \ int __rc = 0; \ \ - if (!test_facility(34)) \ - __rc = EMU_ILLEGAL_OP; \ - else if ((u64 __force)__ptr & mask) \ + if ((u64 __force)__ptr & mask) \ __rc = EMU_SPECIFICATION; \ else if (put_user(*(input), __ptr)) \ __rc = EMU_ADDRESSING; \ @@ -212,9 +209,7 @@ static void adjust_psw_addr(psw_t *psw, unsigned long len) __typeof__(*(ptr)) input; \ int __rc = 0; \ \ - if (!test_facility(34)) \ - __rc = EMU_ILLEGAL_OP; \ - else if ((u64 __force)ptr & mask) \ + if ((u64 __force)ptr & mask) \ __rc = EMU_SPECIFICATION; \ else if (get_user(input, ptr)) \ __rc = EMU_ADDRESSING; \ @@ -259,7 +254,7 @@ static void sim_stor_event(struct pt_regs *regs, void *addr, int len) return; current->thread.per_event.address = regs->psw.addr; current->thread.per_event.cause = PER_EVENT_STORE >> 16; - set_pt_regs_flag(regs, PIF_PER_TRAP); + set_thread_flag(TIF_PER_TRAP); } /* @@ -326,10 +321,6 @@ static void handle_insn_ril(struct arch_uprobe *auprobe, struct pt_regs *regs) break; case 0xc6: switch (insn->opc1) { - case 0x02: /* pfdrl */ - if (!test_facility(34)) - rc = EMU_ILLEGAL_OP; - break; case 0x04: /* cghrl */ rc = emu_cmp_ril(regs, (s16 __user *)uptr, &rx->s64); break; diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c new file mode 100644 index 000000000000..fc07bc39e698 --- /dev/null +++ b/arch/s390/kernel/uv.c @@ -0,0 +1,717 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Common Ultravisor functions and initialization + * + * Copyright IBM Corp. 2019, 2020 + */ +#define KMSG_COMPONENT "prot_virt" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/sizes.h> +#include <linux/bitmap.h> +#include <linux/memblock.h> +#include <linux/pagemap.h> +#include <linux/swap.h> +#include <asm/facility.h> +#include <asm/sections.h> +#include <asm/uv.h> + +/* the bootdata_preserved fields come from ones in arch/s390/boot/uv.c */ +#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST +int __bootdata_preserved(prot_virt_guest); +#endif + +/* + * uv_info contains both host and guest information but it's currently only + * expected to be used within modules if it's the KVM module or for + * any PV guest module. + * + * The kernel itself will write these values once in uv_query_info() + * and then make some of them readable via a sysfs interface. + */ +struct uv_info __bootdata_preserved(uv_info); +EXPORT_SYMBOL(uv_info); + +#if IS_ENABLED(CONFIG_KVM) +int __bootdata_preserved(prot_virt_host); +EXPORT_SYMBOL(prot_virt_host); + +static int __init uv_init(phys_addr_t stor_base, unsigned long stor_len) +{ + struct uv_cb_init uvcb = { + .header.cmd = UVC_CMD_INIT_UV, + .header.len = sizeof(uvcb), + .stor_origin = stor_base, + .stor_len = stor_len, + }; + + if (uv_call(0, (uint64_t)&uvcb)) { + pr_err("Ultravisor init failed with rc: 0x%x rrc: 0%x\n", + uvcb.header.rc, uvcb.header.rrc); + return -1; + } + return 0; +} + +void __init setup_uv(void) +{ + void *uv_stor_base; + + if (!is_prot_virt_host()) + return; + + uv_stor_base = memblock_alloc_try_nid( + uv_info.uv_base_stor_len, SZ_1M, SZ_2G, + MEMBLOCK_ALLOC_ACCESSIBLE, NUMA_NO_NODE); + if (!uv_stor_base) { + pr_warn("Failed to reserve %lu bytes for ultravisor base storage\n", + uv_info.uv_base_stor_len); + goto fail; + } + + if (uv_init(__pa(uv_stor_base), uv_info.uv_base_stor_len)) { + memblock_free(uv_stor_base, uv_info.uv_base_stor_len); + goto fail; + } + + pr_info("Reserving %luMB as ultravisor base storage\n", + uv_info.uv_base_stor_len >> 20); + return; +fail: + pr_info("Disabling support for protected virtualization"); + prot_virt_host = 0; +} + +/* + * Requests the Ultravisor to pin the page in the shared state. This will + * cause an intercept when the guest attempts to unshare the pinned page. + */ +int uv_pin_shared(unsigned long paddr) +{ + struct uv_cb_cfs uvcb = { + .header.cmd = UVC_CMD_PIN_PAGE_SHARED, + .header.len = sizeof(uvcb), + .paddr = paddr, + }; + + if (uv_call(0, (u64)&uvcb)) + return -EINVAL; + return 0; +} +EXPORT_SYMBOL_GPL(uv_pin_shared); + +/* + * Requests the Ultravisor to destroy a guest page and make it + * accessible to the host. The destroy clears the page instead of + * exporting. + * + * @paddr: Absolute host address of page to be destroyed + */ +static int uv_destroy_page(unsigned long paddr) +{ + struct uv_cb_cfs uvcb = { + .header.cmd = UVC_CMD_DESTR_SEC_STOR, + .header.len = sizeof(uvcb), + .paddr = paddr + }; + + if (uv_call(0, (u64)&uvcb)) { + /* + * Older firmware uses 107/d as an indication of a non secure + * page. Let us emulate the newer variant (no-op). + */ + if (uvcb.header.rc == 0x107 && uvcb.header.rrc == 0xd) + return 0; + return -EINVAL; + } + return 0; +} + +/* + * The caller must already hold a reference to the page + */ +int uv_destroy_owned_page(unsigned long paddr) +{ + struct page *page = phys_to_page(paddr); + int rc; + + get_page(page); + rc = uv_destroy_page(paddr); + if (!rc) + clear_bit(PG_arch_1, &page->flags); + put_page(page); + return rc; +} + +/* + * Requests the Ultravisor to encrypt a guest page and make it + * accessible to the host for paging (export). + * + * @paddr: Absolute host address of page to be exported + */ +int uv_convert_from_secure(unsigned long paddr) +{ + struct uv_cb_cfs uvcb = { + .header.cmd = UVC_CMD_CONV_FROM_SEC_STOR, + .header.len = sizeof(uvcb), + .paddr = paddr + }; + + if (uv_call(0, (u64)&uvcb)) + return -EINVAL; + return 0; +} + +/* + * The caller must already hold a reference to the page + */ +int uv_convert_owned_from_secure(unsigned long paddr) +{ + struct page *page = phys_to_page(paddr); + int rc; + + get_page(page); + rc = uv_convert_from_secure(paddr); + if (!rc) + clear_bit(PG_arch_1, &page->flags); + put_page(page); + return rc; +} + +/* + * Calculate the expected ref_count for a page that would otherwise have no + * further pins. This was cribbed from similar functions in other places in + * the kernel, but with some slight modifications. We know that a secure + * page can not be a huge page for example. + */ +static int expected_page_refs(struct page *page) +{ + int res; + + res = page_mapcount(page); + if (PageSwapCache(page)) { + res++; + } else if (page_mapping(page)) { + res++; + if (page_has_private(page)) + res++; + } + return res; +} + +static int make_page_secure(struct page *page, struct uv_cb_header *uvcb) +{ + int expected, cc = 0; + + if (PageWriteback(page)) + return -EAGAIN; + expected = expected_page_refs(page); + if (!page_ref_freeze(page, expected)) + return -EBUSY; + set_bit(PG_arch_1, &page->flags); + /* + * If the UVC does not succeed or fail immediately, we don't want to + * loop for long, or we might get stall notifications. + * On the other hand, this is a complex scenario and we are holding a lot of + * locks, so we can't easily sleep and reschedule. We try only once, + * and if the UVC returned busy or partial completion, we return + * -EAGAIN and we let the callers deal with it. + */ + cc = __uv_call(0, (u64)uvcb); + page_ref_unfreeze(page, expected); + /* + * Return -ENXIO if the page was not mapped, -EINVAL for other errors. + * If busy or partially completed, return -EAGAIN. + */ + if (cc == UVC_CC_OK) + return 0; + else if (cc == UVC_CC_BUSY || cc == UVC_CC_PARTIAL) + return -EAGAIN; + return uvcb->rc == 0x10a ? -ENXIO : -EINVAL; +} + +/** + * should_export_before_import - Determine whether an export is needed + * before an import-like operation + * @uvcb: the Ultravisor control block of the UVC to be performed + * @mm: the mm of the process + * + * Returns whether an export is needed before every import-like operation. + * This is needed for shared pages, which don't trigger a secure storage + * exception when accessed from a different guest. + * + * Although considered as one, the Unpin Page UVC is not an actual import, + * so it is not affected. + * + * No export is needed also when there is only one protected VM, because the + * page cannot belong to the wrong VM in that case (there is no "other VM" + * it can belong to). + * + * Return: true if an export is needed before every import, otherwise false. + */ +static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm) +{ + /* + * The misc feature indicates, among other things, that importing a + * shared page from a different protected VM will automatically also + * transfer its ownership. + */ + if (uv_has_feature(BIT_UV_FEAT_MISC)) + return false; + if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED) + return false; + return atomic_read(&mm->context.protected_count) > 1; +} + +/* + * Requests the Ultravisor to make a page accessible to a guest. + * If it's brought in the first time, it will be cleared. If + * it has been exported before, it will be decrypted and integrity + * checked. + */ +int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb) +{ + struct vm_area_struct *vma; + bool local_drain = false; + spinlock_t *ptelock; + unsigned long uaddr; + struct page *page; + pte_t *ptep; + int rc; + +again: + rc = -EFAULT; + mmap_read_lock(gmap->mm); + + uaddr = __gmap_translate(gmap, gaddr); + if (IS_ERR_VALUE(uaddr)) + goto out; + vma = vma_lookup(gmap->mm, uaddr); + if (!vma) + goto out; + /* + * Secure pages cannot be huge and userspace should not combine both. + * In case userspace does it anyway this will result in an -EFAULT for + * the unpack. The guest is thus never reaching secure mode. If + * userspace is playing dirty tricky with mapping huge pages later + * on this will result in a segmentation fault. + */ + if (is_vm_hugetlb_page(vma)) + goto out; + + rc = -ENXIO; + ptep = get_locked_pte(gmap->mm, uaddr, &ptelock); + if (!ptep) + goto out; + if (pte_present(*ptep) && !(pte_val(*ptep) & _PAGE_INVALID) && pte_write(*ptep)) { + page = pte_page(*ptep); + rc = -EAGAIN; + if (trylock_page(page)) { + if (should_export_before_import(uvcb, gmap->mm)) + uv_convert_from_secure(page_to_phys(page)); + rc = make_page_secure(page, uvcb); + unlock_page(page); + } + } + pte_unmap_unlock(ptep, ptelock); +out: + mmap_read_unlock(gmap->mm); + + if (rc == -EAGAIN) { + /* + * If we are here because the UVC returned busy or partial + * completion, this is just a useless check, but it is safe. + */ + wait_on_page_writeback(page); + } else if (rc == -EBUSY) { + /* + * If we have tried a local drain and the page refcount + * still does not match our expected safe value, try with a + * system wide drain. This is needed if the pagevecs holding + * the page are on a different CPU. + */ + if (local_drain) { + lru_add_drain_all(); + /* We give up here, and let the caller try again */ + return -EAGAIN; + } + /* + * We are here if the page refcount does not match the + * expected safe value. The main culprits are usually + * pagevecs. With lru_add_drain() we drain the pagevecs + * on the local CPU so that hopefully the refcount will + * reach the expected safe value. + */ + lru_add_drain(); + local_drain = true; + /* And now we try again immediately after draining */ + goto again; + } else if (rc == -ENXIO) { + if (gmap_fault(gmap, gaddr, FAULT_FLAG_WRITE)) + return -EFAULT; + return -EAGAIN; + } + return rc; +} +EXPORT_SYMBOL_GPL(gmap_make_secure); + +int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr) +{ + struct uv_cb_cts uvcb = { + .header.cmd = UVC_CMD_CONV_TO_SEC_STOR, + .header.len = sizeof(uvcb), + .guest_handle = gmap->guest_handle, + .gaddr = gaddr, + }; + + return gmap_make_secure(gmap, gaddr, &uvcb); +} +EXPORT_SYMBOL_GPL(gmap_convert_to_secure); + +/** + * gmap_destroy_page - Destroy a guest page. + * @gmap: the gmap of the guest + * @gaddr: the guest address to destroy + * + * An attempt will be made to destroy the given guest page. If the attempt + * fails, an attempt is made to export the page. If both attempts fail, an + * appropriate error is returned. + */ +int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr) +{ + struct vm_area_struct *vma; + unsigned long uaddr; + struct page *page; + int rc; + + rc = -EFAULT; + mmap_read_lock(gmap->mm); + + uaddr = __gmap_translate(gmap, gaddr); + if (IS_ERR_VALUE(uaddr)) + goto out; + vma = vma_lookup(gmap->mm, uaddr); + if (!vma) + goto out; + /* + * Huge pages should not be able to become secure + */ + if (is_vm_hugetlb_page(vma)) + goto out; + + rc = 0; + /* we take an extra reference here */ + page = follow_page(vma, uaddr, FOLL_WRITE | FOLL_GET); + if (IS_ERR_OR_NULL(page)) + goto out; + rc = uv_destroy_owned_page(page_to_phys(page)); + /* + * Fault handlers can race; it is possible that two CPUs will fault + * on the same secure page. One CPU can destroy the page, reboot, + * re-enter secure mode and import it, while the second CPU was + * stuck at the beginning of the handler. At some point the second + * CPU will be able to progress, and it will not be able to destroy + * the page. In that case we do not want to terminate the process, + * we instead try to export the page. + */ + if (rc) + rc = uv_convert_owned_from_secure(page_to_phys(page)); + put_page(page); +out: + mmap_read_unlock(gmap->mm); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_destroy_page); + +/* + * To be called with the page locked or with an extra reference! This will + * prevent gmap_make_secure from touching the page concurrently. Having 2 + * parallel make_page_accessible is fine, as the UV calls will become a + * no-op if the page is already exported. + */ +int arch_make_page_accessible(struct page *page) +{ + int rc = 0; + + /* Hugepage cannot be protected, so nothing to do */ + if (PageHuge(page)) + return 0; + + /* + * PG_arch_1 is used in 3 places: + * 1. for kernel page tables during early boot + * 2. for storage keys of huge pages and KVM + * 3. As an indication that this page might be secure. This can + * overindicate, e.g. we set the bit before calling + * convert_to_secure. + * As secure pages are never huge, all 3 variants can co-exists. + */ + if (!test_bit(PG_arch_1, &page->flags)) + return 0; + + rc = uv_pin_shared(page_to_phys(page)); + if (!rc) { + clear_bit(PG_arch_1, &page->flags); + return 0; + } + + rc = uv_convert_from_secure(page_to_phys(page)); + if (!rc) { + clear_bit(PG_arch_1, &page->flags); + return 0; + } + + return rc; +} +EXPORT_SYMBOL_GPL(arch_make_page_accessible); + +#endif + +#if defined(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) || IS_ENABLED(CONFIG_KVM) +static ssize_t uv_query_facilities(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n%lx\n%lx\n%lx\n", + uv_info.inst_calls_list[0], + uv_info.inst_calls_list[1], + uv_info.inst_calls_list[2], + uv_info.inst_calls_list[3]); +} + +static struct kobj_attribute uv_query_facilities_attr = + __ATTR(facilities, 0444, uv_query_facilities, NULL); + +static ssize_t uv_query_supp_se_hdr_ver(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.supp_se_hdr_ver); +} + +static struct kobj_attribute uv_query_supp_se_hdr_ver_attr = + __ATTR(supp_se_hdr_ver, 0444, uv_query_supp_se_hdr_ver, NULL); + +static ssize_t uv_query_supp_se_hdr_pcf(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.supp_se_hdr_pcf); +} + +static struct kobj_attribute uv_query_supp_se_hdr_pcf_attr = + __ATTR(supp_se_hdr_pcf, 0444, uv_query_supp_se_hdr_pcf, NULL); + +static ssize_t uv_query_dump_cpu_len(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.guest_cpu_stor_len); +} + +static struct kobj_attribute uv_query_dump_cpu_len_attr = + __ATTR(uv_query_dump_cpu_len, 0444, uv_query_dump_cpu_len, NULL); + +static ssize_t uv_query_dump_storage_state_len(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.conf_dump_storage_state_len); +} + +static struct kobj_attribute uv_query_dump_storage_state_len_attr = + __ATTR(dump_storage_state_len, 0444, uv_query_dump_storage_state_len, NULL); + +static ssize_t uv_query_dump_finalize_len(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.conf_dump_finalize_len); +} + +static struct kobj_attribute uv_query_dump_finalize_len_attr = + __ATTR(dump_finalize_len, 0444, uv_query_dump_finalize_len, NULL); + +static ssize_t uv_query_feature_indications(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.uv_feature_indications); +} + +static struct kobj_attribute uv_query_feature_indications_attr = + __ATTR(feature_indications, 0444, uv_query_feature_indications, NULL); + +static ssize_t uv_query_max_guest_cpus(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", uv_info.max_guest_cpu_id + 1); +} + +static struct kobj_attribute uv_query_max_guest_cpus_attr = + __ATTR(max_cpus, 0444, uv_query_max_guest_cpus, NULL); + +static ssize_t uv_query_max_guest_vms(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", uv_info.max_num_sec_conf); +} + +static struct kobj_attribute uv_query_max_guest_vms_attr = + __ATTR(max_guests, 0444, uv_query_max_guest_vms, NULL); + +static ssize_t uv_query_max_guest_addr(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.max_sec_stor_addr); +} + +static struct kobj_attribute uv_query_max_guest_addr_attr = + __ATTR(max_address, 0444, uv_query_max_guest_addr, NULL); + +static ssize_t uv_query_supp_att_req_hdr_ver(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.supp_att_req_hdr_ver); +} + +static struct kobj_attribute uv_query_supp_att_req_hdr_ver_attr = + __ATTR(supp_att_req_hdr_ver, 0444, uv_query_supp_att_req_hdr_ver, NULL); + +static ssize_t uv_query_supp_att_pflags(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.supp_att_pflags); +} + +static struct kobj_attribute uv_query_supp_att_pflags_attr = + __ATTR(supp_att_pflags, 0444, uv_query_supp_att_pflags, NULL); + +static ssize_t uv_query_supp_add_secret_req_ver(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.supp_add_secret_req_ver); +} + +static struct kobj_attribute uv_query_supp_add_secret_req_ver_attr = + __ATTR(supp_add_secret_req_ver, 0444, uv_query_supp_add_secret_req_ver, NULL); + +static ssize_t uv_query_supp_add_secret_pcf(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.supp_add_secret_pcf); +} + +static struct kobj_attribute uv_query_supp_add_secret_pcf_attr = + __ATTR(supp_add_secret_pcf, 0444, uv_query_supp_add_secret_pcf, NULL); + +static ssize_t uv_query_supp_secret_types(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.supp_secret_types); +} + +static struct kobj_attribute uv_query_supp_secret_types_attr = + __ATTR(supp_secret_types, 0444, uv_query_supp_secret_types, NULL); + +static ssize_t uv_query_max_secrets(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", uv_info.max_secrets); +} + +static struct kobj_attribute uv_query_max_secrets_attr = + __ATTR(max_secrets, 0444, uv_query_max_secrets, NULL); + +static struct attribute *uv_query_attrs[] = { + &uv_query_facilities_attr.attr, + &uv_query_feature_indications_attr.attr, + &uv_query_max_guest_cpus_attr.attr, + &uv_query_max_guest_vms_attr.attr, + &uv_query_max_guest_addr_attr.attr, + &uv_query_supp_se_hdr_ver_attr.attr, + &uv_query_supp_se_hdr_pcf_attr.attr, + &uv_query_dump_storage_state_len_attr.attr, + &uv_query_dump_finalize_len_attr.attr, + &uv_query_dump_cpu_len_attr.attr, + &uv_query_supp_att_req_hdr_ver_attr.attr, + &uv_query_supp_att_pflags_attr.attr, + &uv_query_supp_add_secret_req_ver_attr.attr, + &uv_query_supp_add_secret_pcf_attr.attr, + &uv_query_supp_secret_types_attr.attr, + &uv_query_max_secrets_attr.attr, + NULL, +}; + +static struct attribute_group uv_query_attr_group = { + .attrs = uv_query_attrs, +}; + +static ssize_t uv_is_prot_virt_guest(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int val = 0; + +#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST + val = prot_virt_guest; +#endif + return sysfs_emit(buf, "%d\n", val); +} + +static ssize_t uv_is_prot_virt_host(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int val = 0; + +#if IS_ENABLED(CONFIG_KVM) + val = prot_virt_host; +#endif + + return sysfs_emit(buf, "%d\n", val); +} + +static struct kobj_attribute uv_prot_virt_guest = + __ATTR(prot_virt_guest, 0444, uv_is_prot_virt_guest, NULL); + +static struct kobj_attribute uv_prot_virt_host = + __ATTR(prot_virt_host, 0444, uv_is_prot_virt_host, NULL); + +static const struct attribute *uv_prot_virt_attrs[] = { + &uv_prot_virt_guest.attr, + &uv_prot_virt_host.attr, + NULL, +}; + +static struct kset *uv_query_kset; +static struct kobject *uv_kobj; + +static int __init uv_info_init(void) +{ + int rc = -ENOMEM; + + if (!test_facility(158)) + return 0; + + uv_kobj = kobject_create_and_add("uv", firmware_kobj); + if (!uv_kobj) + return -ENOMEM; + + rc = sysfs_create_files(uv_kobj, uv_prot_virt_attrs); + if (rc) + goto out_kobj; + + uv_query_kset = kset_create_and_add("query", NULL, uv_kobj); + if (!uv_query_kset) { + rc = -ENOMEM; + goto out_ind_files; + } + + rc = sysfs_create_group(&uv_query_kset->kobj, &uv_query_attr_group); + if (!rc) + return 0; + + kset_unregister(uv_query_kset); +out_ind_files: + sysfs_remove_files(uv_kobj, uv_prot_virt_attrs); +out_kobj: + kobject_del(uv_kobj); + kobject_put(uv_kobj); + return rc; +} +device_initcall(uv_info_init); +#endif diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index bcc9bdb39ba2..bbaefd84f15e 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -6,269 +6,253 @@ * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) */ -#include <linux/init.h> +#include <linux/binfmts.h> +#include <linux/compat.h> +#include <linux/elf.h> #include <linux/errno.h> -#include <linux/sched.h> +#include <linux/init.h> #include <linux/kernel.h> #include <linux/mm.h> -#include <linux/smp.h> -#include <linux/stddef.h> -#include <linux/unistd.h> #include <linux/slab.h> -#include <linux/user.h> -#include <linux/elf.h> -#include <linux/security.h> -#include <linux/memblock.h> -#include <linux/compat.h> -#include <asm/asm-offsets.h> -#include <asm/pgtable.h> -#include <asm/processor.h> -#include <asm/mmu.h> -#include <asm/mmu_context.h> -#include <asm/sections.h> +#include <linux/smp.h> +#include <linux/time_namespace.h> +#include <linux/random.h> +#include <vdso/datapage.h> #include <asm/vdso.h> -#include <asm/facility.h> -extern char vdso64_start, vdso64_end; -static void *vdso64_kbase = &vdso64_start; -static unsigned int vdso64_pages; -static struct page **vdso64_pagelist; +extern char vdso64_start[], vdso64_end[]; +extern char vdso32_start[], vdso32_end[]; -/* - * Should the kernel map a VDSO page into processes and pass its - * address down to glibc upon exec()? - */ -unsigned int __read_mostly vdso_enabled = 1; - -static vm_fault_t vdso_fault(const struct vm_special_mapping *sm, - struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct page **vdso_pagelist; - unsigned long vdso_pages; - - vdso_pagelist = vdso64_pagelist; - vdso_pages = vdso64_pages; - - if (vmf->pgoff >= vdso_pages) - return VM_FAULT_SIGBUS; +static struct vm_special_mapping vvar_mapping; - vmf->page = vdso_pagelist[vmf->pgoff]; - get_page(vmf->page); - return 0; -} - -static int vdso_mremap(const struct vm_special_mapping *sm, - struct vm_area_struct *vma) -{ - unsigned long vdso_pages; - - vdso_pages = vdso64_pages; - - if ((vdso_pages << PAGE_SHIFT) != vma->vm_end - vma->vm_start) - return -EINVAL; +static union { + struct vdso_data data[CS_BASES]; + u8 page[PAGE_SIZE]; +} vdso_data_store __page_aligned_data; - if (WARN_ON_ONCE(current->mm != vma->vm_mm)) - return -EFAULT; +struct vdso_data *vdso_data = vdso_data_store.data; - current->mm->context.vdso_base = vma->vm_start; - return 0; -} - -static const struct vm_special_mapping vdso_mapping = { - .name = "[vdso]", - .fault = vdso_fault, - .mremap = vdso_mremap, +enum vvar_pages { + VVAR_DATA_PAGE_OFFSET, + VVAR_TIMENS_PAGE_OFFSET, + VVAR_NR_PAGES, }; -static int __init vdso_setup(char *str) +#ifdef CONFIG_TIME_NS +struct vdso_data *arch_get_vdso_data(void *vvar_page) { - bool enabled; - - if (!kstrtobool(str, &enabled)) - vdso_enabled = enabled; - return 1; + return (struct vdso_data *)(vvar_page); } -__setup("vdso=", vdso_setup); - -/* - * The vdso data page - */ -static union { - struct vdso_data data; - u8 page[PAGE_SIZE]; -} vdso_data_store __page_aligned_data; -struct vdso_data *vdso_data = &vdso_data_store.data; /* - * Setup vdso data page. + * The VVAR page layout depends on whether a task belongs to the root or + * non-root time namespace. Whenever a task changes its namespace, the VVAR + * page tables are cleared and then they will be re-faulted with a + * corresponding layout. + * See also the comment near timens_setup_vdso_data() for details. */ -static void __init vdso_init_data(struct vdso_data *vd) +int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) { - vd->ectg_available = test_facility(31); -} - -/* - * Allocate/free per cpu vdso data. - */ -#define SEGMENT_ORDER 2 + struct mm_struct *mm = task->mm; + VMA_ITERATOR(vmi, mm, 0); + struct vm_area_struct *vma; -/* - * The initial vdso_data structure for the boot CPU. Eventually - * it is replaced with a properly allocated structure in vdso_init. - * This is necessary because a valid S390_lowcore.vdso_per_cpu_data - * pointer is required to be able to return from an interrupt or - * program check. See the exit paths in entry.S. - */ -struct vdso_data boot_vdso_data __initdata; + mmap_read_lock(mm); + for_each_vma(vmi, vma) { + if (!vma_is_special_mapping(vma, &vvar_mapping)) + continue; + zap_vma_pages(vma); + break; + } + mmap_read_unlock(mm); + return 0; +} +#endif -void __init vdso_alloc_boot_cpu(struct lowcore *lowcore) +static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, struct vm_fault *vmf) { - lowcore->vdso_per_cpu_data = (unsigned long) &boot_vdso_data; + struct page *timens_page = find_timens_vvar_page(vma); + unsigned long addr, pfn; + vm_fault_t err; + + switch (vmf->pgoff) { + case VVAR_DATA_PAGE_OFFSET: + pfn = virt_to_pfn(vdso_data); + if (timens_page) { + /* + * Fault in VVAR page too, since it will be accessed + * to get clock data anyway. + */ + addr = vmf->address + VVAR_TIMENS_PAGE_OFFSET * PAGE_SIZE; + err = vmf_insert_pfn(vma, addr, pfn); + if (unlikely(err & VM_FAULT_ERROR)) + return err; + pfn = page_to_pfn(timens_page); + } + break; +#ifdef CONFIG_TIME_NS + case VVAR_TIMENS_PAGE_OFFSET: + /* + * If a task belongs to a time namespace then a namespace + * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and + * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET + * offset. + * See also the comment near timens_setup_vdso_data(). + */ + if (!timens_page) + return VM_FAULT_SIGBUS; + pfn = virt_to_pfn(vdso_data); + break; +#endif /* CONFIG_TIME_NS */ + default: + return VM_FAULT_SIGBUS; + } + return vmf_insert_pfn(vma, vmf->address, pfn); } -int vdso_alloc_per_cpu(struct lowcore *lowcore) +static int vdso_mremap(const struct vm_special_mapping *sm, + struct vm_area_struct *vma) { - unsigned long segment_table, page_table, page_frame; - struct vdso_per_cpu_data *vd; - - segment_table = __get_free_pages(GFP_KERNEL, SEGMENT_ORDER); - page_table = get_zeroed_page(GFP_KERNEL); - page_frame = get_zeroed_page(GFP_KERNEL); - if (!segment_table || !page_table || !page_frame) - goto out; - arch_set_page_dat(virt_to_page(segment_table), SEGMENT_ORDER); - arch_set_page_dat(virt_to_page(page_table), 0); - - /* Initialize per-cpu vdso data page */ - vd = (struct vdso_per_cpu_data *) page_frame; - vd->cpu_nr = lowcore->cpu_nr; - vd->node_id = cpu_to_node(vd->cpu_nr); - - /* Set up page table for the vdso address space */ - memset64((u64 *)segment_table, _SEGMENT_ENTRY_EMPTY, _CRST_ENTRIES); - memset64((u64 *)page_table, _PAGE_INVALID, PTRS_PER_PTE); - - *(unsigned long *) segment_table = _SEGMENT_ENTRY + page_table; - *(unsigned long *) page_table = _PAGE_PROTECT + page_frame; - - lowcore->vdso_asce = segment_table + - _ASCE_TABLE_LENGTH + _ASCE_USER_BITS + _ASCE_TYPE_SEGMENT; - lowcore->vdso_per_cpu_data = page_frame; - + current->mm->context.vdso_base = vma->vm_start; return 0; - -out: - free_page(page_frame); - free_page(page_table); - free_pages(segment_table, SEGMENT_ORDER); - return -ENOMEM; } -void vdso_free_per_cpu(struct lowcore *lowcore) -{ - unsigned long segment_table, page_table, page_frame; +static struct vm_special_mapping vvar_mapping = { + .name = "[vvar]", + .fault = vvar_fault, +}; - segment_table = lowcore->vdso_asce & PAGE_MASK; - page_table = *(unsigned long *) segment_table; - page_frame = *(unsigned long *) page_table; +static struct vm_special_mapping vdso64_mapping = { + .name = "[vdso]", + .mremap = vdso_mremap, +}; + +static struct vm_special_mapping vdso32_mapping = { + .name = "[vdso]", + .mremap = vdso_mremap, +}; - free_page(page_frame); - free_page(page_table); - free_pages(segment_table, SEGMENT_ORDER); +int vdso_getcpu_init(void) +{ + set_tod_programmable_field(smp_processor_id()); + return 0; } +early_initcall(vdso_getcpu_init); /* Must be called before SMP init */ -/* - * This is called from binfmt_elf, we create the special vma for the - * vDSO and insert it into the mm struct tree - */ -int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +static int map_vdso(unsigned long addr, unsigned long vdso_mapping_len) { + unsigned long vvar_start, vdso_text_start, vdso_text_len; + struct vm_special_mapping *vdso_mapping; struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - unsigned long vdso_pages; - unsigned long vdso_base; int rc; - if (!vdso_enabled) - return 0; - - if (is_compat_task()) - return 0; - - vdso_pages = vdso64_pages; - /* - * vDSO has a problem and was disabled, just don't "enable" it for - * the process - */ - if (vdso_pages == 0) - return 0; - - /* - * pick a base address for the vDSO in process space. We try to put - * it at vdso_base which is the "natural" base for it, but we might - * fail and end up putting it elsewhere. - */ - if (down_write_killable(&mm->mmap_sem)) + BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES); + if (mmap_write_lock_killable(mm)) return -EINTR; - vdso_base = get_unmapped_area(NULL, 0, vdso_pages << PAGE_SHIFT, 0, 0); - if (IS_ERR_VALUE(vdso_base)) { - rc = vdso_base; - goto out_up; - } - /* - * our vma flags don't have VM_WRITE so by default, the process - * isn't allowed to write those pages. - * gdb can break that with ptrace interface, and thus trigger COW - * on those pages but it's then your responsibility to never do that - * on the "data" page of the vDSO or you'll stop getting kernel - * updates and your nice userland gettimeofday will be totally dead. - * It's fine to use that for setting breakpoints in the vDSO code - * pages though. - */ - vma = _install_special_mapping(mm, vdso_base, vdso_pages << PAGE_SHIFT, + if (is_compat_task()) { + vdso_text_len = vdso32_end - vdso32_start; + vdso_mapping = &vdso32_mapping; + } else { + vdso_text_len = vdso64_end - vdso64_start; + vdso_mapping = &vdso64_mapping; + } + vvar_start = get_unmapped_area(NULL, addr, vdso_mapping_len, 0, 0); + rc = vvar_start; + if (IS_ERR_VALUE(vvar_start)) + goto out; + vma = _install_special_mapping(mm, vvar_start, VVAR_NR_PAGES*PAGE_SIZE, + VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP| + VM_PFNMAP, + &vvar_mapping); + rc = PTR_ERR(vma); + if (IS_ERR(vma)) + goto out; + vdso_text_start = vvar_start + VVAR_NR_PAGES * PAGE_SIZE; + /* VM_MAYWRITE for COW so gdb can set breakpoints */ + vma = _install_special_mapping(mm, vdso_text_start, vdso_text_len, VM_READ|VM_EXEC| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, - &vdso_mapping); + vdso_mapping); if (IS_ERR(vma)) { + do_munmap(mm, vvar_start, PAGE_SIZE, NULL); rc = PTR_ERR(vma); - goto out_up; + } else { + current->mm->context.vdso_base = vdso_text_start; + rc = 0; } +out: + mmap_write_unlock(mm); + return rc; +} - current->mm->context.vdso_base = vdso_base; - rc = 0; +static unsigned long vdso_addr(unsigned long start, unsigned long len) +{ + unsigned long addr, end, offset; -out_up: - up_write(&mm->mmap_sem); - return rc; + /* + * Round up the start address. It can start out unaligned as a result + * of stack start randomization. + */ + start = PAGE_ALIGN(start); + + /* Round the lowest possible end address up to a PMD boundary. */ + end = (start + len + PMD_SIZE - 1) & PMD_MASK; + if (end >= VDSO_BASE) + end = VDSO_BASE; + end -= len; + + if (end > start) { + offset = get_random_u32_below(((end - start) >> PAGE_SHIFT) + 1); + addr = start + (offset << PAGE_SHIFT); + } else { + addr = start; + } + return addr; } -static int __init vdso_init(void) +unsigned long vdso_size(void) { - int i; + unsigned long size = VVAR_NR_PAGES * PAGE_SIZE; - vdso_init_data(vdso_data); + if (is_compat_task()) + size += vdso32_end - vdso32_start; + else + size += vdso64_end - vdso64_start; + return PAGE_ALIGN(size); +} - /* Calculate the size of the 64 bit vDSO */ - vdso64_pages = ((&vdso64_end - &vdso64_start - + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1; +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +{ + unsigned long addr = VDSO_BASE; + unsigned long size = vdso_size(); - /* Make sure pages are in the correct state */ - vdso64_pagelist = kcalloc(vdso64_pages + 1, sizeof(struct page *), - GFP_KERNEL); - BUG_ON(vdso64_pagelist == NULL); - for (i = 0; i < vdso64_pages - 1; i++) { - struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE); - get_page(pg); - vdso64_pagelist[i] = pg; - } - vdso64_pagelist[vdso64_pages - 1] = virt_to_page(vdso_data); - vdso64_pagelist[vdso64_pages] = NULL; - if (vdso_alloc_per_cpu(&S390_lowcore)) - BUG(); + if (current->flags & PF_RANDOMIZE) + addr = vdso_addr(current->mm->start_stack + PAGE_SIZE, size); + return map_vdso(addr, size); +} - get_page(virt_to_page(vdso_data)); +static struct page ** __init vdso_setup_pages(void *start, void *end) +{ + int pages = (end - start) >> PAGE_SHIFT; + struct page **pagelist; + int i; + pagelist = kcalloc(pages + 1, sizeof(struct page *), GFP_KERNEL); + if (!pagelist) + panic("%s: Cannot allocate page list for VDSO", __func__); + for (i = 0; i < pages; i++) + pagelist[i] = virt_to_page(start + i * PAGE_SIZE); + return pagelist; +} + +static int __init vdso_init(void) +{ + vdso64_mapping.pages = vdso_setup_pages(vdso64_start, vdso64_end); + if (IS_ENABLED(CONFIG_COMPAT)) + vdso32_mapping.pages = vdso_setup_pages(vdso32_start, vdso32_end); return 0; } -early_initcall(vdso_init); +arch_initcall(vdso_init); diff --git a/arch/s390/kernel/vdso32/.gitignore b/arch/s390/kernel/vdso32/.gitignore new file mode 100644 index 000000000000..5167384843b9 --- /dev/null +++ b/arch/s390/kernel/vdso32/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +vdso32.lds diff --git a/arch/s390/kernel/vdso32/Makefile b/arch/s390/kernel/vdso32/Makefile new file mode 100644 index 000000000000..caec7db6f966 --- /dev/null +++ b/arch/s390/kernel/vdso32/Makefile @@ -0,0 +1,70 @@ +# SPDX-License-Identifier: GPL-2.0 +# List of files in the vdso + +KCOV_INSTRUMENT := n + +# Include the generic Makefile to check the built vdso. +include $(srctree)/lib/vdso/Makefile +obj-vdso32 = vdso_user_wrapper-32.o note-32.o + +# Build rules + +targets := $(obj-vdso32) vdso32.so vdso32.so.dbg +obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32)) + +KBUILD_AFLAGS += -DBUILD_VDSO +KBUILD_CFLAGS += -DBUILD_VDSO -DDISABLE_BRANCH_PROFILING + +KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) +KBUILD_AFLAGS_32 += -m31 -s + +KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS)) +KBUILD_CFLAGS_32 := $(filter-out -mno-pic-data-is-text-relative,$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 += -m31 -fPIC -shared -fno-common -fno-builtin + +LDFLAGS_vdso32.so.dbg += -fPIC -shared -soname=linux-vdso32.so.1 \ + --hash-style=both --build-id=sha1 -melf_s390 -T + +$(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) +$(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32) + +obj-y += vdso32_wrapper.o +targets += vdso32.lds +CPPFLAGS_vdso32.lds += -P -C -U$(ARCH) + +# Disable gcov profiling, ubsan and kasan for VDSO code +GCOV_PROFILE := n +UBSAN_SANITIZE := n +KASAN_SANITIZE := n +KCSAN_SANITIZE := n + +# Force dependency (incbin is bad) +$(obj)/vdso32_wrapper.o : $(obj)/vdso32.so + +quiet_cmd_vdso_and_check = VDSO $@ + cmd_vdso_and_check = $(cmd_ld); $(cmd_vdso_check) + +$(obj)/vdso32.so.dbg: $(src)/vdso32.lds $(obj-vdso32) FORCE + $(call if_changed,vdso_and_check) + +# strip rule for the .so file +$(obj)/%.so: OBJCOPYFLAGS := -S +$(obj)/%.so: $(obj)/%.so.dbg FORCE + $(call if_changed,objcopy) + +$(obj-vdso32): %-32.o: %.S FORCE + $(call if_changed_dep,vdso32as) + +# actual build commands +quiet_cmd_vdso32as = VDSO32A $@ + cmd_vdso32as = $(CC) $(a_flags) -c -o $@ $< +quiet_cmd_vdso32cc = VDSO32C $@ + cmd_vdso32cc = $(CC) $(c_flags) -c -o $@ $< + +# Generate VDSO offsets using helper script +gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh +quiet_cmd_vdsosym = VDSOSYM $@ + cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ + +include/generated/vdso32-offsets.h: $(obj)/vdso32.so.dbg FORCE + $(call if_changed,vdsosym) diff --git a/arch/s390/kernel/vdso32/gen_vdso_offsets.sh b/arch/s390/kernel/vdso32/gen_vdso_offsets.sh new file mode 100755 index 000000000000..9c4f951e227d --- /dev/null +++ b/arch/s390/kernel/vdso32/gen_vdso_offsets.sh @@ -0,0 +1,15 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +# +# Match symbols in the DSO that look like VDSO_*; produce a header file +# of constant offsets into the shared object. +# +# Doing this inside the Makefile will break the $(filter-out) function, +# causing Kbuild to rebuild the vdso-offsets header file every time. +# +# Inspired by arm64 version. +# + +LC_ALL=C +sed -n 's/\([0-9a-f]*\) . __kernel_compat_\(.*\)/\#define vdso32_offset_\2\t0x\1/p' diff --git a/arch/s390/kernel/vdso32/note.S b/arch/s390/kernel/vdso32/note.S new file mode 100644 index 000000000000..db19d0680a0a --- /dev/null +++ b/arch/s390/kernel/vdso32/note.S @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. + * Here we can supply some information useful to userland. + */ + +#include <linux/uts.h> +#include <linux/version.h> +#include <linux/elfnote.h> + +ELFNOTE_START(Linux, 0, "a") + .long LINUX_VERSION_CODE +ELFNOTE_END diff --git a/arch/s390/kernel/vdso32/vdso32.lds.S b/arch/s390/kernel/vdso32/vdso32.lds.S new file mode 100644 index 000000000000..edf5ff1debe1 --- /dev/null +++ b/arch/s390/kernel/vdso32/vdso32.lds.S @@ -0,0 +1,142 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This is the infamous ld script for the 64 bits vdso + * library + */ + +#include <asm/page.h> +#include <asm/vdso.h> + +OUTPUT_FORMAT("elf32-s390", "elf32-s390", "elf32-s390") +OUTPUT_ARCH(s390:31-bit) +ENTRY(_start) + +SECTIONS +{ + PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); +#ifdef CONFIG_TIME_NS + PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); +#endif + . = VDSO_LBASE + SIZEOF_HEADERS; + + .hash : { *(.hash) } :text + .gnu.hash : { *(.gnu.hash) } + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + + .note : { *(.note.*) } :text :note + + . = ALIGN(16); + .text : { + *(.text .stub .text.* .gnu.linkonce.t.*) + } :text + PROVIDE(__etext = .); + PROVIDE(_etext = .); + PROVIDE(etext = .); + + /* + * Other stuff is appended to the text segment: + */ + .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } + .rodata1 : { *(.rodata1) } + + .dynamic : { *(.dynamic) } :text :dynamic + + .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr + .eh_frame : { KEEP (*(.eh_frame)) } :text + .gcc_except_table : { *(.gcc_except_table .gcc_except_table.*) } + + .rela.dyn ALIGN(8) : { *(.rela.dyn) } + .got ALIGN(8) : { *(.got .toc) } + .got.plt ALIGN(8) : { *(.got.plt) } + + _end = .; + PROVIDE(end = .); + + /* + * Stabs debugging sections are here too. + */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + .comment 0 : { *(.comment) } + + /* + * DWARF debug sections. + * Symbols in the DWARF debugging sections are relative to the + * beginning of the section so we begin them at 0. + */ + /* DWARF 1 */ + .debug 0 : { *(.debug) } + .line 0 : { *(.line) } + /* GNU DWARF 1 extensions */ + .debug_srcinfo 0 : { *(.debug_srcinfo) } + .debug_sfnames 0 : { *(.debug_sfnames) } + /* DWARF 1.1 and DWARF 2 */ + .debug_aranges 0 : { *(.debug_aranges) } + .debug_pubnames 0 : { *(.debug_pubnames) } + /* DWARF 2 */ + .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } + .debug_abbrev 0 : { *(.debug_abbrev) } + .debug_line 0 : { *(.debug_line) } + .debug_frame 0 : { *(.debug_frame) } + .debug_str 0 : { *(.debug_str) } + .debug_loc 0 : { *(.debug_loc) } + .debug_macinfo 0 : { *(.debug_macinfo) } + /* SGI/MIPS DWARF 2 extensions */ + .debug_weaknames 0 : { *(.debug_weaknames) } + .debug_funcnames 0 : { *(.debug_funcnames) } + .debug_typenames 0 : { *(.debug_typenames) } + .debug_varnames 0 : { *(.debug_varnames) } + /* DWARF 3 */ + .debug_pubtypes 0 : { *(.debug_pubtypes) } + .debug_ranges 0 : { *(.debug_ranges) } + .gnu.attributes 0 : { KEEP (*(.gnu.attributes)) } + + /DISCARD/ : { + *(.note.GNU-stack) + *(.branch_lt) + *(.data .data.* .gnu.linkonce.d.* .sdata*) + *(.bss .sbss .dynbss .dynsbss) + } +} + +/* + * Very old versions of ld do not recognize this name token; use the constant. + */ +#define PT_GNU_EH_FRAME 0x6474e550 + +/* + * We must supply the ELF program headers explicitly to get just one + * PT_LOAD segment, and set the flags explicitly to make segments read-only. + */ +PHDRS +{ + text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */ + dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ + note PT_NOTE FLAGS(4); /* PF_R */ + eh_frame_hdr PT_GNU_EH_FRAME; +} + +/* + * This controls what symbols we export from the DSO. + */ +VERSION +{ + VDSO_VERSION_STRING { + global: + /* + * Has to be there for the kernel to find + */ + __kernel_compat_restart_syscall; + __kernel_compat_rt_sigreturn; + __kernel_compat_sigreturn; + local: *; + }; +} diff --git a/arch/s390/kernel/vdso32/vdso32_wrapper.S b/arch/s390/kernel/vdso32/vdso32_wrapper.S new file mode 100644 index 000000000000..de2fb930471a --- /dev/null +++ b/arch/s390/kernel/vdso32/vdso32_wrapper.S @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/init.h> +#include <linux/linkage.h> +#include <asm/page.h> + + __PAGE_ALIGNED_DATA + + .globl vdso32_start, vdso32_end + .balign PAGE_SIZE +vdso32_start: + .incbin "arch/s390/kernel/vdso32/vdso32.so" + .balign PAGE_SIZE +vdso32_end: + + .previous diff --git a/arch/s390/kernel/vdso32/vdso_user_wrapper.S b/arch/s390/kernel/vdso32/vdso_user_wrapper.S new file mode 100644 index 000000000000..2e645003fdaf --- /dev/null +++ b/arch/s390/kernel/vdso32/vdso_user_wrapper.S @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/linkage.h> +#include <asm/unistd.h> +#include <asm/dwarf.h> + +.macro vdso_syscall func,syscall + .globl __kernel_compat_\func + .type __kernel_compat_\func,@function + __ALIGN +__kernel_compat_\func: + CFI_STARTPROC + svc \syscall + /* Make sure we notice when a syscall returns, which shouldn't happen */ + .word 0 + CFI_ENDPROC + .size __kernel_compat_\func,.-__kernel_compat_\func +.endm + +vdso_syscall restart_syscall,__NR_restart_syscall +vdso_syscall sigreturn,__NR_sigreturn +vdso_syscall rt_sigreturn,__NR_rt_sigreturn diff --git a/arch/s390/kernel/vdso64/.gitignore b/arch/s390/kernel/vdso64/.gitignore index 3fd18cf9fec2..4ec80685fecc 100644 --- a/arch/s390/kernel/vdso64/.gitignore +++ b/arch/s390/kernel/vdso64/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only vdso64.lds diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile index bec19e7e6e1c..e3c9085f8fa7 100644 --- a/arch/s390/kernel/vdso64/Makefile +++ b/arch/s390/kernel/vdso64/Makefile @@ -1,44 +1,56 @@ # SPDX-License-Identifier: GPL-2.0 -# List of files in the vdso, has to be asm only for now +# List of files in the vdso KCOV_INSTRUMENT := n -obj-vdso64 = gettimeofday.o clock_getres.o clock_gettime.o note.o getcpu.o +# Include the generic Makefile to check the built vdso. +include $(srctree)/lib/vdso/Makefile +obj-vdso64 = vdso_user_wrapper.o note.o +obj-cvdso64 = vdso64_generic.o getcpu.o +VDSO_CFLAGS_REMOVE := -pg $(CC_FLAGS_FTRACE) $(CC_FLAGS_EXPOLINE) $(CC_FLAGS_CHECK_STACK) +CFLAGS_REMOVE_getcpu.o = $(VDSO_CFLAGS_REMOVE) +CFLAGS_REMOVE_vdso64_generic.o = $(VDSO_CFLAGS_REMOVE) # Build rules -targets := $(obj-vdso64) vdso64.so vdso64.so.dbg +targets := $(obj-vdso64) $(obj-cvdso64) vdso64.so vdso64.so.dbg obj-vdso64 := $(addprefix $(obj)/, $(obj-vdso64)) +obj-cvdso64 := $(addprefix $(obj)/, $(obj-cvdso64)) KBUILD_AFLAGS += -DBUILD_VDSO -KBUILD_CFLAGS += -DBUILD_VDSO +KBUILD_CFLAGS += -DBUILD_VDSO -DDISABLE_BRANCH_PROFILING KBUILD_AFLAGS_64 := $(filter-out -m64,$(KBUILD_AFLAGS)) -KBUILD_AFLAGS_64 += -m64 -s +KBUILD_AFLAGS_64 += -m64 KBUILD_CFLAGS_64 := $(filter-out -m64,$(KBUILD_CFLAGS)) -KBUILD_CFLAGS_64 += -m64 -fPIC -shared -fno-common -fno-builtin -KBUILD_CFLAGS_64 += -nostdlib -Wl,-soname=linux-vdso64.so.1 \ - -Wl,--hash-style=both +KBUILD_CFLAGS_64 := $(filter-out -mno-pic-data-is-text-relative,$(KBUILD_CFLAGS_64)) +KBUILD_CFLAGS_64 += -m64 -fPIC -fno-common -fno-builtin +ldflags-y := -fPIC -shared -soname=linux-vdso64.so.1 \ + --hash-style=both --build-id=sha1 -T $(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_64) $(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_64) obj-y += vdso64_wrapper.o -extra-y += vdso64.lds +targets += vdso64.lds CPPFLAGS_vdso64.lds += -P -C -U$(ARCH) # Disable gcov profiling, ubsan and kasan for VDSO code GCOV_PROFILE := n UBSAN_SANITIZE := n KASAN_SANITIZE := n +KCSAN_SANITIZE := n # Force dependency (incbin is bad) $(obj)/vdso64_wrapper.o : $(obj)/vdso64.so +quiet_cmd_vdso_and_check = VDSO $@ + cmd_vdso_and_check = $(cmd_ld); $(cmd_vdso_check) + # link rule for the .so file, .lds has to be first -$(obj)/vdso64.so.dbg: $(src)/vdso64.lds $(obj-vdso64) FORCE - $(call if_changed,vdso64ld) +$(obj)/vdso64.so.dbg: $(src)/vdso64.lds $(obj-vdso64) $(obj-cvdso64) FORCE + $(call if_changed,vdso_and_check) # strip rule for the .so file $(obj)/%.so: OBJCOPYFLAGS := -S @@ -49,18 +61,19 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE $(obj-vdso64): %.o: %.S FORCE $(call if_changed_dep,vdso64as) +$(obj-cvdso64): %.o: %.c FORCE + $(call if_changed_dep,vdso64cc) + # actual build commands -quiet_cmd_vdso64ld = VDSO64L $@ - cmd_vdso64ld = $(CC) $(c_flags) -Wl,-T $(filter %.lds %.o,$^) -o $@ quiet_cmd_vdso64as = VDSO64A $@ cmd_vdso64as = $(CC) $(a_flags) -c -o $@ $< +quiet_cmd_vdso64cc = VDSO64C $@ + cmd_vdso64cc = $(CC) $(c_flags) -c -o $@ $< -# install commands for the unstripped file -quiet_cmd_vdso_install = INSTALL $@ - cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@ - -vdso64.so: $(obj)/vdso64.so.dbg - @mkdir -p $(MODLIB)/vdso - $(call cmd,vdso_install) +# Generate VDSO offsets using helper script +gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh +quiet_cmd_vdsosym = VDSOSYM $@ + cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ -vdso_install: vdso64.so +include/generated/vdso64-offsets.h: $(obj)/vdso64.so.dbg FORCE + $(call if_changed,vdsosym) diff --git a/arch/s390/kernel/vdso64/clock_getres.S b/arch/s390/kernel/vdso64/clock_getres.S deleted file mode 100644 index 081435398e0a..000000000000 --- a/arch/s390/kernel/vdso64/clock_getres.S +++ /dev/null @@ -1,50 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Userland implementation of clock_getres() for 64 bits processes in a - * s390 kernel for use in the vDSO - * - * Copyright IBM Corp. 2008 - * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) - */ -#include <asm/vdso.h> -#include <asm/asm-offsets.h> -#include <asm/unistd.h> -#include <asm/dwarf.h> - - .text - .align 4 - .globl __kernel_clock_getres - .type __kernel_clock_getres,@function -__kernel_clock_getres: - CFI_STARTPROC - larl %r1,4f - cghi %r2,__CLOCK_REALTIME_COARSE - je 0f - cghi %r2,__CLOCK_MONOTONIC_COARSE - je 0f - larl %r1,3f - cghi %r2,__CLOCK_REALTIME - je 0f - cghi %r2,__CLOCK_MONOTONIC - je 0f - cghi %r2,__CLOCK_THREAD_CPUTIME_ID - je 0f - cghi %r2,-2 /* Per-thread CPUCLOCK with PID=0, VIRT=1 */ - jne 2f - larl %r5,_vdso_data - icm %r0,15,__LC_ECTG_OK(%r5) - jz 2f -0: ltgr %r3,%r3 - jz 1f /* res == NULL */ - lg %r0,0(%r1) - xc 0(8,%r3),0(%r3) /* set tp->tv_sec to zero */ - stg %r0,8(%r3) /* store tp->tv_usec */ -1: lghi %r2,0 - br %r14 -2: lghi %r1,__NR_clock_getres /* fallback to svc */ - svc 0 - br %r14 - CFI_ENDPROC -3: .quad __CLOCK_REALTIME_RES -4: .quad __CLOCK_COARSE_RES - .size __kernel_clock_getres,.-__kernel_clock_getres diff --git a/arch/s390/kernel/vdso64/clock_gettime.S b/arch/s390/kernel/vdso64/clock_gettime.S deleted file mode 100644 index 9d2ee79b90f2..000000000000 --- a/arch/s390/kernel/vdso64/clock_gettime.S +++ /dev/null @@ -1,163 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Userland implementation of clock_gettime() for 64 bits processes in a - * s390 kernel for use in the vDSO - * - * Copyright IBM Corp. 2008 - * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) - */ -#include <asm/vdso.h> -#include <asm/asm-offsets.h> -#include <asm/unistd.h> -#include <asm/dwarf.h> -#include <asm/ptrace.h> - - .text - .align 4 - .globl __kernel_clock_gettime - .type __kernel_clock_gettime,@function -__kernel_clock_gettime: - CFI_STARTPROC - aghi %r15,-16 - CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD+16 - CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD - larl %r5,_vdso_data - cghi %r2,__CLOCK_REALTIME_COARSE - je 4f - cghi %r2,__CLOCK_REALTIME - je 5f - cghi %r2,-3 /* Per-thread CPUCLOCK with PID=0, VIRT=1 */ - je 9f - cghi %r2,__CLOCK_MONOTONIC_COARSE - je 3f - cghi %r2,__CLOCK_MONOTONIC - jne 12f - - /* CLOCK_MONOTONIC */ -0: lg %r4,__VDSO_UPD_COUNT(%r5) /* load update counter */ - tmll %r4,0x0001 /* pending update ? loop */ - jnz 0b - stcke 0(%r15) /* Store TOD clock */ - lgf %r2,__VDSO_TK_SHIFT(%r5) /* Timekeeper shift */ - lg %r0,__VDSO_WTOM_SEC(%r5) - lg %r1,1(%r15) - sg %r1,__VDSO_XTIME_STAMP(%r5) /* TOD - cycle_last */ - msgf %r1,__VDSO_TK_MULT(%r5) /* * tk->mult */ - alg %r1,__VDSO_WTOM_NSEC(%r5) - srlg %r1,%r1,0(%r2) /* >> tk->shift */ - clg %r4,__VDSO_UPD_COUNT(%r5) /* check update counter */ - jne 0b - larl %r5,13f -1: clg %r1,0(%r5) - jl 2f - slg %r1,0(%r5) - aghi %r0,1 - j 1b -2: stg %r0,0(%r3) /* store tp->tv_sec */ - stg %r1,8(%r3) /* store tp->tv_nsec */ - lghi %r2,0 - aghi %r15,16 - CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD - CFI_RESTORE 15 - br %r14 - - /* CLOCK_MONOTONIC_COARSE */ - CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD+16 - CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD -3: lg %r4,__VDSO_UPD_COUNT(%r5) /* load update counter */ - tmll %r4,0x0001 /* pending update ? loop */ - jnz 3b - lg %r0,__VDSO_WTOM_CRS_SEC(%r5) - lg %r1,__VDSO_WTOM_CRS_NSEC(%r5) - clg %r4,__VDSO_UPD_COUNT(%r5) /* check update counter */ - jne 3b - j 2b - - /* CLOCK_REALTIME_COARSE */ -4: lg %r4,__VDSO_UPD_COUNT(%r5) /* load update counter */ - tmll %r4,0x0001 /* pending update ? loop */ - jnz 4b - lg %r0,__VDSO_XTIME_CRS_SEC(%r5) - lg %r1,__VDSO_XTIME_CRS_NSEC(%r5) - clg %r4,__VDSO_UPD_COUNT(%r5) /* check update counter */ - jne 4b - j 7f - - /* CLOCK_REALTIME */ -5: lg %r4,__VDSO_UPD_COUNT(%r5) /* load update counter */ - tmll %r4,0x0001 /* pending update ? loop */ - jnz 5b - stcke 0(%r15) /* Store TOD clock */ - lg %r1,1(%r15) - lg %r0,__VDSO_TS_END(%r5) /* TOD steering end time */ - slgr %r0,%r1 /* now - ts_steering_end */ - ltgr %r0,%r0 /* past end of steering ? */ - jm 17f - srlg %r0,%r0,15 /* 1 per 2^16 */ - tm __VDSO_TS_DIR+3(%r5),0x01 /* steering direction? */ - jz 18f - lcgr %r0,%r0 /* negative TOD offset */ -18: algr %r1,%r0 /* add steering offset */ -17: lgf %r2,__VDSO_TK_SHIFT(%r5) /* Timekeeper shift */ - sg %r1,__VDSO_XTIME_STAMP(%r5) /* TOD - cycle_last */ - msgf %r1,__VDSO_TK_MULT(%r5) /* * tk->mult */ - alg %r1,__VDSO_XTIME_NSEC(%r5) /* + tk->xtime_nsec */ - srlg %r1,%r1,0(%r2) /* >> tk->shift */ - lg %r0,__VDSO_XTIME_SEC(%r5) /* tk->xtime_sec */ - clg %r4,__VDSO_UPD_COUNT(%r5) /* check update counter */ - jne 5b - larl %r5,13f -6: clg %r1,0(%r5) - jl 7f - slg %r1,0(%r5) - aghi %r0,1 - j 6b -7: stg %r0,0(%r3) /* store tp->tv_sec */ - stg %r1,8(%r3) /* store tp->tv_nsec */ - lghi %r2,0 - aghi %r15,16 - CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD - CFI_RESTORE 15 - br %r14 - - /* CPUCLOCK_VIRT for this thread */ - CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD+16 - CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD -9: lghi %r4,0 - icm %r0,15,__VDSO_ECTG_OK(%r5) - jz 12f - sacf 256 /* Magic ectg instruction */ - .insn ssf,0xc80100000000,__VDSO_ECTG_BASE(4),__VDSO_ECTG_USER(4),4 - sacf 0 - algr %r1,%r0 /* r1 = cputime as TOD value */ - mghi %r1,1000 /* convert to nanoseconds */ - srlg %r1,%r1,12 /* r1 = cputime in nanosec */ - lgr %r4,%r1 - larl %r5,13f - srlg %r1,%r1,9 /* divide by 1000000000 */ - mlg %r0,8(%r5) - srlg %r0,%r0,11 /* r0 = tv_sec */ - stg %r0,0(%r3) - msg %r0,0(%r5) /* calculate tv_nsec */ - slgr %r4,%r0 /* r4 = tv_nsec */ - stg %r4,8(%r3) - lghi %r2,0 - aghi %r15,16 - CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD - CFI_RESTORE 15 - br %r14 - - /* Fallback to system call */ - CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD+16 - CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD -12: lghi %r1,__NR_clock_gettime - svc 0 - aghi %r15,16 - CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD - CFI_RESTORE 15 - br %r14 - CFI_ENDPROC - -13: .quad 1000000000 -14: .quad 19342813113834067 - .size __kernel_clock_gettime,.-__kernel_clock_gettime diff --git a/arch/s390/kernel/vdso64/gen_vdso_offsets.sh b/arch/s390/kernel/vdso64/gen_vdso_offsets.sh new file mode 100755 index 000000000000..37f05cb38dad --- /dev/null +++ b/arch/s390/kernel/vdso64/gen_vdso_offsets.sh @@ -0,0 +1,15 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +# +# Match symbols in the DSO that look like VDSO_*; produce a header file +# of constant offsets into the shared object. +# +# Doing this inside the Makefile will break the $(filter-out) function, +# causing Kbuild to rebuild the vdso-offsets header file every time. +# +# Inspired by arm64 version. +# + +LC_ALL=C +sed -n 's/\([0-9a-f]*\) . __kernel_\(.*\)/\#define vdso64_offset_\2\t0x\1/p' diff --git a/arch/s390/kernel/vdso64/getcpu.S b/arch/s390/kernel/vdso64/getcpu.S deleted file mode 100644 index 3c04f7328500..000000000000 --- a/arch/s390/kernel/vdso64/getcpu.S +++ /dev/null @@ -1,31 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Userland implementation of getcpu() for 64 bits processes in a - * s390 kernel for use in the vDSO - * - * Copyright IBM Corp. 2016 - * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> - */ -#include <asm/vdso.h> -#include <asm/asm-offsets.h> -#include <asm/dwarf.h> - - .text - .align 4 - .globl __kernel_getcpu - .type __kernel_getcpu,@function -__kernel_getcpu: - CFI_STARTPROC - sacf 256 - lm %r4,%r5,__VDSO_GETCPU_VAL(%r0) - sacf 0 - ltgr %r2,%r2 - jz 2f - st %r5,0(%r2) -2: ltgr %r3,%r3 - jz 3f - st %r4,0(%r3) -3: lghi %r2,0 - br %r14 - CFI_ENDPROC - .size __kernel_getcpu,.-__kernel_getcpu diff --git a/arch/s390/kernel/vdso64/getcpu.c b/arch/s390/kernel/vdso64/getcpu.c new file mode 100644 index 000000000000..5c5d4a848b76 --- /dev/null +++ b/arch/s390/kernel/vdso64/getcpu.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright IBM Corp. 2020 */ + +#include <linux/compiler.h> +#include <linux/getcpu.h> +#include <asm/timex.h> +#include "vdso.h" + +int __s390_vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) +{ + union tod_clock clk; + + /* CPU number is stored in the programmable field of the TOD clock */ + store_tod_clock_ext(&clk); + if (cpu) + *cpu = clk.pf; + /* NUMA node is always zero */ + if (node) + *node = 0; + return 0; +} diff --git a/arch/s390/kernel/vdso64/gettimeofday.S b/arch/s390/kernel/vdso64/gettimeofday.S deleted file mode 100644 index aebe10dc7c99..000000000000 --- a/arch/s390/kernel/vdso64/gettimeofday.S +++ /dev/null @@ -1,71 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Userland implementation of gettimeofday() for 64 bits processes in a - * s390 kernel for use in the vDSO - * - * Copyright IBM Corp. 2008 - * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) - */ -#include <asm/vdso.h> -#include <asm/asm-offsets.h> -#include <asm/unistd.h> -#include <asm/dwarf.h> -#include <asm/ptrace.h> - - .text - .align 4 - .globl __kernel_gettimeofday - .type __kernel_gettimeofday,@function -__kernel_gettimeofday: - CFI_STARTPROC - aghi %r15,-16 - CFI_ADJUST_CFA_OFFSET 16 - CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD - larl %r5,_vdso_data -0: ltgr %r3,%r3 /* check if tz is NULL */ - je 1f - mvc 0(8,%r3),__VDSO_TIMEZONE(%r5) -1: ltgr %r2,%r2 /* check if tv is NULL */ - je 4f - lg %r4,__VDSO_UPD_COUNT(%r5) /* load update counter */ - tmll %r4,0x0001 /* pending update ? loop */ - jnz 0b - stcke 0(%r15) /* Store TOD clock */ - lg %r1,1(%r15) - lg %r0,__VDSO_TS_END(%r5) /* TOD steering end time */ - slgr %r0,%r1 /* now - ts_steering_end */ - ltgr %r0,%r0 /* past end of steering ? */ - jm 6f - srlg %r0,%r0,15 /* 1 per 2^16 */ - tm __VDSO_TS_DIR+3(%r5),0x01 /* steering direction? */ - jz 7f - lcgr %r0,%r0 /* negative TOD offset */ -7: algr %r1,%r0 /* add steering offset */ -6: sg %r1,__VDSO_XTIME_STAMP(%r5) /* TOD - cycle_last */ - msgf %r1,__VDSO_TK_MULT(%r5) /* * tk->mult */ - alg %r1,__VDSO_XTIME_NSEC(%r5) /* + tk->xtime_nsec */ - lg %r0,__VDSO_XTIME_SEC(%r5) /* tk->xtime_sec */ - clg %r4,__VDSO_UPD_COUNT(%r5) /* check update counter */ - jne 0b - lgf %r5,__VDSO_TK_SHIFT(%r5) /* Timekeeper shift */ - srlg %r1,%r1,0(%r5) /* >> tk->shift */ - larl %r5,5f -2: clg %r1,0(%r5) - jl 3f - slg %r1,0(%r5) - aghi %r0,1 - j 2b -3: stg %r0,0(%r2) /* store tv->tv_sec */ - slgr %r0,%r0 /* tv_nsec -> tv_usec */ - ml %r0,8(%r5) - srlg %r0,%r0,6 - stg %r0,8(%r2) /* store tv->tv_usec */ -4: lghi %r2,0 - aghi %r15,16 - CFI_ADJUST_CFA_OFFSET -16 - CFI_RESTORE 15 - br %r14 - CFI_ENDPROC -5: .quad 1000000000 - .long 274877907 - .size __kernel_gettimeofday,.-__kernel_gettimeofday diff --git a/arch/s390/kernel/vdso64/vdso.h b/arch/s390/kernel/vdso64/vdso.h new file mode 100644 index 000000000000..34c7a2312f9d --- /dev/null +++ b/arch/s390/kernel/vdso64/vdso.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ARCH_S390_KERNEL_VDSO64_VDSO_H +#define __ARCH_S390_KERNEL_VDSO64_VDSO_H + +#include <vdso/datapage.h> + +struct getcpu_cache; + +int __s390_vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused); +int __s390_vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz); +int __s390_vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts); +int __s390_vdso_clock_getres(clockid_t clock, struct __kernel_timespec *ts); + +#endif /* __ARCH_S390_KERNEL_VDSO64_VDSO_H */ diff --git a/arch/s390/kernel/vdso64/vdso64.lds.S b/arch/s390/kernel/vdso64/vdso64.lds.S index 7ddb116b5e2e..4461ea151e49 100644 --- a/arch/s390/kernel/vdso64/vdso64.lds.S +++ b/arch/s390/kernel/vdso64/vdso64.lds.S @@ -13,7 +13,11 @@ ENTRY(_start) SECTIONS { - . = VDSO64_LBASE + SIZEOF_HEADERS; + PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); +#ifdef CONFIG_TIME_NS + PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); +#endif + . = VDSO_LBASE + SIZEOF_HEADERS; .hash : { *(.hash) } :text .gnu.hash : { *(.gnu.hash) } @@ -47,6 +51,7 @@ SECTIONS .rela.dyn ALIGN(8) : { *(.rela.dyn) } .got ALIGN(8) : { *(.got .toc) } + .got.plt ALIGN(8) : { *(.got.plt) } _end = .; PROVIDE(end = .); @@ -94,9 +99,6 @@ SECTIONS .debug_ranges 0 : { *(.debug_ranges) } .gnu.attributes 0 : { KEEP (*(.gnu.attributes)) } - . = ALIGN(PAGE_SIZE); - PROVIDE(_vdso_data = .); - /DISCARD/ : { *(.note.GNU-stack) *(.branch_lt) @@ -136,7 +138,9 @@ VERSION __kernel_clock_gettime; __kernel_clock_getres; __kernel_getcpu; - + __kernel_restart_syscall; + __kernel_rt_sigreturn; + __kernel_sigreturn; local: *; }; } diff --git a/arch/s390/kernel/vdso64/vdso64_generic.c b/arch/s390/kernel/vdso64/vdso64_generic.c new file mode 100644 index 000000000000..a9aa75643c08 --- /dev/null +++ b/arch/s390/kernel/vdso64/vdso64_generic.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "../../../../lib/vdso/gettimeofday.c" +#include "vdso.h" + +int __s390_vdso_gettimeofday(struct __kernel_old_timeval *tv, + struct timezone *tz) +{ + return __cvdso_gettimeofday(tv, tz); +} + +int __s390_vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts) +{ + return __cvdso_clock_gettime(clock, ts); +} + +int __s390_vdso_clock_getres(clockid_t clock, struct __kernel_timespec *ts) +{ + return __cvdso_clock_getres(clock, ts); +} diff --git a/arch/s390/kernel/vdso64/vdso_user_wrapper.S b/arch/s390/kernel/vdso64/vdso_user_wrapper.S new file mode 100644 index 000000000000..57f62596e53b --- /dev/null +++ b/arch/s390/kernel/vdso64/vdso_user_wrapper.S @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/linkage.h> +#include <asm/vdso.h> +#include <asm/unistd.h> +#include <asm/asm-offsets.h> +#include <asm/dwarf.h> +#include <asm/ptrace.h> + +#define WRAPPER_FRAME_SIZE (STACK_FRAME_OVERHEAD+8) + +/* + * Older glibc version called vdso without allocating a stackframe. This wrapper + * is just used to allocate a stackframe. See + * https://sourceware.org/git/?p=glibc.git;a=commit;h=478593e6374f3818da39332260dc453cb19cfa1e + * for details. + */ +.macro vdso_func func + .globl __kernel_\func + .type __kernel_\func,@function + __ALIGN +__kernel_\func: + CFI_STARTPROC + aghi %r15,-WRAPPER_FRAME_SIZE + CFI_DEF_CFA_OFFSET (STACK_FRAME_OVERHEAD + WRAPPER_FRAME_SIZE) + CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD + stg %r14,STACK_FRAME_OVERHEAD(%r15) + brasl %r14,__s390_vdso_\func + lg %r14,STACK_FRAME_OVERHEAD(%r15) + aghi %r15,WRAPPER_FRAME_SIZE + CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD + CFI_RESTORE 15 + br %r14 + CFI_ENDPROC + .size __kernel_\func,.-__kernel_\func +.endm + +vdso_func gettimeofday +vdso_func clock_getres +vdso_func clock_gettime +vdso_func getcpu + +.macro vdso_syscall func,syscall + .globl __kernel_\func + .type __kernel_\func,@function + __ALIGN +__kernel_\func: + CFI_STARTPROC + svc \syscall + /* Make sure we notice when a syscall returns, which shouldn't happen */ + .word 0 + CFI_ENDPROC + .size __kernel_\func,.-__kernel_\func +.endm + +vdso_syscall restart_syscall,__NR_restart_syscall +vdso_syscall sigreturn,__NR_sigreturn +vdso_syscall rt_sigreturn,__NR_rt_sigreturn diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 37695499717d..e32ef446f451 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -5,16 +5,22 @@ #include <asm/thread_info.h> #include <asm/page.h> +#include <asm/ftrace.lds.h> /* * Put .bss..swapper_pg_dir as the first thing in .bss. This will * make sure it has 16k alignment. */ -#define BSS_FIRST_SECTIONS *(.bss..swapper_pg_dir) +#define BSS_FIRST_SECTIONS *(.bss..swapper_pg_dir) \ + *(.bss..invalid_pg_dir) + +#define RO_EXCEPTION_TABLE_ALIGN 16 /* Handle ro_after_init data on our own. */ #define RO_AFTER_INIT_DATA +#define RUNTIME_DISCARD_EXIT + #define EMITS_PT_NOTE #include <asm-generic/vmlinux.lds.h> @@ -40,11 +46,11 @@ SECTIONS HEAD_TEXT TEXT_TEXT SCHED_TEXT - CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT SOFTIRQENTRY_TEXT + FTRACE_HOTPATCH_TRAMPOLINES_TEXT *(.text.*_indirect_*) *(.fixup) *(.gnu.warning) @@ -63,13 +69,20 @@ SECTIONS *(.data..ro_after_init) JUMP_TABLE_DATA } :data - EXCEPTION_TABLE(16) . = ALIGN(PAGE_SIZE); __end_ro_after_init = .; RW_DATA(0x100, PAGE_SIZE, THREAD_SIZE) BOOT_DATA_PRESERVED + . = ALIGN(8); + .amode31.refs : { + _start_amode31_refs = .; + *(.amode31.refs) + _end_amode31_refs = .; + } + + . = ALIGN(PAGE_SIZE); _edata = .; /* End of data section */ /* will be freed after init */ @@ -122,6 +135,7 @@ SECTIONS /* * Table with the patch locations to undo expolines */ + . = ALIGN(4); .nospec_call_table : { __nospec_call_start = . ; *(.s390_indirect*) @@ -135,6 +149,32 @@ SECTIONS BOOT_DATA + /* + * .amode31 section for code, data, ex_table that need to stay + * below 2 GB, even when the kernel is relocated above 2 GB. + */ + . = ALIGN(PAGE_SIZE); + _samode31 = .; + .amode31.text : { + _stext_amode31 = .; + *(.amode31.text) + *(.amode31.text.*_indirect_*) + . = ALIGN(PAGE_SIZE); + _etext_amode31 = .; + } + . = ALIGN(16); + .amode31.ex_table : { + _start_amode31_ex_table = .; + KEEP(*(.amode31.ex_table)) + _stop_amode31_ex_table = .; + } + . = ALIGN(PAGE_SIZE); + .amode31.data : { + *(.amode31.data) + } + . = ALIGN(PAGE_SIZE); + _eamode31 = .; + /* early.c uses stsi, which requires page aligned data. */ . = ALIGN(PAGE_SIZE); INIT_DATA_SECTION(0x100) @@ -157,6 +197,7 @@ SECTIONS BSS_SECTION(PAGE_SIZE, 4 * PAGE_SIZE, PAGE_SIZE) + . = ALIGN(PAGE_SIZE); _end = . ; /* @@ -176,15 +217,28 @@ SECTIONS QUAD(__dynsym_start) /* dynsym_start */ QUAD(__rela_dyn_start) /* rela_dyn_start */ QUAD(__rela_dyn_end) /* rela_dyn_end */ + QUAD(_eamode31 - _samode31) /* amode31_size */ + QUAD(init_mm) + QUAD(swapper_pg_dir) + QUAD(invalid_pg_dir) +#ifdef CONFIG_KASAN + QUAD(kasan_early_shadow_page) + QUAD(kasan_early_shadow_pte) + QUAD(kasan_early_shadow_pmd) + QUAD(kasan_early_shadow_pud) + QUAD(kasan_early_shadow_p4d) +#endif } :NONE /* Debugging sections. */ STABS_DEBUG DWARF_DEBUG + ELF_DETAILS /* Sections to be discarded */ DISCARDS /DISCARD/ : { *(.eh_frame) + *(.interp) } } diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 8df10d3c8f6c..e0a88dcaf5cb 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -7,13 +7,13 @@ */ #include <linux/kernel_stat.h> -#include <linux/sched/cputime.h> #include <linux/export.h> #include <linux/kernel.h> #include <linux/timex.h> #include <linux/types.h> #include <linux/time.h> - +#include <asm/alternative.h> +#include <asm/cputime.h> #include <asm/vtimer.h> #include <asm/vtime.h> #include <asm/cpu_mf.h> @@ -130,13 +130,10 @@ static int do_account_vtime(struct task_struct *tsk) clock = S390_lowcore.last_update_clock; asm volatile( " stpt %0\n" /* Store current cpu timer value */ -#ifdef CONFIG_HAVE_MARCH_Z9_109_FEATURES " stckf %1" /* Store current tod clock value */ -#else - " stck %1" /* Store current tod clock value */ -#endif : "=Q" (S390_lowcore.last_update_timer), - "=Q" (S390_lowcore.last_update_clock)); + "=Q" (S390_lowcore.last_update_clock) + : : "cc"); clock = S390_lowcore.last_update_clock - clock; timer -= S390_lowcore.last_update_timer; @@ -216,41 +213,56 @@ void vtime_flush(struct task_struct *tsk) avg_steal = S390_lowcore.avg_steal_timer / 2; if ((s64) steal > 0) { S390_lowcore.steal_timer = 0; - account_steal_time(steal); + account_steal_time(cputime_to_nsecs(steal)); avg_steal += steal; } S390_lowcore.avg_steal_timer = avg_steal; } +static u64 vtime_delta(void) +{ + u64 timer = S390_lowcore.last_update_timer; + + S390_lowcore.last_update_timer = get_vtimer(); + + return timer - S390_lowcore.last_update_timer; +} + /* * Update process times based on virtual cpu times stored by entry.S * to the lowcore fields user_timer, system_timer & steal_clock. */ -void vtime_account_irq_enter(struct task_struct *tsk) +void vtime_account_kernel(struct task_struct *tsk) { - u64 timer; + u64 delta = vtime_delta(); - timer = S390_lowcore.last_update_timer; - S390_lowcore.last_update_timer = get_vtimer(); - timer -= S390_lowcore.last_update_timer; - - if ((tsk->flags & PF_VCPU) && (irq_count() == 0)) - S390_lowcore.guest_timer += timer; - else if (hardirq_count()) - S390_lowcore.hardirq_timer += timer; - else if (in_serving_softirq()) - S390_lowcore.softirq_timer += timer; + if (tsk->flags & PF_VCPU) + S390_lowcore.guest_timer += delta; else - S390_lowcore.system_timer += timer; + S390_lowcore.system_timer += delta; - virt_timer_forward(timer); + virt_timer_forward(delta); } -EXPORT_SYMBOL_GPL(vtime_account_irq_enter); - -void vtime_account_kernel(struct task_struct *tsk) -__attribute__((alias("vtime_account_irq_enter"))); EXPORT_SYMBOL_GPL(vtime_account_kernel); +void vtime_account_softirq(struct task_struct *tsk) +{ + u64 delta = vtime_delta(); + + S390_lowcore.softirq_timer += delta; + + virt_timer_forward(delta); +} + +void vtime_account_hardirq(struct task_struct *tsk) +{ + u64 delta = vtime_delta(); + + S390_lowcore.hardirq_timer += delta; + + virt_timer_forward(delta); +} + /* * Sorted add to a list. List is linear searched until first bigger * element is found. diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index d3db3d7ed077..72e9b7dcdf7d 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -7,7 +7,7 @@ source "virt/kvm/Kconfig" menuconfig VIRTUALIZATION def_bool y prompt "KVM" - ---help--- + help Say Y here to get to see options for using your Linux host to run other operating systems inside virtual machines (guests). This option alone does not add any kernel code. @@ -20,20 +20,18 @@ config KVM def_tristate y prompt "Kernel-based Virtual Machine (KVM) support" depends on HAVE_KVM - select PREEMPT_NOTIFIERS select HAVE_KVM_CPU_RELAX_INTERCEPT select HAVE_KVM_VCPU_ASYNC_IOCTL - select HAVE_KVM_EVENTFD select KVM_ASYNC_PF select KVM_ASYNC_PF_SYNC + select KVM_COMMON select HAVE_KVM_IRQCHIP - select HAVE_KVM_IRQFD select HAVE_KVM_IRQ_ROUTING select HAVE_KVM_INVALID_WAKEUPS select HAVE_KVM_NO_POLL - select SRCU select KVM_VFIO - ---help--- + select MMU_NOTIFIER + help Support hosting paravirtualized guest machines using the SIE virtualization capability on the mainframe. This should work on any 64bit machine. @@ -49,14 +47,10 @@ config KVM config KVM_S390_UCONTROL bool "Userspace controlled virtual machines" depends on KVM - ---help--- + help Allow CAP_SYS_ADMIN users to create KVM virtual machines that are controlled by userspace. If unsure, say N. -# OK, it's a little counter-intuitive to do this, but it puts it neatly under -# the virtualization menu. -source "drivers/vhost/Kconfig" - endif # VIRTUALIZATION diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile index 05ee90a5ea08..02217fb4ae10 100644 --- a/arch/s390/kvm/Makefile +++ b/arch/s390/kvm/Makefile @@ -3,12 +3,12 @@ # # Copyright IBM Corp. 2008 -KVM := ../../../virt/kvm -common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/async_pf.o $(KVM)/irqchip.o $(KVM)/vfio.o +include $(srctree)/virt/kvm/Makefile.kvm ccflags-y := -Ivirt/kvm -Iarch/s390/kvm -kvm-objs := $(common-objs) kvm-s390.o intercept.o interrupt.o priv.o sigp.o -kvm-objs += diag.o gaccess.o guestdbg.o vsie.o +kvm-y += kvm-s390.o intercept.o interrupt.o priv.o sigp.o +kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o +kvm-$(CONFIG_VFIO_PCI_ZDEV_KVM) += pci.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index 3fb54ec2cf3e..3c65b8258ae6 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -2,7 +2,7 @@ /* * handling diagnose instructions * - * Copyright IBM Corp. 2008, 2011 + * Copyright IBM Corp. 2008, 2020 * * Author(s): Carsten Otte <cotte@de.ibm.com> * Christian Borntraeger <borntraeger@de.ibm.com> @@ -10,7 +10,6 @@ #include <linux/kvm.h> #include <linux/kvm_host.h> -#include <asm/pgalloc.h> #include <asm/gmap.h> #include <asm/virtio-ccw.h> #include "kvm-s390.h" @@ -25,7 +24,7 @@ static int diag_release_pages(struct kvm_vcpu *vcpu) start = vcpu->run->s.regs.gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4]; end = vcpu->run->s.regs.gprs[vcpu->arch.sie_block->ipa & 0xf] + PAGE_SIZE; - vcpu->stat.diagnose_10++; + vcpu->stat.instruction_diagnose_10++; if (start & ~PAGE_MASK || end & ~PAGE_MASK || start >= end || start < 2 * PAGE_SIZE) @@ -75,7 +74,7 @@ static int __diag_page_ref_service(struct kvm_vcpu *vcpu) VCPU_EVENT(vcpu, 3, "diag page reference parameter block at 0x%llx", vcpu->run->s.regs.gprs[rx]); - vcpu->stat.diagnose_258++; + vcpu->stat.instruction_diagnose_258++; if (vcpu->run->s.regs.gprs[rx] & 7) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); rc = read_guest(vcpu, vcpu->run->s.regs.gprs[rx], rx, &parm, sizeof(parm)); @@ -146,18 +145,32 @@ static int __diag_page_ref_service(struct kvm_vcpu *vcpu) static int __diag_time_slice_end(struct kvm_vcpu *vcpu) { VCPU_EVENT(vcpu, 5, "%s", "diag time slice end"); - vcpu->stat.diagnose_44++; + vcpu->stat.instruction_diagnose_44++; kvm_vcpu_on_spin(vcpu, true); return 0; } +static int forward_cnt; +static unsigned long cur_slice; + +static int diag9c_forwarding_overrun(void) +{ + /* Reset the count on a new slice */ + if (time_after(jiffies, cur_slice)) { + cur_slice = jiffies; + forward_cnt = diag9c_forwarding_hz / HZ; + } + return forward_cnt-- <= 0 ? 1 : 0; +} + static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu) { struct kvm_vcpu *tcpu; + int tcpu_cpu; int tid; tid = vcpu->run->s.regs.gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4]; - vcpu->stat.diagnose_9c++; + vcpu->stat.instruction_diagnose_9c++; /* yield to self */ if (tid == vcpu->vcpu_id) @@ -168,9 +181,22 @@ static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu) if (!tcpu) goto no_yield; - /* target already running */ - if (READ_ONCE(tcpu->cpu) >= 0) - goto no_yield; + /* target guest VCPU already running */ + tcpu_cpu = READ_ONCE(tcpu->cpu); + if (tcpu_cpu >= 0) { + if (!diag9c_forwarding_hz || diag9c_forwarding_overrun()) + goto no_yield; + + /* target host CPU already running */ + if (!vcpu_is_preempted(tcpu_cpu)) + goto no_yield; + smp_yield_cpu(tcpu_cpu); + VCPU_EVENT(vcpu, 5, + "diag time slice end directed to %d: yield forwarded", + tid); + vcpu->stat.diag_9c_forward++; + return 0; + } if (kvm_vcpu_yield_to(tcpu) <= 0) goto no_yield; @@ -179,7 +205,7 @@ static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu) return 0; no_yield: VCPU_EVENT(vcpu, 5, "diag time slice end directed to %d: ignored", tid); - vcpu->stat.diagnose_9c_ignored++; + vcpu->stat.diag_9c_ignored++; return 0; } @@ -189,7 +215,7 @@ static int __diag_ipl_functions(struct kvm_vcpu *vcpu) unsigned long subcode = vcpu->run->s.regs.gprs[reg] & 0xffff; VCPU_EVENT(vcpu, 3, "diag ipl functions, subcode %lx", subcode); - vcpu->stat.diagnose_308++; + vcpu->stat.instruction_diagnose_308++; switch (subcode) { case 3: vcpu->run->s390_reset_flags = KVM_S390_RESET_CLEAR; @@ -201,6 +227,10 @@ static int __diag_ipl_functions(struct kvm_vcpu *vcpu) return -EOPNOTSUPP; } + /* + * no need to check the return value of vcpu_stop as it can only have + * an error for protvirt, but protvirt means user cpu state + */ if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm)) kvm_s390_vcpu_stop(vcpu); vcpu->run->s390_reset_flags |= KVM_S390_RESET_SUBSYSTEM; @@ -217,7 +247,7 @@ static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu) { int ret; - vcpu->stat.diagnose_500++; + vcpu->stat.instruction_diagnose_500++; /* No virtio-ccw notification? Get out quickly. */ if (!vcpu->kvm->arch.css_support || (vcpu->run->s.regs.gprs[1] != KVM_S390_VIRTIO_CCW_NOTIFY)) @@ -271,7 +301,7 @@ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu) case 0x500: return __diag_virtio_hypercall(vcpu); default: - vcpu->stat.diagnose_other++; + vcpu->stat.instruction_diagnose_other++; return -EOPNOTSUPP; } } diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 07d30ffcfa41..5bfcc50c1a68 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -9,8 +9,9 @@ #include <linux/vmalloc.h> #include <linux/mm_types.h> #include <linux/err.h> - -#include <asm/pgtable.h> +#include <linux/pgtable.h> +#include <linux/bitfield.h> +#include <asm/fault.h> #include <asm/gmap.h> #include "kvm-s390.h" #include "gaccess.h" @@ -261,77 +262,77 @@ struct aste { /* .. more fields there */ }; -int ipte_lock_held(struct kvm_vcpu *vcpu) +int ipte_lock_held(struct kvm *kvm) { - if (vcpu->arch.sie_block->eca & ECA_SII) { + if (sclp.has_siif) { int rc; - read_lock(&vcpu->kvm->arch.sca_lock); - rc = kvm_s390_get_ipte_control(vcpu->kvm)->kh != 0; - read_unlock(&vcpu->kvm->arch.sca_lock); + read_lock(&kvm->arch.sca_lock); + rc = kvm_s390_get_ipte_control(kvm)->kh != 0; + read_unlock(&kvm->arch.sca_lock); return rc; } - return vcpu->kvm->arch.ipte_lock_count != 0; + return kvm->arch.ipte_lock_count != 0; } -static void ipte_lock_simple(struct kvm_vcpu *vcpu) +static void ipte_lock_simple(struct kvm *kvm) { union ipte_control old, new, *ic; - mutex_lock(&vcpu->kvm->arch.ipte_mutex); - vcpu->kvm->arch.ipte_lock_count++; - if (vcpu->kvm->arch.ipte_lock_count > 1) + mutex_lock(&kvm->arch.ipte_mutex); + kvm->arch.ipte_lock_count++; + if (kvm->arch.ipte_lock_count > 1) goto out; retry: - read_lock(&vcpu->kvm->arch.sca_lock); - ic = kvm_s390_get_ipte_control(vcpu->kvm); + read_lock(&kvm->arch.sca_lock); + ic = kvm_s390_get_ipte_control(kvm); do { old = READ_ONCE(*ic); if (old.k) { - read_unlock(&vcpu->kvm->arch.sca_lock); + read_unlock(&kvm->arch.sca_lock); cond_resched(); goto retry; } new = old; new.k = 1; } while (cmpxchg(&ic->val, old.val, new.val) != old.val); - read_unlock(&vcpu->kvm->arch.sca_lock); + read_unlock(&kvm->arch.sca_lock); out: - mutex_unlock(&vcpu->kvm->arch.ipte_mutex); + mutex_unlock(&kvm->arch.ipte_mutex); } -static void ipte_unlock_simple(struct kvm_vcpu *vcpu) +static void ipte_unlock_simple(struct kvm *kvm) { union ipte_control old, new, *ic; - mutex_lock(&vcpu->kvm->arch.ipte_mutex); - vcpu->kvm->arch.ipte_lock_count--; - if (vcpu->kvm->arch.ipte_lock_count) + mutex_lock(&kvm->arch.ipte_mutex); + kvm->arch.ipte_lock_count--; + if (kvm->arch.ipte_lock_count) goto out; - read_lock(&vcpu->kvm->arch.sca_lock); - ic = kvm_s390_get_ipte_control(vcpu->kvm); + read_lock(&kvm->arch.sca_lock); + ic = kvm_s390_get_ipte_control(kvm); do { old = READ_ONCE(*ic); new = old; new.k = 0; } while (cmpxchg(&ic->val, old.val, new.val) != old.val); - read_unlock(&vcpu->kvm->arch.sca_lock); - wake_up(&vcpu->kvm->arch.ipte_wq); + read_unlock(&kvm->arch.sca_lock); + wake_up(&kvm->arch.ipte_wq); out: - mutex_unlock(&vcpu->kvm->arch.ipte_mutex); + mutex_unlock(&kvm->arch.ipte_mutex); } -static void ipte_lock_siif(struct kvm_vcpu *vcpu) +static void ipte_lock_siif(struct kvm *kvm) { union ipte_control old, new, *ic; retry: - read_lock(&vcpu->kvm->arch.sca_lock); - ic = kvm_s390_get_ipte_control(vcpu->kvm); + read_lock(&kvm->arch.sca_lock); + ic = kvm_s390_get_ipte_control(kvm); do { old = READ_ONCE(*ic); if (old.kg) { - read_unlock(&vcpu->kvm->arch.sca_lock); + read_unlock(&kvm->arch.sca_lock); cond_resched(); goto retry; } @@ -339,15 +340,15 @@ retry: new.k = 1; new.kh++; } while (cmpxchg(&ic->val, old.val, new.val) != old.val); - read_unlock(&vcpu->kvm->arch.sca_lock); + read_unlock(&kvm->arch.sca_lock); } -static void ipte_unlock_siif(struct kvm_vcpu *vcpu) +static void ipte_unlock_siif(struct kvm *kvm) { union ipte_control old, new, *ic; - read_lock(&vcpu->kvm->arch.sca_lock); - ic = kvm_s390_get_ipte_control(vcpu->kvm); + read_lock(&kvm->arch.sca_lock); + ic = kvm_s390_get_ipte_control(kvm); do { old = READ_ONCE(*ic); new = old; @@ -355,25 +356,25 @@ static void ipte_unlock_siif(struct kvm_vcpu *vcpu) if (!new.kh) new.k = 0; } while (cmpxchg(&ic->val, old.val, new.val) != old.val); - read_unlock(&vcpu->kvm->arch.sca_lock); + read_unlock(&kvm->arch.sca_lock); if (!new.kh) - wake_up(&vcpu->kvm->arch.ipte_wq); + wake_up(&kvm->arch.ipte_wq); } -void ipte_lock(struct kvm_vcpu *vcpu) +void ipte_lock(struct kvm *kvm) { - if (vcpu->arch.sie_block->eca & ECA_SII) - ipte_lock_siif(vcpu); + if (sclp.has_siif) + ipte_lock_siif(kvm); else - ipte_lock_simple(vcpu); + ipte_lock_simple(kvm); } -void ipte_unlock(struct kvm_vcpu *vcpu) +void ipte_unlock(struct kvm *kvm) { - if (vcpu->arch.sie_block->eca & ECA_SII) - ipte_unlock_siif(vcpu); + if (sclp.has_siif) + ipte_unlock_siif(kvm); else - ipte_unlock_simple(vcpu); + ipte_unlock_simple(kvm); } static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, u8 ar, @@ -465,61 +466,55 @@ static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, u8 ar, return 0; } -struct trans_exc_code_bits { - unsigned long addr : 52; /* Translation-exception Address */ - unsigned long fsi : 2; /* Access Exception Fetch/Store Indication */ - unsigned long : 2; - unsigned long b56 : 1; - unsigned long : 3; - unsigned long b60 : 1; - unsigned long b61 : 1; - unsigned long as : 2; /* ASCE Identifier */ -}; - -enum { - FSI_UNKNOWN = 0, /* Unknown wether fetch or store */ - FSI_STORE = 1, /* Exception was due to store operation */ - FSI_FETCH = 2 /* Exception was due to fetch operation */ -}; - enum prot_type { PROT_TYPE_LA = 0, PROT_TYPE_KEYC = 1, PROT_TYPE_ALC = 2, PROT_TYPE_DAT = 3, PROT_TYPE_IEP = 4, + /* Dummy value for passing an initialized value when code != PGM_PROTECTION */ + PROT_NONE, }; -static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, - u8 ar, enum gacc_mode mode, enum prot_type prot) +static int trans_exc_ending(struct kvm_vcpu *vcpu, int code, unsigned long gva, u8 ar, + enum gacc_mode mode, enum prot_type prot, bool terminate) { struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm; - struct trans_exc_code_bits *tec; + union teid *teid; memset(pgm, 0, sizeof(*pgm)); pgm->code = code; - tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code; + teid = (union teid *)&pgm->trans_exc_code; switch (code) { case PGM_PROTECTION: switch (prot) { + case PROT_NONE: + /* We should never get here, acts like termination */ + WARN_ON_ONCE(1); + break; case PROT_TYPE_IEP: - tec->b61 = 1; - /* FALL THROUGH */ + teid->b61 = 1; + fallthrough; case PROT_TYPE_LA: - tec->b56 = 1; + teid->b56 = 1; break; case PROT_TYPE_KEYC: - tec->b60 = 1; + teid->b60 = 1; break; case PROT_TYPE_ALC: - tec->b60 = 1; - /* FALL THROUGH */ + teid->b60 = 1; + fallthrough; case PROT_TYPE_DAT: - tec->b61 = 1; + teid->b61 = 1; break; } - /* FALL THROUGH */ + if (terminate) { + teid->b56 = 0; + teid->b60 = 0; + teid->b61 = 0; + } + fallthrough; case PGM_ASCE_TYPE: case PGM_PAGE_TRANSLATION: case PGM_REGION_FIRST_TRANS: @@ -531,10 +526,10 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, * exc_access_id has to be set to 0 for some instructions. Both * cases have to be handled by the caller. */ - tec->addr = gva >> PAGE_SHIFT; - tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH; - tec->as = psw_bits(vcpu->arch.sie_block->gpsw).as; - /* FALL THROUGH */ + teid->addr = gva >> PAGE_SHIFT; + teid->fsi = mode == GACC_STORE ? TEID_FSI_STORE : TEID_FSI_FETCH; + teid->as = psw_bits(vcpu->arch.sie_block->gpsw).as; + fallthrough; case PGM_ALEN_TRANSLATION: case PGM_ALE_SEQUENCE: case PGM_ASTE_VALIDITY: @@ -551,6 +546,12 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, return code; } +static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, u8 ar, + enum gacc_mode mode, enum prot_type prot) +{ + return trans_exc_ending(vcpu, code, gva, ar, mode, prot, false); +} + static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce, unsigned long ga, u8 ar, enum gacc_mode mode) { @@ -607,7 +608,7 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val) * Returns: - zero on success; @gpa contains the resulting absolute address * - a negative value if guest access failed due to e.g. broken * guest mapping - * - a positve value if an access exception happened. In this case + * - a positive value if an access exception happened. In this case * the returned value is the program interruption code as defined * by the architecture */ @@ -677,7 +678,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, dat_protection |= rfte.p; ptr = rfte.rto * PAGE_SIZE + vaddr.rsx * 8; } - /* fallthrough */ + fallthrough; case ASCE_TYPE_REGION2: { union region2_table_entry rste; @@ -695,7 +696,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, dat_protection |= rste.p; ptr = rste.rto * PAGE_SIZE + vaddr.rtx * 8; } - /* fallthrough */ + fallthrough; case ASCE_TYPE_REGION3: { union region3_table_entry rtte; @@ -723,7 +724,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, dat_protection |= rtte.fc0.p; ptr = rtte.fc0.sto * PAGE_SIZE + vaddr.sx * 8; } - /* fallthrough */ + fallthrough; case ASCE_TYPE_SEGMENT: { union segment_table_entry ste; @@ -794,48 +795,270 @@ static int low_address_protection_enabled(struct kvm_vcpu *vcpu, return 1; } -static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, - unsigned long *pages, unsigned long nr_pages, - const union asce asce, enum gacc_mode mode) +static int vm_check_access_key(struct kvm *kvm, u8 access_key, + enum gacc_mode mode, gpa_t gpa) +{ + u8 storage_key, access_control; + bool fetch_protected; + unsigned long hva; + int r; + + if (access_key == 0) + return 0; + + hva = gfn_to_hva(kvm, gpa_to_gfn(gpa)); + if (kvm_is_error_hva(hva)) + return PGM_ADDRESSING; + + mmap_read_lock(current->mm); + r = get_guest_storage_key(current->mm, hva, &storage_key); + mmap_read_unlock(current->mm); + if (r) + return r; + access_control = FIELD_GET(_PAGE_ACC_BITS, storage_key); + if (access_control == access_key) + return 0; + fetch_protected = storage_key & _PAGE_FP_BIT; + if ((mode == GACC_FETCH || mode == GACC_IFETCH) && !fetch_protected) + return 0; + return PGM_PROTECTION; +} + +static bool fetch_prot_override_applicable(struct kvm_vcpu *vcpu, enum gacc_mode mode, + union asce asce) +{ + psw_t *psw = &vcpu->arch.sie_block->gpsw; + unsigned long override; + + if (mode == GACC_FETCH || mode == GACC_IFETCH) { + /* check if fetch protection override enabled */ + override = vcpu->arch.sie_block->gcr[0]; + override &= CR0_FETCH_PROTECTION_OVERRIDE; + /* not applicable if subject to DAT && private space */ + override = override && !(psw_bits(*psw).dat && asce.p); + return override; + } + return false; +} + +static bool fetch_prot_override_applies(unsigned long ga, unsigned int len) +{ + return ga < 2048 && ga + len <= 2048; +} + +static bool storage_prot_override_applicable(struct kvm_vcpu *vcpu) +{ + /* check if storage protection override enabled */ + return vcpu->arch.sie_block->gcr[0] & CR0_STORAGE_PROTECTION_OVERRIDE; +} + +static bool storage_prot_override_applies(u8 access_control) +{ + /* matches special storage protection override key (9) -> allow */ + return access_control == PAGE_SPO_ACC; +} + +static int vcpu_check_access_key(struct kvm_vcpu *vcpu, u8 access_key, + enum gacc_mode mode, union asce asce, gpa_t gpa, + unsigned long ga, unsigned int len) +{ + u8 storage_key, access_control; + unsigned long hva; + int r; + + /* access key 0 matches any storage key -> allow */ + if (access_key == 0) + return 0; + /* + * caller needs to ensure that gfn is accessible, so we can + * assume that this cannot fail + */ + hva = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gpa)); + mmap_read_lock(current->mm); + r = get_guest_storage_key(current->mm, hva, &storage_key); + mmap_read_unlock(current->mm); + if (r) + return r; + access_control = FIELD_GET(_PAGE_ACC_BITS, storage_key); + /* access key matches storage key -> allow */ + if (access_control == access_key) + return 0; + if (mode == GACC_FETCH || mode == GACC_IFETCH) { + /* it is a fetch and fetch protection is off -> allow */ + if (!(storage_key & _PAGE_FP_BIT)) + return 0; + if (fetch_prot_override_applicable(vcpu, mode, asce) && + fetch_prot_override_applies(ga, len)) + return 0; + } + if (storage_prot_override_applicable(vcpu) && + storage_prot_override_applies(access_control)) + return 0; + return PGM_PROTECTION; +} + +/** + * guest_range_to_gpas() - Calculate guest physical addresses of page fragments + * covering a logical range + * @vcpu: virtual cpu + * @ga: guest address, start of range + * @ar: access register + * @gpas: output argument, may be NULL + * @len: length of range in bytes + * @asce: address-space-control element to use for translation + * @mode: access mode + * @access_key: access key to mach the range's storage keys against + * + * Translate a logical range to a series of guest absolute addresses, + * such that the concatenation of page fragments starting at each gpa make up + * the whole range. + * The translation is performed as if done by the cpu for the given @asce, @ar, + * @mode and state of the @vcpu. + * If the translation causes an exception, its program interruption code is + * returned and the &struct kvm_s390_pgm_info pgm member of @vcpu is modified + * such that a subsequent call to kvm_s390_inject_prog_vcpu() will inject + * a correct exception into the guest. + * The resulting gpas are stored into @gpas, unless it is NULL. + * + * Note: All fragments except the first one start at the beginning of a page. + * When deriving the boundaries of a fragment from a gpa, all but the last + * fragment end at the end of the page. + * + * Return: + * * 0 - success + * * <0 - translation could not be performed, for example if guest + * memory could not be accessed + * * >0 - an access exception occurred. In this case the returned value + * is the program interruption code and the contents of pgm may + * be used to inject an exception into the guest. + */ +static int guest_range_to_gpas(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, + unsigned long *gpas, unsigned long len, + const union asce asce, enum gacc_mode mode, + u8 access_key) { psw_t *psw = &vcpu->arch.sie_block->gpsw; + unsigned int offset = offset_in_page(ga); + unsigned int fragment_len; int lap_enabled, rc = 0; enum prot_type prot; + unsigned long gpa; lap_enabled = low_address_protection_enabled(vcpu, asce); - while (nr_pages) { + while (min(PAGE_SIZE - offset, len) > 0) { + fragment_len = min(PAGE_SIZE - offset, len); ga = kvm_s390_logical_to_effective(vcpu, ga); if (mode == GACC_STORE && lap_enabled && is_low_address(ga)) return trans_exc(vcpu, PGM_PROTECTION, ga, ar, mode, PROT_TYPE_LA); - ga &= PAGE_MASK; if (psw_bits(*psw).dat) { - rc = guest_translate(vcpu, ga, pages, asce, mode, &prot); + rc = guest_translate(vcpu, ga, &gpa, asce, mode, &prot); if (rc < 0) return rc; } else { - *pages = kvm_s390_real_to_abs(vcpu, ga); - if (kvm_is_error_gpa(vcpu->kvm, *pages)) + gpa = kvm_s390_real_to_abs(vcpu, ga); + if (kvm_is_error_gpa(vcpu->kvm, gpa)) { rc = PGM_ADDRESSING; + prot = PROT_NONE; + } } if (rc) return trans_exc(vcpu, rc, ga, ar, mode, prot); - ga += PAGE_SIZE; - pages++; - nr_pages--; + rc = vcpu_check_access_key(vcpu, access_key, mode, asce, gpa, ga, + fragment_len); + if (rc) + return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_KEYC); + if (gpas) + *gpas++ = gpa; + offset = 0; + ga += fragment_len; + len -= fragment_len; + } + return 0; +} + +static int access_guest_page(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa, + void *data, unsigned int len) +{ + const unsigned int offset = offset_in_page(gpa); + const gfn_t gfn = gpa_to_gfn(gpa); + int rc; + + if (mode == GACC_STORE) + rc = kvm_write_guest_page(kvm, gfn, data, offset, len); + else + rc = kvm_read_guest_page(kvm, gfn, data, offset, len); + return rc; +} + +static int +access_guest_page_with_key(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa, + void *data, unsigned int len, u8 access_key) +{ + struct kvm_memory_slot *slot; + bool writable; + gfn_t gfn; + hva_t hva; + int rc; + + gfn = gpa >> PAGE_SHIFT; + slot = gfn_to_memslot(kvm, gfn); + hva = gfn_to_hva_memslot_prot(slot, gfn, &writable); + + if (kvm_is_error_hva(hva)) + return PGM_ADDRESSING; + /* + * Check if it's a ro memslot, even tho that can't occur (they're unsupported). + * Don't try to actually handle that case. + */ + if (!writable && mode == GACC_STORE) + return -EOPNOTSUPP; + hva += offset_in_page(gpa); + if (mode == GACC_STORE) + rc = copy_to_user_key((void __user *)hva, data, len, access_key); + else + rc = copy_from_user_key(data, (void __user *)hva, len, access_key); + if (rc) + return PGM_PROTECTION; + if (mode == GACC_STORE) + mark_page_dirty_in_slot(kvm, slot, gfn); + return 0; +} + +int access_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, void *data, + unsigned long len, enum gacc_mode mode, u8 access_key) +{ + int offset = offset_in_page(gpa); + int fragment_len; + int rc; + + while (min(PAGE_SIZE - offset, len) > 0) { + fragment_len = min(PAGE_SIZE - offset, len); + rc = access_guest_page_with_key(kvm, mode, gpa, data, fragment_len, access_key); + if (rc) + return rc; + offset = 0; + len -= fragment_len; + data += fragment_len; + gpa += fragment_len; } return 0; } -int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data, - unsigned long len, enum gacc_mode mode) +int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, + void *data, unsigned long len, enum gacc_mode mode, + u8 access_key) { psw_t *psw = &vcpu->arch.sie_block->gpsw; - unsigned long _len, nr_pages, gpa, idx; - unsigned long pages_array[2]; - unsigned long *pages; + unsigned long nr_pages, idx; + unsigned long gpa_array[2]; + unsigned int fragment_len; + unsigned long *gpas; + enum prot_type prot; int need_ipte_lock; union asce asce; + bool try_storage_prot_override; + bool try_fetch_prot_override; int rc; if (!len) @@ -845,55 +1068,199 @@ int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data, if (rc) return rc; nr_pages = (((ga & ~PAGE_MASK) + len - 1) >> PAGE_SHIFT) + 1; - pages = pages_array; - if (nr_pages > ARRAY_SIZE(pages_array)) - pages = vmalloc(array_size(nr_pages, sizeof(unsigned long))); - if (!pages) + gpas = gpa_array; + if (nr_pages > ARRAY_SIZE(gpa_array)) + gpas = vmalloc(array_size(nr_pages, sizeof(unsigned long))); + if (!gpas) return -ENOMEM; + try_fetch_prot_override = fetch_prot_override_applicable(vcpu, mode, asce); + try_storage_prot_override = storage_prot_override_applicable(vcpu); need_ipte_lock = psw_bits(*psw).dat && !asce.r; if (need_ipte_lock) - ipte_lock(vcpu); - rc = guest_page_range(vcpu, ga, ar, pages, nr_pages, asce, mode); - for (idx = 0; idx < nr_pages && !rc; idx++) { - gpa = *(pages + idx) + (ga & ~PAGE_MASK); - _len = min(PAGE_SIZE - (gpa & ~PAGE_MASK), len); - if (mode == GACC_STORE) - rc = kvm_write_guest(vcpu->kvm, gpa, data, _len); + ipte_lock(vcpu->kvm); + /* + * Since we do the access further down ultimately via a move instruction + * that does key checking and returns an error in case of a protection + * violation, we don't need to do the check during address translation. + * Skip it by passing access key 0, which matches any storage key, + * obviating the need for any further checks. As a result the check is + * handled entirely in hardware on access, we only need to take care to + * forego key protection checking if fetch protection override applies or + * retry with the special key 9 in case of storage protection override. + */ + rc = guest_range_to_gpas(vcpu, ga, ar, gpas, len, asce, mode, 0); + if (rc) + goto out_unlock; + for (idx = 0; idx < nr_pages; idx++) { + fragment_len = min(PAGE_SIZE - offset_in_page(gpas[idx]), len); + if (try_fetch_prot_override && fetch_prot_override_applies(ga, fragment_len)) { + rc = access_guest_page(vcpu->kvm, mode, gpas[idx], + data, fragment_len); + } else { + rc = access_guest_page_with_key(vcpu->kvm, mode, gpas[idx], + data, fragment_len, access_key); + } + if (rc == PGM_PROTECTION && try_storage_prot_override) + rc = access_guest_page_with_key(vcpu->kvm, mode, gpas[idx], + data, fragment_len, PAGE_SPO_ACC); + if (rc) + break; + len -= fragment_len; + data += fragment_len; + ga = kvm_s390_logical_to_effective(vcpu, ga + fragment_len); + } + if (rc > 0) { + bool terminate = (mode == GACC_STORE) && (idx > 0); + + if (rc == PGM_PROTECTION) + prot = PROT_TYPE_KEYC; else - rc = kvm_read_guest(vcpu->kvm, gpa, data, _len); - len -= _len; - ga += _len; - data += _len; + prot = PROT_NONE; + rc = trans_exc_ending(vcpu, rc, ga, ar, mode, prot, terminate); } +out_unlock: if (need_ipte_lock) - ipte_unlock(vcpu); - if (nr_pages > ARRAY_SIZE(pages_array)) - vfree(pages); + ipte_unlock(vcpu->kvm); + if (nr_pages > ARRAY_SIZE(gpa_array)) + vfree(gpas); return rc; } int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data, unsigned long len, enum gacc_mode mode) { - unsigned long _len, gpa; + unsigned int fragment_len; + unsigned long gpa; int rc = 0; while (len && !rc) { gpa = kvm_s390_real_to_abs(vcpu, gra); - _len = min(PAGE_SIZE - (gpa & ~PAGE_MASK), len); - if (mode) - rc = write_guest_abs(vcpu, gpa, data, _len); - else - rc = read_guest_abs(vcpu, gpa, data, _len); - len -= _len; - gra += _len; - data += _len; + fragment_len = min(PAGE_SIZE - offset_in_page(gpa), len); + rc = access_guest_page(vcpu->kvm, mode, gpa, data, fragment_len); + len -= fragment_len; + gra += fragment_len; + data += fragment_len; } return rc; } /** - * guest_translate_address - translate guest logical into guest absolute address + * cmpxchg_guest_abs_with_key() - Perform cmpxchg on guest absolute address. + * @kvm: Virtual machine instance. + * @gpa: Absolute guest address of the location to be changed. + * @len: Operand length of the cmpxchg, required: 1 <= len <= 16. Providing a + * non power of two will result in failure. + * @old_addr: Pointer to old value. If the location at @gpa contains this value, + * the exchange will succeed. After calling cmpxchg_guest_abs_with_key() + * *@old_addr contains the value at @gpa before the attempt to + * exchange the value. + * @new: The value to place at @gpa. + * @access_key: The access key to use for the guest access. + * @success: output value indicating if an exchange occurred. + * + * Atomically exchange the value at @gpa by @new, if it contains *@old. + * Honors storage keys. + * + * Return: * 0: successful exchange + * * >0: a program interruption code indicating the reason cmpxchg could + * not be attempted + * * -EINVAL: address misaligned or len not power of two + * * -EAGAIN: transient failure (len 1 or 2) + * * -EOPNOTSUPP: read-only memslot (should never occur) + */ +int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, + __uint128_t *old_addr, __uint128_t new, + u8 access_key, bool *success) +{ + gfn_t gfn = gpa_to_gfn(gpa); + struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); + bool writable; + hva_t hva; + int ret; + + if (!IS_ALIGNED(gpa, len)) + return -EINVAL; + + hva = gfn_to_hva_memslot_prot(slot, gfn, &writable); + if (kvm_is_error_hva(hva)) + return PGM_ADDRESSING; + /* + * Check if it's a read-only memslot, even though that cannot occur + * since those are unsupported. + * Don't try to actually handle that case. + */ + if (!writable) + return -EOPNOTSUPP; + + hva += offset_in_page(gpa); + /* + * The cmpxchg_user_key macro depends on the type of "old", so we need + * a case for each valid length and get some code duplication as long + * as we don't introduce a new macro. + */ + switch (len) { + case 1: { + u8 old; + + ret = cmpxchg_user_key((u8 __user *)hva, &old, *old_addr, new, access_key); + *success = !ret && old == *old_addr; + *old_addr = old; + break; + } + case 2: { + u16 old; + + ret = cmpxchg_user_key((u16 __user *)hva, &old, *old_addr, new, access_key); + *success = !ret && old == *old_addr; + *old_addr = old; + break; + } + case 4: { + u32 old; + + ret = cmpxchg_user_key((u32 __user *)hva, &old, *old_addr, new, access_key); + *success = !ret && old == *old_addr; + *old_addr = old; + break; + } + case 8: { + u64 old; + + ret = cmpxchg_user_key((u64 __user *)hva, &old, *old_addr, new, access_key); + *success = !ret && old == *old_addr; + *old_addr = old; + break; + } + case 16: { + __uint128_t old; + + ret = cmpxchg_user_key((__uint128_t __user *)hva, &old, *old_addr, new, access_key); + *success = !ret && old == *old_addr; + *old_addr = old; + break; + } + default: + return -EINVAL; + } + if (*success) + mark_page_dirty_in_slot(kvm, slot, gfn); + /* + * Assume that the fault is caused by protection, either key protection + * or user page write protection. + */ + if (ret == -EFAULT) + ret = PGM_PROTECTION; + return ret; +} + +/** + * guest_translate_address_with_key - translate guest logical into guest absolute address + * @vcpu: virtual cpu + * @gva: Guest virtual address + * @ar: Access register + * @gpa: Guest physical address + * @mode: Translation access mode + * @access_key: access key to mach the storage key with * * Parameter semantics are the same as the ones from guest_translate. * The memory contents at the guest address are not changed. @@ -901,11 +1268,10 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, * Note: The IPTE lock is not taken during this function, so the caller * has to take care of this. */ -int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, - unsigned long *gpa, enum gacc_mode mode) +int guest_translate_address_with_key(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, + unsigned long *gpa, enum gacc_mode mode, + u8 access_key) { - psw_t *psw = &vcpu->arch.sie_block->gpsw; - enum prot_type prot; union asce asce; int rc; @@ -913,49 +1279,62 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, rc = get_vcpu_asce(vcpu, &asce, gva, ar, mode); if (rc) return rc; - if (is_low_address(gva) && low_address_protection_enabled(vcpu, asce)) { - if (mode == GACC_STORE) - return trans_exc(vcpu, PGM_PROTECTION, gva, 0, - mode, PROT_TYPE_LA); - } + return guest_range_to_gpas(vcpu, gva, ar, gpa, 1, asce, mode, + access_key); +} - if (psw_bits(*psw).dat && !asce.r) { /* Use DAT? */ - rc = guest_translate(vcpu, gva, gpa, asce, mode, &prot); - if (rc > 0) - return trans_exc(vcpu, rc, gva, 0, mode, prot); - } else { - *gpa = kvm_s390_real_to_abs(vcpu, gva); - if (kvm_is_error_gpa(vcpu->kvm, *gpa)) - return trans_exc(vcpu, rc, gva, PGM_ADDRESSING, mode, 0); - } +/** + * check_gva_range - test a range of guest virtual addresses for accessibility + * @vcpu: virtual cpu + * @gva: Guest virtual address + * @ar: Access register + * @length: Length of test range + * @mode: Translation access mode + * @access_key: access key to mach the storage keys with + */ +int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, + unsigned long length, enum gacc_mode mode, u8 access_key) +{ + union asce asce; + int rc = 0; + + rc = get_vcpu_asce(vcpu, &asce, gva, ar, mode); + if (rc) + return rc; + ipte_lock(vcpu->kvm); + rc = guest_range_to_gpas(vcpu, gva, ar, NULL, length, asce, mode, + access_key); + ipte_unlock(vcpu->kvm); return rc; } /** - * check_gva_range - test a range of guest virtual addresses for accessibility + * check_gpa_range - test a range of guest physical addresses for accessibility + * @kvm: virtual machine instance + * @gpa: guest physical address + * @length: length of test range + * @mode: access mode to test, relevant for storage keys + * @access_key: access key to mach the storage keys with */ -int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, - unsigned long length, enum gacc_mode mode) +int check_gpa_range(struct kvm *kvm, unsigned long gpa, unsigned long length, + enum gacc_mode mode, u8 access_key) { - unsigned long gpa; - unsigned long currlen; + unsigned int fragment_len; int rc = 0; - ipte_lock(vcpu); - while (length > 0 && !rc) { - currlen = min(length, PAGE_SIZE - (gva % PAGE_SIZE)); - rc = guest_translate_address(vcpu, gva, ar, &gpa, mode); - gva += currlen; - length -= currlen; + while (length && !rc) { + fragment_len = min(PAGE_SIZE - offset_in_page(gpa), length); + rc = vm_check_access_key(kvm, access_key, mode, gpa); + length -= fragment_len; + gpa += fragment_len; } - ipte_unlock(vcpu); - return rc; } /** * kvm_s390_check_low_addr_prot_real - check for low-address protection + * @vcpu: virtual cpu * @gra: Guest real address * * Checks whether an address is subject to low-address protection and set @@ -976,13 +1355,17 @@ int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra) * kvm_s390_shadow_tables - walk the guest page table and create shadow tables * @sg: pointer to the shadow guest address space structure * @saddr: faulting address in the shadow gmap - * @pgt: pointer to the page table address result + * @pgt: pointer to the beginning of the page table for the given address if + * successful (return value 0), or to the first invalid DAT entry in + * case of exceptions (return value > 0) + * @dat_protection: referenced memory is write protected * @fake: pgt references contiguous guest memory block, not a pgtable */ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, unsigned long *pgt, int *dat_protection, int *fake) { + struct kvm *kvm; struct gmap *parent; union asce asce; union vaddress vaddr; @@ -991,6 +1374,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, *fake = 0; *dat_protection = 0; + kvm = sg->private; parent = sg->parent; vaddr.addr = saddr; asce.val = sg->orig_asce; @@ -1034,6 +1418,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, rfte.val = ptr; goto shadow_r2t; } + *pgt = ptr + vaddr.rfx * 8; rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val); if (rc) return rc; @@ -1050,7 +1435,9 @@ shadow_r2t: rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake); if (rc) return rc; - } /* fallthrough */ + kvm->stat.gmap_shadow_r1_entry++; + } + fallthrough; case ASCE_TYPE_REGION2: { union region2_table_entry rste; @@ -1059,6 +1446,7 @@ shadow_r2t: rste.val = ptr; goto shadow_r3t; } + *pgt = ptr + vaddr.rsx * 8; rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val); if (rc) return rc; @@ -1076,7 +1464,9 @@ shadow_r3t: rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake); if (rc) return rc; - } /* fallthrough */ + kvm->stat.gmap_shadow_r2_entry++; + } + fallthrough; case ASCE_TYPE_REGION3: { union region3_table_entry rtte; @@ -1085,6 +1475,7 @@ shadow_r3t: rtte.val = ptr; goto shadow_sgt; } + *pgt = ptr + vaddr.rtx * 8; rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val); if (rc) return rc; @@ -1111,7 +1502,9 @@ shadow_sgt: rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake); if (rc) return rc; - } /* fallthrough */ + kvm->stat.gmap_shadow_r3_entry++; + } + fallthrough; case ASCE_TYPE_SEGMENT: { union segment_table_entry ste; @@ -1120,6 +1513,7 @@ shadow_sgt: ste.val = ptr; goto shadow_pgt; } + *pgt = ptr + vaddr.sx * 8; rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val); if (rc) return rc; @@ -1142,6 +1536,7 @@ shadow_pgt: rc = gmap_shadow_pgt(sg, saddr, ste.val, *fake); if (rc) return rc; + kvm->stat.gmap_shadow_sg_entry++; } } /* Return the parent address of the page table */ @@ -1154,6 +1549,8 @@ shadow_pgt: * @vcpu: virtual cpu * @sg: pointer to the shadow guest address space structure * @saddr: faulting address in the shadow gmap + * @datptr: will contain the address of the faulting DAT table entry, or of + * the valid leaf, plus some flags * * Returns: - 0 if the shadow fault was successfully resolved * - > 0 (pgm exception code) on exceptions while faulting @@ -1162,21 +1559,21 @@ shadow_pgt: * - -ENOMEM if out of memory */ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, - unsigned long saddr) + unsigned long saddr, unsigned long *datptr) { union vaddress vaddr; union page_table_entry pte; - unsigned long pgt; + unsigned long pgt = 0; int dat_protection, fake; int rc; - down_read(&sg->mm->mmap_sem); + mmap_read_lock(sg->mm); /* * We don't want any guest-2 tables to change - so the parent * tables/pointers we read stay valid - unshadowing is however * always possible - only guest_table_lock protects us. */ - ipte_lock(vcpu); + ipte_lock(vcpu->kvm); rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake); if (rc) @@ -1188,8 +1585,20 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, pte.val = pgt + vaddr.px * PAGE_SIZE; goto shadow_page; } - if (!rc) - rc = gmap_read_table(sg->parent, pgt + vaddr.px * 8, &pte.val); + + switch (rc) { + case PGM_SEGMENT_TRANSLATION: + case PGM_REGION_THIRD_TRANS: + case PGM_REGION_SECOND_TRANS: + case PGM_REGION_FIRST_TRANS: + pgt |= PEI_NOT_PTE; + break; + case 0: + pgt += vaddr.px * 8; + rc = gmap_read_table(sg->parent, pgt, &pte.val); + } + if (datptr) + *datptr = pgt | dat_protection * PEI_DAT_PROT; if (!rc && pte.i) rc = PGM_PAGE_TRANSLATION; if (!rc && pte.z) @@ -1198,7 +1607,8 @@ shadow_page: pte.p |= dat_protection; if (!rc) rc = gmap_shadow_page(sg, saddr, __pte(pte.val)); - ipte_unlock(vcpu); - up_read(&sg->mm->mmap_sem); + vcpu->kvm->stat.gmap_shadow_pg_entry++; + ipte_unlock(vcpu->kvm); + mmap_read_unlock(sg->mm); return rc; } diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h index f4c51756c462..b320d12aa049 100644 --- a/arch/s390/kvm/gaccess.h +++ b/arch/s390/kvm/gaccess.h @@ -18,17 +18,14 @@ /** * kvm_s390_real_to_abs - convert guest real address to guest absolute address - * @vcpu - guest virtual cpu + * @prefix - guest prefix * @gra - guest real address * * Returns the guest absolute address that corresponds to the passed guest real - * address @gra of a virtual guest cpu by applying its prefix. + * address @gra of by applying the given prefix. */ -static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu, - unsigned long gra) +static inline unsigned long _kvm_s390_real_to_abs(u32 prefix, unsigned long gra) { - unsigned long prefix = kvm_s390_get_prefix(vcpu); - if (gra < 2 * PAGE_SIZE) gra += prefix; else if (gra >= prefix && gra < prefix + 2 * PAGE_SIZE) @@ -37,6 +34,43 @@ static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu, } /** + * kvm_s390_real_to_abs - convert guest real address to guest absolute address + * @vcpu - guest virtual cpu + * @gra - guest real address + * + * Returns the guest absolute address that corresponds to the passed guest real + * address @gra of a virtual guest cpu by applying its prefix. + */ +static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu, + unsigned long gra) +{ + return _kvm_s390_real_to_abs(kvm_s390_get_prefix(vcpu), gra); +} + +/** + * _kvm_s390_logical_to_effective - convert guest logical to effective address + * @psw: psw of the guest + * @ga: guest logical address + * + * Convert a guest logical address to an effective address by applying the + * rules of the addressing mode defined by bits 31 and 32 of the given PSW + * (extendended/basic addressing mode). + * + * Depending on the addressing mode, the upper 40 bits (24 bit addressing + * mode), 33 bits (31 bit addressing mode) or no bits (64 bit addressing + * mode) of @ga will be zeroed and the remaining bits will be returned. + */ +static inline unsigned long _kvm_s390_logical_to_effective(psw_t *psw, + unsigned long ga) +{ + if (psw_bits(*psw).eaba == PSW_BITS_AMODE_64BIT) + return ga; + if (psw_bits(*psw).eaba == PSW_BITS_AMODE_31BIT) + return ga & ((1UL << 31) - 1); + return ga & ((1UL << 24) - 1); +} + +/** * kvm_s390_logical_to_effective - convert guest logical to effective address * @vcpu: guest virtual cpu * @ga: guest logical address @@ -52,13 +86,7 @@ static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu, static inline unsigned long kvm_s390_logical_to_effective(struct kvm_vcpu *vcpu, unsigned long ga) { - psw_t *psw = &vcpu->arch.sie_block->gpsw; - - if (psw_bits(*psw).eaba == PSW_BITS_AMODE_64BIT) - return ga; - if (psw_bits(*psw).eaba == PSW_BITS_AMODE_31BIT) - return ga & ((1UL << 31) - 1); - return ga & ((1UL << 24) - 1); + return _kvm_s390_logical_to_effective(&vcpu->arch.sie_block->gpsw, ga); } /* @@ -158,24 +186,37 @@ enum gacc_mode { GACC_IFETCH, }; -int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, - u8 ar, unsigned long *gpa, enum gacc_mode mode); +int guest_translate_address_with_key(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, + unsigned long *gpa, enum gacc_mode mode, + u8 access_key); + int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, - unsigned long length, enum gacc_mode mode); + unsigned long length, enum gacc_mode mode, u8 access_key); + +int check_gpa_range(struct kvm *kvm, unsigned long gpa, unsigned long length, + enum gacc_mode mode, u8 access_key); + +int access_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, void *data, + unsigned long len, enum gacc_mode mode, u8 access_key); -int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data, - unsigned long len, enum gacc_mode mode); +int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, + void *data, unsigned long len, enum gacc_mode mode, + u8 access_key); int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data, unsigned long len, enum gacc_mode mode); +int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, __uint128_t *old, + __uint128_t new, u8 access_key, bool *success); + /** - * write_guest - copy data from kernel space to guest space + * write_guest_with_key - copy data from kernel space to guest space * @vcpu: virtual cpu * @ga: guest address * @ar: access register * @data: source address in kernel space * @len: number of bytes to copy + * @access_key: access key the storage key needs to match * * Copy @len bytes from @data (kernel space) to @ga (guest address). * In order to copy data to guest space the PSW of the vcpu is inspected: @@ -186,8 +227,8 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, * The addressing mode of the PSW is also inspected, so that address wrap * around is taken into account for 24-, 31- and 64-bit addressing mode, * if the to be copied data crosses page boundaries in guest address space. - * In addition also low address and DAT protection are inspected before - * copying any data (key protection is currently not implemented). + * In addition low address, DAT and key protection checks are performed before + * copying any data. * * This function modifies the 'struct kvm_s390_pgm_info pgm' member of @vcpu. * In case of an access exception (e.g. protection exception) pgm will contain @@ -215,10 +256,53 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, * if data has been changed in guest space in case of an exception. */ static inline __must_check +int write_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, + void *data, unsigned long len, u8 access_key) +{ + return access_guest_with_key(vcpu, ga, ar, data, len, GACC_STORE, + access_key); +} + +/** + * write_guest - copy data from kernel space to guest space + * @vcpu: virtual cpu + * @ga: guest address + * @ar: access register + * @data: source address in kernel space + * @len: number of bytes to copy + * + * The behaviour of write_guest is identical to write_guest_with_key, except + * that the PSW access key is used instead of an explicit argument. + */ +static inline __must_check int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data, unsigned long len) { - return access_guest(vcpu, ga, ar, data, len, GACC_STORE); + u8 access_key = psw_bits(vcpu->arch.sie_block->gpsw).key; + + return write_guest_with_key(vcpu, ga, ar, data, len, access_key); +} + +/** + * read_guest_with_key - copy data from guest space to kernel space + * @vcpu: virtual cpu + * @ga: guest address + * @ar: access register + * @data: destination address in kernel space + * @len: number of bytes to copy + * @access_key: access key the storage key needs to match + * + * Copy @len bytes from @ga (guest address) to @data (kernel space). + * + * The behaviour of read_guest_with_key is identical to write_guest_with_key, + * except that data will be copied from guest space to kernel space. + */ +static inline __must_check +int read_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, + void *data, unsigned long len, u8 access_key) +{ + return access_guest_with_key(vcpu, ga, ar, data, len, GACC_FETCH, + access_key); } /** @@ -231,14 +315,16 @@ int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data, * * Copy @len bytes from @ga (guest address) to @data (kernel space). * - * The behaviour of read_guest is identical to write_guest, except that - * data will be copied from guest space to kernel space. + * The behaviour of read_guest is identical to read_guest_with_key, except + * that the PSW access key is used instead of an explicit argument. */ static inline __must_check int read_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data, unsigned long len) { - return access_guest(vcpu, ga, ar, data, len, GACC_FETCH); + u8 access_key = psw_bits(vcpu->arch.sie_block->gpsw).key; + + return read_guest_with_key(vcpu, ga, ar, data, len, access_key); } /** @@ -259,7 +345,10 @@ static inline __must_check int read_guest_instr(struct kvm_vcpu *vcpu, unsigned long ga, void *data, unsigned long len) { - return access_guest(vcpu, ga, 0, data, len, GACC_IFETCH); + u8 access_key = psw_bits(vcpu->arch.sie_block->gpsw).key; + + return access_guest_with_key(vcpu, ga, 0, data, len, GACC_IFETCH, + access_key); } /** @@ -354,12 +443,16 @@ int read_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data, return access_guest_real(vcpu, gra, data, len, 0); } -void ipte_lock(struct kvm_vcpu *vcpu); -void ipte_unlock(struct kvm_vcpu *vcpu); -int ipte_lock_held(struct kvm_vcpu *vcpu); +void ipte_lock(struct kvm *kvm); +void ipte_unlock(struct kvm *kvm); +int ipte_lock_held(struct kvm *kvm); int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra); +/* MVPG PEI indication bits */ +#define PEI_DAT_PROT 2 +#define PEI_NOT_PTE 4 + int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow, - unsigned long saddr); + unsigned long saddr, unsigned long *datptr); #endif /* __KVM_S390_GACCESS_H */ diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c index 394a5f53805b..80879fc73c90 100644 --- a/arch/s390/kvm/guestdbg.c +++ b/arch/s390/kvm/guestdbg.c @@ -184,7 +184,7 @@ static int __import_wp_info(struct kvm_vcpu *vcpu, if (wp_info->len < 0 || wp_info->len > MAX_WP_SIZE) return -EINVAL; - wp_info->old_data = kmalloc(bp_data->len, GFP_KERNEL); + wp_info->old_data = kmalloc(bp_data->len, GFP_KERNEL_ACCOUNT); if (!wp_info->old_data) return -ENOMEM; /* try to backup the original value */ @@ -213,8 +213,8 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu, else if (dbg->arch.nr_hw_bp > MAX_BP_COUNT) return -EINVAL; - bp_data = memdup_user(dbg->arch.hw_bp, - sizeof(*bp_data) * dbg->arch.nr_hw_bp); + bp_data = memdup_array_user(dbg->arch.hw_bp, dbg->arch.nr_hw_bp, + sizeof(*bp_data)); if (IS_ERR(bp_data)) return PTR_ERR(bp_data); @@ -234,7 +234,7 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu, if (nr_wp > 0) { wp_info = kmalloc_array(nr_wp, sizeof(*wp_info), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!wp_info) { ret = -ENOMEM; goto error; @@ -243,7 +243,7 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu, if (nr_bp > 0) { bp_info = kmalloc_array(nr_bp, sizeof(*bp_info), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!bp_info) { ret = -ENOMEM; goto error; @@ -349,7 +349,7 @@ static struct kvm_hw_wp_info_arch *any_wp_changed(struct kvm_vcpu *vcpu) if (!wp_info || !wp_info->old_data || wp_info->len <= 0) continue; - temp = kmalloc(wp_info->len, GFP_KERNEL); + temp = kmalloc(wp_info->len, GFP_KERNEL_ACCOUNT); if (!temp) continue; diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index a389fa85cca2..b16352083ff9 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -2,7 +2,7 @@ /* * in-kernel handling for sie intercepts * - * Copyright IBM Corp. 2008, 2014 + * Copyright IBM Corp. 2008, 2020 * * Author(s): Carsten Otte <cotte@de.ibm.com> * Christian Borntraeger <borntraeger@de.ibm.com> @@ -12,10 +12,10 @@ #include <linux/errno.h> #include <linux/pagemap.h> -#include <asm/kvm_host.h> #include <asm/asm-offsets.h> #include <asm/irq.h> #include <asm/sysinfo.h> +#include <asm/uv.h> #include "kvm-s390.h" #include "gaccess.h" @@ -79,6 +79,10 @@ static int handle_stop(struct kvm_vcpu *vcpu) return rc; } + /* + * no need to check the return value of vcpu_stop as it can only have + * an error for protvirt, but protvirt means user cpu state + */ if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm)) kvm_s390_vcpu_stop(vcpu); return -EOPNOTSUPP; @@ -213,7 +217,7 @@ static int handle_itdb(struct kvm_vcpu *vcpu) return 0; if (current->thread.per_flags & PER_FLAG_NO_TE) return 0; - itdb = (struct kvm_s390_itdb *)vcpu->arch.sie_block->itdba; + itdb = phys_to_virt(vcpu->arch.sie_block->itdba); rc = write_guest_lc(vcpu, __LC_PGM_TDB, itdb, sizeof(*itdb)); if (rc) return rc; @@ -224,6 +228,21 @@ static int handle_itdb(struct kvm_vcpu *vcpu) #define per_event(vcpu) (vcpu->arch.sie_block->iprcc & PGM_PER) +static bool should_handle_per_event(const struct kvm_vcpu *vcpu) +{ + if (!guestdbg_enabled(vcpu) || !per_event(vcpu)) + return false; + if (guestdbg_sstep_enabled(vcpu) && + vcpu->arch.sie_block->iprcc != PGM_PER) { + /* + * __vcpu_run() will exit after delivering the concurrently + * indicated condition. + */ + return false; + } + return true; +} + static int handle_prog(struct kvm_vcpu *vcpu) { psw_t psw; @@ -231,7 +250,14 @@ static int handle_prog(struct kvm_vcpu *vcpu) vcpu->stat.exit_program_interruption++; - if (guestdbg_enabled(vcpu) && per_event(vcpu)) { + /* + * Intercept 8 indicates a loop of specification exceptions + * for protected guests. + */ + if (kvm_s390_pv_cpu_is_protected(vcpu)) + return -EOPNOTSUPP; + + if (should_handle_per_event(vcpu)) { rc = kvm_s390_handle_per_event(vcpu); if (rc) return rc; @@ -258,11 +284,20 @@ static int handle_prog(struct kvm_vcpu *vcpu) /** * handle_external_interrupt - used for external interruption interceptions + * @vcpu: virtual cpu + * + * This interception occurs if: + * - the CPUSTAT_EXT_INT bit was already set when the external interrupt + * occurred. In this case, the interrupt needs to be injected manually to + * preserve interrupt priority. + * - the external new PSW has external interrupts enabled, which will cause an + * interruption loop. We drop to userspace in this case. + * + * The latter case can be detected by inspecting the external mask bit in the + * external new psw. * - * This interception only occurs if the CPUSTAT_EXT_INT bit was set, or if - * the new PSW does not have external interrupts disabled. In the first case, - * we've got to deliver the interrupt manually, and in the second case, we - * drop to userspace to handle the situation there. + * Under PV, only the latter case can occur, since interrupt priorities are + * handled in the ultravisor. */ static int handle_external_interrupt(struct kvm_vcpu *vcpu) { @@ -273,10 +308,18 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu) vcpu->stat.exit_external_interrupt++; - rc = read_guest_lc(vcpu, __LC_EXT_NEW_PSW, &newpsw, sizeof(psw_t)); - if (rc) - return rc; - /* We can not handle clock comparator or timer interrupt with bad PSW */ + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + newpsw = vcpu->arch.sie_block->gpsw; + } else { + rc = read_guest_lc(vcpu, __LC_EXT_NEW_PSW, &newpsw, sizeof(psw_t)); + if (rc) + return rc; + } + + /* + * Clock comparator or timer interrupt with external interrupt enabled + * will cause interrupt loop. Drop to userspace. + */ if ((eic == EXT_IRQ_CLK_COMP || eic == EXT_IRQ_CPU_TIMER) && (newpsw.mask & PSW_MASK_EXT)) return -EOPNOTSUPP; @@ -304,7 +347,8 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu) } /** - * Handle MOVE PAGE partial execution interception. + * handle_mvpg_pei - Handle MOVE PAGE partial execution interception. + * @vcpu: virtual cpu * * This interception can only happen for guests with DAT disabled and * addresses that are currently not mapped in the host. Thus we try to @@ -318,18 +362,18 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu) kvm_s390_get_regs_rre(vcpu, ®1, ®2); - /* Make sure that the source is paged-in */ - rc = guest_translate_address(vcpu, vcpu->run->s.regs.gprs[reg2], - reg2, &srcaddr, GACC_FETCH); + /* Ensure that the source is paged-in, no actual access -> no key checking */ + rc = guest_translate_address_with_key(vcpu, vcpu->run->s.regs.gprs[reg2], + reg2, &srcaddr, GACC_FETCH, 0); if (rc) return kvm_s390_inject_prog_cond(vcpu, rc); rc = kvm_arch_fault_in_page(vcpu, srcaddr, 0); if (rc != 0) return rc; - /* Make sure that the destination is paged-in */ - rc = guest_translate_address(vcpu, vcpu->run->s.regs.gprs[reg1], - reg1, &dstaddr, GACC_STORE); + /* Ensure that the source is paged-in, no actual access -> no key checking */ + rc = guest_translate_address_with_key(vcpu, vcpu->run->s.regs.gprs[reg1], + reg1, &dstaddr, GACC_STORE, 0); if (rc) return kvm_s390_inject_prog_cond(vcpu, rc); rc = kvm_arch_fault_in_page(vcpu, dstaddr, 1); @@ -360,8 +404,8 @@ static int handle_partial_execution(struct kvm_vcpu *vcpu) */ int handle_sthyi(struct kvm_vcpu *vcpu) { - int reg1, reg2, r = 0; - u64 code, addr, cc = 0, rc = 0; + int reg1, reg2, cc = 0, r = 0; + u64 code, addr, rc = 0; struct sthyi_sctns *sctns = NULL; if (!test_kvm_facility(vcpu->kvm, 74)) @@ -384,21 +428,28 @@ int handle_sthyi(struct kvm_vcpu *vcpu) goto out; } - if (addr & ~PAGE_MASK) + if (!kvm_s390_pv_cpu_is_protected(vcpu) && (addr & ~PAGE_MASK)) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - sctns = (void *)get_zeroed_page(GFP_KERNEL); + sctns = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!sctns) return -ENOMEM; cc = sthyi_fill(sctns, &rc); - + if (cc < 0) { + free_page((unsigned long)sctns); + return cc; + } out: if (!cc) { - r = write_guest(vcpu, addr, reg2, sctns, PAGE_SIZE); - if (r) { - free_page((unsigned long)sctns); - return kvm_s390_inject_prog_cond(vcpu, r); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + memcpy(sida_addr(vcpu->arch.sie_block), sctns, PAGE_SIZE); + } else { + r = write_guest(vcpu, addr, reg2, sctns, PAGE_SIZE); + if (r) { + free_page((unsigned long)sctns); + return kvm_s390_inject_prog_cond(vcpu, r); + } } } @@ -444,6 +495,110 @@ static int handle_operexc(struct kvm_vcpu *vcpu) return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); } +static int handle_pv_spx(struct kvm_vcpu *vcpu) +{ + u32 pref = *(u32 *)sida_addr(vcpu->arch.sie_block); + + kvm_s390_set_prefix(vcpu, pref); + trace_kvm_s390_handle_prefix(vcpu, 1, pref); + return 0; +} + +static int handle_pv_sclp(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; + + spin_lock(&fi->lock); + /* + * 2 cases: + * a: an sccb answering interrupt was already pending or in flight. + * As the sccb value is not known we can simply set some value to + * trigger delivery of a saved SCCB. UV will then use its saved + * copy of the SCCB value. + * b: an error SCCB interrupt needs to be injected so we also inject + * a fake SCCB address. Firmware will use the proper one. + * This makes sure, that both errors and real sccb returns will only + * be delivered after a notification intercept (instruction has + * finished) but not after others. + */ + fi->srv_signal.ext_params |= 0x43000; + set_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs); + clear_bit(IRQ_PEND_EXT_SERVICE, &fi->masked_irqs); + spin_unlock(&fi->lock); + return 0; +} + +static int handle_pv_uvc(struct kvm_vcpu *vcpu) +{ + struct uv_cb_share *guest_uvcb = sida_addr(vcpu->arch.sie_block); + struct uv_cb_cts uvcb = { + .header.cmd = UVC_CMD_UNPIN_PAGE_SHARED, + .header.len = sizeof(uvcb), + .guest_handle = kvm_s390_pv_get_handle(vcpu->kvm), + .gaddr = guest_uvcb->paddr, + }; + int rc; + + if (guest_uvcb->header.cmd != UVC_CMD_REMOVE_SHARED_ACCESS) { + WARN_ONCE(1, "Unexpected notification intercept for UVC 0x%x\n", + guest_uvcb->header.cmd); + return 0; + } + rc = gmap_make_secure(vcpu->arch.gmap, uvcb.gaddr, &uvcb); + /* + * If the unpin did not succeed, the guest will exit again for the UVC + * and we will retry the unpin. + */ + if (rc == -EINVAL) + return 0; + /* + * If we got -EAGAIN here, we simply return it. It will eventually + * get propagated all the way to userspace, which should then try + * again. + */ + return rc; +} + +static int handle_pv_notification(struct kvm_vcpu *vcpu) +{ + int ret; + + if (vcpu->arch.sie_block->ipa == 0xb210) + return handle_pv_spx(vcpu); + if (vcpu->arch.sie_block->ipa == 0xb220) + return handle_pv_sclp(vcpu); + if (vcpu->arch.sie_block->ipa == 0xb9a4) + return handle_pv_uvc(vcpu); + if (vcpu->arch.sie_block->ipa >> 8 == 0xae) { + /* + * Besides external call, other SIGP orders also cause a + * 108 (pv notify) intercept. In contrast to external call, + * these orders need to be emulated and hence the appropriate + * place to handle them is in handle_instruction(). + * So first try kvm_s390_handle_sigp_pei() and if that isn't + * successful, go on with handle_instruction(). + */ + ret = kvm_s390_handle_sigp_pei(vcpu); + if (!ret) + return ret; + } + + return handle_instruction(vcpu); +} + +static bool should_handle_per_ifetch(const struct kvm_vcpu *vcpu, int rc) +{ + /* Process PER, also if the instruction is processed in user space. */ + if (!(vcpu->arch.sie_block->icptstatus & 0x02)) + return false; + if (rc != 0 && rc != -EOPNOTSUPP) + return false; + if (guestdbg_sstep_enabled(vcpu) && vcpu->arch.local_int.pending_irqs) + /* __vcpu_run() will exit after delivering the interrupt. */ + return false; + return true; +} + int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu) { int rc, per_rc = 0; @@ -478,15 +633,35 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu) rc = handle_partial_execution(vcpu); break; case ICPT_KSS: - rc = kvm_s390_skey_check_enable(vcpu); + /* Instruction will be redriven, skip the PER check. */ + return kvm_s390_skey_check_enable(vcpu); + case ICPT_MCHKREQ: + case ICPT_INT_ENABLE: + /* + * PSW bit 13 or a CR (0, 6, 14) changed and we might + * now be able to deliver interrupts. The pre-run code + * will take care of this. + */ + rc = 0; + break; + case ICPT_PV_INSTR: + rc = handle_instruction(vcpu); + break; + case ICPT_PV_NOTIFY: + rc = handle_pv_notification(vcpu); + break; + case ICPT_PV_PREF: + rc = 0; + gmap_convert_to_secure(vcpu->arch.gmap, + kvm_s390_get_prefix(vcpu)); + gmap_convert_to_secure(vcpu->arch.gmap, + kvm_s390_get_prefix(vcpu) + PAGE_SIZE); break; default: return -EOPNOTSUPP; } - /* process PER, also if the instrution is processed in user space */ - if (vcpu->arch.sie_block->icptstatus & 0x02 && - (!rc || rc == -EOPNOTSUPP)) + if (should_handle_per_ifetch(vcpu, rc)) per_rc = kvm_s390_handle_per_ifetch_icpt(vcpu); return per_rc ? per_rc : rc; } diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 165dea4c7f19..fc4007cc067a 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -2,7 +2,7 @@ /* * handling kvm guest interrupts * - * Copyright IBM Corp. 2008, 2015 + * Copyright IBM Corp. 2008, 2020 * * Author(s): Carsten Otte <cotte@de.ibm.com> */ @@ -28,9 +28,11 @@ #include <asm/switch_to.h> #include <asm/nmi.h> #include <asm/airq.h> +#include <asm/tpi.h> #include "kvm-s390.h" #include "gaccess.h" #include "trace-s390.h" +#include "pci.h" #define PFAULT_INIT 0x0600 #define PFAULT_DONE 0x0680 @@ -81,8 +83,9 @@ static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id) struct esca_block *sca = vcpu->kvm->arch.sca; union esca_sigp_ctrl *sigp_ctrl = &(sca->cpu[vcpu->vcpu_id].sigp_ctrl); - union esca_sigp_ctrl new_val = {0}, old_val = *sigp_ctrl; + union esca_sigp_ctrl new_val = {0}, old_val; + old_val = READ_ONCE(*sigp_ctrl); new_val.scn = src_id; new_val.c = 1; old_val.c = 0; @@ -93,8 +96,9 @@ static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id) struct bsca_block *sca = vcpu->kvm->arch.sca; union bsca_sigp_ctrl *sigp_ctrl = &(sca->cpu[vcpu->vcpu_id].sigp_ctrl); - union bsca_sigp_ctrl new_val = {0}, old_val = *sigp_ctrl; + union bsca_sigp_ctrl new_val = {0}, old_val; + old_val = READ_ONCE(*sigp_ctrl); new_val.scn = src_id; new_val.c = 1; old_val.c = 0; @@ -124,16 +128,18 @@ static void sca_clear_ext_call(struct kvm_vcpu *vcpu) struct esca_block *sca = vcpu->kvm->arch.sca; union esca_sigp_ctrl *sigp_ctrl = &(sca->cpu[vcpu->vcpu_id].sigp_ctrl); - union esca_sigp_ctrl old = *sigp_ctrl; + union esca_sigp_ctrl old; + old = READ_ONCE(*sigp_ctrl); expect = old.value; rc = cmpxchg(&sigp_ctrl->value, old.value, 0); } else { struct bsca_block *sca = vcpu->kvm->arch.sca; union bsca_sigp_ctrl *sigp_ctrl = &(sca->cpu[vcpu->vcpu_id].sigp_ctrl); - union bsca_sigp_ctrl old = *sigp_ctrl; + union bsca_sigp_ctrl old; + old = READ_ONCE(*sigp_ctrl); expect = old.value; rc = cmpxchg(&sigp_ctrl->value, old.value, 0); } @@ -297,11 +303,6 @@ static inline u8 gisa_get_ipm_or_restore_iam(struct kvm_s390_gisa_interrupt *gi) return 0; } -static inline int gisa_in_alert_list(struct kvm_s390_gisa *gisa) -{ - return READ_ONCE(gisa->next_alert) != (u32)(u64)gisa; -} - static inline void gisa_set_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) { set_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); @@ -312,11 +313,6 @@ static inline u8 gisa_get_ipm(struct kvm_s390_gisa *gisa) return READ_ONCE(gisa->ipm); } -static inline void gisa_clear_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) -{ - clear_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); -} - static inline int gisa_tac_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) { return test_and_clear_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); @@ -324,8 +320,11 @@ static inline int gisa_tac_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) static inline unsigned long pending_irqs_no_gisa(struct kvm_vcpu *vcpu) { - return vcpu->kvm->arch.float_int.pending_irqs | - vcpu->arch.local_int.pending_irqs; + unsigned long pending = vcpu->kvm->arch.float_int.pending_irqs | + vcpu->arch.local_int.pending_irqs; + + pending &= ~vcpu->kvm->arch.float_int.masked_irqs; + return pending; } static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu) @@ -383,10 +382,18 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu) __clear_bit(IRQ_PEND_EXT_CLOCK_COMP, &active_mask); if (!(vcpu->arch.sie_block->gcr[0] & CR0_CPU_TIMER_SUBMASK)) __clear_bit(IRQ_PEND_EXT_CPU_TIMER, &active_mask); - if (!(vcpu->arch.sie_block->gcr[0] & CR0_SERVICE_SIGNAL_SUBMASK)) + if (!(vcpu->arch.sie_block->gcr[0] & CR0_SERVICE_SIGNAL_SUBMASK)) { __clear_bit(IRQ_PEND_EXT_SERVICE, &active_mask); + __clear_bit(IRQ_PEND_EXT_SERVICE_EV, &active_mask); + } if (psw_mchk_disabled(vcpu)) active_mask &= ~IRQ_PEND_MCHK_MASK; + /* PV guest cpus can have a single interruption injected at a time. */ + if (kvm_s390_pv_cpu_get_handle(vcpu) && + vcpu->arch.sie_block->iictl != IICTL_CODE_NONE) + active_mask &= ~(IRQ_PEND_EXT_II_MASK | + IRQ_PEND_IO_MASK | + IRQ_PEND_MCHK_MASK); /* * Check both floating and local interrupt's cr14 because * bit IRQ_PEND_MCHK_REP could be set in both cases. @@ -408,13 +415,13 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu) static void __set_cpu_idle(struct kvm_vcpu *vcpu) { kvm_s390_set_cpuflags(vcpu, CPUSTAT_WAIT); - set_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask); + set_bit(vcpu->vcpu_idx, vcpu->kvm->arch.idle_mask); } static void __unset_cpu_idle(struct kvm_vcpu *vcpu) { kvm_s390_clear_cpuflags(vcpu, CPUSTAT_WAIT); - clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask); + clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.idle_mask); } static void __reset_intercept_indicators(struct kvm_vcpu *vcpu) @@ -479,19 +486,23 @@ static void set_intercept_indicators(struct kvm_vcpu *vcpu) static int __must_check __deliver_cpu_timer(struct kvm_vcpu *vcpu) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; - int rc; + int rc = 0; vcpu->stat.deliver_cputm++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_CPU_TIMER, 0, 0); - - rc = put_guest_lc(vcpu, EXT_IRQ_CPU_TIMER, - (u16 *)__LC_EXT_INT_CODE); - rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR); - rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); - rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_EXT; + vcpu->arch.sie_block->eic = EXT_IRQ_CPU_TIMER; + } else { + rc = put_guest_lc(vcpu, EXT_IRQ_CPU_TIMER, + (u16 *)__LC_EXT_INT_CODE); + rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR); + rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + } clear_bit(IRQ_PEND_EXT_CPU_TIMER, &li->pending_irqs); return rc ? -EFAULT : 0; } @@ -499,19 +510,23 @@ static int __must_check __deliver_cpu_timer(struct kvm_vcpu *vcpu) static int __must_check __deliver_ckc(struct kvm_vcpu *vcpu) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; - int rc; + int rc = 0; vcpu->stat.deliver_ckc++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_CLOCK_COMP, 0, 0); - - rc = put_guest_lc(vcpu, EXT_IRQ_CLK_COMP, - (u16 __user *)__LC_EXT_INT_CODE); - rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR); - rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); - rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_EXT; + vcpu->arch.sie_block->eic = EXT_IRQ_CLK_COMP; + } else { + rc = put_guest_lc(vcpu, EXT_IRQ_CLK_COMP, + (u16 __user *)__LC_EXT_INT_CODE); + rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR); + rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + } clear_bit(IRQ_PEND_EXT_CLOCK_COMP, &li->pending_irqs); return rc ? -EFAULT : 0; } @@ -553,6 +568,20 @@ static int __write_machine_check(struct kvm_vcpu *vcpu, union mci mci; int rc; + /* + * All other possible payload for a machine check (e.g. the register + * contents in the save area) will be handled by the ultravisor, as + * the hypervisor does not not have the needed information for + * protected guests. + */ + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_MCHK; + vcpu->arch.sie_block->mcic = mchk->mcic; + vcpu->arch.sie_block->faddr = mchk->failing_storage_address; + vcpu->arch.sie_block->edc = mchk->ext_damage_code; + return 0; + } + mci.val = mchk->mcic; /* take care of lazy register loading */ save_fpu_regs(); @@ -610,7 +639,7 @@ static int __write_machine_check(struct kvm_vcpu *vcpu, rc |= put_guest_lc(vcpu, mci.val, (u64 __user *) __LC_MCCK_CODE); /* Register-save areas */ - if (MACHINE_HAS_VX) { + if (cpu_has_vx()) { convert_vx_to_fp(fprs, (__vector128 *) vcpu->run->s.regs.vrs); rc |= write_guest_lc(vcpu, __LC_FPREGS_SAVE_AREA, fprs, 128); } else { @@ -669,7 +698,7 @@ static int __must_check __deliver_machine_check(struct kvm_vcpu *vcpu) /* * We indicate floating repressible conditions along with * other pending conditions. Channel Report Pending and Channel - * Subsystem damage are the only two and and are indicated by + * Subsystem damage are the only two and are indicated by * bits in mcic and masked in cr14. */ if (test_and_clear_bit(IRQ_PEND_MCHK_REP, &fi->pending_irqs)) { @@ -696,17 +725,21 @@ static int __must_check __deliver_machine_check(struct kvm_vcpu *vcpu) static int __must_check __deliver_restart(struct kvm_vcpu *vcpu) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; - int rc; + int rc = 0; VCPU_EVENT(vcpu, 3, "%s", "deliver: cpu restart"); vcpu->stat.deliver_restart_signal++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_RESTART, 0, 0); - rc = write_guest_lc(vcpu, - offsetof(struct lowcore, restart_old_psw), - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); - rc |= read_guest_lc(vcpu, offsetof(struct lowcore, restart_psw), - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_RESTART; + } else { + rc = write_guest_lc(vcpu, + offsetof(struct lowcore, restart_old_psw), + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= read_guest_lc(vcpu, offsetof(struct lowcore, restart_psw), + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + } clear_bit(IRQ_PEND_RESTART, &li->pending_irqs); return rc ? -EFAULT : 0; } @@ -748,6 +781,12 @@ static int __must_check __deliver_emergency_signal(struct kvm_vcpu *vcpu) vcpu->stat.deliver_emergency_signal++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_EMERGENCY, cpu_addr, 0); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_EXT; + vcpu->arch.sie_block->eic = EXT_IRQ_EMERGENCY_SIG; + vcpu->arch.sie_block->extcpuaddr = cpu_addr; + return 0; + } rc = put_guest_lc(vcpu, EXT_IRQ_EMERGENCY_SIG, (u16 *)__LC_EXT_INT_CODE); @@ -776,6 +815,12 @@ static int __must_check __deliver_external_call(struct kvm_vcpu *vcpu) trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_EXTERNAL_CALL, extcall.code, 0); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_EXT; + vcpu->arch.sie_block->eic = EXT_IRQ_EXTERNAL_CALL; + vcpu->arch.sie_block->extcpuaddr = extcall.code; + return 0; + } rc = put_guest_lc(vcpu, EXT_IRQ_EXTERNAL_CALL, (u16 *)__LC_EXT_INT_CODE); @@ -787,6 +832,21 @@ static int __must_check __deliver_external_call(struct kvm_vcpu *vcpu) return rc ? -EFAULT : 0; } +static int __deliver_prog_pv(struct kvm_vcpu *vcpu, u16 code) +{ + switch (code) { + case PGM_SPECIFICATION: + vcpu->arch.sie_block->iictl = IICTL_CODE_SPECIFICATION; + break; + case PGM_OPERAND: + vcpu->arch.sie_block->iictl = IICTL_CODE_OPERAND; + break; + default: + return -EINVAL; + } + return 0; +} + static int __must_check __deliver_prog(struct kvm_vcpu *vcpu) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; @@ -807,6 +867,10 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu) trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_PROGRAM_INT, pgm_info.code, 0); + /* PER is handled by the ultravisor */ + if (kvm_s390_pv_cpu_is_protected(vcpu)) + return __deliver_prog_pv(vcpu, pgm_info.code & ~PGM_PER); + switch (pgm_info.code & ~PGM_PER) { case PGM_AFX_TRANSLATION: case PGM_ASX_TRANSLATION: @@ -818,7 +882,7 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu) case PGM_PRIMARY_AUTHORITY: case PGM_SECONDARY_AUTHORITY: nullifying = true; - /* fall through */ + fallthrough; case PGM_SPACE_SWITCH: rc = put_guest_lc(vcpu, pgm_info.trans_exc_code, (u64 *)__LC_TRANS_EXC_CODE); @@ -892,7 +956,7 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu) /* bit 1+2 of the target are the ilc, so we can directly use ilen */ rc |= put_guest_lc(vcpu, ilen, (u16 *) __LC_PGM_ILC); rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->gbea, - (u64 *) __LC_LAST_BREAK); + (u64 *) __LC_PGM_LAST_BREAK); rc |= put_guest_lc(vcpu, pgm_info.code, (u16 *)__LC_PGM_INT_CODE); rc |= write_guest_lc(vcpu, __LC_PGM_OLD_PSW, @@ -902,20 +966,49 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu) return rc ? -EFAULT : 0; } +#define SCCB_MASK 0xFFFFFFF8 +#define SCCB_EVENT_PENDING 0x3 + +static int write_sclp(struct kvm_vcpu *vcpu, u32 parm) +{ + int rc; + + if (kvm_s390_pv_cpu_get_handle(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_EXT; + vcpu->arch.sie_block->eic = EXT_IRQ_SERVICE_SIG; + vcpu->arch.sie_block->eiparams = parm; + return 0; + } + + rc = put_guest_lc(vcpu, EXT_IRQ_SERVICE_SIG, (u16 *)__LC_EXT_INT_CODE); + rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR); + rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= put_guest_lc(vcpu, parm, + (u32 *)__LC_EXT_PARAMS); + + return rc ? -EFAULT : 0; +} + static int __must_check __deliver_service(struct kvm_vcpu *vcpu) { struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; struct kvm_s390_ext_info ext; - int rc = 0; spin_lock(&fi->lock); - if (!(test_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs))) { + if (test_bit(IRQ_PEND_EXT_SERVICE, &fi->masked_irqs) || + !(test_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs))) { spin_unlock(&fi->lock); return 0; } ext = fi->srv_signal; memset(&fi->srv_signal, 0, sizeof(ext)); clear_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs); + clear_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs); + if (kvm_s390_pv_cpu_is_protected(vcpu)) + set_bit(IRQ_PEND_EXT_SERVICE, &fi->masked_irqs); spin_unlock(&fi->lock); VCPU_EVENT(vcpu, 4, "deliver: sclp parameter 0x%x", @@ -924,16 +1017,31 @@ static int __must_check __deliver_service(struct kvm_vcpu *vcpu) trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_SERVICE, ext.ext_params, 0); - rc = put_guest_lc(vcpu, EXT_IRQ_SERVICE_SIG, (u16 *)__LC_EXT_INT_CODE); - rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR); - rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); - rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); - rc |= put_guest_lc(vcpu, ext.ext_params, - (u32 *)__LC_EXT_PARAMS); + return write_sclp(vcpu, ext.ext_params); +} - return rc ? -EFAULT : 0; +static int __must_check __deliver_service_ev(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; + struct kvm_s390_ext_info ext; + + spin_lock(&fi->lock); + if (!(test_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs))) { + spin_unlock(&fi->lock); + return 0; + } + ext = fi->srv_signal; + /* only clear the event bit */ + fi->srv_signal.ext_params &= ~SCCB_EVENT_PENDING; + clear_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs); + spin_unlock(&fi->lock); + + VCPU_EVENT(vcpu, 4, "%s", "deliver: sclp parameter event"); + vcpu->stat.deliver_service_signal++; + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_SERVICE, + ext.ext_params, 0); + + return write_sclp(vcpu, SCCB_EVENT_PENDING); } static int __must_check __deliver_pfault_done(struct kvm_vcpu *vcpu) @@ -1028,6 +1136,15 @@ static int __do_deliver_io(struct kvm_vcpu *vcpu, struct kvm_s390_io_info *io) { int rc; + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_IO; + vcpu->arch.sie_block->subchannel_id = io->subchannel_id; + vcpu->arch.sie_block->subchannel_nr = io->subchannel_nr; + vcpu->arch.sie_block->io_int_parm = io->io_int_parm; + vcpu->arch.sie_block->io_int_word = io->io_int_word; + return 0; + } + rc = put_guest_lc(vcpu, io->subchannel_id, (u16 *)__LC_SUBCHANNEL_ID); rc |= put_guest_lc(vcpu, io->subchannel_nr, (u16 *)__LC_SUBCHANNEL_NR); rc |= put_guest_lc(vcpu, io->io_int_parm, (u32 *)__LC_IO_INT_PARM); @@ -1166,7 +1283,7 @@ static u64 __calculate_sltime(struct kvm_vcpu *vcpu) /* already expired? */ if (cputm >> 63) return 0; - return min(sltime, tod_to_ns(cputm)); + return min_t(u64, sltime, tod_to_ns(cputm)); } } else if (cpu_timer_interrupts_enabled(vcpu)) { sltime = kvm_s390_get_cpu_timer(vcpu); @@ -1213,10 +1330,11 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu) hrtimer_start(&vcpu->arch.ckc_timer, sltime, HRTIMER_MODE_REL); VCPU_EVENT(vcpu, 4, "enabled wait: %llu ns", sltime); no_timer: - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - kvm_vcpu_block(vcpu); + kvm_vcpu_srcu_read_unlock(vcpu); + kvm_vcpu_halt(vcpu); + vcpu->valid_wakeup = false; __unset_cpu_idle(vcpu); - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_vcpu_srcu_read_lock(vcpu); hrtimer_cancel(&vcpu->arch.ckc_timer); return 0; @@ -1269,6 +1387,7 @@ int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; int rc = 0; + bool delivered = false; unsigned long irq_type; unsigned long irqs; @@ -1329,6 +1448,9 @@ int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu) case IRQ_PEND_EXT_SERVICE: rc = __deliver_service(vcpu); break; + case IRQ_PEND_EXT_SERVICE_EV: + rc = __deliver_service_ev(vcpu); + break; case IRQ_PEND_PFAULT_DONE: rc = __deliver_pfault_done(vcpu); break; @@ -1339,6 +1461,19 @@ int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu) WARN_ONCE(1, "Unknown pending irq type %ld", irq_type); clear_bit(irq_type, &li->pending_irqs); } + delivered |= !rc; + } + + /* + * We delivered at least one interrupt and modified the PC. Force a + * singlestep event now. + */ + if (delivered && guestdbg_sstep_enabled(vcpu)) { + struct kvm_debug_exit_arch *debug_exit = &vcpu->run->debug.arch; + + debug_exit->addr = vcpu->arch.sie_block->gpsw.addr; + debug_exit->type = KVM_SINGLESTEP; + vcpu->guest_debug |= KVM_GUESTDBG_EXIT_PENDING; } set_intercept_indicators(vcpu); @@ -1421,7 +1556,7 @@ static int __inject_extcall(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) if (kvm_get_vcpu_by_id(vcpu->kvm, src_id) == NULL) return -EINVAL; - if (sclp.has_sigpif) + if (sclp.has_sigpif && !kvm_s390_pv_cpu_get_handle(vcpu)) return sca_inject_ext_call(vcpu, src_id); if (test_and_set_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs)) @@ -1668,7 +1803,7 @@ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm, goto out; } gisa_out: - tmp_inti = kzalloc(sizeof(*inti), GFP_KERNEL); + tmp_inti = kzalloc(sizeof(*inti), GFP_KERNEL_ACCOUNT); if (tmp_inti) { tmp_inti->type = KVM_S390_INT_IO(1, 0, 0, 0); tmp_inti->io.io_int_word = isc_to_int_word(isc); @@ -1681,9 +1816,6 @@ out: return inti; } -#define SCCB_MASK 0xFFFFFFF8 -#define SCCB_EVENT_PENDING 0x3 - static int __inject_service(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) { @@ -1692,6 +1824,11 @@ static int __inject_service(struct kvm *kvm, kvm->stat.inject_service_signal++; spin_lock(&fi->lock); fi->srv_signal.ext_params |= inti->ext.ext_params & SCCB_EVENT_PENDING; + + /* We always allow events, track them separately from the sccb ints */ + if (fi->srv_signal.ext_params & SCCB_EVENT_PENDING) + set_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs); + /* * Early versions of the QEMU s390 bios will inject several * service interrupts after another without handling a @@ -1773,6 +1910,12 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) kvm->stat.inject_io++; isc = int_word_to_isc(inti->io.io_int_word); + /* + * We do not use the lock checking variant as this is just a + * performance optimization and we do not hold the lock here. + * This is ok as the code will pick interrupts from both "lists" + * for delivery. + */ if (gi->origin && inti->type & KVM_S390_INT_IO_AI_MASK) { VM_EVENT(kvm, 4, "%s isc %1u", "inject: I/O (AI/gisa)", isc); gisa_set_ipm_gisc(gi->origin, isc); @@ -1834,7 +1977,8 @@ static void __floating_irq_kick(struct kvm *kvm, u64 type) break; case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX: if (!(type & KVM_S390_INT_IO_AI_MASK && - kvm->arch.gisa_int.origin)) + kvm->arch.gisa_int.origin) || + kvm_s390_pv_cpu_get_handle(dst_vcpu)) kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_IO_INT); break; default: @@ -1881,7 +2025,7 @@ int kvm_s390_inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti; int rc; - inti = kzalloc(sizeof(*inti), GFP_KERNEL); + inti = kzalloc(sizeof(*inti), GFP_KERNEL_ACCOUNT); if (!inti) return -ENOMEM; @@ -1981,6 +2125,13 @@ int kvm_s390_is_stop_irq_pending(struct kvm_vcpu *vcpu) return test_bit(IRQ_PEND_SIGP_STOP, &li->pending_irqs); } +int kvm_s390_is_restart_irq_pending(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + + return test_bit(IRQ_PEND_RESTART, &li->pending_irqs); +} + void kvm_s390_clear_stop_irq(struct kvm_vcpu *vcpu) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; @@ -2080,6 +2231,10 @@ void kvm_s390_clear_float_irqs(struct kvm *kvm) struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int; int i; + mutex_lock(&kvm->lock); + if (!kvm_s390_pv_is_protected(kvm)) + fi->masked_irqs = 0; + mutex_unlock(&kvm->lock); spin_lock(&fi->lock); fi->pending_irqs = 0; memset(&fi->srv_signal, 0, sizeof(fi->srv_signal)); @@ -2146,7 +2301,8 @@ static int get_all_floating_irqs(struct kvm *kvm, u8 __user *usrbuf, u64 len) n++; } } - if (test_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs)) { + if (test_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs) || + test_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs)) { if (n == max_irqs) { /* signal userspace to try again */ ret = -ENOMEM; @@ -2190,7 +2346,7 @@ static int flic_ais_mode_get_all(struct kvm *kvm, struct kvm_device_attr *attr) return -EINVAL; if (!test_kvm_facility(kvm, 72)) - return -ENOTSUPP; + return -EOPNOTSUPP; mutex_lock(&fi->ais_lock); ais.simm = fi->simm; @@ -2275,7 +2431,7 @@ static int enqueue_floating_irq(struct kvm_device *dev, return -EINVAL; while (len >= sizeof(struct kvm_s390_irq)) { - inti = kzalloc(sizeof(*inti), GFP_KERNEL); + inti = kzalloc(sizeof(*inti), GFP_KERNEL_ACCOUNT); if (!inti) return -ENOMEM; @@ -2323,13 +2479,10 @@ static int register_io_adapter(struct kvm_device *dev, if (dev->kvm->arch.adapters[adapter_info.id] != NULL) return -EINVAL; - adapter = kzalloc(sizeof(*adapter), GFP_KERNEL); + adapter = kzalloc(sizeof(*adapter), GFP_KERNEL_ACCOUNT); if (!adapter) return -ENOMEM; - INIT_LIST_HEAD(&adapter->maps); - init_rwsem(&adapter->maps_lock); - atomic_set(&adapter->nr_maps, 0); adapter->id = adapter_info.id; adapter->isc = adapter_info.isc; adapter->maskable = adapter_info.maskable; @@ -2354,87 +2507,12 @@ int kvm_s390_mask_adapter(struct kvm *kvm, unsigned int id, bool masked) return ret; } -static int kvm_s390_adapter_map(struct kvm *kvm, unsigned int id, __u64 addr) -{ - struct s390_io_adapter *adapter = get_io_adapter(kvm, id); - struct s390_map_info *map; - int ret; - - if (!adapter || !addr) - return -EINVAL; - - map = kzalloc(sizeof(*map), GFP_KERNEL); - if (!map) { - ret = -ENOMEM; - goto out; - } - INIT_LIST_HEAD(&map->list); - map->guest_addr = addr; - map->addr = gmap_translate(kvm->arch.gmap, addr); - if (map->addr == -EFAULT) { - ret = -EFAULT; - goto out; - } - ret = get_user_pages_fast(map->addr, 1, FOLL_WRITE, &map->page); - if (ret < 0) - goto out; - BUG_ON(ret != 1); - down_write(&adapter->maps_lock); - if (atomic_inc_return(&adapter->nr_maps) < MAX_S390_ADAPTER_MAPS) { - list_add_tail(&map->list, &adapter->maps); - ret = 0; - } else { - put_page(map->page); - ret = -EINVAL; - } - up_write(&adapter->maps_lock); -out: - if (ret) - kfree(map); - return ret; -} - -static int kvm_s390_adapter_unmap(struct kvm *kvm, unsigned int id, __u64 addr) -{ - struct s390_io_adapter *adapter = get_io_adapter(kvm, id); - struct s390_map_info *map, *tmp; - int found = 0; - - if (!adapter || !addr) - return -EINVAL; - - down_write(&adapter->maps_lock); - list_for_each_entry_safe(map, tmp, &adapter->maps, list) { - if (map->guest_addr == addr) { - found = 1; - atomic_dec(&adapter->nr_maps); - list_del(&map->list); - put_page(map->page); - kfree(map); - break; - } - } - up_write(&adapter->maps_lock); - - return found ? 0 : -EINVAL; -} - void kvm_s390_destroy_adapters(struct kvm *kvm) { int i; - struct s390_map_info *map, *tmp; - for (i = 0; i < MAX_S390_IO_ADAPTERS; i++) { - if (!kvm->arch.adapters[i]) - continue; - list_for_each_entry_safe(map, tmp, - &kvm->arch.adapters[i]->maps, list) { - list_del(&map->list); - put_page(map->page); - kfree(map); - } + for (i = 0; i < MAX_S390_IO_ADAPTERS; i++) kfree(kvm->arch.adapters[i]); - } } static int modify_io_adapter(struct kvm_device *dev, @@ -2456,11 +2534,14 @@ static int modify_io_adapter(struct kvm_device *dev, if (ret > 0) ret = 0; break; + /* + * The following operations are no longer needed and therefore no-ops. + * The gpa to hva translation is done when an IRQ route is set up. The + * set_irq code uses get_user_pages_remote() to do the actual write. + */ case KVM_S390_IO_ADAPTER_MAP: - ret = kvm_s390_adapter_map(dev->kvm, req.id, req.addr); - break; case KVM_S390_IO_ADAPTER_UNMAP: - ret = kvm_s390_adapter_unmap(dev->kvm, req.id, req.addr); + ret = 0; break; default: ret = -EINVAL; @@ -2499,7 +2580,7 @@ static int modify_ais_mode(struct kvm *kvm, struct kvm_device_attr *attr) int ret = 0; if (!test_kvm_facility(kvm, 72)) - return -ENOTSUPP; + return -EOPNOTSUPP; if (copy_from_user(&req, (void __user *)attr->addr, sizeof(req))) return -EFAULT; @@ -2579,7 +2660,7 @@ static int flic_ais_mode_set_all(struct kvm *kvm, struct kvm_device_attr *attr) struct kvm_s390_ais_all ais; if (!test_kvm_facility(kvm, 72)) - return -ENOTSUPP; + return -EOPNOTSUPP; if (copy_from_user(&ais, (void __user *)attr->addr, sizeof(ais))) return -EFAULT; @@ -2595,7 +2676,7 @@ static int flic_ais_mode_set_all(struct kvm *kvm, struct kvm_device_attr *attr) static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { int r = 0; - unsigned int i; + unsigned long i; struct kvm_vcpu *vcpu; switch (attr->group) { @@ -2699,19 +2780,15 @@ static unsigned long get_ind_bit(__u64 addr, unsigned long bit_nr, bool swap) return swap ? (bit ^ (BITS_PER_LONG - 1)) : bit; } -static struct s390_map_info *get_map_info(struct s390_io_adapter *adapter, - u64 addr) +static struct page *get_map_page(struct kvm *kvm, u64 uaddr) { - struct s390_map_info *map; + struct page *page = NULL; - if (!adapter) - return NULL; - - list_for_each_entry(map, &adapter->maps, list) { - if (map->guest_addr == addr) - return map; - } - return NULL; + mmap_read_lock(kvm->mm); + get_user_pages_remote(kvm->mm, uaddr, 1, FOLL_WRITE, + &page, NULL); + mmap_read_unlock(kvm->mm); + return page; } static int adapter_indicators_set(struct kvm *kvm, @@ -2720,30 +2797,35 @@ static int adapter_indicators_set(struct kvm *kvm, { unsigned long bit; int summary_set, idx; - struct s390_map_info *info; + struct page *ind_page, *summary_page; void *map; - info = get_map_info(adapter, adapter_int->ind_addr); - if (!info) + ind_page = get_map_page(kvm, adapter_int->ind_addr); + if (!ind_page) return -1; - map = page_address(info->page); - bit = get_ind_bit(info->addr, adapter_int->ind_offset, adapter->swap); - set_bit(bit, map); - idx = srcu_read_lock(&kvm->srcu); - mark_page_dirty(kvm, info->guest_addr >> PAGE_SHIFT); - set_page_dirty_lock(info->page); - info = get_map_info(adapter, adapter_int->summary_addr); - if (!info) { - srcu_read_unlock(&kvm->srcu, idx); + summary_page = get_map_page(kvm, adapter_int->summary_addr); + if (!summary_page) { + put_page(ind_page); return -1; } - map = page_address(info->page); - bit = get_ind_bit(info->addr, adapter_int->summary_offset, - adapter->swap); + + idx = srcu_read_lock(&kvm->srcu); + map = page_address(ind_page); + bit = get_ind_bit(adapter_int->ind_addr, + adapter_int->ind_offset, adapter->swap); + set_bit(bit, map); + mark_page_dirty(kvm, adapter_int->ind_addr >> PAGE_SHIFT); + set_page_dirty_lock(ind_page); + map = page_address(summary_page); + bit = get_ind_bit(adapter_int->summary_addr, + adapter_int->summary_offset, adapter->swap); summary_set = test_and_set_bit(bit, map); - mark_page_dirty(kvm, info->guest_addr >> PAGE_SHIFT); - set_page_dirty_lock(info->page); + mark_page_dirty(kvm, adapter_int->summary_addr >> PAGE_SHIFT); + set_page_dirty_lock(summary_page); srcu_read_unlock(&kvm->srcu, idx); + + put_page(ind_page); + put_page(summary_page); return summary_set ? 0 : 1; } @@ -2765,9 +2847,7 @@ static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e, adapter = get_io_adapter(kvm, e->adapter.adapter_id); if (!adapter) return -1; - down_read(&adapter->maps_lock); ret = adapter_indicators_set(kvm, adapter, &e->adapter); - up_read(&adapter->maps_lock); if ((ret > 0) && !adapter->masked) { ret = kvm_s390_inject_airq(kvm, adapter); if (ret == 0) @@ -2818,23 +2898,27 @@ int kvm_set_routing_entry(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { - int ret; + u64 uaddr; switch (ue->type) { + /* we store the userspace addresses instead of the guest addresses */ case KVM_IRQ_ROUTING_S390_ADAPTER: e->set = set_adapter_int; - e->adapter.summary_addr = ue->u.adapter.summary_addr; - e->adapter.ind_addr = ue->u.adapter.ind_addr; + uaddr = gmap_translate(kvm->arch.gmap, ue->u.adapter.summary_addr); + if (uaddr == -EFAULT) + return -EFAULT; + e->adapter.summary_addr = uaddr; + uaddr = gmap_translate(kvm->arch.gmap, ue->u.adapter.ind_addr); + if (uaddr == -EFAULT) + return -EFAULT; + e->adapter.ind_addr = uaddr; e->adapter.summary_offset = ue->u.adapter.summary_offset; e->adapter.ind_offset = ue->u.adapter.ind_offset; e->adapter.adapter_id = ue->u.adapter.adapter_id; - ret = 0; - break; + return 0; default: - ret = -EINVAL; + return -EINVAL; } - - return ret; } int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, @@ -2983,18 +3067,19 @@ int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, __u8 __user *buf, int len) static void __airqs_kick_single_vcpu(struct kvm *kvm, u8 deliverable_mask) { - int vcpu_id, online_vcpus = atomic_read(&kvm->online_vcpus); + int vcpu_idx, online_vcpus = atomic_read(&kvm->online_vcpus); struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; struct kvm_vcpu *vcpu; + u8 vcpu_isc_mask; - for_each_set_bit(vcpu_id, kvm->arch.idle_mask, online_vcpus) { - vcpu = kvm_get_vcpu(kvm, vcpu_id); + for_each_set_bit(vcpu_idx, kvm->arch.idle_mask, online_vcpus) { + vcpu = kvm_get_vcpu(kvm, vcpu_idx); if (psw_ioint_disabled(vcpu)) continue; - deliverable_mask &= (u8)(vcpu->arch.sie_block->gcr[6] >> 24); - if (deliverable_mask) { + vcpu_isc_mask = (u8)(vcpu->arch.sie_block->gcr[6] >> 24); + if (deliverable_mask & vcpu_isc_mask) { /* lately kicked but not yet running */ - if (test_and_set_bit(vcpu_id, gi->kicked_mask)) + if (test_and_set_bit(vcpu_idx, gi->kicked_mask)) return; kvm_s390_vcpu_wakeup(vcpu); return; @@ -3015,7 +3100,7 @@ static enum hrtimer_restart gisa_vcpu_kicker(struct hrtimer *timer) __airqs_kick_single_vcpu(kvm, pending_mask); hrtimer_forward_now(timer, ns_to_ktime(gi->expires)); return HRTIMER_RESTART; - }; + } return HRTIMER_NORESTART; } @@ -3027,9 +3112,9 @@ static enum hrtimer_restart gisa_vcpu_kicker(struct hrtimer *timer) static void process_gib_alert_list(void) { struct kvm_s390_gisa_interrupt *gi; + u32 final, gisa_phys, origin = 0UL; struct kvm_s390_gisa *gisa; struct kvm *kvm; - u32 final, origin = 0UL; do { /* @@ -3055,9 +3140,10 @@ static void process_gib_alert_list(void) * interruptions asap. */ while (origin & GISA_ADDR_MASK) { - gisa = (struct kvm_s390_gisa *)(u64)origin; + gisa_phys = origin; + gisa = phys_to_virt(gisa_phys); origin = gisa->next_alert; - gisa->next_alert = (u32)(u64)gisa; + gisa->next_alert = gisa_phys; kvm = container_of(gisa, struct sie_page2, gisa)->kvm; gi = &kvm->arch.gisa_int; if (hrtimer_active(&gi->timer)) @@ -3091,23 +3177,67 @@ void kvm_s390_gisa_init(struct kvm *kvm) hrtimer_init(&gi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); gi->timer.function = gisa_vcpu_kicker; memset(gi->origin, 0, sizeof(struct kvm_s390_gisa)); - gi->origin->next_alert = (u32)(u64)gi->origin; + gi->origin->next_alert = (u32)virt_to_phys(gi->origin); VM_EVENT(kvm, 3, "gisa 0x%pK initialized", gi->origin); } +void kvm_s390_gisa_enable(struct kvm *kvm) +{ + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; + struct kvm_vcpu *vcpu; + unsigned long i; + u32 gisa_desc; + + if (gi->origin) + return; + kvm_s390_gisa_init(kvm); + gisa_desc = kvm_s390_get_gisa_desc(kvm); + if (!gisa_desc) + return; + kvm_for_each_vcpu(i, vcpu, kvm) { + mutex_lock(&vcpu->mutex); + vcpu->arch.sie_block->gd = gisa_desc; + vcpu->arch.sie_block->eca |= ECA_AIV; + VCPU_EVENT(vcpu, 3, "AIV gisa format-%u enabled for cpu %03u", + vcpu->arch.sie_block->gd & 0x3, vcpu->vcpu_id); + mutex_unlock(&vcpu->mutex); + } +} + void kvm_s390_gisa_destroy(struct kvm *kvm) { struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; + struct kvm_s390_gisa *gisa = gi->origin; if (!gi->origin) return; - if (gi->alert.mask) - KVM_EVENT(3, "vm 0x%pK has unexpected iam 0x%02x", - kvm, gi->alert.mask); - while (gisa_in_alert_list(gi->origin)) - cpu_relax(); + WARN(gi->alert.mask != 0x00, + "unexpected non zero alert.mask 0x%02x", + gi->alert.mask); + gi->alert.mask = 0x00; + if (gisa_set_iam(gi->origin, gi->alert.mask)) + process_gib_alert_list(); hrtimer_cancel(&gi->timer); gi->origin = NULL; + VM_EVENT(kvm, 3, "gisa 0x%pK destroyed", gisa); +} + +void kvm_s390_gisa_disable(struct kvm *kvm) +{ + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; + struct kvm_vcpu *vcpu; + unsigned long i; + + if (!gi->origin) + return; + kvm_for_each_vcpu(i, vcpu, kvm) { + mutex_lock(&vcpu->mutex); + vcpu->arch.sie_block->eca &= ~ECA_AIV; + vcpu->arch.sie_block->gd = 0U; + mutex_unlock(&vcpu->mutex); + VCPU_EVENT(vcpu, 3, "AIV disabled for cpu %03u", vcpu->vcpu_id); + } + kvm_s390_gisa_destroy(kvm); } /** @@ -3193,29 +3323,111 @@ out: } EXPORT_SYMBOL_GPL(kvm_s390_gisc_unregister); -static void gib_alert_irq_handler(struct airq_struct *airq, bool floating) +static void aen_host_forward(unsigned long si) +{ + struct kvm_s390_gisa_interrupt *gi; + struct zpci_gaite *gaite; + struct kvm *kvm; + + gaite = (struct zpci_gaite *)aift->gait + + (si * sizeof(struct zpci_gaite)); + if (gaite->count == 0) + return; + if (gaite->aisb != 0) + set_bit_inv(gaite->aisbo, phys_to_virt(gaite->aisb)); + + kvm = kvm_s390_pci_si_to_kvm(aift, si); + if (!kvm) + return; + gi = &kvm->arch.gisa_int; + + if (!(gi->origin->g1.simm & AIS_MODE_MASK(gaite->gisc)) || + !(gi->origin->g1.nimm & AIS_MODE_MASK(gaite->gisc))) { + gisa_set_ipm_gisc(gi->origin, gaite->gisc); + if (hrtimer_active(&gi->timer)) + hrtimer_cancel(&gi->timer); + hrtimer_start(&gi->timer, 0, HRTIMER_MODE_REL); + kvm->stat.aen_forward++; + } +} + +static void aen_process_gait(u8 isc) { + bool found = false, first = true; + union zpci_sic_iib iib = {{0}}; + unsigned long si, flags; + + spin_lock_irqsave(&aift->gait_lock, flags); + + if (!aift->gait) { + spin_unlock_irqrestore(&aift->gait_lock, flags); + return; + } + + for (si = 0;;) { + /* Scan adapter summary indicator bit vector */ + si = airq_iv_scan(aift->sbv, si, airq_iv_end(aift->sbv)); + if (si == -1UL) { + if (first || found) { + /* Re-enable interrupts. */ + zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, isc, + &iib); + first = found = false; + } else { + /* Interrupts on and all bits processed */ + break; + } + found = false; + si = 0; + /* Scan again after re-enabling interrupts */ + continue; + } + found = true; + aen_host_forward(si); + } + + spin_unlock_irqrestore(&aift->gait_lock, flags); +} + +static void gib_alert_irq_handler(struct airq_struct *airq, + struct tpi_info *tpi_info) +{ + struct tpi_adapter_info *info = (struct tpi_adapter_info *)tpi_info; + inc_irq_stat(IRQIO_GAL); - process_gib_alert_list(); + + if ((info->forward || info->error) && + IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) { + aen_process_gait(info->isc); + if (info->aism != 0) + process_gib_alert_list(); + } else { + process_gib_alert_list(); + } } static struct airq_struct gib_alert_irq = { .handler = gib_alert_irq_handler, - .lsi_ptr = &gib_alert_irq.lsi_mask, }; void kvm_s390_gib_destroy(void) { if (!gib) return; + if (kvm_s390_pci_interp_allowed() && aift) { + mutex_lock(&aift->aift_lock); + kvm_s390_pci_aen_exit(); + mutex_unlock(&aift->aift_lock); + } chsc_sgib(0); unregister_adapter_interrupt(&gib_alert_irq); free_page((unsigned long)gib); gib = NULL; } -int kvm_s390_gib_init(u8 nisc) +int __init kvm_s390_gib_init(u8 nisc) { + u32 gib_origin; int rc = 0; if (!css_general_characteristics.aiv) { @@ -3223,7 +3435,7 @@ int kvm_s390_gib_init(u8 nisc) goto out; } - gib = (struct kvm_s390_gib *)get_zeroed_page(GFP_KERNEL | GFP_DMA); + gib = (struct kvm_s390_gib *)get_zeroed_page(GFP_KERNEL_ACCOUNT | GFP_DMA); if (!gib) { rc = -ENOMEM; goto out; @@ -3235,9 +3447,12 @@ int kvm_s390_gib_init(u8 nisc) rc = -EIO; goto out_free_gib; } + /* adapter interrupts used for AP (applicable here) don't use the LSI */ + *gib_alert_irq.lsi_ptr = 0xff; gib->nisc = nisc; - if (chsc_sgib((u32)(u64)gib)) { + gib_origin = virt_to_phys(gib); + if (chsc_sgib(gib_origin)) { pr_err("Associating the GIB with the AIV facility failed\n"); free_page((unsigned long)gib); gib = NULL; @@ -3245,6 +3460,14 @@ int kvm_s390_gib_init(u8 nisc) goto out_unreg_gal; } + if (kvm_s390_pci_interp_allowed()) { + if (kvm_s390_pci_aen_init(nisc)) { + pr_err("Initializing AEN for PCI failed\n"); + rc = -EIO; + goto out_unreg_gal; + } + } + KVM_EVENT(3, "gib 0x%pK (nisc=%d) initialized", gib, gib->nisc); goto out; diff --git a/arch/s390/kvm/irq.h b/arch/s390/kvm/irq.h deleted file mode 100644 index 484608c71dd0..000000000000 --- a/arch/s390/kvm/irq.h +++ /dev/null @@ -1,19 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * s390 irqchip routines - * - * Copyright IBM Corp. 2014 - * - * Author(s): Cornelia Huck <cornelia.huck@de.ibm.com> - */ -#ifndef __KVM_IRQ_H -#define __KVM_IRQ_H - -#include <linux/kvm_host.h> - -static inline int irqchip_in_kernel(struct kvm *kvm) -{ - return 1; -} - -#endif diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index d9e6bf3d54f0..ea63ac769889 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2,11 +2,10 @@ /* * hosting IBM Z kernel virtual machines (s390x) * - * Copyright IBM Corp. 2008, 2018 + * Copyright IBM Corp. 2008, 2020 * * Author(s): Carsten Otte <cotte@de.ibm.com> * Christian Borntraeger <borntraeger@de.ibm.com> - * Heiko Carstens <heiko.carstens@de.ibm.com> * Christian Ehrhardt <ehrhardt@de.ibm.com> * Jason J. Herne <jjherne@us.ibm.com> */ @@ -31,11 +30,12 @@ #include <linux/bitmap.h> #include <linux/sched/signal.h> #include <linux/string.h> +#include <linux/pgtable.h> +#include <linux/mmu_notifier.h> #include <asm/asm-offsets.h> #include <asm/lowcore.h> #include <asm/stp.h> -#include <asm/pgtable.h> #include <asm/gmap.h> #include <asm/nmi.h> #include <asm/switch_to.h> @@ -44,8 +44,11 @@ #include <asm/cpacf.h> #include <asm/timex.h> #include <asm/ap.h> +#include <asm/uv.h> +#include <asm/fpu/api.h> #include "kvm-s390.h" #include "gaccess.h" +#include "pci.h" #define CREATE_TRACE_POINTS #include "trace.h" @@ -56,118 +59,137 @@ #define VCPU_IRQS_MAX_BUF (sizeof(struct kvm_s390_irq) * \ (KVM_MAX_VCPUS + LOCAL_IRQS)) -#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU -#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM - -struct kvm_stats_debugfs_item debugfs_entries[] = { - { "userspace_handled", VCPU_STAT(exit_userspace) }, - { "exit_null", VCPU_STAT(exit_null) }, - { "exit_validity", VCPU_STAT(exit_validity) }, - { "exit_stop_request", VCPU_STAT(exit_stop_request) }, - { "exit_external_request", VCPU_STAT(exit_external_request) }, - { "exit_io_request", VCPU_STAT(exit_io_request) }, - { "exit_external_interrupt", VCPU_STAT(exit_external_interrupt) }, - { "exit_instruction", VCPU_STAT(exit_instruction) }, - { "exit_pei", VCPU_STAT(exit_pei) }, - { "exit_program_interruption", VCPU_STAT(exit_program_interruption) }, - { "exit_instr_and_program_int", VCPU_STAT(exit_instr_and_program) }, - { "exit_operation_exception", VCPU_STAT(exit_operation_exception) }, - { "halt_successful_poll", VCPU_STAT(halt_successful_poll) }, - { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) }, - { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) }, - { "halt_no_poll_steal", VCPU_STAT(halt_no_poll_steal) }, - { "halt_wakeup", VCPU_STAT(halt_wakeup) }, - { "instruction_lctlg", VCPU_STAT(instruction_lctlg) }, - { "instruction_lctl", VCPU_STAT(instruction_lctl) }, - { "instruction_stctl", VCPU_STAT(instruction_stctl) }, - { "instruction_stctg", VCPU_STAT(instruction_stctg) }, - { "deliver_ckc", VCPU_STAT(deliver_ckc) }, - { "deliver_cputm", VCPU_STAT(deliver_cputm) }, - { "deliver_emergency_signal", VCPU_STAT(deliver_emergency_signal) }, - { "deliver_external_call", VCPU_STAT(deliver_external_call) }, - { "deliver_service_signal", VCPU_STAT(deliver_service_signal) }, - { "deliver_virtio", VCPU_STAT(deliver_virtio) }, - { "deliver_stop_signal", VCPU_STAT(deliver_stop_signal) }, - { "deliver_prefix_signal", VCPU_STAT(deliver_prefix_signal) }, - { "deliver_restart_signal", VCPU_STAT(deliver_restart_signal) }, - { "deliver_program", VCPU_STAT(deliver_program) }, - { "deliver_io", VCPU_STAT(deliver_io) }, - { "deliver_machine_check", VCPU_STAT(deliver_machine_check) }, - { "exit_wait_state", VCPU_STAT(exit_wait_state) }, - { "inject_ckc", VCPU_STAT(inject_ckc) }, - { "inject_cputm", VCPU_STAT(inject_cputm) }, - { "inject_external_call", VCPU_STAT(inject_external_call) }, - { "inject_float_mchk", VM_STAT(inject_float_mchk) }, - { "inject_emergency_signal", VCPU_STAT(inject_emergency_signal) }, - { "inject_io", VM_STAT(inject_io) }, - { "inject_mchk", VCPU_STAT(inject_mchk) }, - { "inject_pfault_done", VM_STAT(inject_pfault_done) }, - { "inject_program", VCPU_STAT(inject_program) }, - { "inject_restart", VCPU_STAT(inject_restart) }, - { "inject_service_signal", VM_STAT(inject_service_signal) }, - { "inject_set_prefix", VCPU_STAT(inject_set_prefix) }, - { "inject_stop_signal", VCPU_STAT(inject_stop_signal) }, - { "inject_pfault_init", VCPU_STAT(inject_pfault_init) }, - { "inject_virtio", VM_STAT(inject_virtio) }, - { "instruction_epsw", VCPU_STAT(instruction_epsw) }, - { "instruction_gs", VCPU_STAT(instruction_gs) }, - { "instruction_io_other", VCPU_STAT(instruction_io_other) }, - { "instruction_lpsw", VCPU_STAT(instruction_lpsw) }, - { "instruction_lpswe", VCPU_STAT(instruction_lpswe) }, - { "instruction_pfmf", VCPU_STAT(instruction_pfmf) }, - { "instruction_ptff", VCPU_STAT(instruction_ptff) }, - { "instruction_stidp", VCPU_STAT(instruction_stidp) }, - { "instruction_sck", VCPU_STAT(instruction_sck) }, - { "instruction_sckpf", VCPU_STAT(instruction_sckpf) }, - { "instruction_spx", VCPU_STAT(instruction_spx) }, - { "instruction_stpx", VCPU_STAT(instruction_stpx) }, - { "instruction_stap", VCPU_STAT(instruction_stap) }, - { "instruction_iske", VCPU_STAT(instruction_iske) }, - { "instruction_ri", VCPU_STAT(instruction_ri) }, - { "instruction_rrbe", VCPU_STAT(instruction_rrbe) }, - { "instruction_sske", VCPU_STAT(instruction_sske) }, - { "instruction_ipte_interlock", VCPU_STAT(instruction_ipte_interlock) }, - { "instruction_essa", VCPU_STAT(instruction_essa) }, - { "instruction_stsi", VCPU_STAT(instruction_stsi) }, - { "instruction_stfl", VCPU_STAT(instruction_stfl) }, - { "instruction_tb", VCPU_STAT(instruction_tb) }, - { "instruction_tpi", VCPU_STAT(instruction_tpi) }, - { "instruction_tprot", VCPU_STAT(instruction_tprot) }, - { "instruction_tsch", VCPU_STAT(instruction_tsch) }, - { "instruction_sthyi", VCPU_STAT(instruction_sthyi) }, - { "instruction_sie", VCPU_STAT(instruction_sie) }, - { "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) }, - { "instruction_sigp_sense_running", VCPU_STAT(instruction_sigp_sense_running) }, - { "instruction_sigp_external_call", VCPU_STAT(instruction_sigp_external_call) }, - { "instruction_sigp_emergency", VCPU_STAT(instruction_sigp_emergency) }, - { "instruction_sigp_cond_emergency", VCPU_STAT(instruction_sigp_cond_emergency) }, - { "instruction_sigp_start", VCPU_STAT(instruction_sigp_start) }, - { "instruction_sigp_stop", VCPU_STAT(instruction_sigp_stop) }, - { "instruction_sigp_stop_store_status", VCPU_STAT(instruction_sigp_stop_store_status) }, - { "instruction_sigp_store_status", VCPU_STAT(instruction_sigp_store_status) }, - { "instruction_sigp_store_adtl_status", VCPU_STAT(instruction_sigp_store_adtl_status) }, - { "instruction_sigp_set_arch", VCPU_STAT(instruction_sigp_arch) }, - { "instruction_sigp_set_prefix", VCPU_STAT(instruction_sigp_prefix) }, - { "instruction_sigp_restart", VCPU_STAT(instruction_sigp_restart) }, - { "instruction_sigp_cpu_reset", VCPU_STAT(instruction_sigp_cpu_reset) }, - { "instruction_sigp_init_cpu_reset", VCPU_STAT(instruction_sigp_init_cpu_reset) }, - { "instruction_sigp_unknown", VCPU_STAT(instruction_sigp_unknown) }, - { "instruction_diag_10", VCPU_STAT(diagnose_10) }, - { "instruction_diag_44", VCPU_STAT(diagnose_44) }, - { "instruction_diag_9c", VCPU_STAT(diagnose_9c) }, - { "diag_9c_ignored", VCPU_STAT(diagnose_9c_ignored) }, - { "instruction_diag_258", VCPU_STAT(diagnose_258) }, - { "instruction_diag_308", VCPU_STAT(diagnose_308) }, - { "instruction_diag_500", VCPU_STAT(diagnose_500) }, - { "instruction_diag_other", VCPU_STAT(diagnose_other) }, - { NULL } +const struct _kvm_stats_desc kvm_vm_stats_desc[] = { + KVM_GENERIC_VM_STATS(), + STATS_DESC_COUNTER(VM, inject_io), + STATS_DESC_COUNTER(VM, inject_float_mchk), + STATS_DESC_COUNTER(VM, inject_pfault_done), + STATS_DESC_COUNTER(VM, inject_service_signal), + STATS_DESC_COUNTER(VM, inject_virtio), + STATS_DESC_COUNTER(VM, aen_forward), + STATS_DESC_COUNTER(VM, gmap_shadow_reuse), + STATS_DESC_COUNTER(VM, gmap_shadow_create), + STATS_DESC_COUNTER(VM, gmap_shadow_r1_entry), + STATS_DESC_COUNTER(VM, gmap_shadow_r2_entry), + STATS_DESC_COUNTER(VM, gmap_shadow_r3_entry), + STATS_DESC_COUNTER(VM, gmap_shadow_sg_entry), + STATS_DESC_COUNTER(VM, gmap_shadow_pg_entry), }; -struct kvm_s390_tod_clock_ext { - __u8 epoch_idx; - __u64 tod; - __u8 reserved[7]; -} __packed; +const struct kvm_stats_header kvm_vm_stats_header = { + .name_size = KVM_STATS_NAME_SIZE, + .num_desc = ARRAY_SIZE(kvm_vm_stats_desc), + .id_offset = sizeof(struct kvm_stats_header), + .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, + .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + + sizeof(kvm_vm_stats_desc), +}; + +const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { + KVM_GENERIC_VCPU_STATS(), + STATS_DESC_COUNTER(VCPU, exit_userspace), + STATS_DESC_COUNTER(VCPU, exit_null), + STATS_DESC_COUNTER(VCPU, exit_external_request), + STATS_DESC_COUNTER(VCPU, exit_io_request), + STATS_DESC_COUNTER(VCPU, exit_external_interrupt), + STATS_DESC_COUNTER(VCPU, exit_stop_request), + STATS_DESC_COUNTER(VCPU, exit_validity), + STATS_DESC_COUNTER(VCPU, exit_instruction), + STATS_DESC_COUNTER(VCPU, exit_pei), + STATS_DESC_COUNTER(VCPU, halt_no_poll_steal), + STATS_DESC_COUNTER(VCPU, instruction_lctl), + STATS_DESC_COUNTER(VCPU, instruction_lctlg), + STATS_DESC_COUNTER(VCPU, instruction_stctl), + STATS_DESC_COUNTER(VCPU, instruction_stctg), + STATS_DESC_COUNTER(VCPU, exit_program_interruption), + STATS_DESC_COUNTER(VCPU, exit_instr_and_program), + STATS_DESC_COUNTER(VCPU, exit_operation_exception), + STATS_DESC_COUNTER(VCPU, deliver_ckc), + STATS_DESC_COUNTER(VCPU, deliver_cputm), + STATS_DESC_COUNTER(VCPU, deliver_external_call), + STATS_DESC_COUNTER(VCPU, deliver_emergency_signal), + STATS_DESC_COUNTER(VCPU, deliver_service_signal), + STATS_DESC_COUNTER(VCPU, deliver_virtio), + STATS_DESC_COUNTER(VCPU, deliver_stop_signal), + STATS_DESC_COUNTER(VCPU, deliver_prefix_signal), + STATS_DESC_COUNTER(VCPU, deliver_restart_signal), + STATS_DESC_COUNTER(VCPU, deliver_program), + STATS_DESC_COUNTER(VCPU, deliver_io), + STATS_DESC_COUNTER(VCPU, deliver_machine_check), + STATS_DESC_COUNTER(VCPU, exit_wait_state), + STATS_DESC_COUNTER(VCPU, inject_ckc), + STATS_DESC_COUNTER(VCPU, inject_cputm), + STATS_DESC_COUNTER(VCPU, inject_external_call), + STATS_DESC_COUNTER(VCPU, inject_emergency_signal), + STATS_DESC_COUNTER(VCPU, inject_mchk), + STATS_DESC_COUNTER(VCPU, inject_pfault_init), + STATS_DESC_COUNTER(VCPU, inject_program), + STATS_DESC_COUNTER(VCPU, inject_restart), + STATS_DESC_COUNTER(VCPU, inject_set_prefix), + STATS_DESC_COUNTER(VCPU, inject_stop_signal), + STATS_DESC_COUNTER(VCPU, instruction_epsw), + STATS_DESC_COUNTER(VCPU, instruction_gs), + STATS_DESC_COUNTER(VCPU, instruction_io_other), + STATS_DESC_COUNTER(VCPU, instruction_lpsw), + STATS_DESC_COUNTER(VCPU, instruction_lpswe), + STATS_DESC_COUNTER(VCPU, instruction_pfmf), + STATS_DESC_COUNTER(VCPU, instruction_ptff), + STATS_DESC_COUNTER(VCPU, instruction_sck), + STATS_DESC_COUNTER(VCPU, instruction_sckpf), + STATS_DESC_COUNTER(VCPU, instruction_stidp), + STATS_DESC_COUNTER(VCPU, instruction_spx), + STATS_DESC_COUNTER(VCPU, instruction_stpx), + STATS_DESC_COUNTER(VCPU, instruction_stap), + STATS_DESC_COUNTER(VCPU, instruction_iske), + STATS_DESC_COUNTER(VCPU, instruction_ri), + STATS_DESC_COUNTER(VCPU, instruction_rrbe), + STATS_DESC_COUNTER(VCPU, instruction_sske), + STATS_DESC_COUNTER(VCPU, instruction_ipte_interlock), + STATS_DESC_COUNTER(VCPU, instruction_stsi), + STATS_DESC_COUNTER(VCPU, instruction_stfl), + STATS_DESC_COUNTER(VCPU, instruction_tb), + STATS_DESC_COUNTER(VCPU, instruction_tpi), + STATS_DESC_COUNTER(VCPU, instruction_tprot), + STATS_DESC_COUNTER(VCPU, instruction_tsch), + STATS_DESC_COUNTER(VCPU, instruction_sie), + STATS_DESC_COUNTER(VCPU, instruction_essa), + STATS_DESC_COUNTER(VCPU, instruction_sthyi), + STATS_DESC_COUNTER(VCPU, instruction_sigp_sense), + STATS_DESC_COUNTER(VCPU, instruction_sigp_sense_running), + STATS_DESC_COUNTER(VCPU, instruction_sigp_external_call), + STATS_DESC_COUNTER(VCPU, instruction_sigp_emergency), + STATS_DESC_COUNTER(VCPU, instruction_sigp_cond_emergency), + STATS_DESC_COUNTER(VCPU, instruction_sigp_start), + STATS_DESC_COUNTER(VCPU, instruction_sigp_stop), + STATS_DESC_COUNTER(VCPU, instruction_sigp_stop_store_status), + STATS_DESC_COUNTER(VCPU, instruction_sigp_store_status), + STATS_DESC_COUNTER(VCPU, instruction_sigp_store_adtl_status), + STATS_DESC_COUNTER(VCPU, instruction_sigp_arch), + STATS_DESC_COUNTER(VCPU, instruction_sigp_prefix), + STATS_DESC_COUNTER(VCPU, instruction_sigp_restart), + STATS_DESC_COUNTER(VCPU, instruction_sigp_init_cpu_reset), + STATS_DESC_COUNTER(VCPU, instruction_sigp_cpu_reset), + STATS_DESC_COUNTER(VCPU, instruction_sigp_unknown), + STATS_DESC_COUNTER(VCPU, instruction_diagnose_10), + STATS_DESC_COUNTER(VCPU, instruction_diagnose_44), + STATS_DESC_COUNTER(VCPU, instruction_diagnose_9c), + STATS_DESC_COUNTER(VCPU, diag_9c_ignored), + STATS_DESC_COUNTER(VCPU, diag_9c_forward), + STATS_DESC_COUNTER(VCPU, instruction_diagnose_258), + STATS_DESC_COUNTER(VCPU, instruction_diagnose_308), + STATS_DESC_COUNTER(VCPU, instruction_diagnose_500), + STATS_DESC_COUNTER(VCPU, instruction_diagnose_other), + STATS_DESC_COUNTER(VCPU, pfault_sync) +}; + +const struct kvm_stats_header kvm_vcpu_stats_header = { + .name_size = KVM_STATS_NAME_SIZE, + .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc), + .id_offset = sizeof(struct kvm_stats_header), + .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, + .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + + sizeof(kvm_vcpu_stats_desc), +}; /* allow nested virtualization in KVM (if enabled by user space) */ static int nested; @@ -184,6 +206,24 @@ static u8 halt_poll_max_steal = 10; module_param(halt_poll_max_steal, byte, 0644); MODULE_PARM_DESC(halt_poll_max_steal, "Maximum percentage of steal time to allow polling"); +/* if set to true, the GISA will be initialized and used if available */ +static bool use_gisa = true; +module_param(use_gisa, bool, 0644); +MODULE_PARM_DESC(use_gisa, "Use the GISA if the host supports it."); + +/* maximum diag9c forwarding per second */ +unsigned int diag9c_forwarding_hz; +module_param(diag9c_forwarding_hz, uint, 0644); +MODULE_PARM_DESC(diag9c_forwarding_hz, "Maximum diag9c forwarding per second, 0 to turn off"); + +/* + * allow asynchronous deinit for protected guests; enable by default since + * the feature is opt-in anyway + */ +static int async_destroy = 1; +module_param(async_destroy, int, 0444); +MODULE_PARM_DESC(async_destroy, "Asynchronous destroy for protected guests"); + /* * For now we handle at most 16 double words as this is what the s390 base * kernel handles and stores in the prefix page. If we ever need to go beyond @@ -207,7 +247,7 @@ static unsigned long kvm_s390_fac_size(void) BUILD_BUG_ON(SIZE_INTERNAL > S390_ARCH_FAC_MASK_SIZE_U64); BUILD_BUG_ON(SIZE_INTERNAL > S390_ARCH_FAC_LIST_SIZE_U64); BUILD_BUG_ON(SIZE_INTERNAL * sizeof(unsigned long) > - sizeof(S390_lowcore.stfle_fac_list)); + sizeof(stfle_fac_list)); return SIZE_INTERNAL; } @@ -220,21 +260,13 @@ static struct kvm_s390_vm_cpu_subfunc kvm_s390_available_subfunc; static struct gmap_notifier gmap_notifier; static struct gmap_notifier vsie_gmap_notifier; debug_info_t *kvm_s390_dbf; +debug_info_t *kvm_s390_dbf_uv; /* Section: not file related */ -int kvm_arch_hardware_enable(void) -{ - /* every s390 is virtualization enabled ;-) */ - return 0; -} - -int kvm_arch_check_processor_compat(void) -{ - return 0; -} - +/* forward declarations */ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, unsigned long end); +static int sca_switch_to_extended(struct kvm *kvm); static void kvm_clock_sync_scb(struct kvm_s390_sie_block *scb, u64 delta) { @@ -269,7 +301,7 @@ static int kvm_clock_sync(struct notifier_block *notifier, unsigned long val, { struct kvm *kvm; struct kvm_vcpu *vcpu; - int i; + unsigned long i; unsigned long long *delta = v; list_for_each_entry(kvm, &vm_list, vm_list) { @@ -293,25 +325,6 @@ static struct notifier_block kvm_clock_notifier = { .notifier_call = kvm_clock_sync, }; -int kvm_arch_hardware_setup(void) -{ - gmap_notifier.notifier_call = kvm_gmap_notifier; - gmap_register_pte_notifier(&gmap_notifier); - vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier; - gmap_register_pte_notifier(&vsie_gmap_notifier); - atomic_notifier_chain_register(&s390_epoch_delta_notifier, - &kvm_clock_notifier); - return 0; -} - -void kvm_arch_hardware_unsetup(void) -{ - gmap_unregister_pte_notifier(&gmap_notifier); - gmap_unregister_pte_notifier(&vsie_gmap_notifier); - atomic_notifier_chain_unregister(&s390_epoch_delta_notifier, - &kvm_clock_notifier); -} - static void allow_cpu_feat(unsigned long nr) { set_bit_inv(nr, kvm_s390_available_cpu_feat); @@ -319,37 +332,37 @@ static void allow_cpu_feat(unsigned long nr) static inline int plo_test_bit(unsigned char nr) { - register unsigned long r0 asm("0") = (unsigned long) nr | 0x100; + unsigned long function = (unsigned long)nr | 0x100; int cc; asm volatile( + " lgr 0,%[function]\n" /* Parameter registers are ignored for "test bit" */ " plo 0,0,0,0(0)\n" " ipm %0\n" " srl %0,28\n" : "=d" (cc) - : "d" (r0) - : "cc"); + : [function] "d" (function) + : "cc", "0"); return cc == 0; } static __always_inline void __insn32_query(unsigned int opcode, u8 *query) { - register unsigned long r0 asm("0") = 0; /* query function */ - register unsigned long r1 asm("1") = (unsigned long) query; - asm volatile( - /* Parameter regs are ignored */ + " lghi 0,0\n" + " lgr 1,%[query]\n" + /* Parameter registers are ignored */ " .insn rrf,%[opc] << 16,2,4,6,0\n" : - : "d" (r0), "a" (r1), [opc] "i" (opcode) - : "cc", "memory"); + : [query] "d" ((unsigned long)query), [opc] "i" (opcode) + : "cc", "memory", "0", "1"); } #define INSN_SORTL 0xb938 #define INSN_DFLTCC 0xb939 -static void kvm_s390_cpu_feat_init(void) +static void __init kvm_s390_cpu_feat_init(void) { int i; @@ -452,7 +465,7 @@ static void kvm_s390_cpu_feat_init(void) */ } -int kvm_arch_init(void *opaque) +static int __init __kvm_s390_init(void) { int rc = -ENOMEM; @@ -460,8 +473,13 @@ int kvm_arch_init(void *opaque) if (!kvm_s390_dbf) return -ENOMEM; - if (debug_register_view(kvm_s390_dbf, &debug_sprintf_view)) - goto out; + kvm_s390_dbf_uv = debug_register("kvm-uv", 32, 1, 7 * sizeof(long)); + if (!kvm_s390_dbf_uv) + goto err_kvm_uv; + + if (debug_register_view(kvm_s390_dbf, &debug_sprintf_view) || + debug_register_view(kvm_s390_dbf_uv, &debug_sprintf_view)) + goto err_debug_view; kvm_s390_cpu_feat_init(); @@ -469,24 +487,54 @@ int kvm_arch_init(void *opaque) rc = kvm_register_device_ops(&kvm_flic_ops, KVM_DEV_TYPE_FLIC); if (rc) { pr_err("A FLIC registration call failed with rc=%d\n", rc); - goto out; + goto err_flic; + } + + if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) { + rc = kvm_s390_pci_init(); + if (rc) { + pr_err("Unable to allocate AIFT for PCI\n"); + goto err_pci; + } } rc = kvm_s390_gib_init(GAL_ISC); if (rc) - goto out; + goto err_gib; + + gmap_notifier.notifier_call = kvm_gmap_notifier; + gmap_register_pte_notifier(&gmap_notifier); + vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier; + gmap_register_pte_notifier(&vsie_gmap_notifier); + atomic_notifier_chain_register(&s390_epoch_delta_notifier, + &kvm_clock_notifier); return 0; -out: - kvm_arch_exit(); +err_gib: + if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) + kvm_s390_pci_exit(); +err_pci: +err_flic: +err_debug_view: + debug_unregister(kvm_s390_dbf_uv); +err_kvm_uv: + debug_unregister(kvm_s390_dbf); return rc; } -void kvm_arch_exit(void) +static void __kvm_s390_exit(void) { + gmap_unregister_pte_notifier(&gmap_notifier); + gmap_unregister_pte_notifier(&vsie_gmap_notifier); + atomic_notifier_chain_unregister(&s390_epoch_delta_notifier, + &kvm_clock_notifier); + kvm_s390_gib_destroy(); + if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) + kvm_s390_pci_exit(); debug_unregister(kvm_s390_dbf); + debug_unregister(kvm_s390_dbf_uv); } /* Section: device related */ @@ -515,7 +563,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ENABLE_CAP: case KVM_CAP_S390_CSS_SUPPORT: case KVM_CAP_IOEVENTFD: - case KVM_CAP_DEVICE_CTRL: case KVM_CAP_S390_IRQCHIP: case KVM_CAP_VM_ATTRIBUTES: case KVM_CAP_MP_STATE: @@ -529,8 +576,15 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_CMMA_MIGRATION: case KVM_CAP_S390_AIS: case KVM_CAP_S390_AIS_MIGRATION: + case KVM_CAP_S390_VCPU_RESETS: + case KVM_CAP_SET_GUEST_DEBUG: + case KVM_CAP_S390_DIAG318: + case KVM_CAP_IRQFD_RESAMPLE: r = 1; break; + case KVM_CAP_SET_GUEST_DEBUG2: + r = KVM_GUESTDBG_VALID_MASK; + break; case KVM_CAP_S390_HPAGE_1M: r = 0; if (hpage && !kvm_is_ucontrol(kvm)) @@ -539,6 +593,15 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_MEM_OP: r = MEM_OP_MAX_SIZE; break; + case KVM_CAP_S390_MEM_OP_EXTENSION: + /* + * Flag bits indicating which extensions are supported. + * If r > 0, the base extension must also be supported/indicated, + * in order to maintain backwards compatibility. + */ + r = KVM_S390_MEMOP_EXTENSION_CAP_BASE | + KVM_S390_MEMOP_EXTENSION_CAP_CMPXCHG; + break; case KVM_CAP_NR_VCPUS: case KVM_CAP_MAX_VCPUS: case KVM_CAP_MAX_VCPU_ID: @@ -547,12 +610,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_MAX_VCPUS; else if (sclp.has_esca && sclp.has_64bscao) r = KVM_S390_ESCA_CPU_SLOTS; + if (ext == KVM_CAP_NR_VCPUS) + r = min_t(unsigned int, num_online_cpus(), r); break; case KVM_CAP_S390_COW: r = MACHINE_HAS_ESOP; break; case KVM_CAP_S390_VECTOR_REGISTERS: - r = MACHINE_HAS_VX; + r = test_facility(129); break; case KVM_CAP_S390_RI: r = test_facility(64); @@ -563,14 +628,45 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_BPB: r = test_facility(82); break; + case KVM_CAP_S390_PROTECTED_ASYNC_DISABLE: + r = async_destroy && is_prot_virt_host(); + break; + case KVM_CAP_S390_PROTECTED: + r = is_prot_virt_host(); + break; + case KVM_CAP_S390_PROTECTED_DUMP: { + u64 pv_cmds_dump[] = { + BIT_UVC_CMD_DUMP_INIT, + BIT_UVC_CMD_DUMP_CONFIG_STOR_STATE, + BIT_UVC_CMD_DUMP_CPU, + BIT_UVC_CMD_DUMP_COMPLETE, + }; + int i; + + r = is_prot_virt_host(); + + for (i = 0; i < ARRAY_SIZE(pv_cmds_dump); i++) { + if (!test_bit_inv(pv_cmds_dump[i], + (unsigned long *)&uv_info.inst_calls_list)) { + r = 0; + break; + } + } + break; + } + case KVM_CAP_S390_ZPCI_OP: + r = kvm_s390_pci_interp_allowed(); + break; + case KVM_CAP_S390_CPU_TOPOLOGY: + r = test_facility(11); + break; default: r = 0; } return r; } -static void kvm_s390_sync_dirty_log(struct kvm *kvm, - struct kvm_memory_slot *memslot) +void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) { int i; gfn_t cur_gfn, last_gfn; @@ -611,9 +707,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, { int r; unsigned long n; - struct kvm_memslots *slots; struct kvm_memory_slot *memslot; - int is_dirty = 0; + int is_dirty; if (kvm_is_ucontrol(kvm)) return -EINVAL; @@ -624,14 +719,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, if (log->slot >= KVM_USER_MEM_SLOTS) goto out; - slots = kvm_memslots(kvm); - memslot = id_to_memslot(slots, log->slot); - r = -ENOENT; - if (!memslot->dirty_bitmap) - goto out; - - kvm_s390_sync_dirty_log(kvm, memslot); - r = kvm_get_dirty_log(kvm, log, &is_dirty); + r = kvm_get_dirty_log(kvm, log, &is_dirty, &memslot); if (r) goto out; @@ -648,7 +736,7 @@ out: static void icpt_operexc_on_all_vcpus(struct kvm *kvm) { - unsigned int i; + unsigned long i; struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) { @@ -678,7 +766,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) mutex_lock(&kvm->lock); if (kvm->created_vcpus) { r = -EBUSY; - } else if (MACHINE_HAS_VX) { + } else if (cpu_has_vx()) { set_kvm_facility(kvm->arch.model.fac_mask, 129); set_kvm_facility(kvm->arch.model.fac_list, 129); if (test_facility(134)) { @@ -697,6 +785,10 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) set_kvm_facility(kvm->arch.model.fac_mask, 152); set_kvm_facility(kvm->arch.model.fac_list, 152); } + if (test_facility(192)) { + set_kvm_facility(kvm->arch.model.fac_mask, 192); + set_kvm_facility(kvm->arch.model.fac_list, 192); + } r = 0; } else r = -EINVAL; @@ -753,9 +845,9 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) r = -EINVAL; else { r = 0; - down_write(&kvm->mm->mmap_sem); + mmap_write_lock(kvm->mm); kvm->mm->context.allow_gmap_hpage_1m = 1; - up_write(&kvm->mm->mmap_sem); + mmap_write_unlock(kvm->mm); /* * We might have to create fake 4k page * tables. To avoid that the hardware works on @@ -779,6 +871,20 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) icpt_operexc_on_all_vcpus(kvm); r = 0; break; + case KVM_CAP_S390_CPU_TOPOLOGY: + r = -EINVAL; + mutex_lock(&kvm->lock); + if (kvm->created_vcpus) { + r = -EBUSY; + } else if (test_facility(11)) { + set_kvm_facility(kvm->arch.model.fac_mask, 11); + set_kvm_facility(kvm->arch.model.fac_list, 11); + r = 0; + } + mutex_unlock(&kvm->lock); + VM_EVENT(kvm, 3, "ENABLE: CAP_S390_CPU_TOPOLOGY %s", + r ? "(not available)" : "(success)"); + break; default: r = -EINVAL; break; @@ -898,7 +1004,7 @@ static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_crypto_reset_all(struct kvm *kvm) { struct kvm_vcpu *vcpu; - int i; + unsigned long i; kvm_s390_vcpu_block_all(kvm); @@ -981,9 +1087,45 @@ static int kvm_s390_vm_set_crypto(struct kvm *kvm, struct kvm_device_attr *attr) return 0; } +static void kvm_s390_vcpu_pci_setup(struct kvm_vcpu *vcpu) +{ + /* Only set the ECB bits after guest requests zPCI interpretation */ + if (!vcpu->kvm->arch.use_zpci_interp) + return; + + vcpu->arch.sie_block->ecb2 |= ECB2_ZPCI_LSI; + vcpu->arch.sie_block->ecb3 |= ECB3_AISII + ECB3_AISI; +} + +void kvm_s390_vcpu_pci_enable_interp(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + unsigned long i; + + lockdep_assert_held(&kvm->lock); + + if (!kvm_s390_pci_interp_allowed()) + return; + + /* + * If host is configured for PCI and the necessary facilities are + * available, turn on interpretation for the life of this guest + */ + kvm->arch.use_zpci_interp = 1; + + kvm_s390_vcpu_block_all(kvm); + + kvm_for_each_vcpu(i, vcpu, kvm) { + kvm_s390_vcpu_pci_setup(vcpu); + kvm_s390_sync_request(KVM_REQ_VSIE_RESTART, vcpu); + } + + kvm_s390_vcpu_unblock_all(kvm); +} + static void kvm_s390_sync_request_broadcast(struct kvm *kvm, int req) { - int cx; + unsigned long cx; struct kvm_vcpu *vcpu; kvm_for_each_vcpu(cx, vcpu, kvm) @@ -999,13 +1141,13 @@ static int kvm_s390_vm_start_migration(struct kvm *kvm) struct kvm_memory_slot *ms; struct kvm_memslots *slots; unsigned long ram_pages = 0; - int slotnr; + int bkt; /* migration mode already enabled */ if (kvm->arch.migration_mode) return 0; slots = kvm_memslots(kvm); - if (!slots || !slots->used_slots) + if (!slots || kvm_memslots_empty(slots)) return -EINVAL; if (!kvm->arch.use_cmma) { @@ -1013,8 +1155,7 @@ static int kvm_s390_vm_start_migration(struct kvm *kvm) return 0; } /* mark all the pages in active slots as dirty */ - for (slotnr = 0; slotnr < slots->used_slots; slotnr++) { - ms = slots->memslots + slotnr; + kvm_for_each_memslot(ms, bkt, slots) { if (!ms->dirty_bitmap) return -EINVAL; /* @@ -1081,6 +1222,8 @@ static int kvm_s390_vm_get_migration(struct kvm *kvm, return 0; } +static void __kvm_s390_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod); + static int kvm_s390_set_tod_ext(struct kvm *kvm, struct kvm_device_attr *attr) { struct kvm_s390_vm_tod_clock gtod; @@ -1090,7 +1233,7 @@ static int kvm_s390_set_tod_ext(struct kvm *kvm, struct kvm_device_attr *attr) if (!test_kvm_facility(kvm, 139) && gtod.epoch_idx) return -EINVAL; - kvm_s390_set_tod_clock(kvm, >od); + __kvm_s390_set_tod_clock(kvm, >od); VM_EVENT(kvm, 3, "SET: TOD extension: 0x%x, TOD base: 0x%llx", gtod.epoch_idx, gtod.tod); @@ -1121,7 +1264,7 @@ static int kvm_s390_set_tod_low(struct kvm *kvm, struct kvm_device_attr *attr) sizeof(gtod.tod))) return -EFAULT; - kvm_s390_set_tod_clock(kvm, >od); + __kvm_s390_set_tod_clock(kvm, >od); VM_EVENT(kvm, 3, "SET: TOD base: 0x%llx", gtod.tod); return 0; } @@ -1133,6 +1276,16 @@ static int kvm_s390_set_tod(struct kvm *kvm, struct kvm_device_attr *attr) if (attr->flags) return -EINVAL; + mutex_lock(&kvm->lock); + /* + * For protected guests, the TOD is managed by the ultravisor, so trying + * to change it will never bring the expected results. + */ + if (kvm_s390_pv_is_protected(kvm)) { + ret = -EOPNOTSUPP; + goto out_unlock; + } + switch (attr->attr) { case KVM_S390_VM_TOD_EXT: ret = kvm_s390_set_tod_ext(kvm, attr); @@ -1147,23 +1300,26 @@ static int kvm_s390_set_tod(struct kvm *kvm, struct kvm_device_attr *attr) ret = -ENXIO; break; } + +out_unlock: + mutex_unlock(&kvm->lock); return ret; } static void kvm_s390_get_tod_clock(struct kvm *kvm, struct kvm_s390_vm_tod_clock *gtod) { - struct kvm_s390_tod_clock_ext htod; + union tod_clock clk; preempt_disable(); - get_tod_clock_ext((char *)&htod); + store_tod_clock_ext(&clk); - gtod->tod = htod.tod + kvm->arch.epoch; + gtod->tod = clk.tod + kvm->arch.epoch; gtod->epoch_idx = 0; if (test_kvm_facility(kvm, 139)) { - gtod->epoch_idx = htod.epoch_idx + kvm->arch.epdx; - if (gtod->tod < htod.tod) + gtod->epoch_idx = clk.ei + kvm->arch.epdx; + if (gtod->tod < clk.tod) gtod->epoch_idx += 1; } @@ -1243,7 +1399,7 @@ static int kvm_s390_set_processor(struct kvm *kvm, struct kvm_device_attr *attr) ret = -EBUSY; goto out; } - proc = kzalloc(sizeof(*proc), GFP_KERNEL); + proc = kzalloc(sizeof(*proc), GFP_KERNEL_ACCOUNT); if (!proc) { ret = -ENOMEM; goto out; @@ -1295,8 +1451,7 @@ static int kvm_s390_set_processor_feat(struct kvm *kvm, mutex_unlock(&kvm->lock); return -EBUSY; } - bitmap_copy(kvm->arch.cpu_feat, (unsigned long *) data.feat, - KVM_S390_VM_CPU_FEAT_NR_BITS); + bitmap_from_arr64(kvm->arch.cpu_feat, data.feat, KVM_S390_VM_CPU_FEAT_NR_BITS); mutex_unlock(&kvm->lock); VM_EVENT(kvm, 3, "SET: guest feat: 0x%16.16llx.0x%16.16llx.0x%16.16llx", data.feat[0], @@ -1382,6 +1537,39 @@ static int kvm_s390_set_processor_subfunc(struct kvm *kvm, return 0; } +#define KVM_S390_VM_CPU_UV_FEAT_GUEST_MASK \ +( \ + ((struct kvm_s390_vm_cpu_uv_feat){ \ + .ap = 1, \ + .ap_intr = 1, \ + }) \ + .feat \ +) + +static int kvm_s390_set_uv_feat(struct kvm *kvm, struct kvm_device_attr *attr) +{ + struct kvm_s390_vm_cpu_uv_feat __user *ptr = (void __user *)attr->addr; + unsigned long data, filter; + + filter = uv_info.uv_feature_indications & KVM_S390_VM_CPU_UV_FEAT_GUEST_MASK; + if (get_user(data, &ptr->feat)) + return -EFAULT; + if (!bitmap_subset(&data, &filter, KVM_S390_VM_CPU_UV_FEAT_NR_BITS)) + return -EINVAL; + + mutex_lock(&kvm->lock); + if (kvm->created_vcpus) { + mutex_unlock(&kvm->lock); + return -EBUSY; + } + kvm->arch.model.uv_feat_guest.feat = data; + mutex_unlock(&kvm->lock); + + VM_EVENT(kvm, 3, "SET: guest UV-feat: 0x%16.16lx", data); + + return 0; +} + static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) { int ret = -ENXIO; @@ -1396,6 +1584,9 @@ static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC: ret = kvm_s390_set_processor_subfunc(kvm, attr); break; + case KVM_S390_VM_CPU_PROCESSOR_UV_FEAT_GUEST: + ret = kvm_s390_set_uv_feat(kvm, attr); + break; } return ret; } @@ -1405,7 +1596,7 @@ static int kvm_s390_get_processor(struct kvm *kvm, struct kvm_device_attr *attr) struct kvm_s390_vm_cpu_processor *proc; int ret = 0; - proc = kzalloc(sizeof(*proc), GFP_KERNEL); + proc = kzalloc(sizeof(*proc), GFP_KERNEL_ACCOUNT); if (!proc) { ret = -ENOMEM; goto out; @@ -1433,7 +1624,7 @@ static int kvm_s390_get_machine(struct kvm *kvm, struct kvm_device_attr *attr) struct kvm_s390_vm_cpu_machine *mach; int ret = 0; - mach = kzalloc(sizeof(*mach), GFP_KERNEL); + mach = kzalloc(sizeof(*mach), GFP_KERNEL_ACCOUNT); if (!mach) { ret = -ENOMEM; goto out; @@ -1442,8 +1633,8 @@ static int kvm_s390_get_machine(struct kvm *kvm, struct kvm_device_attr *attr) mach->ibc = sclp.ibc; memcpy(&mach->fac_mask, kvm->arch.model.fac_mask, S390_ARCH_FAC_LIST_SIZE_BYTE); - memcpy((unsigned long *)&mach->fac_list, S390_lowcore.stfle_fac_list, - sizeof(S390_lowcore.stfle_fac_list)); + memcpy((unsigned long *)&mach->fac_list, stfle_fac_list, + sizeof(stfle_fac_list)); VM_EVENT(kvm, 3, "GET: host ibc: 0x%4.4x, host cpuid: 0x%16.16llx", kvm->arch.model.ibc, kvm->arch.model.cpuid); @@ -1467,8 +1658,7 @@ static int kvm_s390_get_processor_feat(struct kvm *kvm, { struct kvm_s390_vm_cpu_feat data; - bitmap_copy((unsigned long *) data.feat, kvm->arch.cpu_feat, - KVM_S390_VM_CPU_FEAT_NR_BITS); + bitmap_to_arr64(data.feat, kvm->arch.cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); if (copy_to_user((void __user *)attr->addr, &data, sizeof(data))) return -EFAULT; VM_EVENT(kvm, 3, "GET: guest feat: 0x%16.16llx.0x%16.16llx.0x%16.16llx", @@ -1483,9 +1673,7 @@ static int kvm_s390_get_machine_feat(struct kvm *kvm, { struct kvm_s390_vm_cpu_feat data; - bitmap_copy((unsigned long *) data.feat, - kvm_s390_available_cpu_feat, - KVM_S390_VM_CPU_FEAT_NR_BITS); + bitmap_to_arr64(data.feat, kvm_s390_available_cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); if (copy_to_user((void __user *)attr->addr, &data, sizeof(data))) return -EFAULT; VM_EVENT(kvm, 3, "GET: host feat: 0x%16.16llx.0x%16.16llx.0x%16.16llx", @@ -1631,6 +1819,33 @@ static int kvm_s390_get_machine_subfunc(struct kvm *kvm, return 0; } +static int kvm_s390_get_processor_uv_feat(struct kvm *kvm, struct kvm_device_attr *attr) +{ + struct kvm_s390_vm_cpu_uv_feat __user *dst = (void __user *)attr->addr; + unsigned long feat = kvm->arch.model.uv_feat_guest.feat; + + if (put_user(feat, &dst->feat)) + return -EFAULT; + VM_EVENT(kvm, 3, "GET: guest UV-feat: 0x%16.16lx", feat); + + return 0; +} + +static int kvm_s390_get_machine_uv_feat(struct kvm *kvm, struct kvm_device_attr *attr) +{ + struct kvm_s390_vm_cpu_uv_feat __user *dst = (void __user *)attr->addr; + unsigned long feat; + + BUILD_BUG_ON(sizeof(*dst) != sizeof(uv_info.uv_feature_indications)); + + feat = uv_info.uv_feature_indications & KVM_S390_VM_CPU_UV_FEAT_GUEST_MASK; + if (put_user(feat, &dst->feat)) + return -EFAULT; + VM_EVENT(kvm, 3, "GET: guest UV-feat: 0x%16.16lx", feat); + + return 0; +} + static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) { int ret = -ENXIO; @@ -1654,10 +1869,67 @@ static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_CPU_MACHINE_SUBFUNC: ret = kvm_s390_get_machine_subfunc(kvm, attr); break; + case KVM_S390_VM_CPU_PROCESSOR_UV_FEAT_GUEST: + ret = kvm_s390_get_processor_uv_feat(kvm, attr); + break; + case KVM_S390_VM_CPU_MACHINE_UV_FEAT_GUEST: + ret = kvm_s390_get_machine_uv_feat(kvm, attr); + break; } return ret; } +/** + * kvm_s390_update_topology_change_report - update CPU topology change report + * @kvm: guest KVM description + * @val: set or clear the MTCR bit + * + * Updates the Multiprocessor Topology-Change-Report bit to signal + * the guest with a topology change. + * This is only relevant if the topology facility is present. + * + * The SCA version, bsca or esca, doesn't matter as offset is the same. + */ +static void kvm_s390_update_topology_change_report(struct kvm *kvm, bool val) +{ + union sca_utility new, old; + struct bsca_block *sca; + + read_lock(&kvm->arch.sca_lock); + sca = kvm->arch.sca; + do { + old = READ_ONCE(sca->utility); + new = old; + new.mtcr = val; + } while (cmpxchg(&sca->utility.val, old.val, new.val) != old.val); + read_unlock(&kvm->arch.sca_lock); +} + +static int kvm_s390_set_topo_change_indication(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + if (!test_kvm_facility(kvm, 11)) + return -ENXIO; + + kvm_s390_update_topology_change_report(kvm, !!attr->attr); + return 0; +} + +static int kvm_s390_get_topo_change_indication(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + u8 topo; + + if (!test_kvm_facility(kvm, 11)) + return -ENXIO; + + read_lock(&kvm->arch.sca_lock); + topo = ((struct bsca_block *)kvm->arch.sca)->utility.mtcr; + read_unlock(&kvm->arch.sca_lock); + + return put_user(topo, (u8 __user *)attr->addr); +} + static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr) { int ret; @@ -1678,6 +1950,9 @@ static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_MIGRATION: ret = kvm_s390_vm_set_migration(kvm, attr); break; + case KVM_S390_VM_CPU_TOPOLOGY: + ret = kvm_s390_set_topo_change_indication(kvm, attr); + break; default: ret = -ENXIO; break; @@ -1703,6 +1978,9 @@ static int kvm_s390_vm_get_attr(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_MIGRATION: ret = kvm_s390_vm_get_migration(kvm, attr); break; + case KVM_S390_VM_CPU_TOPOLOGY: + ret = kvm_s390_get_topo_change_indication(kvm, attr); + break; default: ret = -ENXIO; break; @@ -1749,6 +2027,8 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_CPU_MACHINE_FEAT: case KVM_S390_VM_CPU_MACHINE_SUBFUNC: case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC: + case KVM_S390_VM_CPU_MACHINE_UV_FEAT_GUEST: + case KVM_S390_VM_CPU_PROCESSOR_UV_FEAT_GUEST: ret = 0; break; default: @@ -1776,6 +2056,9 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_MIGRATION: ret = 0; break; + case KVM_S390_VM_CPU_TOPOLOGY: + ret = test_kvm_facility(kvm, 11) ? 0 : -ENXIO; + break; default: ret = -ENXIO; break; @@ -1784,7 +2067,7 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) return ret; } -static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) +static int kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) { uint8_t *keys; uint64_t hva; @@ -1801,11 +2084,11 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX) return -EINVAL; - keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL); + keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL_ACCOUNT); if (!keys) return -ENOMEM; - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); srcu_idx = srcu_read_lock(&kvm->srcu); for (i = 0; i < args->count; i++) { hva = gfn_to_hva(kvm, args->start_gfn + i); @@ -1819,7 +2102,7 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) break; } srcu_read_unlock(&kvm->srcu, srcu_idx); - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); if (!r) { r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys, @@ -1832,7 +2115,7 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) return r; } -static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) +static int kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) { uint8_t *keys; uint64_t hva; @@ -1846,7 +2129,7 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX) return -EINVAL; - keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL); + keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL_ACCOUNT); if (!keys) return -ENOMEM; @@ -1863,7 +2146,7 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) goto out; i = 0; - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); srcu_idx = srcu_read_lock(&kvm->srcu); while (i < args->count) { unlocked = false; @@ -1881,7 +2164,7 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) r = set_guest_storage_key(current->mm, hva, keys[i], 0); if (r) { - r = fixup_user_fault(current, current->mm, hva, + r = fixup_user_fault(current->mm, hva, FAULT_FLAG_WRITE, &unlocked); if (r) break; @@ -1890,7 +2173,7 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) i++; } srcu_read_unlock(&kvm->srcu, srcu_idx); - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); out: kvfree(keys); return r; @@ -1905,38 +2188,6 @@ out: /* for consistency */ #define KVM_S390_CMMA_SIZE_MAX ((u32)KVM_S390_SKEYS_MAX) -/* - * Similar to gfn_to_memslot, but returns the index of a memslot also when the - * address falls in a hole. In that case the index of one of the memslots - * bordering the hole is returned. - */ -static int gfn_to_memslot_approx(struct kvm_memslots *slots, gfn_t gfn) -{ - int start = 0, end = slots->used_slots; - int slot = atomic_read(&slots->lru_slot); - struct kvm_memory_slot *memslots = slots->memslots; - - if (gfn >= memslots[slot].base_gfn && - gfn < memslots[slot].base_gfn + memslots[slot].npages) - return slot; - - while (start < end) { - slot = start + (end - start) / 2; - - if (gfn >= memslots[slot].base_gfn) - end = slot; - else - start = slot + 1; - } - - if (gfn >= memslots[start].base_gfn && - gfn < memslots[start].base_gfn + memslots[start].npages) { - atomic_set(&slots->lru_slot, start); - } - - return start; -} - static int kvm_s390_peek_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args, u8 *res, unsigned long bufsize) { @@ -1960,27 +2211,36 @@ static int kvm_s390_peek_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args, return 0; } +static struct kvm_memory_slot *gfn_to_memslot_approx(struct kvm_memslots *slots, + gfn_t gfn) +{ + return ____gfn_to_memslot(slots, gfn, true); +} + static unsigned long kvm_s390_next_dirty_cmma(struct kvm_memslots *slots, unsigned long cur_gfn) { - int slotidx = gfn_to_memslot_approx(slots, cur_gfn); - struct kvm_memory_slot *ms = slots->memslots + slotidx; + struct kvm_memory_slot *ms = gfn_to_memslot_approx(slots, cur_gfn); unsigned long ofs = cur_gfn - ms->base_gfn; + struct rb_node *mnode = &ms->gfn_node[slots->node_idx]; if (ms->base_gfn + ms->npages <= cur_gfn) { - slotidx--; + mnode = rb_next(mnode); /* If we are above the highest slot, wrap around */ - if (slotidx < 0) - slotidx = slots->used_slots - 1; + if (!mnode) + mnode = rb_first(&slots->gfn_tree); - ms = slots->memslots + slotidx; + ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]); ofs = 0; } + + if (cur_gfn < ms->base_gfn) + ofs = 0; + ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, ofs); - while ((slotidx > 0) && (ofs >= ms->npages)) { - slotidx--; - ms = slots->memslots + slotidx; - ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, 0); + while (ofs >= ms->npages && (mnode = rb_next(mnode))) { + ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]); + ofs = find_first_bit(kvm_second_dirty_bitmap(ms), ms->npages); } return ms->base_gfn + ofs; } @@ -1992,6 +2252,9 @@ static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args, struct kvm_memslots *slots = kvm_memslots(kvm); struct kvm_memory_slot *ms; + if (unlikely(kvm_memslots_empty(slots))) + return 0; + cur_gfn = kvm_s390_next_dirty_cmma(slots, args->start_gfn); ms = gfn_to_memslot(kvm, cur_gfn); args->count = 0; @@ -1999,7 +2262,7 @@ static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args, if (!ms) return 0; next_gfn = kvm_s390_next_dirty_cmma(slots, cur_gfn + 1); - mem_end = slots->memslots[0].base_gfn + slots->memslots[0].npages; + mem_end = kvm_s390_get_gfn_end(slots); while (args->count < bufsize) { hva = gfn_to_hva(kvm, cur_gfn); @@ -2073,14 +2336,14 @@ static int kvm_s390_get_cmma_bits(struct kvm *kvm, if (!values) return -ENOMEM; - down_read(&kvm->mm->mmap_sem); + mmap_read_lock(kvm->mm); srcu_idx = srcu_read_lock(&kvm->srcu); if (peek) ret = kvm_s390_peek_cmma(kvm, args, values, bufsize); else ret = kvm_s390_get_cmma(kvm, args, values, bufsize); srcu_read_unlock(&kvm->srcu, srcu_idx); - up_read(&kvm->mm->mmap_sem); + mmap_read_unlock(kvm->mm); if (kvm->arch.migration_mode) args->remaining = atomic64_read(&kvm->arch.cmma_dirty_pages); @@ -2130,7 +2393,7 @@ static int kvm_s390_set_cmma_bits(struct kvm *kvm, goto out; } - down_read(&kvm->mm->mmap_sem); + mmap_read_lock(kvm->mm); srcu_idx = srcu_read_lock(&kvm->srcu); for (i = 0; i < args->count; i++) { hva = gfn_to_hva(kvm, args->start_gfn + i); @@ -2145,20 +2408,579 @@ static int kvm_s390_set_cmma_bits(struct kvm *kvm, set_pgste_bits(kvm->mm, hva, mask, pgstev); } srcu_read_unlock(&kvm->srcu, srcu_idx); - up_read(&kvm->mm->mmap_sem); + mmap_read_unlock(kvm->mm); if (!kvm->mm->context.uses_cmm) { - down_write(&kvm->mm->mmap_sem); + mmap_write_lock(kvm->mm); kvm->mm->context.uses_cmm = 1; - up_write(&kvm->mm->mmap_sem); + mmap_write_unlock(kvm->mm); } out: vfree(bits); return r; } -long kvm_arch_vm_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +/** + * kvm_s390_cpus_from_pv - Convert all protected vCPUs in a protected VM to + * non protected. + * @kvm: the VM whose protected vCPUs are to be converted + * @rc: return value for the RC field of the UVC (in case of error) + * @rrc: return value for the RRC field of the UVC (in case of error) + * + * Does not stop in case of error, tries to convert as many + * CPUs as possible. In case of error, the RC and RRC of the last error are + * returned. + * + * Return: 0 in case of success, otherwise -EIO + */ +int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + struct kvm_vcpu *vcpu; + unsigned long i; + u16 _rc, _rrc; + int ret = 0; + + /* + * We ignore failures and try to destroy as many CPUs as possible. + * At the same time we must not free the assigned resources when + * this fails, as the ultravisor has still access to that memory. + * So kvm_s390_pv_destroy_cpu can leave a "wanted" memory leak + * behind. + * We want to return the first failure rc and rrc, though. + */ + kvm_for_each_vcpu(i, vcpu, kvm) { + mutex_lock(&vcpu->mutex); + if (kvm_s390_pv_destroy_cpu(vcpu, &_rc, &_rrc) && !ret) { + *rc = _rc; + *rrc = _rrc; + ret = -EIO; + } + mutex_unlock(&vcpu->mutex); + } + /* Ensure that we re-enable gisa if the non-PV guest used it but the PV guest did not. */ + if (use_gisa) + kvm_s390_gisa_enable(kvm); + return ret; +} + +/** + * kvm_s390_cpus_to_pv - Convert all non-protected vCPUs in a protected VM + * to protected. + * @kvm: the VM whose protected vCPUs are to be converted + * @rc: return value for the RC field of the UVC (in case of error) + * @rrc: return value for the RRC field of the UVC (in case of error) + * + * Tries to undo the conversion in case of error. + * + * Return: 0 in case of success, otherwise -EIO + */ +static int kvm_s390_cpus_to_pv(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + unsigned long i; + int r = 0; + u16 dummy; + + struct kvm_vcpu *vcpu; + + /* Disable the GISA if the ultravisor does not support AIV. */ + if (!uv_has_feature(BIT_UV_FEAT_AIV)) + kvm_s390_gisa_disable(kvm); + + kvm_for_each_vcpu(i, vcpu, kvm) { + mutex_lock(&vcpu->mutex); + r = kvm_s390_pv_create_cpu(vcpu, rc, rrc); + mutex_unlock(&vcpu->mutex); + if (r) + break; + } + if (r) + kvm_s390_cpus_from_pv(kvm, &dummy, &dummy); + return r; +} + +/* + * Here we provide user space with a direct interface to query UV + * related data like UV maxima and available features as well as + * feature specific data. + * + * To facilitate future extension of the data structures we'll try to + * write data up to the maximum requested length. + */ +static ssize_t kvm_s390_handle_pv_info(struct kvm_s390_pv_info *info) +{ + ssize_t len_min; + + switch (info->header.id) { + case KVM_PV_INFO_VM: { + len_min = sizeof(info->header) + sizeof(info->vm); + + if (info->header.len_max < len_min) + return -EINVAL; + + memcpy(info->vm.inst_calls_list, + uv_info.inst_calls_list, + sizeof(uv_info.inst_calls_list)); + + /* It's max cpuid not max cpus, so it's off by one */ + info->vm.max_cpus = uv_info.max_guest_cpu_id + 1; + info->vm.max_guests = uv_info.max_num_sec_conf; + info->vm.max_guest_addr = uv_info.max_sec_stor_addr; + info->vm.feature_indication = uv_info.uv_feature_indications; + + return len_min; + } + case KVM_PV_INFO_DUMP: { + len_min = sizeof(info->header) + sizeof(info->dump); + + if (info->header.len_max < len_min) + return -EINVAL; + + info->dump.dump_cpu_buffer_len = uv_info.guest_cpu_stor_len; + info->dump.dump_config_mem_buffer_per_1m = uv_info.conf_dump_storage_state_len; + info->dump.dump_config_finalize_len = uv_info.conf_dump_finalize_len; + return len_min; + } + default: + return -EINVAL; + } +} + +static int kvm_s390_pv_dmp(struct kvm *kvm, struct kvm_pv_cmd *cmd, + struct kvm_s390_pv_dmp dmp) +{ + int r = -EINVAL; + void __user *result_buff = (void __user *)dmp.buff_addr; + + switch (dmp.subcmd) { + case KVM_PV_DUMP_INIT: { + if (kvm->arch.pv.dumping) + break; + + /* + * Block SIE entry as concurrent dump UVCs could lead + * to validities. + */ + kvm_s390_vcpu_block_all(kvm); + + r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), + UVC_CMD_DUMP_INIT, &cmd->rc, &cmd->rrc); + KVM_UV_EVENT(kvm, 3, "PROTVIRT DUMP INIT: rc %x rrc %x", + cmd->rc, cmd->rrc); + if (!r) { + kvm->arch.pv.dumping = true; + } else { + kvm_s390_vcpu_unblock_all(kvm); + r = -EINVAL; + } + break; + } + case KVM_PV_DUMP_CONFIG_STOR_STATE: { + if (!kvm->arch.pv.dumping) + break; + + /* + * gaddr is an output parameter since we might stop + * early. As dmp will be copied back in our caller, we + * don't need to do it ourselves. + */ + r = kvm_s390_pv_dump_stor_state(kvm, result_buff, &dmp.gaddr, dmp.buff_len, + &cmd->rc, &cmd->rrc); + break; + } + case KVM_PV_DUMP_COMPLETE: { + if (!kvm->arch.pv.dumping) + break; + + r = -EINVAL; + if (dmp.buff_len < uv_info.conf_dump_finalize_len) + break; + + r = kvm_s390_pv_dump_complete(kvm, result_buff, + &cmd->rc, &cmd->rrc); + break; + } + default: + r = -ENOTTY; + break; + } + + return r; +} + +static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) +{ + const bool need_lock = (cmd->cmd != KVM_PV_ASYNC_CLEANUP_PERFORM); + void __user *argp = (void __user *)cmd->data; + int r = 0; + u16 dummy; + + if (need_lock) + mutex_lock(&kvm->lock); + + switch (cmd->cmd) { + case KVM_PV_ENABLE: { + r = -EINVAL; + if (kvm_s390_pv_is_protected(kvm)) + break; + + /* + * FMT 4 SIE needs esca. As we never switch back to bsca from + * esca, we need no cleanup in the error cases below + */ + r = sca_switch_to_extended(kvm); + if (r) + break; + + mmap_write_lock(current->mm); + r = gmap_mark_unmergeable(); + mmap_write_unlock(current->mm); + if (r) + break; + + r = kvm_s390_pv_init_vm(kvm, &cmd->rc, &cmd->rrc); + if (r) + break; + + r = kvm_s390_cpus_to_pv(kvm, &cmd->rc, &cmd->rrc); + if (r) + kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy); + + /* we need to block service interrupts from now on */ + set_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs); + break; + } + case KVM_PV_ASYNC_CLEANUP_PREPARE: + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm) || !async_destroy) + break; + + r = kvm_s390_cpus_from_pv(kvm, &cmd->rc, &cmd->rrc); + /* + * If a CPU could not be destroyed, destroy VM will also fail. + * There is no point in trying to destroy it. Instead return + * the rc and rrc from the first CPU that failed destroying. + */ + if (r) + break; + r = kvm_s390_pv_set_aside(kvm, &cmd->rc, &cmd->rrc); + + /* no need to block service interrupts any more */ + clear_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs); + break; + case KVM_PV_ASYNC_CLEANUP_PERFORM: + r = -EINVAL; + if (!async_destroy) + break; + /* kvm->lock must not be held; this is asserted inside the function. */ + r = kvm_s390_pv_deinit_aside_vm(kvm, &cmd->rc, &cmd->rrc); + break; + case KVM_PV_DISABLE: { + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm)) + break; + + r = kvm_s390_cpus_from_pv(kvm, &cmd->rc, &cmd->rrc); + /* + * If a CPU could not be destroyed, destroy VM will also fail. + * There is no point in trying to destroy it. Instead return + * the rc and rrc from the first CPU that failed destroying. + */ + if (r) + break; + r = kvm_s390_pv_deinit_cleanup_all(kvm, &cmd->rc, &cmd->rrc); + + /* no need to block service interrupts any more */ + clear_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs); + break; + } + case KVM_PV_SET_SEC_PARMS: { + struct kvm_s390_pv_sec_parm parms = {}; + void *hdr; + + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm)) + break; + + r = -EFAULT; + if (copy_from_user(&parms, argp, sizeof(parms))) + break; + + /* Currently restricted to 8KB */ + r = -EINVAL; + if (parms.length > PAGE_SIZE * 2) + break; + + r = -ENOMEM; + hdr = vmalloc(parms.length); + if (!hdr) + break; + + r = -EFAULT; + if (!copy_from_user(hdr, (void __user *)parms.origin, + parms.length)) + r = kvm_s390_pv_set_sec_parms(kvm, hdr, parms.length, + &cmd->rc, &cmd->rrc); + + vfree(hdr); + break; + } + case KVM_PV_UNPACK: { + struct kvm_s390_pv_unp unp = {}; + + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm) || !mm_is_protected(kvm->mm)) + break; + + r = -EFAULT; + if (copy_from_user(&unp, argp, sizeof(unp))) + break; + + r = kvm_s390_pv_unpack(kvm, unp.addr, unp.size, unp.tweak, + &cmd->rc, &cmd->rrc); + break; + } + case KVM_PV_VERIFY: { + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm)) + break; + + r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), + UVC_CMD_VERIFY_IMG, &cmd->rc, &cmd->rrc); + KVM_UV_EVENT(kvm, 3, "PROTVIRT VERIFY: rc %x rrc %x", cmd->rc, + cmd->rrc); + break; + } + case KVM_PV_PREP_RESET: { + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm)) + break; + + r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), + UVC_CMD_PREPARE_RESET, &cmd->rc, &cmd->rrc); + KVM_UV_EVENT(kvm, 3, "PROTVIRT PREP RESET: rc %x rrc %x", + cmd->rc, cmd->rrc); + break; + } + case KVM_PV_UNSHARE_ALL: { + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm)) + break; + + r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), + UVC_CMD_SET_UNSHARE_ALL, &cmd->rc, &cmd->rrc); + KVM_UV_EVENT(kvm, 3, "PROTVIRT UNSHARE: rc %x rrc %x", + cmd->rc, cmd->rrc); + break; + } + case KVM_PV_INFO: { + struct kvm_s390_pv_info info = {}; + ssize_t data_len; + + /* + * No need to check the VM protection here. + * + * Maybe user space wants to query some of the data + * when the VM is still unprotected. If we see the + * need to fence a new data command we can still + * return an error in the info handler. + */ + + r = -EFAULT; + if (copy_from_user(&info, argp, sizeof(info.header))) + break; + + r = -EINVAL; + if (info.header.len_max < sizeof(info.header)) + break; + + data_len = kvm_s390_handle_pv_info(&info); + if (data_len < 0) { + r = data_len; + break; + } + /* + * If a data command struct is extended (multiple + * times) this can be used to determine how much of it + * is valid. + */ + info.header.len_written = data_len; + + r = -EFAULT; + if (copy_to_user(argp, &info, data_len)) + break; + + r = 0; + break; + } + case KVM_PV_DUMP: { + struct kvm_s390_pv_dmp dmp; + + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm)) + break; + + r = -EFAULT; + if (copy_from_user(&dmp, argp, sizeof(dmp))) + break; + + r = kvm_s390_pv_dmp(kvm, cmd, dmp); + if (r) + break; + + if (copy_to_user(argp, &dmp, sizeof(dmp))) { + r = -EFAULT; + break; + } + + break; + } + default: + r = -ENOTTY; + } + if (need_lock) + mutex_unlock(&kvm->lock); + + return r; +} + +static int mem_op_validate_common(struct kvm_s390_mem_op *mop, u64 supported_flags) +{ + if (mop->flags & ~supported_flags || !mop->size) + return -EINVAL; + if (mop->size > MEM_OP_MAX_SIZE) + return -E2BIG; + if (mop->flags & KVM_S390_MEMOP_F_SKEY_PROTECTION) { + if (mop->key > 0xf) + return -EINVAL; + } else { + mop->key = 0; + } + return 0; +} + +static int kvm_s390_vm_mem_op_abs(struct kvm *kvm, struct kvm_s390_mem_op *mop) +{ + void __user *uaddr = (void __user *)mop->buf; + enum gacc_mode acc_mode; + void *tmpbuf = NULL; + int r, srcu_idx; + + r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_SKEY_PROTECTION | + KVM_S390_MEMOP_F_CHECK_ONLY); + if (r) + return r; + + if (!(mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)) { + tmpbuf = vmalloc(mop->size); + if (!tmpbuf) + return -ENOMEM; + } + + srcu_idx = srcu_read_lock(&kvm->srcu); + + if (kvm_is_error_gpa(kvm, mop->gaddr)) { + r = PGM_ADDRESSING; + goto out_unlock; + } + + acc_mode = mop->op == KVM_S390_MEMOP_ABSOLUTE_READ ? GACC_FETCH : GACC_STORE; + if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { + r = check_gpa_range(kvm, mop->gaddr, mop->size, acc_mode, mop->key); + goto out_unlock; + } + if (acc_mode == GACC_FETCH) { + r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf, + mop->size, GACC_FETCH, mop->key); + if (r) + goto out_unlock; + if (copy_to_user(uaddr, tmpbuf, mop->size)) + r = -EFAULT; + } else { + if (copy_from_user(tmpbuf, uaddr, mop->size)) { + r = -EFAULT; + goto out_unlock; + } + r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf, + mop->size, GACC_STORE, mop->key); + } + +out_unlock: + srcu_read_unlock(&kvm->srcu, srcu_idx); + + vfree(tmpbuf); + return r; +} + +static int kvm_s390_vm_mem_op_cmpxchg(struct kvm *kvm, struct kvm_s390_mem_op *mop) +{ + void __user *uaddr = (void __user *)mop->buf; + void __user *old_addr = (void __user *)mop->old_addr; + union { + __uint128_t quad; + char raw[sizeof(__uint128_t)]; + } old = { .quad = 0}, new = { .quad = 0 }; + unsigned int off_in_quad = sizeof(new) - mop->size; + int r, srcu_idx; + bool success; + + r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_SKEY_PROTECTION); + if (r) + return r; + /* + * This validates off_in_quad. Checking that size is a power + * of two is not necessary, as cmpxchg_guest_abs_with_key + * takes care of that + */ + if (mop->size > sizeof(new)) + return -EINVAL; + if (copy_from_user(&new.raw[off_in_quad], uaddr, mop->size)) + return -EFAULT; + if (copy_from_user(&old.raw[off_in_quad], old_addr, mop->size)) + return -EFAULT; + + srcu_idx = srcu_read_lock(&kvm->srcu); + + if (kvm_is_error_gpa(kvm, mop->gaddr)) { + r = PGM_ADDRESSING; + goto out_unlock; + } + + r = cmpxchg_guest_abs_with_key(kvm, mop->gaddr, mop->size, &old.quad, + new.quad, mop->key, &success); + if (!success && copy_to_user(old_addr, &old.raw[off_in_quad], mop->size)) + r = -EFAULT; + +out_unlock: + srcu_read_unlock(&kvm->srcu, srcu_idx); + return r; +} + +static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop) +{ + /* + * This is technically a heuristic only, if the kvm->lock is not + * taken, it is not guaranteed that the vm is/remains non-protected. + * This is ok from a kernel perspective, wrongdoing is detected + * on the access, -EFAULT is returned and the vm may crash the + * next time it accesses the memory in question. + * There is no sane usecase to do switching and a memop on two + * different CPUs at the same time. + */ + if (kvm_s390_pv_get_handle(kvm)) + return -EINVAL; + + switch (mop->op) { + case KVM_S390_MEMOP_ABSOLUTE_READ: + case KVM_S390_MEMOP_ABSOLUTE_WRITE: + return kvm_s390_vm_mem_op_abs(kvm, mop); + case KVM_S390_MEMOP_ABSOLUTE_CMPXCHG: + return kvm_s390_vm_mem_op_cmpxchg(kvm, mop); + default: + return -EINVAL; + } +} + +int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm *kvm = filp->private_data; void __user *argp = (void __user *)arg; @@ -2254,6 +3076,54 @@ long kvm_arch_vm_ioctl(struct file *filp, mutex_unlock(&kvm->slots_lock); break; } + case KVM_S390_PV_COMMAND: { + struct kvm_pv_cmd args; + + /* protvirt means user cpu state */ + kvm_s390_set_user_cpu_state_ctrl(kvm); + r = 0; + if (!is_prot_virt_host()) { + r = -EINVAL; + break; + } + if (copy_from_user(&args, argp, sizeof(args))) { + r = -EFAULT; + break; + } + if (args.flags) { + r = -EINVAL; + break; + } + /* must be called without kvm->lock */ + r = kvm_s390_handle_pv(kvm, &args); + if (copy_to_user(argp, &args, sizeof(args))) { + r = -EFAULT; + break; + } + break; + } + case KVM_S390_MEM_OP: { + struct kvm_s390_mem_op mem_op; + + if (copy_from_user(&mem_op, argp, sizeof(mem_op)) == 0) + r = kvm_s390_vm_mem_op(kvm, &mem_op); + else + r = -EFAULT; + break; + } + case KVM_S390_ZPCI_OP: { + struct kvm_s390_zpci_op args; + + r = -EINVAL; + if (!IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) + break; + if (copy_from_user(&args, argp, sizeof(args))) { + r = -EFAULT; + break; + } + r = kvm_s390_pci_zpci_op(kvm, &args); + break; + } default: r = -ENOTTY; } @@ -2298,12 +3168,26 @@ static void kvm_s390_set_crycb_format(struct kvm *kvm) kvm->arch.crypto.crycbd |= CRYCB_FORMAT1; } +/* + * kvm_arch_crypto_set_masks + * + * @kvm: pointer to the target guest's KVM struct containing the crypto masks + * to be set. + * @apm: the mask identifying the accessible AP adapters + * @aqm: the mask identifying the accessible AP domains + * @adm: the mask identifying the accessible AP control domains + * + * Set the masks that identify the adapters, domains and control domains to + * which the KVM guest is granted access. + * + * Note: The kvm->lock mutex must be locked by the caller before invoking this + * function. + */ void kvm_arch_crypto_set_masks(struct kvm *kvm, unsigned long *apm, unsigned long *aqm, unsigned long *adm) { struct kvm_s390_crypto_cb *crycb = kvm->arch.crypto.crycb; - mutex_lock(&kvm->lock); kvm_s390_vcpu_block_all(kvm); switch (kvm->arch.crypto.crycbd & CRYCB_FORMAT_MASK) { @@ -2334,13 +3218,23 @@ void kvm_arch_crypto_set_masks(struct kvm *kvm, unsigned long *apm, /* recreate the shadow crycb for each vcpu */ kvm_s390_sync_request_broadcast(kvm, KVM_REQ_VSIE_RESTART); kvm_s390_vcpu_unblock_all(kvm); - mutex_unlock(&kvm->lock); } EXPORT_SYMBOL_GPL(kvm_arch_crypto_set_masks); +/* + * kvm_arch_crypto_clear_masks + * + * @kvm: pointer to the target guest's KVM struct containing the crypto masks + * to be cleared. + * + * Clear the masks that identify the adapters, domains and control domains to + * which the KVM guest is granted access. + * + * Note: The kvm->lock mutex must be locked by the caller before invoking this + * function. + */ void kvm_arch_crypto_clear_masks(struct kvm *kvm) { - mutex_lock(&kvm->lock); kvm_s390_vcpu_block_all(kvm); memset(&kvm->arch.crypto.crycb->apcb0, 0, @@ -2352,7 +3246,6 @@ void kvm_arch_crypto_clear_masks(struct kvm *kvm) /* recreate the shadow crycb for each vcpu */ kvm_s390_sync_request_broadcast(kvm, KVM_REQ_VSIE_RESTART); kvm_s390_vcpu_unblock_all(kvm); - mutex_unlock(&kvm->lock); } EXPORT_SYMBOL_GPL(kvm_arch_crypto_clear_masks); @@ -2369,6 +3262,7 @@ static void kvm_s390_crypto_init(struct kvm *kvm) { kvm->arch.crypto.crycb = &kvm->arch.sie_page2->crycb; kvm_s390_set_crycb_format(kvm); + init_rwsem(&kvm->arch.crypto.pqap_hook_rwsem); if (!test_kvm_facility(kvm, 76)) return; @@ -2391,9 +3285,17 @@ static void sca_dispose(struct kvm *kvm) kvm->arch.sca = NULL; } +void kvm_arch_free_vm(struct kvm *kvm) +{ + if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) + kvm_s390_pci_clear_list(kvm); + + __kvm_arch_free_vm(kvm); +} + int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { - gfp_t alloc_flags = GFP_KERNEL; + gfp_t alloc_flags = GFP_KERNEL_ACCOUNT; int i, rc; char debug_name[16]; static unsigned long sca_offset; @@ -2438,7 +3340,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) BUILD_BUG_ON(sizeof(struct sie_page2) != 4096); kvm->arch.sie_page2 = - (struct sie_page2 *) get_zeroed_page(GFP_KERNEL | GFP_DMA); + (struct sie_page2 *) get_zeroed_page(GFP_KERNEL_ACCOUNT | GFP_DMA); if (!kvm->arch.sie_page2) goto out_err; @@ -2446,10 +3348,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.model.fac_list = kvm->arch.sie_page2->fac_list; for (i = 0; i < kvm_s390_fac_size(); i++) { - kvm->arch.model.fac_mask[i] = S390_lowcore.stfle_fac_list[i] & + kvm->arch.model.fac_mask[i] = stfle_fac_list[i] & (kvm_s390_fac_base[i] | kvm_s390_fac_ext[i]); - kvm->arch.model.fac_list[i] = S390_lowcore.stfle_fac_list[i] & + kvm->arch.model.fac_list[i] = stfle_fac_list[i] & kvm_s390_fac_base[i]; } kvm->arch.model.subfuncs = kvm_s390_available_subfunc; @@ -2471,8 +3373,17 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.model.cpuid = kvm_s390_get_initial_cpuid(); kvm->arch.model.ibc = sclp.ibc & 0x0fff; + kvm->arch.model.uv_feat_guest.feat = 0; + kvm_s390_crypto_init(kvm); + if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) { + mutex_lock(&kvm->lock); + kvm_s390_pci_init_list(kvm); + kvm_s390_vcpu_pci_enable_interp(kvm); + mutex_unlock(&kvm->lock); + } + mutex_init(&kvm->arch.float_int.ais_lock); spin_lock_init(&kvm->arch.float_int.lock); for (i = 0; i < FIRQ_LIST_COUNT; i++) @@ -2503,7 +3414,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.use_skf = sclp.has_skey; spin_lock_init(&kvm->arch.start_stop_lock); kvm_s390_vsie_init(kvm); - kvm_s390_gisa_init(kvm); + if (use_gisa) + kvm_s390_gisa_init(kvm); + INIT_LIST_HEAD(&kvm->arch.pv.need_cleanup); + kvm->arch.pv.set_aside = NULL; KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid); return 0; @@ -2517,46 +3431,50 @@ out_err: void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { + u16 rc, rrc; + VCPU_EVENT(vcpu, 3, "%s", "free cpu"); trace_kvm_s390_destroy_vcpu(vcpu->vcpu_id); kvm_s390_clear_local_irqs(vcpu); kvm_clear_async_pf_completion_queue(vcpu); if (!kvm_is_ucontrol(vcpu->kvm)) sca_del_vcpu(vcpu); + kvm_s390_update_topology_change_report(vcpu->kvm, 1); if (kvm_is_ucontrol(vcpu->kvm)) gmap_remove(vcpu->arch.gmap); if (vcpu->kvm->arch.use_cmma) kvm_s390_vcpu_unsetup_cmma(vcpu); + /* We can not hold the vcpu mutex here, we are already dying */ + if (kvm_s390_pv_cpu_get_handle(vcpu)) + kvm_s390_pv_destroy_cpu(vcpu, &rc, &rrc); free_page((unsigned long)(vcpu->arch.sie_block)); - - kvm_vcpu_uninit(vcpu); - kmem_cache_free(kvm_vcpu_cache, vcpu); -} - -static void kvm_free_vcpus(struct kvm *kvm) -{ - unsigned int i; - struct kvm_vcpu *vcpu; - - kvm_for_each_vcpu(i, vcpu, kvm) - kvm_arch_vcpu_destroy(vcpu); - - mutex_lock(&kvm->lock); - for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) - kvm->vcpus[i] = NULL; - - atomic_set(&kvm->online_vcpus, 0); - mutex_unlock(&kvm->lock); } void kvm_arch_destroy_vm(struct kvm *kvm) { - kvm_free_vcpus(kvm); + u16 rc, rrc; + + kvm_destroy_vcpus(kvm); sca_dispose(kvm); - debug_unregister(kvm->arch.dbf); kvm_s390_gisa_destroy(kvm); + /* + * We are already at the end of life and kvm->lock is not taken. + * This is ok as the file descriptor is closed by now and nobody + * can mess with the pv state. + */ + kvm_s390_pv_deinit_cleanup_all(kvm, &rc, &rrc); + /* + * Remove the mmu notifier only when the whole KVM VM is torn down, + * and only if one was registered to begin with. If the VM is + * currently not protected, but has been previously been protected, + * then it's possible that the notifier is still registered. + */ + if (kvm->arch.pv.mmu_notifier.ops) + mmu_notifier_unregister(&kvm->arch.pv.mmu_notifier, kvm->mm); + + debug_unregister(kvm->arch.dbf); free_page((unsigned long)kvm->arch.sie_page2); if (!kvm_is_ucontrol(kvm)) gmap_remove(kvm->arch.gmap); @@ -2599,28 +3517,30 @@ static void sca_del_vcpu(struct kvm_vcpu *vcpu) static void sca_add_vcpu(struct kvm_vcpu *vcpu) { if (!kvm_s390_use_sca_entries()) { - struct bsca_block *sca = vcpu->kvm->arch.sca; + phys_addr_t sca_phys = virt_to_phys(vcpu->kvm->arch.sca); /* we still need the basic sca for the ipte control */ - vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32); - vcpu->arch.sie_block->scaol = (__u32)(__u64)sca; + vcpu->arch.sie_block->scaoh = sca_phys >> 32; + vcpu->arch.sie_block->scaol = sca_phys; return; } read_lock(&vcpu->kvm->arch.sca_lock); if (vcpu->kvm->arch.use_esca) { struct esca_block *sca = vcpu->kvm->arch.sca; + phys_addr_t sca_phys = virt_to_phys(sca); - sca->cpu[vcpu->vcpu_id].sda = (__u64) vcpu->arch.sie_block; - vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32); - vcpu->arch.sie_block->scaol = (__u32)(__u64)sca & ~0x3fU; + sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block); + vcpu->arch.sie_block->scaoh = sca_phys >> 32; + vcpu->arch.sie_block->scaol = sca_phys & ESCA_SCAOL_MASK; vcpu->arch.sie_block->ecb2 |= ECB2_ESCA; set_bit_inv(vcpu->vcpu_id, (unsigned long *) sca->mcn); } else { struct bsca_block *sca = vcpu->kvm->arch.sca; + phys_addr_t sca_phys = virt_to_phys(sca); - sca->cpu[vcpu->vcpu_id].sda = (__u64) vcpu->arch.sie_block; - vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32); - vcpu->arch.sie_block->scaol = (__u32)(__u64)sca; + sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block); + vcpu->arch.sie_block->scaoh = sca_phys >> 32; + vcpu->arch.sie_block->scaol = sca_phys; set_bit_inv(vcpu->vcpu_id, (unsigned long *) &sca->mcn); } read_unlock(&vcpu->kvm->arch.sca_lock); @@ -2649,15 +3569,20 @@ static int sca_switch_to_extended(struct kvm *kvm) struct bsca_block *old_sca = kvm->arch.sca; struct esca_block *new_sca; struct kvm_vcpu *vcpu; - unsigned int vcpu_idx; + unsigned long vcpu_idx; u32 scaol, scaoh; + phys_addr_t new_sca_phys; - new_sca = alloc_pages_exact(sizeof(*new_sca), GFP_KERNEL|__GFP_ZERO); + if (kvm->arch.use_esca) + return 0; + + new_sca = alloc_pages_exact(sizeof(*new_sca), GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!new_sca) return -ENOMEM; - scaoh = (u32)((u64)(new_sca) >> 32); - scaol = (u32)(u64)(new_sca) & ~0x3fU; + new_sca_phys = virt_to_phys(new_sca); + scaoh = new_sca_phys >> 32; + scaol = new_sca_phys & ESCA_SCAOL_MASK; kvm_s390_vcpu_block_all(kvm); write_lock(&kvm->arch.sca_lock); @@ -2696,46 +3621,11 @@ static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id) if (!sclp.has_esca || !sclp.has_64bscao) return false; - mutex_lock(&kvm->lock); rc = kvm->arch.use_esca ? 0 : sca_switch_to_extended(kvm); - mutex_unlock(&kvm->lock); return rc == 0 && id < KVM_S390_ESCA_CPU_SLOTS; } -int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) -{ - vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; - kvm_clear_async_pf_completion_queue(vcpu); - vcpu->run->kvm_valid_regs = KVM_SYNC_PREFIX | - KVM_SYNC_GPRS | - KVM_SYNC_ACRS | - KVM_SYNC_CRS | - KVM_SYNC_ARCH0 | - KVM_SYNC_PFAULT; - kvm_s390_set_prefix(vcpu, 0); - if (test_kvm_facility(vcpu->kvm, 64)) - vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB; - if (test_kvm_facility(vcpu->kvm, 82)) - vcpu->run->kvm_valid_regs |= KVM_SYNC_BPBC; - if (test_kvm_facility(vcpu->kvm, 133)) - vcpu->run->kvm_valid_regs |= KVM_SYNC_GSCB; - if (test_kvm_facility(vcpu->kvm, 156)) - vcpu->run->kvm_valid_regs |= KVM_SYNC_ETOKEN; - /* fprs can be synchronized via vrs, even if the guest has no vx. With - * MACHINE_HAS_VX, (load|store)_fpu_regs() will work with vrs format. - */ - if (MACHINE_HAS_VX) - vcpu->run->kvm_valid_regs |= KVM_SYNC_VRS; - else - vcpu->run->kvm_valid_regs |= KVM_SYNC_FPRS; - - if (kvm_is_ucontrol(vcpu->kvm)) - return __kvm_ucontrol_vcpu_init(vcpu); - - return 0; -} - /* needs disabled preemption to protect from TOD sync and vcpu_load/put */ static void __start_cpu_timer_accounting(struct kvm_vcpu *vcpu) { @@ -2844,35 +3734,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) } -static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu) -{ - /* this equals initial cpu reset in pop, but we don't switch to ESA */ - vcpu->arch.sie_block->gpsw.mask = 0UL; - vcpu->arch.sie_block->gpsw.addr = 0UL; - kvm_s390_set_prefix(vcpu, 0); - kvm_s390_set_cpu_timer(vcpu, 0); - vcpu->arch.sie_block->ckc = 0UL; - vcpu->arch.sie_block->todpr = 0; - memset(vcpu->arch.sie_block->gcr, 0, 16 * sizeof(__u64)); - vcpu->arch.sie_block->gcr[0] = CR0_UNUSED_56 | - CR0_INTERRUPT_KEY_SUBMASK | - CR0_MEASUREMENT_ALERT_SUBMASK; - vcpu->arch.sie_block->gcr[14] = CR14_UNUSED_32 | - CR14_UNUSED_33 | - CR14_EXTERNAL_DAMAGE_SUBMASK; - /* make sure the new fpc will be lazily loaded */ - save_fpu_regs(); - current->thread.fpu.fpc = 0; - vcpu->arch.sie_block->gbea = 1; - vcpu->arch.sie_block->pp = 0; - vcpu->arch.sie_block->fpf &= ~FPF_BPBC; - vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; - kvm_clear_async_pf_completion_queue(vcpu); - if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm)) - kvm_s390_vcpu_stop(vcpu); - kvm_s390_clear_local_irqs(vcpu); -} - void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { mutex_lock(&vcpu->kvm->lock); @@ -2941,15 +3802,18 @@ static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu) void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu) { - free_page(vcpu->arch.sie_block->cbrlo); + free_page((unsigned long)phys_to_virt(vcpu->arch.sie_block->cbrlo)); vcpu->arch.sie_block->cbrlo = 0; } int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu) { - vcpu->arch.sie_block->cbrlo = get_zeroed_page(GFP_KERNEL); - if (!vcpu->arch.sie_block->cbrlo) + void *cbrlo_page = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + + if (!cbrlo_page) return -ENOMEM; + + vcpu->arch.sie_block->cbrlo = virt_to_phys(cbrlo_page); return 0; } @@ -2959,12 +3823,13 @@ static void kvm_s390_vcpu_setup_model(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->ibc = model->ibc; if (test_kvm_facility(vcpu->kvm, 7)) - vcpu->arch.sie_block->fac = (u32)(u64) model->fac_list; + vcpu->arch.sie_block->fac = virt_to_phys(model->fac_list); } -int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) +static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu) { int rc = 0; + u16 uvrc, uvrrc; atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH | CPUSTAT_SM | @@ -2982,8 +3847,12 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->ecb |= ECB_HOSTPROTINT; if (test_kvm_facility(vcpu->kvm, 9)) vcpu->arch.sie_block->ecb |= ECB_SRSI; + if (test_kvm_facility(vcpu->kvm, 11)) + vcpu->arch.sie_block->ecb |= ECB_PTF; if (test_kvm_facility(vcpu->kvm, 73)) vcpu->arch.sie_block->ecb |= ECB_TE; + if (!kvm_is_ucontrol(vcpu->kvm)) + vcpu->arch.sie_block->ecb |= ECB_SPECI; if (test_kvm_facility(vcpu->kvm, 8) && vcpu->kvm->arch.use_pfmfi) vcpu->arch.sie_block->ecb2 |= ECB2_PFMFI; @@ -3011,9 +3880,8 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) VCPU_EVENT(vcpu, 3, "AIV gisa format-%u enabled for cpu %03u", vcpu->arch.sie_block->gd & 0x3, vcpu->vcpu_id); } - vcpu->arch.sie_block->sdnxo = ((unsigned long) &vcpu->run->s.regs.sdnx) - | SDNXC; - vcpu->arch.sie_block->riccbd = (unsigned long) &vcpu->run->s.regs.riccb; + vcpu->arch.sie_block->sdnxo = virt_to_phys(&vcpu->run->s.regs.sdnx) | SDNXC; + vcpu->arch.sie_block->riccbd = virt_to_phys(&vcpu->run->s.regs.riccb); if (sclp.has_kss) kvm_s390_set_cpuflags(vcpu, CPUSTAT_KSS); @@ -3032,62 +3900,102 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) kvm_s390_vcpu_crypto_setup(vcpu); + kvm_s390_vcpu_pci_setup(vcpu); + + mutex_lock(&vcpu->kvm->lock); + if (kvm_s390_pv_is_protected(vcpu->kvm)) { + rc = kvm_s390_pv_create_cpu(vcpu, &uvrc, &uvrrc); + if (rc) + kvm_s390_vcpu_unsetup_cmma(vcpu); + } + mutex_unlock(&vcpu->kvm->lock); + return rc; } -struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, - unsigned int id) +int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) { - struct kvm_vcpu *vcpu; - struct sie_page *sie_page; - int rc = -EINVAL; - if (!kvm_is_ucontrol(kvm) && !sca_can_add_vcpu(kvm, id)) - goto out; - - rc = -ENOMEM; + return -EINVAL; + return 0; +} - vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); - if (!vcpu) - goto out; +int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) +{ + struct sie_page *sie_page; + int rc; BUILD_BUG_ON(sizeof(struct sie_page) != 4096); - sie_page = (struct sie_page *) get_zeroed_page(GFP_KERNEL); + sie_page = (struct sie_page *) get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!sie_page) - goto out_free_cpu; + return -ENOMEM; vcpu->arch.sie_block = &sie_page->sie_block; - vcpu->arch.sie_block->itdba = (unsigned long) &sie_page->itdb; + vcpu->arch.sie_block->itdba = virt_to_phys(&sie_page->itdb); /* the real guest size will always be smaller than msl */ vcpu->arch.sie_block->mso = 0; vcpu->arch.sie_block->msl = sclp.hamax; - vcpu->arch.sie_block->icpua = id; + vcpu->arch.sie_block->icpua = vcpu->vcpu_id; spin_lock_init(&vcpu->arch.local_int.lock); - vcpu->arch.sie_block->gd = (u32)(u64)kvm->arch.gisa_int.origin; - if (vcpu->arch.sie_block->gd && sclp.has_gisaf) - vcpu->arch.sie_block->gd |= GISA_FORMAT1; + vcpu->arch.sie_block->gd = kvm_s390_get_gisa_desc(vcpu->kvm); seqcount_init(&vcpu->arch.cputm_seqcount); - rc = kvm_vcpu_init(vcpu, kvm, id); + vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; + kvm_clear_async_pf_completion_queue(vcpu); + vcpu->run->kvm_valid_regs = KVM_SYNC_PREFIX | + KVM_SYNC_GPRS | + KVM_SYNC_ACRS | + KVM_SYNC_CRS | + KVM_SYNC_ARCH0 | + KVM_SYNC_PFAULT | + KVM_SYNC_DIAG318; + kvm_s390_set_prefix(vcpu, 0); + if (test_kvm_facility(vcpu->kvm, 64)) + vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB; + if (test_kvm_facility(vcpu->kvm, 82)) + vcpu->run->kvm_valid_regs |= KVM_SYNC_BPBC; + if (test_kvm_facility(vcpu->kvm, 133)) + vcpu->run->kvm_valid_regs |= KVM_SYNC_GSCB; + if (test_kvm_facility(vcpu->kvm, 156)) + vcpu->run->kvm_valid_regs |= KVM_SYNC_ETOKEN; + /* fprs can be synchronized via vrs, even if the guest has no vx. With + * cpu_has_vx(), (load|store)_fpu_regs() will work with vrs format. + */ + if (cpu_has_vx()) + vcpu->run->kvm_valid_regs |= KVM_SYNC_VRS; + else + vcpu->run->kvm_valid_regs |= KVM_SYNC_FPRS; + + if (kvm_is_ucontrol(vcpu->kvm)) { + rc = __kvm_ucontrol_vcpu_init(vcpu); + if (rc) + goto out_free_sie_block; + } + + VM_EVENT(vcpu->kvm, 3, "create cpu %d at 0x%pK, sie block at 0x%pK", + vcpu->vcpu_id, vcpu, vcpu->arch.sie_block); + trace_kvm_s390_create_vcpu(vcpu->vcpu_id, vcpu, vcpu->arch.sie_block); + + rc = kvm_s390_vcpu_setup(vcpu); if (rc) - goto out_free_sie_block; - VM_EVENT(kvm, 3, "create cpu %d at 0x%pK, sie block at 0x%pK", id, vcpu, - vcpu->arch.sie_block); - trace_kvm_s390_create_vcpu(id, vcpu, vcpu->arch.sie_block); + goto out_ucontrol_uninit; - return vcpu; + kvm_s390_update_topology_change_report(vcpu->kvm, 1); + return 0; + +out_ucontrol_uninit: + if (kvm_is_ucontrol(vcpu->kvm)) + gmap_remove(vcpu->arch.gmap); out_free_sie_block: free_page((unsigned long)(vcpu->arch.sie_block)); -out_free_cpu: - kmem_cache_free(kvm_vcpu_cache, vcpu); -out: - return ERR_PTR(rc); + return rc; } int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { + clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.gisa_int.kicked_mask); return kvm_s390_vcpu_has_irq(vcpu, 0); } @@ -3139,7 +4047,7 @@ void exit_sie(struct kvm_vcpu *vcpu) /* Kick a guest cpu out of SIE to process a request synchronously */ void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu) { - kvm_make_request(req, vcpu); + __kvm_make_request(req, vcpu); kvm_s390_vcpu_request(vcpu); } @@ -3149,7 +4057,9 @@ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, struct kvm *kvm = gmap->private; struct kvm_vcpu *vcpu; unsigned long prefix; - int i; + unsigned long i; + + trace_kvm_s390_gmap_notifier(start, end, gmap_is_shadow(gmap)); if (gmap_is_shadow(gmap)) return; @@ -3162,7 +4072,7 @@ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, if (prefix <= end && start <= prefix + 2*PAGE_SIZE - 1) { VCPU_EVENT(vcpu, 2, "gmap notifier for %lx-%lx", start, end); - kvm_s390_sync_request(KVM_REQ_MMU_RELOAD, vcpu); + kvm_s390_sync_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu); } } } @@ -3171,7 +4081,7 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) { /* do not poll with more than halt_poll_max_steal percent of steal time */ if (S390_lowcore.avg_steal_timer * 100 / (TICK_USEC << 12) >= - halt_poll_max_steal) { + READ_ONCE(halt_poll_max_steal)) { vcpu->stat.halt_no_poll_steal++; return true; } @@ -3287,10 +4197,76 @@ static int kvm_arch_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, return r; } -static int kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu) +static void kvm_arch_vcpu_ioctl_normal_reset(struct kvm_vcpu *vcpu) { - kvm_s390_vcpu_initial_reset(vcpu); - return 0; + vcpu->arch.sie_block->gpsw.mask &= ~PSW_MASK_RI; + vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; + memset(vcpu->run->s.regs.riccb, 0, sizeof(vcpu->run->s.regs.riccb)); + + kvm_clear_async_pf_completion_queue(vcpu); + if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm)) + kvm_s390_vcpu_stop(vcpu); + kvm_s390_clear_local_irqs(vcpu); +} + +static void kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu) +{ + /* Initial reset is a superset of the normal reset */ + kvm_arch_vcpu_ioctl_normal_reset(vcpu); + + /* + * This equals initial cpu reset in pop, but we don't switch to ESA. + * We do not only reset the internal data, but also ... + */ + vcpu->arch.sie_block->gpsw.mask = 0; + vcpu->arch.sie_block->gpsw.addr = 0; + kvm_s390_set_prefix(vcpu, 0); + kvm_s390_set_cpu_timer(vcpu, 0); + vcpu->arch.sie_block->ckc = 0; + memset(vcpu->arch.sie_block->gcr, 0, sizeof(vcpu->arch.sie_block->gcr)); + vcpu->arch.sie_block->gcr[0] = CR0_INITIAL_MASK; + vcpu->arch.sie_block->gcr[14] = CR14_INITIAL_MASK; + + /* ... the data in sync regs */ + memset(vcpu->run->s.regs.crs, 0, sizeof(vcpu->run->s.regs.crs)); + vcpu->run->s.regs.ckc = 0; + vcpu->run->s.regs.crs[0] = CR0_INITIAL_MASK; + vcpu->run->s.regs.crs[14] = CR14_INITIAL_MASK; + vcpu->run->psw_addr = 0; + vcpu->run->psw_mask = 0; + vcpu->run->s.regs.todpr = 0; + vcpu->run->s.regs.cputm = 0; + vcpu->run->s.regs.ckc = 0; + vcpu->run->s.regs.pp = 0; + vcpu->run->s.regs.gbea = 1; + vcpu->run->s.regs.fpc = 0; + /* + * Do not reset these registers in the protected case, as some of + * them are overlaid and they are not accessible in this case + * anyway. + */ + if (!kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->gbea = 1; + vcpu->arch.sie_block->pp = 0; + vcpu->arch.sie_block->fpf &= ~FPF_BPBC; + vcpu->arch.sie_block->todpr = 0; + } +} + +static void kvm_arch_vcpu_ioctl_clear_reset(struct kvm_vcpu *vcpu) +{ + struct kvm_sync_regs *regs = &vcpu->run->s.regs; + + /* Clear reset is a superset of the initial reset */ + kvm_arch_vcpu_ioctl_initial_reset(vcpu); + + memset(®s->gprs, 0, sizeof(regs->gprs)); + memset(®s->vrs, 0, sizeof(regs->vrs)); + memset(®s->acrs, 0, sizeof(regs->acrs)); + memset(®s->gscb, 0, sizeof(regs->gscb)); + + regs->etoken = 0; + regs->etoken_extension = 0; } int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) @@ -3339,18 +4315,13 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) vcpu_load(vcpu); - if (test_fp_ctl(fpu->fpc)) { - ret = -EINVAL; - goto out; - } vcpu->run->s.regs.fpc = fpu->fpc; - if (MACHINE_HAS_VX) + if (cpu_has_vx()) convert_fp_to_vx((__vector128 *) vcpu->run->s.regs.vrs, (freg_t *) fpu->fprs); else memcpy(vcpu->run->s.regs.fprs, &fpu->fprs, sizeof(fpu->fprs)); -out: vcpu_put(vcpu); return ret; } @@ -3359,9 +4330,7 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { vcpu_load(vcpu); - /* make sure we have the latest values */ - save_fpu_regs(); - if (MACHINE_HAS_VX) + if (cpu_has_vx()) convert_vx_to_fp((freg_t *) fpu->fprs, (__vector128 *) vcpu->run->s.regs.vrs); else @@ -3460,18 +4429,24 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, vcpu_load(vcpu); /* user space knows about this interface - let it control the state */ - vcpu->kvm->arch.user_cpu_state_ctrl = 1; + kvm_s390_set_user_cpu_state_ctrl(vcpu->kvm); switch (mp_state->mp_state) { case KVM_MP_STATE_STOPPED: - kvm_s390_vcpu_stop(vcpu); + rc = kvm_s390_vcpu_stop(vcpu); break; case KVM_MP_STATE_OPERATING: - kvm_s390_vcpu_start(vcpu); + rc = kvm_s390_vcpu_start(vcpu); break; case KVM_MP_STATE_LOAD: + if (!kvm_s390_pv_cpu_is_protected(vcpu)) { + rc = -ENXIO; + break; + } + rc = kvm_s390_pv_set_cpu_state(vcpu, PV_CPU_STATE_OPR_LOAD); + break; case KVM_MP_STATE_CHECK_STOP: - /* fall through - CHECK_STOP and LOAD are not supported yet */ + fallthrough; /* CHECK_STOP and LOAD are not supported yet */ default: rc = -ENXIO; } @@ -3492,19 +4467,19 @@ retry: if (!kvm_request_pending(vcpu)) return 0; /* - * We use MMU_RELOAD just to re-arm the ipte notifier for the + * If the guest prefix changed, re-arm the ipte notifier for the * guest prefix page. gmap_mprotect_notify will wait on the ptl lock. * This ensures that the ipte instruction for this request has * already finished. We might race against a second unmapper that * wants to set the blocking bit. Lets just retry the request loop. */ - if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) { + if (kvm_check_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu)) { int rc; rc = gmap_mprotect_notify(vcpu->arch.gmap, kvm_s390_get_prefix(vcpu), PAGE_SIZE * 2, PROT_WRITE); if (rc) { - kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); + kvm_make_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu); return rc; } goto retry; @@ -3557,30 +4532,26 @@ retry: goto retry; } - /* nothing to do, just clear the request */ - kvm_clear_request(KVM_REQ_UNHALT, vcpu); /* we left the vsie handler, nothing to do, just clear the request */ kvm_clear_request(KVM_REQ_VSIE_RESTART, vcpu); return 0; } -void kvm_s390_set_tod_clock(struct kvm *kvm, - const struct kvm_s390_vm_tod_clock *gtod) +static void __kvm_s390_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod) { struct kvm_vcpu *vcpu; - struct kvm_s390_tod_clock_ext htod; - int i; + union tod_clock clk; + unsigned long i; - mutex_lock(&kvm->lock); preempt_disable(); - get_tod_clock_ext((char *)&htod); + store_tod_clock_ext(&clk); - kvm->arch.epoch = gtod->tod - htod.tod; + kvm->arch.epoch = gtod->tod - clk.tod; kvm->arch.epdx = 0; if (test_kvm_facility(kvm, 139)) { - kvm->arch.epdx = gtod->epoch_idx - htod.epoch_idx; + kvm->arch.epdx = gtod->epoch_idx - clk.ei; if (kvm->arch.epoch > gtod->tod) kvm->arch.epdx -= 1; } @@ -3593,7 +4564,15 @@ void kvm_s390_set_tod_clock(struct kvm *kvm, kvm_s390_vcpu_unblock_all(kvm); preempt_enable(); +} + +int kvm_s390_try_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod) +{ + if (!mutex_trylock(&kvm->lock)) + return 0; + __kvm_s390_set_tod_clock(kvm, gtod); mutex_unlock(&kvm->lock); + return 1; } /** @@ -3629,11 +4608,13 @@ static void __kvm_inject_pfault_token(struct kvm_vcpu *vcpu, bool start_token, } } -void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, +bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) { trace_kvm_s390_pfault_init(vcpu, work->arch.pfault_token); __kvm_inject_pfault_token(vcpu, true, work->arch.pfault_token); + + return true; } void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, @@ -3649,7 +4630,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, /* s390 will always inject the page directly */ } -bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu) +bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu) { /* * s390 will always inject the page directly, @@ -3658,33 +4639,31 @@ bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu) return true; } -static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu) +static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu) { hva_t hva; struct kvm_arch_async_pf arch; - int rc; if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID) - return 0; + return false; if ((vcpu->arch.sie_block->gpsw.mask & vcpu->arch.pfault_select) != vcpu->arch.pfault_compare) - return 0; + return false; if (psw_extint_disabled(vcpu)) - return 0; + return false; if (kvm_s390_vcpu_has_irq(vcpu, 0)) - return 0; + return false; if (!(vcpu->arch.sie_block->gcr[0] & CR0_SERVICE_SIGNAL_SUBMASK)) - return 0; + return false; if (!vcpu->arch.gmap->pfault_enabled) - return 0; + return false; hva = gfn_to_hva(vcpu->kvm, gpa_to_gfn(current->thread.gmap_addr)); hva += current->thread.gmap_addr & ~PAGE_MASK; if (read_guest_real(vcpu, vcpu->arch.pfault_token, &arch.pfault_token, 8)) - return 0; + return false; - rc = kvm_setup_async_pf(vcpu, current->thread.gmap_addr, hva, &arch); - return rc; + return kvm_setup_async_pf(vcpu, current->thread.gmap_addr, hva, &arch); } static int vcpu_pre_run(struct kvm_vcpu *vcpu) @@ -3704,12 +4683,9 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu) if (need_resched()) schedule(); - if (test_cpu_flag(CIF_MCCK_PENDING)) - s390_handle_mcck(); - if (!kvm_is_ucontrol(vcpu->kvm)) { rc = kvm_s390_deliver_pending_interrupts(vcpu); - if (rc) + if (rc || guestdbg_exit_pending(vcpu)) return rc; } @@ -3722,7 +4698,7 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu) kvm_s390_patch_guest_per_regs(vcpu); } - clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.gisa_int.kicked_mask); + clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.gisa_int.kicked_mask); vcpu->arch.sie_block->icptcode = 0; cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags); @@ -3816,27 +4792,30 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason) current->thread.gmap_pfault = 0; if (kvm_arch_setup_async_pf(vcpu)) return 0; + vcpu->stat.pfault_sync++; return kvm_arch_fault_in_page(vcpu, current->thread.gmap_addr, 1); } return vcpu_post_run_fault_in_sie(vcpu); } +#define PSW_INT_MASK (PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_MCHECK) static int __vcpu_run(struct kvm_vcpu *vcpu) { int rc, exit_reason; + struct sie_page *sie_page = (struct sie_page *)vcpu->arch.sie_block; /* * We try to hold kvm->srcu during most of vcpu_run (except when run- * ning the guest), so that memslots (and other stuff) are protected */ - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_vcpu_srcu_read_lock(vcpu); do { rc = vcpu_pre_run(vcpu); - if (rc) + if (rc || guestdbg_exit_pending(vcpu)) break; - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + kvm_vcpu_srcu_read_unlock(vcpu); /* * As PF_VCPU will be used in fault handler, between * guest_enter and guest_exit should be no uaccess. @@ -3845,23 +4824,46 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) guest_enter_irqoff(); __disable_cpu_timer_accounting(vcpu); local_irq_enable(); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + memcpy(sie_page->pv_grregs, + vcpu->run->s.regs.gprs, + sizeof(sie_page->pv_grregs)); + } + if (test_cpu_flag(CIF_FPU)) + load_fpu_regs(); exit_reason = sie64a(vcpu->arch.sie_block, vcpu->run->s.regs.gprs); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + memcpy(vcpu->run->s.regs.gprs, + sie_page->pv_grregs, + sizeof(sie_page->pv_grregs)); + /* + * We're not allowed to inject interrupts on intercepts + * that leave the guest state in an "in-between" state + * where the next SIE entry will do a continuation. + * Fence interrupts in our "internal" PSW. + */ + if (vcpu->arch.sie_block->icptcode == ICPT_PV_INSTR || + vcpu->arch.sie_block->icptcode == ICPT_PV_PREF) { + vcpu->arch.sie_block->gpsw.mask &= ~PSW_INT_MASK; + } + } local_irq_disable(); __enable_cpu_timer_accounting(vcpu); guest_exit_irqoff(); local_irq_enable(); - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_vcpu_srcu_read_lock(vcpu); rc = vcpu_post_run(vcpu, exit_reason); } while (!signal_pending(current) && !guestdbg_exit_pending(vcpu) && !rc); - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + kvm_vcpu_srcu_read_unlock(vcpu); return rc; } -static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static void sync_regs_fmt2(struct kvm_vcpu *vcpu) { + struct kvm_run *kvm_run = vcpu->run; struct runtime_instr_cb *riccb; struct gs_cb *gscb; @@ -3869,16 +4871,7 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) gscb = (struct gs_cb *) &kvm_run->s.regs.gscb; vcpu->arch.sie_block->gpsw.mask = kvm_run->psw_mask; vcpu->arch.sie_block->gpsw.addr = kvm_run->psw_addr; - if (kvm_run->kvm_dirty_regs & KVM_SYNC_PREFIX) - kvm_s390_set_prefix(vcpu, kvm_run->s.regs.prefix); - if (kvm_run->kvm_dirty_regs & KVM_SYNC_CRS) { - memcpy(&vcpu->arch.sie_block->gcr, &kvm_run->s.regs.crs, 128); - /* some control register changes require a tlb flush */ - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); - } if (kvm_run->kvm_dirty_regs & KVM_SYNC_ARCH0) { - kvm_s390_set_cpu_timer(vcpu, kvm_run->s.regs.cputm); - vcpu->arch.sie_block->ckc = kvm_run->s.regs.ckc; vcpu->arch.sie_block->todpr = kvm_run->s.regs.todpr; vcpu->arch.sie_block->pp = kvm_run->s.regs.pp; vcpu->arch.sie_block->gbea = kvm_run->s.regs.gbea; @@ -3890,6 +4883,11 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID) kvm_clear_async_pf_completion_queue(vcpu); } + if (kvm_run->kvm_dirty_regs & KVM_SYNC_DIAG318) { + vcpu->arch.diag318_info.val = kvm_run->s.regs.diag318; + vcpu->arch.sie_block->cpnc = vcpu->arch.diag318_info.cpnc; + VCPU_EVENT(vcpu, 3, "setting cpnc to %d", vcpu->arch.diag318_info.cpnc); + } /* * If userspace sets the riccb (e.g. after migration) to a valid state, * we should enable RI here instead of doing the lazy enablement. @@ -3919,23 +4917,9 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu->arch.sie_block->fpf &= ~FPF_BPBC; vcpu->arch.sie_block->fpf |= kvm_run->s.regs.bpbc ? FPF_BPBC : 0; } - save_access_regs(vcpu->arch.host_acrs); - restore_access_regs(vcpu->run->s.regs.acrs); - /* save host (userspace) fprs/vrs */ - save_fpu_regs(); - vcpu->arch.host_fpregs.fpc = current->thread.fpu.fpc; - vcpu->arch.host_fpregs.regs = current->thread.fpu.regs; - if (MACHINE_HAS_VX) - current->thread.fpu.regs = vcpu->run->s.regs.vrs; - else - current->thread.fpu.regs = vcpu->run->s.regs.fprs; - current->thread.fpu.fpc = vcpu->run->s.regs.fpc; - if (test_fp_ctl(current->thread.fpu.fpc)) - /* User space provided an invalid FPC, let's clear it */ - current->thread.fpu.fpc = 0; if (MACHINE_HAS_GS) { preempt_disable(); - __ctl_set_bit(2, 4); + local_ctl_set_bit(2, CR2_GUARDED_STORAGE_BIT); if (current->thread.gs_cb) { vcpu->arch.host_gscb = current->thread.gs_cb; save_gs_cb(vcpu->arch.host_gscb); @@ -3948,25 +4932,93 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) preempt_enable(); } /* SIE will load etoken directly from SDNX and therefore kvm_run */ +} + +static void sync_regs(struct kvm_vcpu *vcpu) +{ + struct kvm_run *kvm_run = vcpu->run; + + if (kvm_run->kvm_dirty_regs & KVM_SYNC_PREFIX) + kvm_s390_set_prefix(vcpu, kvm_run->s.regs.prefix); + if (kvm_run->kvm_dirty_regs & KVM_SYNC_CRS) { + memcpy(&vcpu->arch.sie_block->gcr, &kvm_run->s.regs.crs, 128); + /* some control register changes require a tlb flush */ + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + } + if (kvm_run->kvm_dirty_regs & KVM_SYNC_ARCH0) { + kvm_s390_set_cpu_timer(vcpu, kvm_run->s.regs.cputm); + vcpu->arch.sie_block->ckc = kvm_run->s.regs.ckc; + } + save_access_regs(vcpu->arch.host_acrs); + restore_access_regs(vcpu->run->s.regs.acrs); + /* save host (userspace) fprs/vrs */ + save_fpu_regs(); + vcpu->arch.host_fpregs.fpc = current->thread.fpu.fpc; + vcpu->arch.host_fpregs.regs = current->thread.fpu.regs; + if (cpu_has_vx()) + current->thread.fpu.regs = vcpu->run->s.regs.vrs; + else + current->thread.fpu.regs = vcpu->run->s.regs.fprs; + current->thread.fpu.fpc = vcpu->run->s.regs.fpc; + + /* Sync fmt2 only data */ + if (likely(!kvm_s390_pv_cpu_is_protected(vcpu))) { + sync_regs_fmt2(vcpu); + } else { + /* + * In several places we have to modify our internal view to + * not do things that are disallowed by the ultravisor. For + * example we must not inject interrupts after specific exits + * (e.g. 112 prefix page not secure). We do this by turning + * off the machine check, external and I/O interrupt bits + * of our PSW copy. To avoid getting validity intercepts, we + * do only accept the condition code from userspace. + */ + vcpu->arch.sie_block->gpsw.mask &= ~PSW_MASK_CC; + vcpu->arch.sie_block->gpsw.mask |= kvm_run->psw_mask & + PSW_MASK_CC; + } kvm_run->kvm_dirty_regs = 0; } -static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static void store_regs_fmt2(struct kvm_vcpu *vcpu) { + struct kvm_run *kvm_run = vcpu->run; + + kvm_run->s.regs.todpr = vcpu->arch.sie_block->todpr; + kvm_run->s.regs.pp = vcpu->arch.sie_block->pp; + kvm_run->s.regs.gbea = vcpu->arch.sie_block->gbea; + kvm_run->s.regs.bpbc = (vcpu->arch.sie_block->fpf & FPF_BPBC) == FPF_BPBC; + kvm_run->s.regs.diag318 = vcpu->arch.diag318_info.val; + if (MACHINE_HAS_GS) { + preempt_disable(); + local_ctl_set_bit(2, CR2_GUARDED_STORAGE_BIT); + if (vcpu->arch.gs_enabled) + save_gs_cb(current->thread.gs_cb); + current->thread.gs_cb = vcpu->arch.host_gscb; + restore_gs_cb(vcpu->arch.host_gscb); + if (!vcpu->arch.host_gscb) + local_ctl_clear_bit(2, CR2_GUARDED_STORAGE_BIT); + vcpu->arch.host_gscb = NULL; + preempt_enable(); + } + /* SIE will save etoken directly into SDNX and therefore kvm_run */ +} + +static void store_regs(struct kvm_vcpu *vcpu) +{ + struct kvm_run *kvm_run = vcpu->run; + kvm_run->psw_mask = vcpu->arch.sie_block->gpsw.mask; kvm_run->psw_addr = vcpu->arch.sie_block->gpsw.addr; kvm_run->s.regs.prefix = kvm_s390_get_prefix(vcpu); memcpy(&kvm_run->s.regs.crs, &vcpu->arch.sie_block->gcr, 128); kvm_run->s.regs.cputm = kvm_s390_get_cpu_timer(vcpu); kvm_run->s.regs.ckc = vcpu->arch.sie_block->ckc; - kvm_run->s.regs.todpr = vcpu->arch.sie_block->todpr; - kvm_run->s.regs.pp = vcpu->arch.sie_block->pp; - kvm_run->s.regs.gbea = vcpu->arch.sie_block->gbea; kvm_run->s.regs.pft = vcpu->arch.pfault_token; kvm_run->s.regs.pfs = vcpu->arch.pfault_select; kvm_run->s.regs.pfc = vcpu->arch.pfault_compare; - kvm_run->s.regs.bpbc = (vcpu->arch.sie_block->fpf & FPF_BPBC) == FPF_BPBC; save_access_regs(vcpu->run->s.regs.acrs); restore_access_regs(vcpu->arch.host_acrs); /* Save guest register state */ @@ -3975,25 +5027,24 @@ static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) /* Restore will be done lazily at return */ current->thread.fpu.fpc = vcpu->arch.host_fpregs.fpc; current->thread.fpu.regs = vcpu->arch.host_fpregs.regs; - if (MACHINE_HAS_GS) { - __ctl_set_bit(2, 4); - if (vcpu->arch.gs_enabled) - save_gs_cb(current->thread.gs_cb); - preempt_disable(); - current->thread.gs_cb = vcpu->arch.host_gscb; - restore_gs_cb(vcpu->arch.host_gscb); - preempt_enable(); - if (!vcpu->arch.host_gscb) - __ctl_clear_bit(2, 4); - vcpu->arch.host_gscb = NULL; - } - /* SIE will save etoken directly into SDNX and therefore kvm_run */ + if (likely(!kvm_s390_pv_cpu_is_protected(vcpu))) + store_regs_fmt2(vcpu); } -int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { + struct kvm_run *kvm_run = vcpu->run; int rc; + /* + * Running a VM while dumping always has the potential to + * produce inconsistent dump data. But for PV vcpus a SIE + * entry while dumping could also lead to a fatal validity + * intercept which we absolutely want to avoid. + */ + if (vcpu->kvm->arch.pv.dumping) + return -EINVAL; + if (kvm_run->immediate_exit) return -EINTR; @@ -4011,6 +5062,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_sigset_activate(vcpu); + /* + * no need to check the return value of vcpu_start as it can only have + * an error for protvirt, but protvirt means user cpu state + */ if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm)) { kvm_s390_vcpu_start(vcpu); } else if (is_vcpu_stopped(vcpu)) { @@ -4020,7 +5075,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) goto out; } - sync_regs(vcpu, kvm_run); + sync_regs(vcpu); enable_cpu_timer_accounting(vcpu); might_fault(); @@ -4042,7 +5097,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } disable_cpu_timer_accounting(vcpu); - store_regs(vcpu, kvm_run); + store_regs(vcpu); kvm_sigset_deactivate(vcpu); @@ -4079,7 +5134,7 @@ int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long gpa) gpa -= __LC_FPREGS_SAVE_AREA; /* manually convert vector registers if necessary */ - if (MACHINE_HAS_VX) { + if (cpu_has_vx()) { convert_vx_to_fp(fprs, (__vector128 *) vcpu->run->s.regs.vrs); rc = write_guest_abs(vcpu, gpa + __LC_FPREGS_SAVE_AREA, fprs, 128); @@ -4132,7 +5187,7 @@ static void __disable_ibs_on_vcpu(struct kvm_vcpu *vcpu) static void __disable_ibs_on_all_vcpus(struct kvm *kvm) { - unsigned int i; + unsigned long i; struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) { @@ -4148,20 +5203,29 @@ static void __enable_ibs_on_vcpu(struct kvm_vcpu *vcpu) kvm_s390_sync_request(KVM_REQ_ENABLE_IBS, vcpu); } -void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu) +int kvm_s390_vcpu_start(struct kvm_vcpu *vcpu) { - int i, online_vcpus, started_vcpus = 0; + int i, online_vcpus, r = 0, started_vcpus = 0; if (!is_vcpu_stopped(vcpu)) - return; + return 0; trace_kvm_s390_vcpu_start_stop(vcpu->vcpu_id, 1); /* Only one cpu at a time may enter/leave the STOPPED state. */ spin_lock(&vcpu->kvm->arch.start_stop_lock); online_vcpus = atomic_read(&vcpu->kvm->online_vcpus); + /* Let's tell the UV that we want to change into the operating state */ + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + r = kvm_s390_pv_set_cpu_state(vcpu, PV_CPU_STATE_OPR); + if (r) { + spin_unlock(&vcpu->kvm->arch.start_stop_lock); + return r; + } + } + for (i = 0; i < online_vcpus; i++) { - if (!is_vcpu_stopped(vcpu->kvm->vcpus[i])) + if (!is_vcpu_stopped(kvm_get_vcpu(vcpu->kvm, i))) started_vcpus++; } @@ -4172,44 +5236,67 @@ void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu) /* * As we are starting a second VCPU, we have to disable * the IBS facility on all VCPUs to remove potentially - * oustanding ENABLE requests. + * outstanding ENABLE requests. */ __disable_ibs_on_all_vcpus(vcpu->kvm); } kvm_s390_clear_cpuflags(vcpu, CPUSTAT_STOPPED); /* + * The real PSW might have changed due to a RESTART interpreted by the + * ultravisor. We block all interrupts and let the next sie exit + * refresh our view. + */ + if (kvm_s390_pv_cpu_is_protected(vcpu)) + vcpu->arch.sie_block->gpsw.mask &= ~PSW_INT_MASK; + /* * Another VCPU might have used IBS while we were offline. * Let's play safe and flush the VCPU at startup. */ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); spin_unlock(&vcpu->kvm->arch.start_stop_lock); - return; + return 0; } -void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu) +int kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu) { - int i, online_vcpus, started_vcpus = 0; + int i, online_vcpus, r = 0, started_vcpus = 0; struct kvm_vcpu *started_vcpu = NULL; if (is_vcpu_stopped(vcpu)) - return; + return 0; trace_kvm_s390_vcpu_start_stop(vcpu->vcpu_id, 0); /* Only one cpu at a time may enter/leave the STOPPED state. */ spin_lock(&vcpu->kvm->arch.start_stop_lock); online_vcpus = atomic_read(&vcpu->kvm->online_vcpus); - /* SIGP STOP and SIGP STOP AND STORE STATUS has been fully processed */ - kvm_s390_clear_stop_irq(vcpu); + /* Let's tell the UV that we want to change into the stopped state */ + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + r = kvm_s390_pv_set_cpu_state(vcpu, PV_CPU_STATE_STP); + if (r) { + spin_unlock(&vcpu->kvm->arch.start_stop_lock); + return r; + } + } + /* + * Set the VCPU to STOPPED and THEN clear the interrupt flag, + * now that the SIGP STOP and SIGP STOP AND STORE STATUS orders + * have been fully processed. This will ensure that the VCPU + * is kept BUSY if another VCPU is inquiring with SIGP SENSE. + */ kvm_s390_set_cpuflags(vcpu, CPUSTAT_STOPPED); + kvm_s390_clear_stop_irq(vcpu); + __disable_ibs_on_vcpu(vcpu); for (i = 0; i < online_vcpus; i++) { - if (!is_vcpu_stopped(vcpu->kvm->vcpus[i])) { + struct kvm_vcpu *tmp = kvm_get_vcpu(vcpu->kvm, i); + + if (!is_vcpu_stopped(tmp)) { started_vcpus++; - started_vcpu = vcpu->kvm->vcpus[i]; + started_vcpu = tmp; } } @@ -4222,7 +5309,7 @@ void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu) } spin_unlock(&vcpu->kvm->arch.start_stop_lock); - return; + return 0; } static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, @@ -4249,64 +5336,116 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return r; } -static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu, +static long kvm_s390_vcpu_sida_op(struct kvm_vcpu *vcpu, struct kvm_s390_mem_op *mop) { void __user *uaddr = (void __user *)mop->buf; - void *tmpbuf = NULL; - int r, srcu_idx; - const u64 supported_flags = KVM_S390_MEMOP_F_INJECT_EXCEPTION - | KVM_S390_MEMOP_F_CHECK_ONLY; + void *sida_addr; + int r = 0; - if (mop->flags & ~supported_flags || mop->ar >= NUM_ACRS || !mop->size) + if (mop->flags || !mop->size) return -EINVAL; - - if (mop->size > MEM_OP_MAX_SIZE) + if (mop->size + mop->sida_offset < mop->size) + return -EINVAL; + if (mop->size + mop->sida_offset > sida_size(vcpu->arch.sie_block)) return -E2BIG; + if (!kvm_s390_pv_cpu_is_protected(vcpu)) + return -EINVAL; + + sida_addr = (char *)sida_addr(vcpu->arch.sie_block) + mop->sida_offset; + switch (mop->op) { + case KVM_S390_MEMOP_SIDA_READ: + if (copy_to_user(uaddr, sida_addr, mop->size)) + r = -EFAULT; + + break; + case KVM_S390_MEMOP_SIDA_WRITE: + if (copy_from_user(sida_addr, uaddr, mop->size)) + r = -EFAULT; + break; + } + return r; +} + +static long kvm_s390_vcpu_mem_op(struct kvm_vcpu *vcpu, + struct kvm_s390_mem_op *mop) +{ + void __user *uaddr = (void __user *)mop->buf; + enum gacc_mode acc_mode; + void *tmpbuf = NULL; + int r; + + r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_INJECT_EXCEPTION | + KVM_S390_MEMOP_F_CHECK_ONLY | + KVM_S390_MEMOP_F_SKEY_PROTECTION); + if (r) + return r; + if (mop->ar >= NUM_ACRS) + return -EINVAL; + if (kvm_s390_pv_cpu_is_protected(vcpu)) + return -EINVAL; if (!(mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)) { tmpbuf = vmalloc(mop->size); if (!tmpbuf) return -ENOMEM; } + acc_mode = mop->op == KVM_S390_MEMOP_LOGICAL_READ ? GACC_FETCH : GACC_STORE; + if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { + r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size, + acc_mode, mop->key); + goto out_inject; + } + if (acc_mode == GACC_FETCH) { + r = read_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf, + mop->size, mop->key); + if (r) + goto out_inject; + if (copy_to_user(uaddr, tmpbuf, mop->size)) { + r = -EFAULT; + goto out_free; + } + } else { + if (copy_from_user(tmpbuf, uaddr, mop->size)) { + r = -EFAULT; + goto out_free; + } + r = write_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf, + mop->size, mop->key); + } + +out_inject: + if (r > 0 && (mop->flags & KVM_S390_MEMOP_F_INJECT_EXCEPTION) != 0) + kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm); + +out_free: + vfree(tmpbuf); + return r; +} + +static long kvm_s390_vcpu_memsida_op(struct kvm_vcpu *vcpu, + struct kvm_s390_mem_op *mop) +{ + int r, srcu_idx; + srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); switch (mop->op) { case KVM_S390_MEMOP_LOGICAL_READ: - if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { - r = check_gva_range(vcpu, mop->gaddr, mop->ar, - mop->size, GACC_FETCH); - break; - } - r = read_guest(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size); - if (r == 0) { - if (copy_to_user(uaddr, tmpbuf, mop->size)) - r = -EFAULT; - } - break; case KVM_S390_MEMOP_LOGICAL_WRITE: - if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { - r = check_gva_range(vcpu, mop->gaddr, mop->ar, - mop->size, GACC_STORE); - break; - } - if (copy_from_user(tmpbuf, uaddr, mop->size)) { - r = -EFAULT; - break; - } - r = write_guest(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size); + r = kvm_s390_vcpu_mem_op(vcpu, mop); + break; + case KVM_S390_MEMOP_SIDA_READ: + case KVM_S390_MEMOP_SIDA_WRITE: + /* we are locked against sida going away by the vcpu->mutex */ + r = kvm_s390_vcpu_sida_op(vcpu, mop); break; default: r = -EINVAL; } srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); - - if (r > 0 && (mop->flags & KVM_S390_MEMOP_F_INJECT_EXCEPTION) != 0) - kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm); - - vfree(tmpbuf); return r; } @@ -4315,6 +5454,7 @@ long kvm_arch_vcpu_async_ioctl(struct file *filp, { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; + int rc; switch (ioctl) { case KVM_S390_IRQ: { @@ -4322,7 +5462,8 @@ long kvm_arch_vcpu_async_ioctl(struct file *filp, if (copy_from_user(&s390irq, argp, sizeof(s390irq))) return -EFAULT; - return kvm_s390_inject_vcpu(vcpu, &s390irq); + rc = kvm_s390_inject_vcpu(vcpu, &s390irq); + break; } case KVM_S390_INTERRUPT: { struct kvm_s390_interrupt s390int; @@ -4332,10 +5473,67 @@ long kvm_arch_vcpu_async_ioctl(struct file *filp, return -EFAULT; if (s390int_to_s390irq(&s390int, &s390irq)) return -EINVAL; - return kvm_s390_inject_vcpu(vcpu, &s390irq); + rc = kvm_s390_inject_vcpu(vcpu, &s390irq); + break; } + default: + rc = -ENOIOCTLCMD; + break; } - return -ENOIOCTLCMD; + + /* + * To simplify single stepping of userspace-emulated instructions, + * KVM_EXIT_S390_SIEIC exit sets KVM_GUESTDBG_EXIT_PENDING (see + * should_handle_per_ifetch()). However, if userspace emulation injects + * an interrupt, it needs to be cleared, so that KVM_EXIT_DEBUG happens + * after (and not before) the interrupt delivery. + */ + if (!rc) + vcpu->guest_debug &= ~KVM_GUESTDBG_EXIT_PENDING; + + return rc; +} + +static int kvm_s390_handle_pv_vcpu_dump(struct kvm_vcpu *vcpu, + struct kvm_pv_cmd *cmd) +{ + struct kvm_s390_pv_dmp dmp; + void *data; + int ret; + + /* Dump initialization is a prerequisite */ + if (!vcpu->kvm->arch.pv.dumping) + return -EINVAL; + + if (copy_from_user(&dmp, (__u8 __user *)cmd->data, sizeof(dmp))) + return -EFAULT; + + /* We only handle this subcmd right now */ + if (dmp.subcmd != KVM_PV_DUMP_CPU) + return -EINVAL; + + /* CPU dump length is the same as create cpu storage donation. */ + if (dmp.buff_len != uv_info.guest_cpu_stor_len) + return -EINVAL; + + data = kvzalloc(uv_info.guest_cpu_stor_len, GFP_KERNEL); + if (!data) + return -ENOMEM; + + ret = kvm_s390_pv_dump_cpu(vcpu, data, &cmd->rc, &cmd->rrc); + + VCPU_EVENT(vcpu, 3, "PROTVIRT DUMP CPU %d rc %x rrc %x", + vcpu->vcpu_id, cmd->rc, cmd->rrc); + + if (ret) + ret = -EINVAL; + + /* On success copy over the dump data */ + if (!ret && copy_to_user((__u8 __user *)dmp.buff_addr, data, uv_info.guest_cpu_stor_len)) + ret = -EFAULT; + + kvfree(data); + return ret; } long kvm_arch_vcpu_ioctl(struct file *filp, @@ -4345,13 +5543,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp, void __user *argp = (void __user *)arg; int idx; long r; + u16 rc, rrc; vcpu_load(vcpu); switch (ioctl) { case KVM_S390_STORE_STATUS: idx = srcu_read_lock(&vcpu->kvm->srcu); - r = kvm_s390_vcpu_store_status(vcpu, arg); + r = kvm_s390_store_status_unloaded(vcpu, arg); srcu_read_unlock(&vcpu->kvm->srcu, idx); break; case KVM_S390_SET_INITIAL_PSW: { @@ -4363,12 +5562,43 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_arch_vcpu_ioctl_set_initial_psw(vcpu, psw); break; } + case KVM_S390_CLEAR_RESET: + r = 0; + kvm_arch_vcpu_ioctl_clear_reset(vcpu); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + r = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu), + UVC_CMD_CPU_RESET_CLEAR, &rc, &rrc); + VCPU_EVENT(vcpu, 3, "PROTVIRT RESET CLEAR VCPU: rc %x rrc %x", + rc, rrc); + } + break; case KVM_S390_INITIAL_RESET: - r = kvm_arch_vcpu_ioctl_initial_reset(vcpu); + r = 0; + kvm_arch_vcpu_ioctl_initial_reset(vcpu); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + r = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu), + UVC_CMD_CPU_RESET_INITIAL, + &rc, &rrc); + VCPU_EVENT(vcpu, 3, "PROTVIRT RESET INITIAL VCPU: rc %x rrc %x", + rc, rrc); + } + break; + case KVM_S390_NORMAL_RESET: + r = 0; + kvm_arch_vcpu_ioctl_normal_reset(vcpu); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + r = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu), + UVC_CMD_CPU_RESET, &rc, &rrc); + VCPU_EVENT(vcpu, 3, "PROTVIRT RESET NORMAL VCPU: rc %x rrc %x", + rc, rrc); + } break; case KVM_SET_ONE_REG: case KVM_GET_ONE_REG: { struct kvm_one_reg reg; + r = -EINVAL; + if (kvm_s390_pv_cpu_is_protected(vcpu)) + break; r = -EFAULT; if (copy_from_user(®, argp, sizeof(reg))) break; @@ -4431,7 +5661,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_s390_mem_op mem_op; if (copy_from_user(&mem_op, argp, sizeof(mem_op)) == 0) - r = kvm_s390_guest_mem_op(vcpu, &mem_op); + r = kvm_s390_vcpu_memsida_op(vcpu, &mem_op); else r = -EFAULT; break; @@ -4470,6 +5700,33 @@ long kvm_arch_vcpu_ioctl(struct file *filp, irq_state.len); break; } + case KVM_S390_PV_CPU_COMMAND: { + struct kvm_pv_cmd cmd; + + r = -EINVAL; + if (!is_prot_virt_host()) + break; + + r = -EFAULT; + if (copy_from_user(&cmd, argp, sizeof(cmd))) + break; + + r = -EINVAL; + if (cmd.flags) + break; + + /* We only handle this cmd right now */ + if (cmd.cmd != KVM_PV_DUMP) + break; + + r = kvm_s390_handle_pv_vcpu_dump(vcpu, &cmd); + + /* Always copy over UV rc / rrc data */ + if (copy_to_user((__u8 __user *)argp, &cmd.rc, + sizeof(cmd.rc) + sizeof(cmd.rrc))) + r = -EFAULT; + break; + } default: r = -ENOTTY; } @@ -4491,38 +5748,63 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } -int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, - unsigned long npages) +bool kvm_arch_irqchip_in_kernel(struct kvm *kvm) { - return 0; + return true; } /* Section: memory related */ int kvm_arch_prepare_memory_region(struct kvm *kvm, - struct kvm_memory_slot *memslot, - const struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old, + struct kvm_memory_slot *new, enum kvm_mr_change change) { - /* A few sanity checks. We can have memory slots which have to be - located/ended at a segment boundary (1MB). The memory in userland is - ok to be fragmented into various different vmas. It is okay to mmap() - and munmap() stuff in this slot after doing this call at any time */ + gpa_t size; - if (mem->userspace_addr & 0xffffful) + /* When we are protected, we should not change the memory slots */ + if (kvm_s390_pv_get_handle(kvm)) return -EINVAL; - if (mem->memory_size & 0xffffful) - return -EINVAL; + if (change != KVM_MR_DELETE && change != KVM_MR_FLAGS_ONLY) { + /* + * A few sanity checks. We can have memory slots which have to be + * located/ended at a segment boundary (1MB). The memory in userland is + * ok to be fragmented into various different vmas. It is okay to mmap() + * and munmap() stuff in this slot after doing this call at any time + */ - if (mem->guest_phys_addr + mem->memory_size > kvm->arch.mem_limit) - return -EINVAL; + if (new->userspace_addr & 0xffffful) + return -EINVAL; + + size = new->npages * PAGE_SIZE; + if (size & 0xffffful) + return -EINVAL; + + if ((new->base_gfn * PAGE_SIZE) + size > kvm->arch.mem_limit) + return -EINVAL; + } + + if (!kvm->arch.migration_mode) + return 0; + + /* + * Turn off migration mode when: + * - userspace creates a new memslot with dirty logging off, + * - userspace modifies an existing memslot (MOVE or FLAGS_ONLY) and + * dirty logging is turned off. + * Migration mode expects dirty page logging being enabled to store + * its dirty bitmap. + */ + if (change != KVM_MR_DELETE && + !(new->flags & KVM_MEM_LOG_DIRTY_PAGES)) + WARN(kvm_s390_vm_stop_migration(kvm), + "Failed to stop migration mode"); return 0; } void kvm_arch_commit_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem, - const struct kvm_memory_slot *old, + struct kvm_memory_slot *old, const struct kvm_memory_slot *new, enum kvm_mr_change change) { @@ -4538,10 +5820,11 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, old->npages * PAGE_SIZE); if (rc) break; - /* FALLTHROUGH */ + fallthrough; case KVM_MR_CREATE: - rc = gmap_map_segment(kvm->arch.gmap, mem->userspace_addr, - mem->guest_phys_addr, mem->memory_size); + rc = gmap_map_segment(kvm->arch.gmap, new->userspace_addr, + new->base_gfn * PAGE_SIZE, + new->npages * PAGE_SIZE); break; case KVM_MR_FLAGS_ONLY: break; @@ -4560,14 +5843,9 @@ static inline unsigned long nonhyp_mask(int i) return 0x0000ffffffffffffUL >> (nonhyp_fai << 4); } -void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) -{ - vcpu->valid_wakeup = false; -} - static int __init kvm_s390_init(void) { - int i; + int i, r; if (!sclp.has_sief2) { pr_info("SIE is not available\n"); @@ -4581,14 +5859,25 @@ static int __init kvm_s390_init(void) for (i = 0; i < 16; i++) kvm_s390_fac_base[i] |= - S390_lowcore.stfle_fac_list[i] & nonhyp_mask(i); + stfle_fac_list[i] & nonhyp_mask(i); + + r = __kvm_s390_init(); + if (r) + return r; - return kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); + r = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); + if (r) { + __kvm_s390_exit(); + return r; + } + return 0; } static void __exit kvm_s390_exit(void) { kvm_exit(); + + __kvm_s390_exit(); } module_init(kvm_s390_init); diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 6d9448dbd052..a7ea80cfa445 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -2,7 +2,7 @@ /* * definition for kvm on s390 * - * Copyright IBM Corp. 2008, 2009 + * Copyright IBM Corp. 2008, 2020 * * Author(s): Carsten Otte <cotte@de.ibm.com> * Christian Borntraeger <borntraeger@de.ibm.com> @@ -15,6 +15,7 @@ #include <linux/hrtimer.h> #include <linux/kvm.h> #include <linux/kvm_host.h> +#include <linux/lockdep.h> #include <asm/facility.h> #include <asm/processor.h> #include <asm/sclp.h> @@ -22,9 +23,21 @@ /* Transactional Memory Execution related macros */ #define IS_TE_ENABLED(vcpu) ((vcpu->arch.sie_block->ecb & ECB_TE)) #define TDB_FORMAT1 1 -#define IS_ITDB_VALID(vcpu) ((*(char *)vcpu->arch.sie_block->itdba == TDB_FORMAT1)) +#define IS_ITDB_VALID(vcpu) \ + ((*(char *)phys_to_virt((vcpu)->arch.sie_block->itdba) == TDB_FORMAT1)) extern debug_info_t *kvm_s390_dbf; +extern debug_info_t *kvm_s390_dbf_uv; + +#define KVM_UV_EVENT(d_kvm, d_loglevel, d_string, d_args...)\ +do { \ + debug_sprintf_event((d_kvm)->arch.dbf, d_loglevel, d_string "\n", \ + d_args); \ + debug_sprintf_event(kvm_s390_dbf_uv, d_loglevel, \ + "%d: " d_string "\n", (d_kvm)->userspace_pid, \ + d_args); \ +} while (0) + #define KVM_EVENT(d_loglevel, d_string, d_args...)\ do { \ debug_sprintf_event(kvm_s390_dbf, d_loglevel, d_string "\n", \ @@ -67,7 +80,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu) static inline int is_vcpu_idle(struct kvm_vcpu *vcpu) { - return test_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask); + return test_bit(vcpu->vcpu_idx, vcpu->kvm->arch.idle_mask); } static inline int kvm_is_ucontrol(struct kvm *kvm) @@ -93,7 +106,7 @@ static inline void kvm_s390_set_prefix(struct kvm_vcpu *vcpu, u32 prefix) prefix); vcpu->arch.sie_block->prefix = prefix >> GUEST_PREFIX_SHIFT; kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); - kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); + kvm_make_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu); } static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu, u8 *ar) @@ -196,6 +209,67 @@ static inline int kvm_s390_user_cpu_state_ctrl(struct kvm *kvm) return kvm->arch.user_cpu_state_ctrl != 0; } +static inline void kvm_s390_set_user_cpu_state_ctrl(struct kvm *kvm) +{ + if (kvm->arch.user_cpu_state_ctrl) + return; + + VM_EVENT(kvm, 3, "%s", "ENABLE: Userspace CPU state control"); + kvm->arch.user_cpu_state_ctrl = 1; +} + +/* get the end gfn of the last (highest gfn) memslot */ +static inline unsigned long kvm_s390_get_gfn_end(struct kvm_memslots *slots) +{ + struct rb_node *node; + struct kvm_memory_slot *ms; + + if (WARN_ON(kvm_memslots_empty(slots))) + return 0; + + node = rb_last(&slots->gfn_tree); + ms = container_of(node, struct kvm_memory_slot, gfn_node[slots->node_idx]); + return ms->base_gfn + ms->npages; +} + +static inline u32 kvm_s390_get_gisa_desc(struct kvm *kvm) +{ + u32 gd = virt_to_phys(kvm->arch.gisa_int.origin); + + if (gd && sclp.has_gisaf) + gd |= GISA_FORMAT1; + return gd; +} + +/* implemented in pv.c */ +int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc); +int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc); +int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc); +int kvm_s390_pv_deinit_aside_vm(struct kvm *kvm, u16 *rc, u16 *rrc); +int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc); +int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc); +int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc); +int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc, + u16 *rrc); +int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size, + unsigned long tweak, u16 *rc, u16 *rrc); +int kvm_s390_pv_set_cpu_state(struct kvm_vcpu *vcpu, u8 state); +int kvm_s390_pv_dump_cpu(struct kvm_vcpu *vcpu, void *buff, u16 *rc, u16 *rrc); +int kvm_s390_pv_dump_stor_state(struct kvm *kvm, void __user *buff_user, + u64 *gaddr, u64 buff_user_len, u16 *rc, u16 *rrc); +int kvm_s390_pv_dump_complete(struct kvm *kvm, void __user *buff_user, + u16 *rc, u16 *rrc); + +static inline u64 kvm_s390_pv_get_handle(struct kvm *kvm) +{ + return kvm->arch.pv.handle; +} + +static inline u64 kvm_s390_pv_cpu_get_handle(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.pv.handle; +} + /* implemented in interrupt.c */ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu); @@ -281,13 +355,12 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu); int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu); /* implemented in kvm-s390.c */ -void kvm_s390_set_tod_clock(struct kvm *kvm, - const struct kvm_s390_vm_tod_clock *gtod); +int kvm_s390_try_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod); long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable); int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr); int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr); -void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu); -void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu); +int kvm_s390_vcpu_start(struct kvm_vcpu *vcpu); +int kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_block(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_unblock(struct kvm_vcpu *vcpu); bool kvm_s390_vcpu_sie_inhibited(struct kvm_vcpu *vcpu); @@ -297,13 +370,14 @@ int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu); void kvm_s390_set_cpu_timer(struct kvm_vcpu *vcpu, __u64 cputm); __u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu); +int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rc, u16 *rrc); /* implemented in diag.c */ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu); static inline void kvm_s390_vcpu_block_all(struct kvm *kvm) { - int i; + unsigned long i; struct kvm_vcpu *vcpu; WARN_ON(!mutex_is_locked(&kvm->lock)); @@ -313,7 +387,7 @@ static inline void kvm_s390_vcpu_block_all(struct kvm *kvm) static inline void kvm_s390_vcpu_unblock_all(struct kvm *kvm) { - int i; + unsigned long i; struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) @@ -373,6 +447,7 @@ void kvm_s390_destroy_adapters(struct kvm *kvm); int kvm_s390_ext_call_pending(struct kvm_vcpu *vcpu); extern struct kvm_device_ops kvm_flic_ops; int kvm_s390_is_stop_irq_pending(struct kvm_vcpu *vcpu); +int kvm_s390_is_restart_irq_pending(struct kvm_vcpu *vcpu); void kvm_s390_clear_stop_irq(struct kvm_vcpu *vcpu); int kvm_s390_set_irq_state(struct kvm_vcpu *vcpu, void __user *buf, int len); @@ -381,7 +456,9 @@ int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, void kvm_s390_gisa_init(struct kvm *kvm); void kvm_s390_gisa_clear(struct kvm *kvm); void kvm_s390_gisa_destroy(struct kvm *kvm); -int kvm_s390_gib_init(u8 nisc); +void kvm_s390_gisa_disable(struct kvm *kvm); +void kvm_s390_gisa_enable(struct kvm *kvm); +int __init kvm_s390_gib_init(u8 nisc); void kvm_s390_gib_destroy(void); /* implemented in guestdbg.c */ @@ -426,4 +503,22 @@ void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu, * @kvm: the KVM guest */ void kvm_s390_vcpu_crypto_reset_all(struct kvm *kvm); + +/** + * kvm_s390_vcpu_pci_enable_interp + * + * Set the associated PCI attributes for each vcpu to allow for zPCI Load/Store + * interpretation as well as adapter interruption forwarding. + * + * @kvm: the KVM guest + */ +void kvm_s390_vcpu_pci_enable_interp(struct kvm *kvm); + +/** + * diag9c_forwarding_hz + * + * Set the maximum number of diag9c forwarding per second + */ +extern unsigned int diag9c_forwarding_hz; + #endif diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c new file mode 100644 index 000000000000..ffa7739c7a28 --- /dev/null +++ b/arch/s390/kvm/pci.c @@ -0,0 +1,704 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * s390 kvm PCI passthrough support + * + * Copyright IBM Corp. 2022 + * + * Author(s): Matthew Rosato <mjrosato@linux.ibm.com> + */ + +#include <linux/kvm_host.h> +#include <linux/pci.h> +#include <asm/pci.h> +#include <asm/pci_insn.h> +#include <asm/pci_io.h> +#include <asm/sclp.h> +#include "pci.h" +#include "kvm-s390.h" + +struct zpci_aift *aift; + +static inline int __set_irq_noiib(u16 ctl, u8 isc) +{ + union zpci_sic_iib iib = {{0}}; + + return zpci_set_irq_ctrl(ctl, isc, &iib); +} + +void kvm_s390_pci_aen_exit(void) +{ + unsigned long flags; + struct kvm_zdev **gait_kzdev; + + lockdep_assert_held(&aift->aift_lock); + + /* + * Contents of the aipb remain registered for the life of the host + * kernel, the information preserved in zpci_aipb and zpci_aif_sbv + * in case we insert the KVM module again later. Clear the AIFT + * information and free anything not registered with underlying + * firmware. + */ + spin_lock_irqsave(&aift->gait_lock, flags); + gait_kzdev = aift->kzdev; + aift->gait = NULL; + aift->sbv = NULL; + aift->kzdev = NULL; + spin_unlock_irqrestore(&aift->gait_lock, flags); + + kfree(gait_kzdev); +} + +static int zpci_setup_aipb(u8 nisc) +{ + struct page *page; + int size, rc; + + zpci_aipb = kzalloc(sizeof(union zpci_sic_iib), GFP_KERNEL); + if (!zpci_aipb) + return -ENOMEM; + + aift->sbv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC, NULL); + if (!aift->sbv) { + rc = -ENOMEM; + goto free_aipb; + } + zpci_aif_sbv = aift->sbv; + size = get_order(PAGE_ALIGN(ZPCI_NR_DEVICES * + sizeof(struct zpci_gaite))); + page = alloc_pages(GFP_KERNEL | __GFP_ZERO, size); + if (!page) { + rc = -ENOMEM; + goto free_sbv; + } + aift->gait = (struct zpci_gaite *)page_to_virt(page); + + zpci_aipb->aipb.faisb = virt_to_phys(aift->sbv->vector); + zpci_aipb->aipb.gait = virt_to_phys(aift->gait); + zpci_aipb->aipb.afi = nisc; + zpci_aipb->aipb.faal = ZPCI_NR_DEVICES; + + /* Setup Adapter Event Notification Interpretation */ + if (zpci_set_irq_ctrl(SIC_SET_AENI_CONTROLS, 0, zpci_aipb)) { + rc = -EIO; + goto free_gait; + } + + return 0; + +free_gait: + free_pages((unsigned long)aift->gait, size); +free_sbv: + airq_iv_release(aift->sbv); + zpci_aif_sbv = NULL; +free_aipb: + kfree(zpci_aipb); + zpci_aipb = NULL; + + return rc; +} + +static int zpci_reset_aipb(u8 nisc) +{ + /* + * AEN registration can only happen once per system boot. If + * an aipb already exists then AEN was already registered and + * we can re-use the aipb contents. This can only happen if + * the KVM module was removed and re-inserted. However, we must + * ensure that the same forwarding ISC is used as this is assigned + * during KVM module load. + */ + if (zpci_aipb->aipb.afi != nisc) + return -EINVAL; + + aift->sbv = zpci_aif_sbv; + aift->gait = phys_to_virt(zpci_aipb->aipb.gait); + + return 0; +} + +int kvm_s390_pci_aen_init(u8 nisc) +{ + int rc = 0; + + /* If already enabled for AEN, bail out now */ + if (aift->gait || aift->sbv) + return -EPERM; + + mutex_lock(&aift->aift_lock); + aift->kzdev = kcalloc(ZPCI_NR_DEVICES, sizeof(struct kvm_zdev *), + GFP_KERNEL); + if (!aift->kzdev) { + rc = -ENOMEM; + goto unlock; + } + + if (!zpci_aipb) + rc = zpci_setup_aipb(nisc); + else + rc = zpci_reset_aipb(nisc); + if (rc) + goto free_zdev; + + /* Enable floating IRQs */ + if (__set_irq_noiib(SIC_IRQ_MODE_SINGLE, nisc)) { + rc = -EIO; + kvm_s390_pci_aen_exit(); + } + + goto unlock; + +free_zdev: + kfree(aift->kzdev); +unlock: + mutex_unlock(&aift->aift_lock); + return rc; +} + +/* Modify PCI: Register floating adapter interruption forwarding */ +static int kvm_zpci_set_airq(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_REG_INT); + struct zpci_fib fib = {}; + u8 status; + + fib.fmt0.isc = zdev->kzdev->fib.fmt0.isc; + fib.fmt0.sum = 1; /* enable summary notifications */ + fib.fmt0.noi = airq_iv_end(zdev->aibv); + fib.fmt0.aibv = virt_to_phys(zdev->aibv->vector); + fib.fmt0.aibvo = 0; + fib.fmt0.aisb = virt_to_phys(aift->sbv->vector + (zdev->aisb / 64) * 8); + fib.fmt0.aisbo = zdev->aisb & 63; + fib.gd = zdev->gisa; + + return zpci_mod_fc(req, &fib, &status) ? -EIO : 0; +} + +/* Modify PCI: Unregister floating adapter interruption forwarding */ +static int kvm_zpci_clear_airq(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_DEREG_INT); + struct zpci_fib fib = {}; + u8 cc, status; + + fib.gd = zdev->gisa; + + cc = zpci_mod_fc(req, &fib, &status); + if (cc == 3 || (cc == 1 && status == 24)) + /* Function already gone or IRQs already deregistered. */ + cc = 0; + + return cc ? -EIO : 0; +} + +static inline void unaccount_mem(unsigned long nr_pages) +{ + struct user_struct *user = get_uid(current_user()); + + if (user) + atomic_long_sub(nr_pages, &user->locked_vm); + if (current->mm) + atomic64_sub(nr_pages, ¤t->mm->pinned_vm); +} + +static inline int account_mem(unsigned long nr_pages) +{ + struct user_struct *user = get_uid(current_user()); + unsigned long page_limit, cur_pages, new_pages; + + page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + do { + cur_pages = atomic_long_read(&user->locked_vm); + new_pages = cur_pages + nr_pages; + if (new_pages > page_limit) + return -ENOMEM; + } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages, + new_pages) != cur_pages); + + atomic64_add(nr_pages, ¤t->mm->pinned_vm); + + return 0; +} + +static int kvm_s390_pci_aif_enable(struct zpci_dev *zdev, struct zpci_fib *fib, + bool assist) +{ + struct page *pages[1], *aibv_page, *aisb_page = NULL; + unsigned int msi_vecs, idx; + struct zpci_gaite *gaite; + unsigned long hva, bit; + struct kvm *kvm; + phys_addr_t gaddr; + int rc = 0, gisc, npages, pcount = 0; + + /* + * Interrupt forwarding is only applicable if the device is already + * enabled for interpretation + */ + if (zdev->gisa == 0) + return -EINVAL; + + kvm = zdev->kzdev->kvm; + msi_vecs = min_t(unsigned int, fib->fmt0.noi, zdev->max_msi); + + /* Get the associated forwarding ISC - if invalid, return the error */ + gisc = kvm_s390_gisc_register(kvm, fib->fmt0.isc); + if (gisc < 0) + return gisc; + + /* Replace AIBV address */ + idx = srcu_read_lock(&kvm->srcu); + hva = gfn_to_hva(kvm, gpa_to_gfn((gpa_t)fib->fmt0.aibv)); + npages = pin_user_pages_fast(hva, 1, FOLL_WRITE | FOLL_LONGTERM, pages); + srcu_read_unlock(&kvm->srcu, idx); + if (npages < 1) { + rc = -EIO; + goto out; + } + aibv_page = pages[0]; + pcount++; + gaddr = page_to_phys(aibv_page) + (fib->fmt0.aibv & ~PAGE_MASK); + fib->fmt0.aibv = gaddr; + + /* Pin the guest AISB if one was specified */ + if (fib->fmt0.sum == 1) { + idx = srcu_read_lock(&kvm->srcu); + hva = gfn_to_hva(kvm, gpa_to_gfn((gpa_t)fib->fmt0.aisb)); + npages = pin_user_pages_fast(hva, 1, FOLL_WRITE | FOLL_LONGTERM, + pages); + srcu_read_unlock(&kvm->srcu, idx); + if (npages < 1) { + rc = -EIO; + goto unpin1; + } + aisb_page = pages[0]; + pcount++; + } + + /* Account for pinned pages, roll back on failure */ + if (account_mem(pcount)) + goto unpin2; + + /* AISB must be allocated before we can fill in GAITE */ + mutex_lock(&aift->aift_lock); + bit = airq_iv_alloc_bit(aift->sbv); + if (bit == -1UL) + goto unlock; + zdev->aisb = bit; /* store the summary bit number */ + zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | + AIRQ_IV_BITLOCK | + AIRQ_IV_GUESTVEC, + phys_to_virt(fib->fmt0.aibv)); + + spin_lock_irq(&aift->gait_lock); + gaite = (struct zpci_gaite *)aift->gait + (zdev->aisb * + sizeof(struct zpci_gaite)); + + /* If assist not requested, host will get all alerts */ + if (assist) + gaite->gisa = (u32)virt_to_phys(&kvm->arch.sie_page2->gisa); + else + gaite->gisa = 0; + + gaite->gisc = fib->fmt0.isc; + gaite->count++; + gaite->aisbo = fib->fmt0.aisbo; + gaite->aisb = virt_to_phys(page_address(aisb_page) + (fib->fmt0.aisb & + ~PAGE_MASK)); + aift->kzdev[zdev->aisb] = zdev->kzdev; + spin_unlock_irq(&aift->gait_lock); + + /* Update guest FIB for re-issue */ + fib->fmt0.aisbo = zdev->aisb & 63; + fib->fmt0.aisb = virt_to_phys(aift->sbv->vector + (zdev->aisb / 64) * 8); + fib->fmt0.isc = gisc; + + /* Save some guest fib values in the host for later use */ + zdev->kzdev->fib.fmt0.isc = fib->fmt0.isc; + zdev->kzdev->fib.fmt0.aibv = fib->fmt0.aibv; + mutex_unlock(&aift->aift_lock); + + /* Issue the clp to setup the irq now */ + rc = kvm_zpci_set_airq(zdev); + return rc; + +unlock: + mutex_unlock(&aift->aift_lock); +unpin2: + if (fib->fmt0.sum == 1) + unpin_user_page(aisb_page); +unpin1: + unpin_user_page(aibv_page); +out: + return rc; +} + +static int kvm_s390_pci_aif_disable(struct zpci_dev *zdev, bool force) +{ + struct kvm_zdev *kzdev = zdev->kzdev; + struct zpci_gaite *gaite; + struct page *vpage = NULL, *spage = NULL; + int rc, pcount = 0; + u8 isc; + + if (zdev->gisa == 0) + return -EINVAL; + + mutex_lock(&aift->aift_lock); + + /* + * If the clear fails due to an error, leave now unless we know this + * device is about to go away (force) -- In that case clear the GAITE + * regardless. + */ + rc = kvm_zpci_clear_airq(zdev); + if (rc && !force) + goto out; + + if (zdev->kzdev->fib.fmt0.aibv == 0) + goto out; + spin_lock_irq(&aift->gait_lock); + gaite = (struct zpci_gaite *)aift->gait + (zdev->aisb * + sizeof(struct zpci_gaite)); + isc = gaite->gisc; + gaite->count--; + if (gaite->count == 0) { + /* Release guest AIBV and AISB */ + vpage = phys_to_page(kzdev->fib.fmt0.aibv); + if (gaite->aisb != 0) + spage = phys_to_page(gaite->aisb); + /* Clear the GAIT entry */ + gaite->aisb = 0; + gaite->gisc = 0; + gaite->aisbo = 0; + gaite->gisa = 0; + aift->kzdev[zdev->aisb] = NULL; + /* Clear zdev info */ + airq_iv_free_bit(aift->sbv, zdev->aisb); + airq_iv_release(zdev->aibv); + zdev->aisb = 0; + zdev->aibv = NULL; + } + spin_unlock_irq(&aift->gait_lock); + kvm_s390_gisc_unregister(kzdev->kvm, isc); + kzdev->fib.fmt0.isc = 0; + kzdev->fib.fmt0.aibv = 0; + + if (vpage) { + unpin_user_page(vpage); + pcount++; + } + if (spage) { + unpin_user_page(spage); + pcount++; + } + if (pcount > 0) + unaccount_mem(pcount); +out: + mutex_unlock(&aift->aift_lock); + + return rc; +} + +static int kvm_s390_pci_dev_open(struct zpci_dev *zdev) +{ + struct kvm_zdev *kzdev; + + kzdev = kzalloc(sizeof(struct kvm_zdev), GFP_KERNEL); + if (!kzdev) + return -ENOMEM; + + kzdev->zdev = zdev; + zdev->kzdev = kzdev; + + return 0; +} + +static void kvm_s390_pci_dev_release(struct zpci_dev *zdev) +{ + struct kvm_zdev *kzdev; + + kzdev = zdev->kzdev; + WARN_ON(kzdev->zdev != zdev); + zdev->kzdev = NULL; + kfree(kzdev); +} + + +/* + * Register device with the specified KVM. If interpretation facilities are + * available, enable them and let userspace indicate whether or not they will + * be used (specify SHM bit to disable). + */ +static int kvm_s390_pci_register_kvm(void *opaque, struct kvm *kvm) +{ + struct zpci_dev *zdev = opaque; + u8 status; + int rc; + + if (!zdev) + return -EINVAL; + + mutex_lock(&zdev->kzdev_lock); + + if (zdev->kzdev || zdev->gisa != 0 || !kvm) { + mutex_unlock(&zdev->kzdev_lock); + return -EINVAL; + } + + kvm_get_kvm(kvm); + + mutex_lock(&kvm->lock); + + rc = kvm_s390_pci_dev_open(zdev); + if (rc) + goto err; + + /* + * If interpretation facilities aren't available, add the device to + * the kzdev list but don't enable for interpretation. + */ + if (!kvm_s390_pci_interp_allowed()) + goto out; + + /* + * If this is the first request to use an interpreted device, make the + * necessary vcpu changes + */ + if (!kvm->arch.use_zpci_interp) + kvm_s390_vcpu_pci_enable_interp(kvm); + + if (zdev_enabled(zdev)) { + rc = zpci_disable_device(zdev); + if (rc) + goto err; + } + + /* + * Store information about the identity of the kvm guest allowed to + * access this device via interpretation to be used by host CLP + */ + zdev->gisa = (u32)virt_to_phys(&kvm->arch.sie_page2->gisa); + + rc = zpci_enable_device(zdev); + if (rc) + goto clear_gisa; + + /* Re-register the IOMMU that was already created */ + rc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, + virt_to_phys(zdev->dma_table), &status); + if (rc) + goto clear_gisa; + +out: + zdev->kzdev->kvm = kvm; + + spin_lock(&kvm->arch.kzdev_list_lock); + list_add_tail(&zdev->kzdev->entry, &kvm->arch.kzdev_list); + spin_unlock(&kvm->arch.kzdev_list_lock); + + mutex_unlock(&kvm->lock); + mutex_unlock(&zdev->kzdev_lock); + return 0; + +clear_gisa: + zdev->gisa = 0; +err: + if (zdev->kzdev) + kvm_s390_pci_dev_release(zdev); + mutex_unlock(&kvm->lock); + mutex_unlock(&zdev->kzdev_lock); + kvm_put_kvm(kvm); + return rc; +} + +static void kvm_s390_pci_unregister_kvm(void *opaque) +{ + struct zpci_dev *zdev = opaque; + struct kvm *kvm; + u8 status; + + if (!zdev) + return; + + mutex_lock(&zdev->kzdev_lock); + + if (WARN_ON(!zdev->kzdev)) { + mutex_unlock(&zdev->kzdev_lock); + return; + } + + kvm = zdev->kzdev->kvm; + mutex_lock(&kvm->lock); + + /* + * A 0 gisa means interpretation was never enabled, just remove the + * device from the list. + */ + if (zdev->gisa == 0) + goto out; + + /* Forwarding must be turned off before interpretation */ + if (zdev->kzdev->fib.fmt0.aibv != 0) + kvm_s390_pci_aif_disable(zdev, true); + + /* Remove the host CLP guest designation */ + zdev->gisa = 0; + + if (zdev_enabled(zdev)) { + if (zpci_disable_device(zdev)) + goto out; + } + + if (zpci_enable_device(zdev)) + goto out; + + /* Re-register the IOMMU that was already created */ + zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, + virt_to_phys(zdev->dma_table), &status); + +out: + spin_lock(&kvm->arch.kzdev_list_lock); + list_del(&zdev->kzdev->entry); + spin_unlock(&kvm->arch.kzdev_list_lock); + kvm_s390_pci_dev_release(zdev); + + mutex_unlock(&kvm->lock); + mutex_unlock(&zdev->kzdev_lock); + + kvm_put_kvm(kvm); +} + +void kvm_s390_pci_init_list(struct kvm *kvm) +{ + spin_lock_init(&kvm->arch.kzdev_list_lock); + INIT_LIST_HEAD(&kvm->arch.kzdev_list); +} + +void kvm_s390_pci_clear_list(struct kvm *kvm) +{ + /* + * This list should already be empty, either via vfio device closures + * or kvm fd cleanup. + */ + spin_lock(&kvm->arch.kzdev_list_lock); + WARN_ON_ONCE(!list_empty(&kvm->arch.kzdev_list)); + spin_unlock(&kvm->arch.kzdev_list_lock); +} + +static struct zpci_dev *get_zdev_from_kvm_by_fh(struct kvm *kvm, u32 fh) +{ + struct zpci_dev *zdev = NULL; + struct kvm_zdev *kzdev; + + spin_lock(&kvm->arch.kzdev_list_lock); + list_for_each_entry(kzdev, &kvm->arch.kzdev_list, entry) { + if (kzdev->zdev->fh == fh) { + zdev = kzdev->zdev; + break; + } + } + spin_unlock(&kvm->arch.kzdev_list_lock); + + return zdev; +} + +static int kvm_s390_pci_zpci_reg_aen(struct zpci_dev *zdev, + struct kvm_s390_zpci_op *args) +{ + struct zpci_fib fib = {}; + bool hostflag; + + fib.fmt0.aibv = args->u.reg_aen.ibv; + fib.fmt0.isc = args->u.reg_aen.isc; + fib.fmt0.noi = args->u.reg_aen.noi; + if (args->u.reg_aen.sb != 0) { + fib.fmt0.aisb = args->u.reg_aen.sb; + fib.fmt0.aisbo = args->u.reg_aen.sbo; + fib.fmt0.sum = 1; + } else { + fib.fmt0.aisb = 0; + fib.fmt0.aisbo = 0; + fib.fmt0.sum = 0; + } + + hostflag = !(args->u.reg_aen.flags & KVM_S390_ZPCIOP_REGAEN_HOST); + return kvm_s390_pci_aif_enable(zdev, &fib, hostflag); +} + +int kvm_s390_pci_zpci_op(struct kvm *kvm, struct kvm_s390_zpci_op *args) +{ + struct kvm_zdev *kzdev; + struct zpci_dev *zdev; + int r; + + zdev = get_zdev_from_kvm_by_fh(kvm, args->fh); + if (!zdev) + return -ENODEV; + + mutex_lock(&zdev->kzdev_lock); + mutex_lock(&kvm->lock); + + kzdev = zdev->kzdev; + if (!kzdev) { + r = -ENODEV; + goto out; + } + if (kzdev->kvm != kvm) { + r = -EPERM; + goto out; + } + + switch (args->op) { + case KVM_S390_ZPCIOP_REG_AEN: + /* Fail on unknown flags */ + if (args->u.reg_aen.flags & ~KVM_S390_ZPCIOP_REGAEN_HOST) { + r = -EINVAL; + break; + } + r = kvm_s390_pci_zpci_reg_aen(zdev, args); + break; + case KVM_S390_ZPCIOP_DEREG_AEN: + r = kvm_s390_pci_aif_disable(zdev, false); + break; + default: + r = -EINVAL; + } + +out: + mutex_unlock(&kvm->lock); + mutex_unlock(&zdev->kzdev_lock); + return r; +} + +int __init kvm_s390_pci_init(void) +{ + zpci_kvm_hook.kvm_register = kvm_s390_pci_register_kvm; + zpci_kvm_hook.kvm_unregister = kvm_s390_pci_unregister_kvm; + + if (!kvm_s390_pci_interp_allowed()) + return 0; + + aift = kzalloc(sizeof(struct zpci_aift), GFP_KERNEL); + if (!aift) + return -ENOMEM; + + spin_lock_init(&aift->gait_lock); + mutex_init(&aift->aift_lock); + + return 0; +} + +void kvm_s390_pci_exit(void) +{ + zpci_kvm_hook.kvm_register = NULL; + zpci_kvm_hook.kvm_unregister = NULL; + + if (!kvm_s390_pci_interp_allowed()) + return; + + mutex_destroy(&aift->aift_lock); + + kfree(aift); +} diff --git a/arch/s390/kvm/pci.h b/arch/s390/kvm/pci.h new file mode 100644 index 000000000000..ff0972dd5e71 --- /dev/null +++ b/arch/s390/kvm/pci.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * s390 kvm PCI passthrough support + * + * Copyright IBM Corp. 2022 + * + * Author(s): Matthew Rosato <mjrosato@linux.ibm.com> + */ + +#ifndef __KVM_S390_PCI_H +#define __KVM_S390_PCI_H + +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/mutex.h> +#include <linux/pci.h> +#include <asm/airq.h> +#include <asm/cpu.h> + +struct kvm_zdev { + struct zpci_dev *zdev; + struct kvm *kvm; + struct zpci_fib fib; + struct list_head entry; +}; + +struct zpci_gaite { + u32 gisa; + u8 gisc; + u8 count; + u8 reserved; + u8 aisbo; + u64 aisb; +}; + +struct zpci_aift { + struct zpci_gaite *gait; + struct airq_iv *sbv; + struct kvm_zdev **kzdev; + spinlock_t gait_lock; /* Protects the gait, used during AEN forward */ + struct mutex aift_lock; /* Protects the other structures in aift */ +}; + +extern struct zpci_aift *aift; + +static inline struct kvm *kvm_s390_pci_si_to_kvm(struct zpci_aift *aift, + unsigned long si) +{ + if (!IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM) || !aift->kzdev || + !aift->kzdev[si]) + return NULL; + return aift->kzdev[si]->kvm; +}; + +int kvm_s390_pci_aen_init(u8 nisc); +void kvm_s390_pci_aen_exit(void); + +void kvm_s390_pci_init_list(struct kvm *kvm); +void kvm_s390_pci_clear_list(struct kvm *kvm); + +int kvm_s390_pci_zpci_op(struct kvm *kvm, struct kvm_s390_zpci_op *args); + +int __init kvm_s390_pci_init(void); +void kvm_s390_pci_exit(void); + +static inline bool kvm_s390_pci_interp_allowed(void) +{ + struct cpuid cpu_id; + + get_cpu_id(&cpu_id); + switch (cpu_id.machine) { + case 0x2817: + case 0x2818: + case 0x2827: + case 0x2828: + case 0x2964: + case 0x2965: + /* No SHM on certain machines */ + return false; + default: + return (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM) && + sclp.has_zpci_lsi && sclp.has_aeni && sclp.has_aisi && + sclp.has_aisii); + } +} + +#endif /* __KVM_S390_PCI_H */ diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index ed52ffa8d5d4..621a17fd1a1b 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -2,7 +2,7 @@ /* * handling privileged instructions * - * Copyright IBM Corp. 2008, 2018 + * Copyright IBM Corp. 2008, 2020 * * Author(s): Carsten Otte <cotte@de.ibm.com> * Christian Borntraeger <borntraeger@de.ibm.com> @@ -11,20 +11,17 @@ #include <linux/kvm.h> #include <linux/gfp.h> #include <linux/errno.h> -#include <linux/compat.h> #include <linux/mm_types.h> - +#include <linux/pgtable.h> +#include <linux/io.h> #include <asm/asm-offsets.h> #include <asm/facility.h> #include <asm/current.h> #include <asm/debug.h> #include <asm/ebcdic.h> #include <asm/sysinfo.h> -#include <asm/pgtable.h> #include <asm/page-states.h> -#include <asm/pgalloc.h> #include <asm/gmap.h> -#include <asm/io.h> #include <asm/ptrace.h> #include <asm/sclp.h> #include <asm/ap.h> @@ -60,7 +57,7 @@ static int handle_gs(struct kvm_vcpu *vcpu) if (test_kvm_facility(vcpu->kvm, 133)) { VCPU_EVENT(vcpu, 3, "%s", "ENABLE: GS (lazy)"); preempt_disable(); - __ctl_set_bit(2, 4); + local_ctl_set_bit(2, CR2_GUARDED_STORAGE_BIT); current->thread.gs_cb = (struct gs_cb *)&vcpu->run->s.regs.gscb; restore_gs_cb(current->thread.gs_cb); preempt_enable(); @@ -103,7 +100,20 @@ static int handle_set_clock(struct kvm_vcpu *vcpu) return kvm_s390_inject_prog_cond(vcpu, rc); VCPU_EVENT(vcpu, 3, "SCK: setting guest TOD to 0x%llx", gtod.tod); - kvm_s390_set_tod_clock(vcpu->kvm, >od); + /* + * To set the TOD clock the kvm lock must be taken, but the vcpu lock + * is already held in handle_set_clock. The usual lock order is the + * opposite. As SCK is deprecated and should not be used in several + * cases, for example when the multiple epoch facility or TOD clock + * steering facility is installed (see Principles of Operation), a + * slow path can be used. If the lock can not be taken via try_lock, + * the instruction will be retried via -EAGAIN at a later point in + * time. + */ + if (!kvm_s390_try_set_tod_clock(vcpu->kvm, >od)) { + kvm_s390_retry_instr(vcpu); + return -EAGAIN; + } kvm_s390_set_psw_cc(vcpu, 0); return 0; @@ -270,18 +280,18 @@ static int handle_iske(struct kvm_vcpu *vcpu) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); retry: unlocked = false; - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); rc = get_guest_storage_key(current->mm, vmaddr, &key); if (rc) { - rc = fixup_user_fault(current, current->mm, vmaddr, + rc = fixup_user_fault(current->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked); if (!rc) { - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); goto retry; } } - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); if (rc == -EFAULT) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); if (rc < 0) @@ -317,17 +327,17 @@ static int handle_rrbe(struct kvm_vcpu *vcpu) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); retry: unlocked = false; - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); rc = reset_guest_reference_bit(current->mm, vmaddr); if (rc < 0) { - rc = fixup_user_fault(current, current->mm, vmaddr, + rc = fixup_user_fault(current->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked); if (!rc) { - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); goto retry; } } - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); if (rc == -EFAULT) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); if (rc < 0) @@ -385,19 +395,21 @@ static int handle_sske(struct kvm_vcpu *vcpu) if (kvm_is_error_hva(vmaddr)) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); rc = cond_set_guest_storage_key(current->mm, vmaddr, key, &oldkey, m3 & SSKE_NQ, m3 & SSKE_MR, m3 & SSKE_MC); if (rc < 0) { - rc = fixup_user_fault(current, current->mm, vmaddr, + rc = fixup_user_fault(current->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked); rc = !rc ? -EAGAIN : rc; } - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); if (rc == -EFAULT) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + if (rc == -EAGAIN) + continue; if (rc < 0) return rc; start += PAGE_SIZE; @@ -429,7 +441,7 @@ static int handle_ipte_interlock(struct kvm_vcpu *vcpu) vcpu->stat.instruction_ipte_interlock++; if (psw_bits(vcpu->arch.sie_block->gpsw).pstate) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); - wait_event(vcpu->kvm->arch.ipte_wq, !ipte_lock_held(vcpu)); + wait_event(vcpu->kvm->arch.ipte_wq, !ipte_lock_held(vcpu->kvm)); kvm_s390_retry_instr(vcpu); VCPU_EVENT(vcpu, 4, "%s", "retrying ipte interlock operation"); return 0; @@ -611,6 +623,7 @@ static int handle_io_inst(struct kvm_vcpu *vcpu) static int handle_pqap(struct kvm_vcpu *vcpu) { struct ap_queue_status status = {}; + crypto_hook pqap_hook; unsigned long reg0; int ret; uint8_t fc; @@ -626,10 +639,12 @@ static int handle_pqap(struct kvm_vcpu *vcpu) * available for the guest are AQIC and TAPQ with the t bit set * since we do not set IC.3 (FIII) we currently will only intercept * the AQIC function code. + * Note: running nested under z/VM can result in intercepts for other + * function codes, e.g. PQAP(QCI). We do not support this and bail out. */ reg0 = vcpu->run->s.regs.gprs[0]; fc = (reg0 >> 24) & 0xff; - if (WARN_ON_ONCE(fc != 0x03)) + if (fc != 0x03) return -EOPNOTSUPP; /* PQAP instruction is allowed for guest kernel only */ @@ -653,18 +668,20 @@ static int handle_pqap(struct kvm_vcpu *vcpu) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); /* - * Verify that the hook callback is registered, lock the owner - * and call the hook. + * If the hook callback is registered, there will be a pointer to the + * hook function pointer in the kvm_s390_crypto structure. Lock the + * owner, retrieve the hook function pointer and call the hook. */ + down_read(&vcpu->kvm->arch.crypto.pqap_hook_rwsem); if (vcpu->kvm->arch.crypto.pqap_hook) { - if (!try_module_get(vcpu->kvm->arch.crypto.pqap_hook->owner)) - return -EOPNOTSUPP; - ret = vcpu->kvm->arch.crypto.pqap_hook->hook(vcpu); - module_put(vcpu->kvm->arch.crypto.pqap_hook->owner); + pqap_hook = *vcpu->kvm->arch.crypto.pqap_hook; + ret = pqap_hook(vcpu); if (!ret && vcpu->run->s.regs.gprs[1] & 0x00ff0000) kvm_s390_set_psw_cc(vcpu, 3); + up_read(&vcpu->kvm->arch.crypto.pqap_hook_rwsem); return ret; } + up_read(&vcpu->kvm->arch.crypto.pqap_hook_rwsem); /* * A vfio_driver must register a hook. * No hook means no driver to enable the SIE CRYCB and no queues. @@ -855,10 +872,18 @@ static int handle_stsi(struct kvm_vcpu *vcpu) if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); - if (fc > 3) { - kvm_s390_set_psw_cc(vcpu, 3); - return 0; - } + /* Bailout forbidden function codes */ + if (fc > 3 && fc != 15) + goto out_no_data; + + /* + * fc 15 is provided only with + * - PTF/CPU topology support through facility 15 + * - KVM_CAP_S390_USER_STSI + */ + if (fc == 15 && (!test_kvm_facility(vcpu->kvm, 11) || + !vcpu->kvm->arch.user_stsi)) + goto out_no_data; if (vcpu->run->s.regs.gprs[0] & 0x0fffff00 || vcpu->run->s.regs.gprs[1] & 0xffff0000) @@ -872,13 +897,13 @@ static int handle_stsi(struct kvm_vcpu *vcpu) operand2 = kvm_s390_get_base_disp_s(vcpu, &ar); - if (operand2 & 0xfff) + if (!kvm_s390_pv_cpu_is_protected(vcpu) && (operand2 & 0xfff)) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); switch (fc) { case 1: /* same handling for 1 and 2 */ case 2: - mem = get_zeroed_page(GFP_KERNEL); + mem = get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!mem) goto out_no_data; if (stsi((void *) mem, fc, sel1, sel2)) @@ -887,14 +912,22 @@ static int handle_stsi(struct kvm_vcpu *vcpu) case 3: if (sel1 != 2 || sel2 != 2) goto out_no_data; - mem = get_zeroed_page(GFP_KERNEL); + mem = get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!mem) goto out_no_data; handle_stsi_3_2_2(vcpu, (void *) mem); break; + case 15: /* fc 15 is fully handled in userspace */ + insert_stsi_usr_data(vcpu, operand2, ar, fc, sel1, sel2); + trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2); + return -EREMOTE; + } + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + memcpy(sida_addr(vcpu->arch.sie_block), (void *)mem, PAGE_SIZE); + rc = 0; + } else { + rc = write_guest(vcpu, operand2, ar, (void *)mem, PAGE_SIZE); } - - rc = write_guest(vcpu, operand2, ar, (void *)mem, PAGE_SIZE); if (rc) { rc = kvm_s390_inject_prog_cond(vcpu, rc); goto out; @@ -1084,15 +1117,15 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) if (rc) return rc; - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); rc = cond_set_guest_storage_key(current->mm, vmaddr, key, NULL, nq, mr, mc); if (rc < 0) { - rc = fixup_user_fault(current, current->mm, vmaddr, + rc = fixup_user_fault(current->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked); rc = !rc ? -EAGAIN : rc; } - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); if (rc == -EFAULT) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); if (rc == -EAGAIN) @@ -1115,7 +1148,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) } /* - * Must be called with relevant read locks held (kvm->mm->mmap_sem, kvm->srcu) + * Must be called with relevant read locks held (kvm->mm->mmap_lock, kvm->srcu) */ static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc) { @@ -1213,9 +1246,9 @@ static int handle_essa(struct kvm_vcpu *vcpu) * already correct, we do nothing and avoid the lock. */ if (vcpu->kvm->mm->context.uses_cmm == 0) { - down_write(&vcpu->kvm->mm->mmap_sem); + mmap_write_lock(vcpu->kvm->mm); vcpu->kvm->mm->context.uses_cmm = 1; - up_write(&vcpu->kvm->mm->mmap_sem); + mmap_write_unlock(vcpu->kvm->mm); } /* * If we are here, we are supposed to have CMMA enabled in @@ -1232,11 +1265,11 @@ static int handle_essa(struct kvm_vcpu *vcpu) } else { int srcu_idx; - down_read(&vcpu->kvm->mm->mmap_sem); + mmap_read_lock(vcpu->kvm->mm); srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); i = __do_essa(vcpu, orc); srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); - up_read(&vcpu->kvm->mm->mmap_sem); + mmap_read_unlock(vcpu->kvm->mm); if (i < 0) return i; /* Account for the possible extra cbrl entry */ @@ -1244,10 +1277,10 @@ static int handle_essa(struct kvm_vcpu *vcpu) } vcpu->arch.sie_block->cbrlo &= PAGE_MASK; /* reset nceo */ cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo); - down_read(&gmap->mm->mmap_sem); + mmap_read_lock(gmap->mm); for (i = 0; i < entries; ++i) __gmap_zap(gmap, cbrlo[i]); - up_read(&gmap->mm->mmap_sem); + mmap_read_unlock(gmap->mm); return 0; } @@ -1432,10 +1465,11 @@ int kvm_s390_handle_eb(struct kvm_vcpu *vcpu) static int handle_tprot(struct kvm_vcpu *vcpu) { - u64 address1, address2; - unsigned long hva, gpa; - int ret = 0, cc = 0; + u64 address, operand2; + unsigned long gpa; + u8 access_key; bool writable; + int ret, cc; u8 ar; vcpu->stat.instruction_tprot++; @@ -1443,45 +1477,48 @@ static int handle_tprot(struct kvm_vcpu *vcpu) if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); - kvm_s390_get_base_disp_sse(vcpu, &address1, &address2, &ar, NULL); + kvm_s390_get_base_disp_sse(vcpu, &address, &operand2, &ar, NULL); + access_key = (operand2 & 0xf0) >> 4; - /* we only handle the Linux memory detection case: - * access key == 0 - * everything else goes to userspace. */ - if (address2 & 0xf0) - return -EOPNOTSUPP; if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT) - ipte_lock(vcpu); - ret = guest_translate_address(vcpu, address1, ar, &gpa, GACC_STORE); - if (ret == PGM_PROTECTION) { + ipte_lock(vcpu->kvm); + + ret = guest_translate_address_with_key(vcpu, address, ar, &gpa, + GACC_STORE, access_key); + if (ret == 0) { + gfn_to_hva_prot(vcpu->kvm, gpa_to_gfn(gpa), &writable); + } else if (ret == PGM_PROTECTION) { + writable = false; /* Write protected? Try again with read-only... */ - cc = 1; - ret = guest_translate_address(vcpu, address1, ar, &gpa, - GACC_FETCH); + ret = guest_translate_address_with_key(vcpu, address, ar, &gpa, + GACC_FETCH, access_key); } - if (ret) { - if (ret == PGM_ADDRESSING || ret == PGM_TRANSLATION_SPEC) { - ret = kvm_s390_inject_program_int(vcpu, ret); - } else if (ret > 0) { - /* Translation not available */ - kvm_s390_set_psw_cc(vcpu, 3); + if (ret >= 0) { + cc = -1; + + /* Fetching permitted; storing permitted */ + if (ret == 0 && writable) + cc = 0; + /* Fetching permitted; storing not permitted */ + else if (ret == 0 && !writable) + cc = 1; + /* Fetching not permitted; storing not permitted */ + else if (ret == PGM_PROTECTION) + cc = 2; + /* Translation not available */ + else if (ret != PGM_ADDRESSING && ret != PGM_TRANSLATION_SPEC) + cc = 3; + + if (cc != -1) { + kvm_s390_set_psw_cc(vcpu, cc); ret = 0; + } else { + ret = kvm_s390_inject_program_int(vcpu, ret); } - goto out_unlock; } - hva = gfn_to_hva_prot(vcpu->kvm, gpa_to_gfn(gpa), &writable); - if (kvm_is_error_hva(hva)) { - ret = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - } else { - if (!writable) - cc = 1; /* Write not permitted ==> read-only */ - kvm_s390_set_psw_cc(vcpu, cc); - /* Note: CC2 only occurs for storage keys (not supported yet) */ - } -out_unlock: if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT) - ipte_unlock(vcpu); + ipte_unlock(vcpu->kvm); return ret; } diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c new file mode 100644 index 000000000000..75e81ba26d04 --- /dev/null +++ b/arch/s390/kvm/pv.c @@ -0,0 +1,894 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Hosting Protected Virtual Machines + * + * Copyright IBM Corp. 2019, 2020 + * Author(s): Janosch Frank <frankja@linux.ibm.com> + */ +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/minmax.h> +#include <linux/pagemap.h> +#include <linux/sched/signal.h> +#include <asm/gmap.h> +#include <asm/uv.h> +#include <asm/mman.h> +#include <linux/pagewalk.h> +#include <linux/sched/mm.h> +#include <linux/mmu_notifier.h> +#include "kvm-s390.h" + +bool kvm_s390_pv_is_protected(struct kvm *kvm) +{ + lockdep_assert_held(&kvm->lock); + return !!kvm_s390_pv_get_handle(kvm); +} +EXPORT_SYMBOL_GPL(kvm_s390_pv_is_protected); + +bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu) +{ + lockdep_assert_held(&vcpu->mutex); + return !!kvm_s390_pv_cpu_get_handle(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_s390_pv_cpu_is_protected); + +/** + * struct pv_vm_to_be_destroyed - Represents a protected VM that needs to + * be destroyed + * + * @list: list head for the list of leftover VMs + * @old_gmap_table: the gmap table of the leftover protected VM + * @handle: the handle of the leftover protected VM + * @stor_var: pointer to the variable storage of the leftover protected VM + * @stor_base: address of the base storage of the leftover protected VM + * + * Represents a protected VM that is still registered with the Ultravisor, + * but which does not correspond any longer to an active KVM VM. It should + * be destroyed at some point later, either asynchronously or when the + * process terminates. + */ +struct pv_vm_to_be_destroyed { + struct list_head list; + unsigned long old_gmap_table; + u64 handle; + void *stor_var; + unsigned long stor_base; +}; + +static void kvm_s390_clear_pv_state(struct kvm *kvm) +{ + kvm->arch.pv.handle = 0; + kvm->arch.pv.guest_len = 0; + kvm->arch.pv.stor_base = 0; + kvm->arch.pv.stor_var = NULL; +} + +int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc) +{ + int cc; + + if (!kvm_s390_pv_cpu_get_handle(vcpu)) + return 0; + + cc = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu), UVC_CMD_DESTROY_SEC_CPU, rc, rrc); + + KVM_UV_EVENT(vcpu->kvm, 3, "PROTVIRT DESTROY VCPU %d: rc %x rrc %x", + vcpu->vcpu_id, *rc, *rrc); + WARN_ONCE(cc, "protvirt destroy cpu failed rc %x rrc %x", *rc, *rrc); + + /* Intended memory leak for something that should never happen. */ + if (!cc) + free_pages(vcpu->arch.pv.stor_base, + get_order(uv_info.guest_cpu_stor_len)); + + free_page((unsigned long)sida_addr(vcpu->arch.sie_block)); + vcpu->arch.sie_block->pv_handle_cpu = 0; + vcpu->arch.sie_block->pv_handle_config = 0; + memset(&vcpu->arch.pv, 0, sizeof(vcpu->arch.pv)); + vcpu->arch.sie_block->sdf = 0; + /* + * The sidad field (for sdf == 2) is now the gbea field (for sdf == 0). + * Use the reset value of gbea to avoid leaking the kernel pointer of + * the just freed sida. + */ + vcpu->arch.sie_block->gbea = 1; + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + + return cc ? EIO : 0; +} + +int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc) +{ + struct uv_cb_csc uvcb = { + .header.cmd = UVC_CMD_CREATE_SEC_CPU, + .header.len = sizeof(uvcb), + }; + void *sida_addr; + int cc; + + if (kvm_s390_pv_cpu_get_handle(vcpu)) + return -EINVAL; + + vcpu->arch.pv.stor_base = __get_free_pages(GFP_KERNEL_ACCOUNT, + get_order(uv_info.guest_cpu_stor_len)); + if (!vcpu->arch.pv.stor_base) + return -ENOMEM; + + /* Input */ + uvcb.guest_handle = kvm_s390_pv_get_handle(vcpu->kvm); + uvcb.num = vcpu->arch.sie_block->icpua; + uvcb.state_origin = virt_to_phys(vcpu->arch.sie_block); + uvcb.stor_origin = virt_to_phys((void *)vcpu->arch.pv.stor_base); + + /* Alloc Secure Instruction Data Area Designation */ + sida_addr = (void *)__get_free_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!sida_addr) { + free_pages(vcpu->arch.pv.stor_base, + get_order(uv_info.guest_cpu_stor_len)); + return -ENOMEM; + } + vcpu->arch.sie_block->sidad = virt_to_phys(sida_addr); + + cc = uv_call(0, (u64)&uvcb); + *rc = uvcb.header.rc; + *rrc = uvcb.header.rrc; + KVM_UV_EVENT(vcpu->kvm, 3, + "PROTVIRT CREATE VCPU: cpu %d handle %llx rc %x rrc %x", + vcpu->vcpu_id, uvcb.cpu_handle, uvcb.header.rc, + uvcb.header.rrc); + + if (cc) { + u16 dummy; + + kvm_s390_pv_destroy_cpu(vcpu, &dummy, &dummy); + return -EIO; + } + + /* Output */ + vcpu->arch.pv.handle = uvcb.cpu_handle; + vcpu->arch.sie_block->pv_handle_cpu = uvcb.cpu_handle; + vcpu->arch.sie_block->pv_handle_config = kvm_s390_pv_get_handle(vcpu->kvm); + vcpu->arch.sie_block->sdf = 2; + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + return 0; +} + +/* only free resources when the destroy was successful */ +static void kvm_s390_pv_dealloc_vm(struct kvm *kvm) +{ + vfree(kvm->arch.pv.stor_var); + free_pages(kvm->arch.pv.stor_base, + get_order(uv_info.guest_base_stor_len)); + kvm_s390_clear_pv_state(kvm); +} + +static int kvm_s390_pv_alloc_vm(struct kvm *kvm) +{ + unsigned long base = uv_info.guest_base_stor_len; + unsigned long virt = uv_info.guest_virt_var_stor_len; + unsigned long npages = 0, vlen = 0; + + kvm->arch.pv.stor_var = NULL; + kvm->arch.pv.stor_base = __get_free_pages(GFP_KERNEL_ACCOUNT, get_order(base)); + if (!kvm->arch.pv.stor_base) + return -ENOMEM; + + /* + * Calculate current guest storage for allocation of the + * variable storage, which is based on the length in MB. + * + * Slots are sorted by GFN + */ + mutex_lock(&kvm->slots_lock); + npages = kvm_s390_get_gfn_end(kvm_memslots(kvm)); + mutex_unlock(&kvm->slots_lock); + + kvm->arch.pv.guest_len = npages * PAGE_SIZE; + + /* Allocate variable storage */ + vlen = ALIGN(virt * ((npages * PAGE_SIZE) / HPAGE_SIZE), PAGE_SIZE); + vlen += uv_info.guest_virt_base_stor_len; + kvm->arch.pv.stor_var = vzalloc(vlen); + if (!kvm->arch.pv.stor_var) + goto out_err; + return 0; + +out_err: + kvm_s390_pv_dealloc_vm(kvm); + return -ENOMEM; +} + +/** + * kvm_s390_pv_dispose_one_leftover - Clean up one leftover protected VM. + * @kvm: the KVM that was associated with this leftover protected VM + * @leftover: details about the leftover protected VM that needs a clean up + * @rc: the RC code of the Destroy Secure Configuration UVC + * @rrc: the RRC code of the Destroy Secure Configuration UVC + * + * Destroy one leftover protected VM. + * On success, kvm->mm->context.protected_count will be decremented atomically + * and all other resources used by the VM will be freed. + * + * Return: 0 in case of success, otherwise 1 + */ +static int kvm_s390_pv_dispose_one_leftover(struct kvm *kvm, + struct pv_vm_to_be_destroyed *leftover, + u16 *rc, u16 *rrc) +{ + int cc; + + /* It used the destroy-fast UVC, nothing left to do here */ + if (!leftover->handle) + goto done_fast; + cc = uv_cmd_nodata(leftover->handle, UVC_CMD_DESTROY_SEC_CONF, rc, rrc); + KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY LEFTOVER VM: rc %x rrc %x", *rc, *rrc); + WARN_ONCE(cc, "protvirt destroy leftover vm failed rc %x rrc %x", *rc, *rrc); + if (cc) + return cc; + /* + * Intentionally leak unusable memory. If the UVC fails, the memory + * used for the VM and its metadata is permanently unusable. + * This can only happen in case of a serious KVM or hardware bug; it + * is not expected to happen in normal operation. + */ + free_pages(leftover->stor_base, get_order(uv_info.guest_base_stor_len)); + free_pages(leftover->old_gmap_table, CRST_ALLOC_ORDER); + vfree(leftover->stor_var); +done_fast: + atomic_dec(&kvm->mm->context.protected_count); + return 0; +} + +/** + * kvm_s390_destroy_lower_2g - Destroy the first 2GB of protected guest memory. + * @kvm: the VM whose memory is to be cleared. + * + * Destroy the first 2GB of guest memory, to avoid prefix issues after reboot. + * The CPUs of the protected VM need to be destroyed beforehand. + */ +static void kvm_s390_destroy_lower_2g(struct kvm *kvm) +{ + const unsigned long pages_2g = SZ_2G / PAGE_SIZE; + struct kvm_memory_slot *slot; + unsigned long len; + int srcu_idx; + + srcu_idx = srcu_read_lock(&kvm->srcu); + + /* Take the memslot containing guest absolute address 0 */ + slot = gfn_to_memslot(kvm, 0); + /* Clear all slots or parts thereof that are below 2GB */ + while (slot && slot->base_gfn < pages_2g) { + len = min_t(u64, slot->npages, pages_2g - slot->base_gfn) * PAGE_SIZE; + s390_uv_destroy_range(kvm->mm, slot->userspace_addr, slot->userspace_addr + len); + /* Take the next memslot */ + slot = gfn_to_memslot(kvm, slot->base_gfn + slot->npages); + } + + srcu_read_unlock(&kvm->srcu, srcu_idx); +} + +static int kvm_s390_pv_deinit_vm_fast(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + struct uv_cb_destroy_fast uvcb = { + .header.cmd = UVC_CMD_DESTROY_SEC_CONF_FAST, + .header.len = sizeof(uvcb), + .handle = kvm_s390_pv_get_handle(kvm), + }; + int cc; + + cc = uv_call_sched(0, (u64)&uvcb); + if (rc) + *rc = uvcb.header.rc; + if (rrc) + *rrc = uvcb.header.rrc; + WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); + KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM FAST: rc %x rrc %x", + uvcb.header.rc, uvcb.header.rrc); + WARN_ONCE(cc && uvcb.header.rc != 0x104, + "protvirt destroy vm fast failed handle %llx rc %x rrc %x", + kvm_s390_pv_get_handle(kvm), uvcb.header.rc, uvcb.header.rrc); + /* Intended memory leak on "impossible" error */ + if (!cc) + kvm_s390_pv_dealloc_vm(kvm); + return cc ? -EIO : 0; +} + +static inline bool is_destroy_fast_available(void) +{ + return test_bit_inv(BIT_UVC_CMD_DESTROY_SEC_CONF_FAST, uv_info.inst_calls_list); +} + +/** + * kvm_s390_pv_set_aside - Set aside a protected VM for later teardown. + * @kvm: the VM + * @rc: return value for the RC field of the UVCB + * @rrc: return value for the RRC field of the UVCB + * + * Set aside the protected VM for a subsequent teardown. The VM will be able + * to continue immediately as a non-secure VM, and the information needed to + * properly tear down the protected VM is set aside. If another protected VM + * was already set aside without starting its teardown, this function will + * fail. + * The CPUs of the protected VM need to be destroyed beforehand. + * + * Context: kvm->lock needs to be held + * + * Return: 0 in case of success, -EINVAL if another protected VM was already set + * aside, -ENOMEM if the system ran out of memory. + */ +int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + struct pv_vm_to_be_destroyed *priv; + int res = 0; + + lockdep_assert_held(&kvm->lock); + /* + * If another protected VM was already prepared for teardown, refuse. + * A normal deinitialization has to be performed instead. + */ + if (kvm->arch.pv.set_aside) + return -EINVAL; + + /* Guest with segment type ASCE, refuse to destroy asynchronously */ + if ((kvm->arch.gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT) + return -EINVAL; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + if (is_destroy_fast_available()) { + res = kvm_s390_pv_deinit_vm_fast(kvm, rc, rrc); + } else { + priv->stor_var = kvm->arch.pv.stor_var; + priv->stor_base = kvm->arch.pv.stor_base; + priv->handle = kvm_s390_pv_get_handle(kvm); + priv->old_gmap_table = (unsigned long)kvm->arch.gmap->table; + WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); + if (s390_replace_asce(kvm->arch.gmap)) + res = -ENOMEM; + } + + if (res) { + kfree(priv); + return res; + } + + kvm_s390_destroy_lower_2g(kvm); + kvm_s390_clear_pv_state(kvm); + kvm->arch.pv.set_aside = priv; + + *rc = UVC_RC_EXECUTED; + *rrc = 42; + return 0; +} + +/** + * kvm_s390_pv_deinit_vm - Deinitialize the current protected VM + * @kvm: the KVM whose protected VM needs to be deinitialized + * @rc: the RC code of the UVC + * @rrc: the RRC code of the UVC + * + * Deinitialize the current protected VM. This function will destroy and + * cleanup the current protected VM, but it will not cleanup the guest + * memory. This function should only be called when the protected VM has + * just been created and therefore does not have any guest memory, or when + * the caller cleans up the guest memory separately. + * + * This function should not fail, but if it does, the donated memory must + * not be freed. + * + * Context: kvm->lock needs to be held + * + * Return: 0 in case of success, otherwise -EIO + */ +int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + int cc; + + cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), + UVC_CMD_DESTROY_SEC_CONF, rc, rrc); + WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); + if (!cc) { + atomic_dec(&kvm->mm->context.protected_count); + kvm_s390_pv_dealloc_vm(kvm); + } else { + /* Intended memory leak on "impossible" error */ + s390_replace_asce(kvm->arch.gmap); + } + KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM: rc %x rrc %x", *rc, *rrc); + WARN_ONCE(cc, "protvirt destroy vm failed rc %x rrc %x", *rc, *rrc); + + return cc ? -EIO : 0; +} + +/** + * kvm_s390_pv_deinit_cleanup_all - Clean up all protected VMs associated + * with a specific KVM. + * @kvm: the KVM to be cleaned up + * @rc: the RC code of the first failing UVC + * @rrc: the RRC code of the first failing UVC + * + * This function will clean up all protected VMs associated with a KVM. + * This includes the active one, the one prepared for deinitialization with + * kvm_s390_pv_set_aside, and any still pending in the need_cleanup list. + * + * Context: kvm->lock needs to be held unless being called from + * kvm_arch_destroy_vm. + * + * Return: 0 if all VMs are successfully cleaned up, otherwise -EIO + */ +int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + struct pv_vm_to_be_destroyed *cur; + bool need_zap = false; + u16 _rc, _rrc; + int cc = 0; + + /* + * Nothing to do if the counter was already 0. Otherwise make sure + * the counter does not reach 0 before calling s390_uv_destroy_range. + */ + if (!atomic_inc_not_zero(&kvm->mm->context.protected_count)) + return 0; + + *rc = 1; + /* If the current VM is protected, destroy it */ + if (kvm_s390_pv_get_handle(kvm)) { + cc = kvm_s390_pv_deinit_vm(kvm, rc, rrc); + need_zap = true; + } + + /* If a previous protected VM was set aside, put it in the need_cleanup list */ + if (kvm->arch.pv.set_aside) { + list_add(kvm->arch.pv.set_aside, &kvm->arch.pv.need_cleanup); + kvm->arch.pv.set_aside = NULL; + } + + /* Cleanup all protected VMs in the need_cleanup list */ + while (!list_empty(&kvm->arch.pv.need_cleanup)) { + cur = list_first_entry(&kvm->arch.pv.need_cleanup, typeof(*cur), list); + need_zap = true; + if (kvm_s390_pv_dispose_one_leftover(kvm, cur, &_rc, &_rrc)) { + cc = 1; + /* + * Only return the first error rc and rrc, so make + * sure it is not overwritten. All destroys will + * additionally be reported via KVM_UV_EVENT(). + */ + if (*rc == UVC_RC_EXECUTED) { + *rc = _rc; + *rrc = _rrc; + } + } + list_del(&cur->list); + kfree(cur); + } + + /* + * If the mm still has a mapping, try to mark all its pages as + * accessible. The counter should not reach zero before this + * cleanup has been performed. + */ + if (need_zap && mmget_not_zero(kvm->mm)) { + s390_uv_destroy_range(kvm->mm, 0, TASK_SIZE); + mmput(kvm->mm); + } + + /* Now the counter can safely reach 0 */ + atomic_dec(&kvm->mm->context.protected_count); + return cc ? -EIO : 0; +} + +/** + * kvm_s390_pv_deinit_aside_vm - Teardown a previously set aside protected VM. + * @kvm: the VM previously associated with the protected VM + * @rc: return value for the RC field of the UVCB + * @rrc: return value for the RRC field of the UVCB + * + * Tear down the protected VM that had been previously prepared for teardown + * using kvm_s390_pv_set_aside_vm. Ideally this should be called by + * userspace asynchronously from a separate thread. + * + * Context: kvm->lock must not be held. + * + * Return: 0 in case of success, -EINVAL if no protected VM had been + * prepared for asynchronous teardowm, -EIO in case of other errors. + */ +int kvm_s390_pv_deinit_aside_vm(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + struct pv_vm_to_be_destroyed *p; + int ret = 0; + + lockdep_assert_not_held(&kvm->lock); + mutex_lock(&kvm->lock); + p = kvm->arch.pv.set_aside; + kvm->arch.pv.set_aside = NULL; + mutex_unlock(&kvm->lock); + if (!p) + return -EINVAL; + + /* When a fatal signal is received, stop immediately */ + if (s390_uv_destroy_range_interruptible(kvm->mm, 0, TASK_SIZE_MAX)) + goto done; + if (kvm_s390_pv_dispose_one_leftover(kvm, p, rc, rrc)) + ret = -EIO; + kfree(p); + p = NULL; +done: + /* + * p is not NULL if we aborted because of a fatal signal, in which + * case queue the leftover for later cleanup. + */ + if (p) { + mutex_lock(&kvm->lock); + list_add(&p->list, &kvm->arch.pv.need_cleanup); + mutex_unlock(&kvm->lock); + /* Did not finish, but pretend things went well */ + *rc = UVC_RC_EXECUTED; + *rrc = 42; + } + return ret; +} + +static void kvm_s390_pv_mmu_notifier_release(struct mmu_notifier *subscription, + struct mm_struct *mm) +{ + struct kvm *kvm = container_of(subscription, struct kvm, arch.pv.mmu_notifier); + u16 dummy; + int r; + + /* + * No locking is needed since this is the last thread of the last user of this + * struct mm. + * When the struct kvm gets deinitialized, this notifier is also + * unregistered. This means that if this notifier runs, then the + * struct kvm is still valid. + */ + r = kvm_s390_cpus_from_pv(kvm, &dummy, &dummy); + if (!r && is_destroy_fast_available() && kvm_s390_pv_get_handle(kvm)) + kvm_s390_pv_deinit_vm_fast(kvm, &dummy, &dummy); +} + +static const struct mmu_notifier_ops kvm_s390_pv_mmu_notifier_ops = { + .release = kvm_s390_pv_mmu_notifier_release, +}; + +int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + struct uv_cb_cgc uvcb = { + .header.cmd = UVC_CMD_CREATE_SEC_CONF, + .header.len = sizeof(uvcb) + }; + int cc, ret; + u16 dummy; + + ret = kvm_s390_pv_alloc_vm(kvm); + if (ret) + return ret; + + /* Inputs */ + uvcb.guest_stor_origin = 0; /* MSO is 0 for KVM */ + uvcb.guest_stor_len = kvm->arch.pv.guest_len; + uvcb.guest_asce = kvm->arch.gmap->asce; + uvcb.guest_sca = virt_to_phys(kvm->arch.sca); + uvcb.conf_base_stor_origin = + virt_to_phys((void *)kvm->arch.pv.stor_base); + uvcb.conf_virt_stor_origin = (u64)kvm->arch.pv.stor_var; + uvcb.flags.ap_allow_instr = kvm->arch.model.uv_feat_guest.ap; + uvcb.flags.ap_instr_intr = kvm->arch.model.uv_feat_guest.ap_intr; + + cc = uv_call_sched(0, (u64)&uvcb); + *rc = uvcb.header.rc; + *rrc = uvcb.header.rrc; + KVM_UV_EVENT(kvm, 3, "PROTVIRT CREATE VM: handle %llx len %llx rc %x rrc %x flags %04x", + uvcb.guest_handle, uvcb.guest_stor_len, *rc, *rrc, uvcb.flags.raw); + + /* Outputs */ + kvm->arch.pv.handle = uvcb.guest_handle; + + atomic_inc(&kvm->mm->context.protected_count); + if (cc) { + if (uvcb.header.rc & UVC_RC_NEED_DESTROY) { + kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy); + } else { + atomic_dec(&kvm->mm->context.protected_count); + kvm_s390_pv_dealloc_vm(kvm); + } + return -EIO; + } + kvm->arch.gmap->guest_handle = uvcb.guest_handle; + /* Add the notifier only once. No races because we hold kvm->lock */ + if (kvm->arch.pv.mmu_notifier.ops != &kvm_s390_pv_mmu_notifier_ops) { + kvm->arch.pv.mmu_notifier.ops = &kvm_s390_pv_mmu_notifier_ops; + mmu_notifier_register(&kvm->arch.pv.mmu_notifier, kvm->mm); + } + return 0; +} + +int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc, + u16 *rrc) +{ + struct uv_cb_ssc uvcb = { + .header.cmd = UVC_CMD_SET_SEC_CONF_PARAMS, + .header.len = sizeof(uvcb), + .sec_header_origin = (u64)hdr, + .sec_header_len = length, + .guest_handle = kvm_s390_pv_get_handle(kvm), + }; + int cc = uv_call(0, (u64)&uvcb); + + *rc = uvcb.header.rc; + *rrc = uvcb.header.rrc; + KVM_UV_EVENT(kvm, 3, "PROTVIRT VM SET PARMS: rc %x rrc %x", + *rc, *rrc); + return cc ? -EINVAL : 0; +} + +static int unpack_one(struct kvm *kvm, unsigned long addr, u64 tweak, + u64 offset, u16 *rc, u16 *rrc) +{ + struct uv_cb_unp uvcb = { + .header.cmd = UVC_CMD_UNPACK_IMG, + .header.len = sizeof(uvcb), + .guest_handle = kvm_s390_pv_get_handle(kvm), + .gaddr = addr, + .tweak[0] = tweak, + .tweak[1] = offset, + }; + int ret = gmap_make_secure(kvm->arch.gmap, addr, &uvcb); + + *rc = uvcb.header.rc; + *rrc = uvcb.header.rrc; + + if (ret && ret != -EAGAIN) + KVM_UV_EVENT(kvm, 3, "PROTVIRT VM UNPACK: failed addr %llx with rc %x rrc %x", + uvcb.gaddr, *rc, *rrc); + return ret; +} + +int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size, + unsigned long tweak, u16 *rc, u16 *rrc) +{ + u64 offset = 0; + int ret = 0; + + if (addr & ~PAGE_MASK || !size || size & ~PAGE_MASK) + return -EINVAL; + + KVM_UV_EVENT(kvm, 3, "PROTVIRT VM UNPACK: start addr %lx size %lx", + addr, size); + + while (offset < size) { + ret = unpack_one(kvm, addr, tweak, offset, rc, rrc); + if (ret == -EAGAIN) { + cond_resched(); + if (fatal_signal_pending(current)) + break; + continue; + } + if (ret) + break; + addr += PAGE_SIZE; + offset += PAGE_SIZE; + } + if (!ret) + KVM_UV_EVENT(kvm, 3, "%s", "PROTVIRT VM UNPACK: successful"); + return ret; +} + +int kvm_s390_pv_set_cpu_state(struct kvm_vcpu *vcpu, u8 state) +{ + struct uv_cb_cpu_set_state uvcb = { + .header.cmd = UVC_CMD_CPU_SET_STATE, + .header.len = sizeof(uvcb), + .cpu_handle = kvm_s390_pv_cpu_get_handle(vcpu), + .state = state, + }; + int cc; + + cc = uv_call(0, (u64)&uvcb); + KVM_UV_EVENT(vcpu->kvm, 3, "PROTVIRT SET CPU %d STATE %d rc %x rrc %x", + vcpu->vcpu_id, state, uvcb.header.rc, uvcb.header.rrc); + if (cc) + return -EINVAL; + return 0; +} + +int kvm_s390_pv_dump_cpu(struct kvm_vcpu *vcpu, void *buff, u16 *rc, u16 *rrc) +{ + struct uv_cb_dump_cpu uvcb = { + .header.cmd = UVC_CMD_DUMP_CPU, + .header.len = sizeof(uvcb), + .cpu_handle = vcpu->arch.pv.handle, + .dump_area_origin = (u64)buff, + }; + int cc; + + cc = uv_call_sched(0, (u64)&uvcb); + *rc = uvcb.header.rc; + *rrc = uvcb.header.rrc; + return cc; +} + +/* Size of the cache for the storage state dump data. 1MB for now */ +#define DUMP_BUFF_LEN HPAGE_SIZE + +/** + * kvm_s390_pv_dump_stor_state + * + * @kvm: pointer to the guest's KVM struct + * @buff_user: Userspace pointer where we will write the results to + * @gaddr: Starting absolute guest address for which the storage state + * is requested. + * @buff_user_len: Length of the buff_user buffer + * @rc: Pointer to where the uvcb return code is stored + * @rrc: Pointer to where the uvcb return reason code is stored + * + * Stores buff_len bytes of tweak component values to buff_user + * starting with the 1MB block specified by the absolute guest address + * (gaddr). The gaddr pointer will be updated with the last address + * for which data was written when returning to userspace. buff_user + * might be written to even if an error rc is returned. For instance + * if we encounter a fault after writing the first page of data. + * + * Context: kvm->lock needs to be held + * + * Return: + * 0 on success + * -ENOMEM if allocating the cache fails + * -EINVAL if gaddr is not aligned to 1MB + * -EINVAL if buff_user_len is not aligned to uv_info.conf_dump_storage_state_len + * -EINVAL if the UV call fails, rc and rrc will be set in this case + * -EFAULT if copying the result to buff_user failed + */ +int kvm_s390_pv_dump_stor_state(struct kvm *kvm, void __user *buff_user, + u64 *gaddr, u64 buff_user_len, u16 *rc, u16 *rrc) +{ + struct uv_cb_dump_stor_state uvcb = { + .header.cmd = UVC_CMD_DUMP_CONF_STOR_STATE, + .header.len = sizeof(uvcb), + .config_handle = kvm->arch.pv.handle, + .gaddr = *gaddr, + .dump_area_origin = 0, + }; + const u64 increment_len = uv_info.conf_dump_storage_state_len; + size_t buff_kvm_size; + size_t size_done = 0; + u8 *buff_kvm = NULL; + int cc, ret; + + ret = -EINVAL; + /* UV call processes 1MB guest storage chunks at a time */ + if (!IS_ALIGNED(*gaddr, HPAGE_SIZE)) + goto out; + + /* + * We provide the storage state for 1MB chunks of guest + * storage. The buffer will need to be aligned to + * conf_dump_storage_state_len so we don't end on a partial + * chunk. + */ + if (!buff_user_len || + !IS_ALIGNED(buff_user_len, increment_len)) + goto out; + + /* + * Allocate a buffer from which we will later copy to the user + * process. We don't want userspace to dictate our buffer size + * so we limit it to DUMP_BUFF_LEN. + */ + ret = -ENOMEM; + buff_kvm_size = min_t(u64, buff_user_len, DUMP_BUFF_LEN); + buff_kvm = vzalloc(buff_kvm_size); + if (!buff_kvm) + goto out; + + ret = 0; + uvcb.dump_area_origin = (u64)buff_kvm; + /* We will loop until the user buffer is filled or an error occurs */ + do { + /* Get 1MB worth of guest storage state data */ + cc = uv_call_sched(0, (u64)&uvcb); + + /* All or nothing */ + if (cc) { + ret = -EINVAL; + break; + } + + size_done += increment_len; + uvcb.dump_area_origin += increment_len; + buff_user_len -= increment_len; + uvcb.gaddr += HPAGE_SIZE; + + /* KVM Buffer full, time to copy to the process */ + if (!buff_user_len || size_done == DUMP_BUFF_LEN) { + if (copy_to_user(buff_user, buff_kvm, size_done)) { + ret = -EFAULT; + break; + } + + buff_user += size_done; + size_done = 0; + uvcb.dump_area_origin = (u64)buff_kvm; + } + } while (buff_user_len); + + /* Report back where we ended dumping */ + *gaddr = uvcb.gaddr; + + /* Lets only log errors, we don't want to spam */ +out: + if (ret) + KVM_UV_EVENT(kvm, 3, + "PROTVIRT DUMP STORAGE STATE: addr %llx ret %d, uvcb rc %x rrc %x", + uvcb.gaddr, ret, uvcb.header.rc, uvcb.header.rrc); + *rc = uvcb.header.rc; + *rrc = uvcb.header.rrc; + vfree(buff_kvm); + + return ret; +} + +/** + * kvm_s390_pv_dump_complete + * + * @kvm: pointer to the guest's KVM struct + * @buff_user: Userspace pointer where we will write the results to + * @rc: Pointer to where the uvcb return code is stored + * @rrc: Pointer to where the uvcb return reason code is stored + * + * Completes the dumping operation and writes the completion data to + * user space. + * + * Context: kvm->lock needs to be held + * + * Return: + * 0 on success + * -ENOMEM if allocating the completion buffer fails + * -EINVAL if the UV call fails, rc and rrc will be set in this case + * -EFAULT if copying the result to buff_user failed + */ +int kvm_s390_pv_dump_complete(struct kvm *kvm, void __user *buff_user, + u16 *rc, u16 *rrc) +{ + struct uv_cb_dump_complete complete = { + .header.len = sizeof(complete), + .header.cmd = UVC_CMD_DUMP_COMPLETE, + .config_handle = kvm_s390_pv_get_handle(kvm), + }; + u64 *compl_data; + int ret; + + /* Allocate dump area */ + compl_data = vzalloc(uv_info.conf_dump_finalize_len); + if (!compl_data) + return -ENOMEM; + complete.dump_area_origin = (u64)compl_data; + + ret = uv_call_sched(0, (u64)&complete); + *rc = complete.header.rc; + *rrc = complete.header.rrc; + KVM_UV_EVENT(kvm, 3, "PROTVIRT DUMP COMPLETE: rc %x rrc %x", + complete.header.rc, complete.header.rrc); + + if (!ret) { + /* + * kvm_s390_pv_dealloc_vm() will also (mem)set + * this to false on a reboot or other destroy + * operation for this vm. + */ + kvm->arch.pv.dumping = false; + kvm_s390_vcpu_unblock_all(kvm); + ret = copy_to_user(buff_user, compl_data, uv_info.conf_dump_finalize_len); + if (ret) + ret = -EFAULT; + } + vfree(compl_data); + /* If the UVC returned an error, translate it to -EINVAL */ + if (ret > 0) + ret = -EINVAL; + return ret; +} diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index 683036c1c92a..d9696b530064 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -151,22 +151,10 @@ static int __sigp_stop_and_store_status(struct kvm_vcpu *vcpu, static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter, u64 *status_reg) { - unsigned int i; - struct kvm_vcpu *v; - bool all_stopped = true; - - kvm_for_each_vcpu(i, v, vcpu->kvm) { - if (v == vcpu) - continue; - if (!is_vcpu_stopped(v)) - all_stopped = false; - } - *status_reg &= 0xffffffff00000000UL; /* Reject set arch order, with czam we're always in z/Arch mode. */ - *status_reg |= (all_stopped ? SIGP_STATUS_INVALID_PARAMETER : - SIGP_STATUS_INCORRECT_STATE); + *status_reg |= SIGP_STATUS_INVALID_PARAMETER; return SIGP_CC_STATUS_STORED; } @@ -288,6 +276,34 @@ static int handle_sigp_dst(struct kvm_vcpu *vcpu, u8 order_code, if (!dst_vcpu) return SIGP_CC_NOT_OPERATIONAL; + /* + * SIGP RESTART, SIGP STOP, and SIGP STOP AND STORE STATUS orders + * are processed asynchronously. Until the affected VCPU finishes + * its work and calls back into KVM to clear the (RESTART or STOP) + * interrupt, we need to return any new non-reset orders "busy". + * + * This is important because a single VCPU could issue: + * 1) SIGP STOP $DESTINATION + * 2) SIGP SENSE $DESTINATION + * + * If the SIGP SENSE would not be rejected as "busy", it could + * return an incorrect answer as to whether the VCPU is STOPPED + * or OPERATING. + */ + if (order_code != SIGP_INITIAL_CPU_RESET && + order_code != SIGP_CPU_RESET) { + /* + * Lockless check. Both SIGP STOP and SIGP (RE)START + * properly synchronize everything while processing + * their orders, while the guest cannot observe a + * difference when issuing other orders from two + * different VCPUs. + */ + if (kvm_s390_is_stop_irq_pending(dst_vcpu) || + kvm_s390_is_restart_irq_pending(dst_vcpu)) + return SIGP_CC_BUSY; + } + switch (order_code) { case SIGP_SENSE: vcpu->stat.instruction_sigp_sense++; @@ -453,7 +469,7 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu) * * This interception will occur at the source cpu when a source cpu sends an * external call to a target cpu and the target cpu has the WAIT bit set in - * its cpuflags. Interception will occurr after the interrupt indicator bits at + * its cpuflags. Interception will occur after the interrupt indicator bits at * the target cpu have been set. All error cases will lead to instruction * interception, therefore nothing is to be checked or prepared. */ @@ -464,9 +480,9 @@ int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu) struct kvm_vcpu *dest_vcpu; u8 order_code = kvm_s390_get_base_disp_rs(vcpu, NULL); - trace_kvm_s390_handle_sigp_pei(vcpu, order_code, cpu_addr); - if (order_code == SIGP_EXTERNAL_CALL) { + trace_kvm_s390_handle_sigp_pei(vcpu, order_code, cpu_addr); + dest_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, cpu_addr); BUG_ON(dest_vcpu == NULL); diff --git a/arch/s390/kvm/trace-s390.h b/arch/s390/kvm/trace-s390.h index 6f0209d45164..9ac92dbf680d 100644 --- a/arch/s390/kvm/trace-s390.h +++ b/arch/s390/kvm/trace-s390.h @@ -333,6 +333,29 @@ TRACE_EVENT(kvm_s390_airq_suppressed, __entry->id, __entry->isc) ); +/* + * Trace point for gmap notifier calls. + */ +TRACE_EVENT(kvm_s390_gmap_notifier, + TP_PROTO(unsigned long start, unsigned long end, unsigned int shadow), + TP_ARGS(start, end, shadow), + + TP_STRUCT__entry( + __field(unsigned long, start) + __field(unsigned long, end) + __field(unsigned int, shadow) + ), + + TP_fast_assign( + __entry->start = start; + __entry->end = end; + __entry->shadow = shadow; + ), + + TP_printk("gmap notified (start:0x%lx end:0x%lx shadow:%d)", + __entry->start, __entry->end, __entry->shadow) + ); + #endif /* _TRACE_KVMS390_H */ diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 076090f9e666..fef42e2a80a2 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -18,6 +18,8 @@ #include <asm/sclp.h> #include <asm/nmi.h> #include <asm/dis.h> +#include <asm/fpu/api.h> +#include <asm/facility.h> #include "kvm-s390.h" #include "gaccess.h" @@ -137,11 +139,15 @@ static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) } /* Copy to APCB FORMAT1 from APCB FORMAT0 */ static int setup_apcb10(struct kvm_vcpu *vcpu, struct kvm_s390_apcb1 *apcb_s, - unsigned long apcb_o, struct kvm_s390_apcb1 *apcb_h) + unsigned long crycb_gpa, struct kvm_s390_apcb1 *apcb_h) { struct kvm_s390_apcb0 tmp; + unsigned long apcb_gpa; - if (read_guest_real(vcpu, apcb_o, &tmp, sizeof(struct kvm_s390_apcb0))) + apcb_gpa = crycb_gpa + offsetof(struct kvm_s390_crypto_cb, apcb0); + + if (read_guest_real(vcpu, apcb_gpa, &tmp, + sizeof(struct kvm_s390_apcb0))) return -EFAULT; apcb_s->apm[0] = apcb_h->apm[0] & tmp.apm[0]; @@ -156,19 +162,24 @@ static int setup_apcb10(struct kvm_vcpu *vcpu, struct kvm_s390_apcb1 *apcb_s, * setup_apcb00 - Copy to APCB FORMAT0 from APCB FORMAT0 * @vcpu: pointer to the virtual CPU * @apcb_s: pointer to start of apcb in the shadow crycb - * @apcb_o: pointer to start of original apcb in the guest2 + * @crycb_gpa: guest physical address to start of original guest crycb * @apcb_h: pointer to start of apcb in the guest1 * * Returns 0 and -EFAULT on error reading guest apcb */ static int setup_apcb00(struct kvm_vcpu *vcpu, unsigned long *apcb_s, - unsigned long apcb_o, unsigned long *apcb_h) + unsigned long crycb_gpa, unsigned long *apcb_h) { - if (read_guest_real(vcpu, apcb_o, apcb_s, + unsigned long apcb_gpa; + + apcb_gpa = crycb_gpa + offsetof(struct kvm_s390_crypto_cb, apcb0); + + if (read_guest_real(vcpu, apcb_gpa, apcb_s, sizeof(struct kvm_s390_apcb0))) return -EFAULT; - bitmap_and(apcb_s, apcb_s, apcb_h, sizeof(struct kvm_s390_apcb0)); + bitmap_and(apcb_s, apcb_s, apcb_h, + BITS_PER_BYTE * sizeof(struct kvm_s390_apcb0)); return 0; } @@ -177,20 +188,25 @@ static int setup_apcb00(struct kvm_vcpu *vcpu, unsigned long *apcb_s, * setup_apcb11 - Copy the FORMAT1 APCB from the guest to the shadow CRYCB * @vcpu: pointer to the virtual CPU * @apcb_s: pointer to start of apcb in the shadow crycb - * @apcb_o: pointer to start of original guest apcb + * @crycb_gpa: guest physical address to start of original guest crycb * @apcb_h: pointer to start of apcb in the host * * Returns 0 and -EFAULT on error reading guest apcb */ static int setup_apcb11(struct kvm_vcpu *vcpu, unsigned long *apcb_s, - unsigned long apcb_o, + unsigned long crycb_gpa, unsigned long *apcb_h) { - if (read_guest_real(vcpu, apcb_o, apcb_s, + unsigned long apcb_gpa; + + apcb_gpa = crycb_gpa + offsetof(struct kvm_s390_crypto_cb, apcb1); + + if (read_guest_real(vcpu, apcb_gpa, apcb_s, sizeof(struct kvm_s390_apcb1))) return -EFAULT; - bitmap_and(apcb_s, apcb_s, apcb_h, sizeof(struct kvm_s390_apcb1)); + bitmap_and(apcb_s, apcb_s, apcb_h, + BITS_PER_BYTE * sizeof(struct kvm_s390_apcb1)); return 0; } @@ -199,7 +215,7 @@ static int setup_apcb11(struct kvm_vcpu *vcpu, unsigned long *apcb_s, * setup_apcb - Create a shadow copy of the apcb. * @vcpu: pointer to the virtual CPU * @crycb_s: pointer to shadow crycb - * @crycb_o: pointer to original guest crycb + * @crycb_gpa: guest physical address of original guest crycb * @crycb_h: pointer to the host crycb * @fmt_o: format of the original guest crycb. * @fmt_h: format of the host crycb. @@ -210,50 +226,46 @@ static int setup_apcb11(struct kvm_vcpu *vcpu, unsigned long *apcb_s, * Return 0 or an error number if the guest and host crycb are incompatible. */ static int setup_apcb(struct kvm_vcpu *vcpu, struct kvm_s390_crypto_cb *crycb_s, - const u32 crycb_o, + const u32 crycb_gpa, struct kvm_s390_crypto_cb *crycb_h, int fmt_o, int fmt_h) { - struct kvm_s390_crypto_cb *crycb; - - crycb = (struct kvm_s390_crypto_cb *) (unsigned long)crycb_o; - switch (fmt_o) { case CRYCB_FORMAT2: - if ((crycb_o & PAGE_MASK) != ((crycb_o + 256) & PAGE_MASK)) + if ((crycb_gpa & PAGE_MASK) != ((crycb_gpa + 256) & PAGE_MASK)) return -EACCES; if (fmt_h != CRYCB_FORMAT2) return -EINVAL; return setup_apcb11(vcpu, (unsigned long *)&crycb_s->apcb1, - (unsigned long) &crycb->apcb1, + crycb_gpa, (unsigned long *)&crycb_h->apcb1); case CRYCB_FORMAT1: switch (fmt_h) { case CRYCB_FORMAT2: return setup_apcb10(vcpu, &crycb_s->apcb1, - (unsigned long) &crycb->apcb0, + crycb_gpa, &crycb_h->apcb1); case CRYCB_FORMAT1: return setup_apcb00(vcpu, (unsigned long *) &crycb_s->apcb0, - (unsigned long) &crycb->apcb0, + crycb_gpa, (unsigned long *) &crycb_h->apcb0); } break; case CRYCB_FORMAT0: - if ((crycb_o & PAGE_MASK) != ((crycb_o + 32) & PAGE_MASK)) + if ((crycb_gpa & PAGE_MASK) != ((crycb_gpa + 32) & PAGE_MASK)) return -EACCES; switch (fmt_h) { case CRYCB_FORMAT2: return setup_apcb10(vcpu, &crycb_s->apcb1, - (unsigned long) &crycb->apcb0, + crycb_gpa, &crycb_h->apcb1); case CRYCB_FORMAT1: case CRYCB_FORMAT0: return setup_apcb00(vcpu, (unsigned long *) &crycb_s->apcb0, - (unsigned long) &crycb->apcb0, + crycb_gpa, (unsigned long *) &crycb_h->apcb0); } } @@ -416,11 +428,6 @@ static void unshadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) memcpy((void *)((u64)scb_o + 0xc0), (void *)((u64)scb_s + 0xc0), 0xf0 - 0xc0); break; - case ICPT_PARTEXEC: - /* MVPG only */ - memcpy((void *)((u64)scb_o + 0xc0), - (void *)((u64)scb_s + 0xc0), 0xd0 - 0xc0); - break; } if (scb_s->ihcpu != 0xffffU) @@ -498,7 +505,7 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) scb_s->mso = new_mso; scb_s->prefix = new_prefix; - /* We have to definetly flush the tlb if this scb never ran */ + /* We have to definitely flush the tlb if this scb never ran */ if (scb_s->ihcpu != 0xffffU) scb_s->ihcpu = scb_o->ihcpu; @@ -507,6 +514,14 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) /* Host-protection-interruption introduced with ESOP */ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_ESOP)) scb_s->ecb |= scb_o->ecb & ECB_HOSTPROTINT; + /* + * CPU Topology + * This facility only uses the utility field of the SCA and none of + * the cpu entries that are problematic with the other interpretation + * facilities so we can pass it through + */ + if (test_kvm_facility(vcpu->kvm, 11)) + scb_s->ecb |= scb_o->ecb & ECB_PTF; /* transactional execution */ if (test_kvm_facility(vcpu->kvm, 73) && wants_tx) { /* remap the prefix is tx is toggled on */ @@ -514,6 +529,8 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) prefix_unmapped(vsie_page); scb_s->ecb |= ECB_TE; } + /* specification exception interpretation */ + scb_s->ecb |= scb_o->ecb & ECB_SPECI; /* branch prediction */ if (test_kvm_facility(vcpu->kvm, 82)) scb_s->fpf |= scb_o->fpf & FPF_BPBC; @@ -540,14 +557,17 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_CEI)) scb_s->eca |= scb_o->eca & ECA_CEI; /* Epoch Extension */ - if (test_kvm_facility(vcpu->kvm, 139)) + if (test_kvm_facility(vcpu->kvm, 139)) { scb_s->ecd |= scb_o->ecd & ECD_MEF; + scb_s->epdx = scb_o->epdx; + } /* etoken */ if (test_kvm_facility(vcpu->kvm, 156)) scb_s->ecd |= scb_o->ecd & ECD_ETOKENF; scb_s->hpid = HPID_VSIE; + scb_s->cpnc = scb_o->cpnc; prepare_ibc(vcpu, vsie_page); rc = shadow_crycb(vcpu, vsie_page); @@ -568,10 +588,6 @@ void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start, if (!gmap_is_shadow(gmap)) return; - if (start >= 1UL << 31) - /* We are only interested in prefix pages */ - return; - /* * Only new shadow blocks are added to the list during runtime, * therefore we can safely reference them all the time. @@ -618,10 +634,10 @@ static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) /* with mso/msl, the prefix lies at offset *mso* */ prefix += scb_s->mso; - rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix); + rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix, NULL); if (!rc && (scb_s->ecb & ECB_TE)) rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, - prefix + PAGE_SIZE); + prefix + PAGE_SIZE, NULL); /* * We don't have to mprotect, we will be called for all unshadows. * SIE will detect if protection applies and trigger a validity. @@ -647,7 +663,7 @@ static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa) page = gfn_to_page(kvm, gpa_to_gfn(gpa)); if (is_error_page(page)) return -EINVAL; - *hpa = (hpa_t) page_to_virt(page) + (gpa & ~PAGE_MASK); + *hpa = (hpa_t)page_to_phys(page) + (gpa & ~PAGE_MASK); return 0; } @@ -862,7 +878,7 @@ static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, WARN_ON_ONCE(rc); return 1; } - vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa; + vsie_page->scb_o = phys_to_virt(hpa); return 0; } @@ -882,7 +898,7 @@ static int inject_fault(struct kvm_vcpu *vcpu, __u16 code, __u64 vaddr, (vaddr & 0xfffffffffffff000UL) | /* 52-53: store / fetch */ (((unsigned int) !write_flag) + 1) << 10, - /* 62-63: asce id (alway primary == 0) */ + /* 62-63: asce id (always primary == 0) */ .exc_access_id = 0, /* always primary */ .op_access_id = 0, /* not MVPG */ }; @@ -912,7 +928,7 @@ static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) current->thread.gmap_addr, 1); rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, - current->thread.gmap_addr); + current->thread.gmap_addr, NULL); if (rc > 0) { rc = inject_fault(vcpu, rc, current->thread.gmap_addr, @@ -934,7 +950,7 @@ static void handle_last_fault(struct kvm_vcpu *vcpu, { if (vsie_page->fault_addr) kvm_s390_shadow_fault(vcpu, vsie_page->gmap, - vsie_page->fault_addr); + vsie_page->fault_addr, NULL); vsie_page->fault_addr = 0; } @@ -969,12 +985,26 @@ static void retry_vsie_icpt(struct vsie_page *vsie_page) static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) { struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; - __u32 fac = READ_ONCE(vsie_page->scb_o->fac) & 0x7ffffff8U; + __u32 fac = READ_ONCE(vsie_page->scb_o->fac); + /* + * Alternate-STFLE-Interpretive-Execution facilities are not supported + * -> format-0 flcb + */ if (fac && test_kvm_facility(vcpu->kvm, 7)) { retry_vsie_icpt(vsie_page); + /* + * The facility list origin (FLO) is in bits 1 - 28 of the FLD + * so we need to mask here before reading. + */ + fac = fac & 0x7ffffff8U; + /* + * format-0 -> size of nested guest's facility list == guest's size + * guest's size == host's size, since STFLE is interpretatively executed + * using a format-0 for the guest, too. + */ if (read_guest_real(vcpu, fac, &vsie_page->fac, - sizeof(vsie_page->fac))) + stfle_size() * sizeof(u64))) return set_validity_icpt(scb_s, 0x1090U); scb_s->fac = (__u32)(__u64) &vsie_page->fac; } @@ -982,6 +1012,98 @@ static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) } /* + * Get a register for a nested guest. + * @vcpu the vcpu of the guest + * @vsie_page the vsie_page for the nested guest + * @reg the register number, the upper 4 bits are ignored. + * returns: the value of the register. + */ +static u64 vsie_get_register(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, u8 reg) +{ + /* no need to validate the parameter and/or perform error handling */ + reg &= 0xf; + switch (reg) { + case 15: + return vsie_page->scb_s.gg15; + case 14: + return vsie_page->scb_s.gg14; + default: + return vcpu->run->s.regs.gprs[reg]; + } +} + +static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + unsigned long pei_dest, pei_src, src, dest, mask, prefix; + u64 *pei_block = &vsie_page->scb_o->mcic; + int edat, rc_dest, rc_src; + union ctlreg0 cr0; + + cr0.val = vcpu->arch.sie_block->gcr[0]; + edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8); + mask = _kvm_s390_logical_to_effective(&scb_s->gpsw, PAGE_MASK); + prefix = scb_s->prefix << GUEST_PREFIX_SHIFT; + + dest = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 20) & mask; + dest = _kvm_s390_real_to_abs(prefix, dest) + scb_s->mso; + src = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 16) & mask; + src = _kvm_s390_real_to_abs(prefix, src) + scb_s->mso; + + rc_dest = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, dest, &pei_dest); + rc_src = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, src, &pei_src); + /* + * Either everything went well, or something non-critical went wrong + * e.g. because of a race. In either case, simply retry. + */ + if (rc_dest == -EAGAIN || rc_src == -EAGAIN || (!rc_dest && !rc_src)) { + retry_vsie_icpt(vsie_page); + return -EAGAIN; + } + /* Something more serious went wrong, propagate the error */ + if (rc_dest < 0) + return rc_dest; + if (rc_src < 0) + return rc_src; + + /* The only possible suppressing exception: just deliver it */ + if (rc_dest == PGM_TRANSLATION_SPEC || rc_src == PGM_TRANSLATION_SPEC) { + clear_vsie_icpt(vsie_page); + rc_dest = kvm_s390_inject_program_int(vcpu, PGM_TRANSLATION_SPEC); + WARN_ON_ONCE(rc_dest); + return 1; + } + + /* + * Forward the PEI intercept to the guest if it was a page fault, or + * also for segment and region table faults if EDAT applies. + */ + if (edat) { + rc_dest = rc_dest == PGM_ASCE_TYPE ? rc_dest : 0; + rc_src = rc_src == PGM_ASCE_TYPE ? rc_src : 0; + } else { + rc_dest = rc_dest != PGM_PAGE_TRANSLATION ? rc_dest : 0; + rc_src = rc_src != PGM_PAGE_TRANSLATION ? rc_src : 0; + } + if (!rc_dest && !rc_src) { + pei_block[0] = pei_dest; + pei_block[1] = pei_src; + return 1; + } + + retry_vsie_icpt(vsie_page); + + /* + * The host has edat, and the guest does not, or it was an ASCE type + * exception. The host needs to inject the appropriate DAT interrupts + * into the guest. + */ + if (rc_dest) + return inject_fault(vcpu, rc_dest, dest, 1); + return inject_fault(vcpu, rc_src, src, 0); +} + +/* * Run the vsie on a shadow scb and a shadow gmap, without any further * sanity checks, handling SIE faults. * @@ -1000,12 +1122,7 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) handle_last_fault(vcpu, vsie_page); - if (need_resched()) - schedule(); - if (test_cpu_flag(CIF_MCCK_PENDING)) - s390_handle_mcck(); - - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + kvm_vcpu_srcu_read_unlock(vcpu); /* save current guest state of bp isolation override */ guest_bp_isolation = test_thread_flag(TIF_ISOLATE_BP_GUEST); @@ -1032,6 +1149,8 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) */ vcpu->arch.sie_block->prog0c |= PROG_IN_SIE; barrier(); + if (test_cpu_flag(CIF_FPU)) + load_fpu_regs(); if (!kvm_s390_vcpu_sie_inhibited(vcpu)) rc = sie64a(scb_s, vcpu->run->s.regs.gprs); barrier(); @@ -1045,7 +1164,7 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) if (!guest_bp_isolation) clear_thread_flag(TIF_ISOLATE_BP_GUEST); - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_vcpu_srcu_read_lock(vcpu); if (rc == -EINTR) { VCPU_EVENT(vcpu, 3, "%s", "machine check"); @@ -1072,6 +1191,10 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) if ((scb_s->ipa & 0xf000) != 0xf000) scb_s->ipa += 0x1000; break; + case ICPT_PARTEXEC: + if (scb_s->ipa == 0xb254) + rc = vsie_handle_mvpg(vcpu, vsie_page); + break; } return rc; } @@ -1102,8 +1225,10 @@ static int acquire_gmap_shadow(struct kvm_vcpu *vcpu, * we're holding has been unshadowed. If the gmap is still valid, * we can safely reuse it. */ - if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat)) + if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat)) { + vcpu->kvm->stat.gmap_shadow_reuse++; return 0; + } /* release the old shadow - if any, and mark the prefix as unmapped */ release_gmap_shadow(vsie_page); @@ -1111,6 +1236,7 @@ static int acquire_gmap_shadow(struct kvm_vcpu *vcpu, if (IS_ERR(gmap)) return PTR_ERR(gmap); gmap->private = vcpu->kvm; + vcpu->kvm->stat.gmap_shadow_create++; WRITE_ONCE(vsie_page->gmap, gmap); return 0; } @@ -1185,6 +1311,7 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) kvm_s390_vcpu_has_irq(vcpu, 0) || kvm_s390_vcpu_sie_inhibited(vcpu)) break; + cond_resched(); } if (rc == -EFAULT) { @@ -1202,6 +1329,7 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) scb_s->iprcc = PGM_ADDRESSING; scb_s->pgmilc = 4; scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, 4); + rc = 1; } return rc; } @@ -1236,7 +1364,7 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) mutex_lock(&kvm->arch.vsie.mutex); if (kvm->arch.vsie.page_count < nr_vcpus) { - page = alloc_page(GFP_KERNEL | __GFP_ZERO | GFP_DMA); + page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO | GFP_DMA); if (!page) { mutex_unlock(&kvm->arch.vsie.mutex); return ERR_PTR(-ENOMEM); @@ -1338,7 +1466,7 @@ out_put: void kvm_s390_vsie_init(struct kvm *kvm) { mutex_init(&kvm->arch.vsie.mutex); - INIT_RADIX_TREE(&kvm->arch.vsie.addr_to_page, GFP_KERNEL); + INIT_RADIX_TREE(&kvm->arch.vsie.addr_to_page, GFP_KERNEL_ACCOUNT); } /* Destroy the vsie data structures. To be called when a vm is destroyed. */ diff --git a/arch/s390/lib/Makefile b/arch/s390/lib/Makefile index 28fd66d558ff..7c50eca85ca4 100644 --- a/arch/s390/lib/Makefile +++ b/arch/s390/lib/Makefile @@ -3,10 +3,12 @@ # Makefile for s390-specific library files.. # -lib-y += delay.o string.o uaccess.o find.o spinlock.o +lib-y += delay.o string.o uaccess.o find.o spinlock.o tishift.o obj-y += mem.o xor.o lib-$(CONFIG_KPROBES) += probes.o lib-$(CONFIG_UPROBES) += probes.o +obj-$(CONFIG_S390_KPROBES_SANITY_TEST) += test_kprobes_s390.o +test_kprobes_s390-objs += test_kprobes_asm.o test_kprobes.o # Instrumenting memory accesses to __user data (in different address space) # produce false positives @@ -14,3 +16,10 @@ KASAN_SANITIZE_uaccess.o := n obj-$(CONFIG_S390_UNWIND_SELFTEST) += test_unwind.o CFLAGS_test_unwind.o += -fno-optimize-sibling-calls + +obj-$(CONFIG_S390_MODULES_SANITY_TEST) += test_modules.o +obj-$(CONFIG_S390_MODULES_SANITY_TEST_HELPERS) += test_modules_helpers.o + +lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o + +obj-$(CONFIG_EXPOLINE_EXTERN) += expoline/ diff --git a/arch/s390/lib/delay.c b/arch/s390/lib/delay.c index d4aa10795605..be14c58cb989 100644 --- a/arch/s390/lib/delay.c +++ b/arch/s390/lib/delay.c @@ -4,126 +4,42 @@ * * Copyright IBM Corp. 1999, 2008 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>, - * Heiko Carstens <heiko.carstens@de.ibm.com>, */ -#include <linux/sched.h> +#include <linux/processor.h> #include <linux/delay.h> -#include <linux/timex.h> -#include <linux/export.h> -#include <linux/irqflags.h> -#include <linux/interrupt.h> -#include <linux/irq.h> -#include <asm/vtimer.h> #include <asm/div64.h> -#include <asm/idle.h> +#include <asm/timex.h> void __delay(unsigned long loops) { - /* - * To end the bloody studid and useless discussion about the - * BogoMips number I took the liberty to define the __delay - * function in a way that that resulting BogoMips number will - * yield the megahertz number of the cpu. The important function - * is udelay and that is done using the tod clock. -- martin. - */ + /* + * Loop 'loops' times. Callers must not assume a specific + * amount of time passes before this function returns. + */ asm volatile("0: brct %0,0b" : : "d" ((loops/2) + 1)); } EXPORT_SYMBOL(__delay); -static void __udelay_disabled(unsigned long long usecs) +static void delay_loop(unsigned long delta) { - unsigned long cr0, cr0_new, psw_mask; - struct s390_idle_data idle; - u64 end; + unsigned long end; - end = get_tod_clock() + (usecs << 12); - __ctl_store(cr0, 0, 0); - cr0_new = cr0 & ~CR0_IRQ_SUBCLASS_MASK; - cr0_new |= (1UL << (63 - 52)); /* enable clock comparator irq */ - __ctl_load(cr0_new, 0, 0); - psw_mask = __extract_psw() | PSW_MASK_EXT | PSW_MASK_WAIT; - set_clock_comparator(end); - set_cpu_flag(CIF_IGNORE_IRQ); - psw_idle(&idle, psw_mask); - clear_cpu_flag(CIF_IGNORE_IRQ); - set_clock_comparator(S390_lowcore.clock_comparator); - __ctl_load(cr0, 0, 0); -} - -static void __udelay_enabled(unsigned long long usecs) -{ - u64 clock_saved, end; - - end = get_tod_clock_fast() + (usecs << 12); - do { - clock_saved = 0; - if (tod_after(S390_lowcore.clock_comparator, end)) { - clock_saved = local_tick_disable(); - set_clock_comparator(end); - } - enabled_wait(); - if (clock_saved) - local_tick_enable(clock_saved); - } while (get_tod_clock_fast() < end); + end = get_tod_clock_monotonic() + delta; + while (!tod_after(get_tod_clock_monotonic(), end)) + cpu_relax(); } -/* - * Waits for 'usecs' microseconds using the TOD clock comparator. - */ -void __udelay(unsigned long long usecs) +void __udelay(unsigned long usecs) { - unsigned long flags; - - preempt_disable(); - local_irq_save(flags); - if (in_irq()) { - __udelay_disabled(usecs); - goto out; - } - if (in_softirq()) { - if (raw_irqs_disabled_flags(flags)) - __udelay_disabled(usecs); - else - __udelay_enabled(usecs); - goto out; - } - if (raw_irqs_disabled_flags(flags)) { - local_bh_disable(); - __udelay_disabled(usecs); - _local_bh_enable(); - goto out; - } - __udelay_enabled(usecs); -out: - local_irq_restore(flags); - preempt_enable(); + delay_loop(usecs << 12); } EXPORT_SYMBOL(__udelay); -/* - * Simple udelay variant. To be used on startup and reboot - * when the interrupt handler isn't working. - */ -void udelay_simple(unsigned long long usecs) -{ - u64 end; - - end = get_tod_clock_fast() + (usecs << 12); - while (get_tod_clock_fast() < end) - cpu_relax(); -} - -void __ndelay(unsigned long long nsecs) +void __ndelay(unsigned long nsecs) { - u64 end; - nsecs <<= 9; do_div(nsecs, 125); - end = get_tod_clock_fast() + nsecs; - if (nsecs & ~0xfffUL) - __udelay(nsecs >> 12); - while (get_tod_clock_fast() < end) - barrier(); + delay_loop(nsecs); } EXPORT_SYMBOL(__ndelay); diff --git a/arch/s390/lib/error-inject.c b/arch/s390/lib/error-inject.c new file mode 100644 index 000000000000..8c9d4da87eef --- /dev/null +++ b/arch/s390/lib/error-inject.c @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0+ +#include <asm/ptrace.h> +#include <linux/error-injection.h> +#include <linux/kprobes.h> + +void override_function_with_return(struct pt_regs *regs) +{ + /* + * Emulate 'br 14'. 'regs' is captured by kprobes on entry to some + * kernel function. + */ + regs->psw.addr = regs->gprs[14]; +} +NOKPROBE_SYMBOL(override_function_with_return); diff --git a/arch/s390/lib/expoline/Makefile b/arch/s390/lib/expoline/Makefile new file mode 100644 index 000000000000..854631d9cb03 --- /dev/null +++ b/arch/s390/lib/expoline/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-y += expoline.o diff --git a/arch/s390/lib/expoline/expoline.S b/arch/s390/lib/expoline/expoline.S new file mode 100644 index 000000000000..92ed8409a7a4 --- /dev/null +++ b/arch/s390/lib/expoline/expoline.S @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <asm/nospec-insn.h> +#include <linux/linkage.h> + +.macro GEN_ALL_BR_THUNK_EXTERN + .irp r1,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 + GEN_BR_THUNK_EXTERN %r\r1 + .endr +.endm + +GEN_ALL_BR_THUNK_EXTERN diff --git a/arch/s390/lib/mem.S b/arch/s390/lib/mem.S index dc0874f2e203..08f60a42b9a6 100644 --- a/arch/s390/lib/mem.S +++ b/arch/s390/lib/mem.S @@ -5,8 +5,8 @@ * Copyright IBM Corp. 2012 */ +#include <linux/export.h> #include <linux/linkage.h> -#include <asm/export.h> #include <asm/nospec-insn.h> GEN_BR_THUNK %r14 @@ -14,8 +14,7 @@ /* * void *memmove(void *dest, const void *src, size_t n) */ -WEAK(memmove) -ENTRY(__memmove) +SYM_FUNC_START(__memmove) ltgr %r4,%r4 lgr %r1,%r2 jz .Lmemmove_exit @@ -48,7 +47,10 @@ ENTRY(__memmove) BR_EX %r14 .Lmemmove_mvc: mvc 0(1,%r1),0(%r3) -ENDPROC(__memmove) +SYM_FUNC_END(__memmove) +EXPORT_SYMBOL(__memmove) + +SYM_FUNC_ALIAS(memmove, __memmove) EXPORT_SYMBOL(memmove) /* @@ -66,8 +68,7 @@ EXPORT_SYMBOL(memmove) * return __builtin_memset(s, c, n); * } */ -WEAK(memset) -ENTRY(__memset) +SYM_FUNC_START(__memset) ltgr %r4,%r4 jz .Lmemset_exit ltgr %r3,%r3 @@ -111,7 +112,10 @@ ENTRY(__memset) xc 0(1,%r1),0(%r1) .Lmemset_mvc: mvc 1(1,%r1),0(%r1) -ENDPROC(__memset) +SYM_FUNC_END(__memset) +EXPORT_SYMBOL(__memset) + +SYM_FUNC_ALIAS(memset, __memset) EXPORT_SYMBOL(memset) /* @@ -119,8 +123,7 @@ EXPORT_SYMBOL(memset) * * void *memcpy(void *dest, const void *src, size_t n) */ -WEAK(memcpy) -ENTRY(__memcpy) +SYM_FUNC_START(__memcpy) ltgr %r4,%r4 jz .Lmemcpy_exit aghi %r4,-1 @@ -141,7 +144,10 @@ ENTRY(__memcpy) j .Lmemcpy_remainder .Lmemcpy_mvc: mvc 0(1,%r1),0(%r3) -ENDPROC(__memcpy) +SYM_FUNC_END(__memcpy) +EXPORT_SYMBOL(__memcpy) + +SYM_FUNC_ALIAS(memcpy, __memcpy) EXPORT_SYMBOL(memcpy) /* @@ -152,7 +158,7 @@ EXPORT_SYMBOL(memcpy) * void *__memset64(uint64_t *s, uint64_t v, size_t count) */ .macro __MEMSET bits,bytes,insn -ENTRY(__memset\bits) +SYM_FUNC_START(__memset\bits) ltgr %r4,%r4 jz .L__memset_exit\bits cghi %r4,\bytes @@ -178,7 +184,7 @@ ENTRY(__memset\bits) BR_EX %r14 .L__memset_mvc\bits: mvc \bytes(1,%r1),0(%r1) -ENDPROC(__memset\bits) +SYM_FUNC_END(__memset\bits) .endm __MEMSET 16,2,sth diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c index ce1e4bbe53aa..81c53440b3e6 100644 --- a/arch/s390/lib/spinlock.c +++ b/arch/s390/lib/spinlock.c @@ -13,8 +13,8 @@ #include <linux/init.h> #include <linux/smp.h> #include <linux/percpu.h> +#include <linux/io.h> #include <asm/alternative.h> -#include <asm/io.h> int spin_retry = -1; @@ -26,7 +26,7 @@ static int __init spin_retry_init(void) } early_initcall(spin_retry_init); -/** +/* * spin_retry= parameter */ static int __init spin_retry_setup(char *str) @@ -75,7 +75,7 @@ static inline int arch_load_niai4(int *lock) int owner; asm_inline volatile( - ALTERNATIVE("", ".long 0xb2fa0040", 49) /* NIAI 4 */ + ALTERNATIVE("nop", ".insn rre,0xb2fa0000,4,0", 49) /* NIAI 4 */ " l %0,%1\n" : "=d" (owner) : "Q" (*lock) : "memory"); return owner; @@ -86,7 +86,7 @@ static inline int arch_cmpxchg_niai8(int *lock, int old, int new) int expected = old; asm_inline volatile( - ALTERNATIVE("", ".long 0xb2fa0080", 49) /* NIAI 8 */ + ALTERNATIVE("nop", ".insn rre,0xb2fa0000,8,0", 49) /* NIAI 8 */ " cs %0,%3,%1\n" : "=d" (old), "=Q" (*lock) : "0" (old), "d" (new), "Q" (*lock) @@ -242,7 +242,6 @@ static inline void arch_spin_lock_classic(arch_spinlock_t *lp) void arch_spin_lock_wait(arch_spinlock_t *lp) { - /* Use classic spinlocks + niai if the steal time is >= 10% */ if (test_cpu_flag(CIF_DEDICATED_CPU)) arch_spin_lock_queued(lp); else diff --git a/arch/s390/lib/string.c b/arch/s390/lib/string.c index 0e30e6e43b0c..7d8741818239 100644 --- a/arch/s390/lib/string.c +++ b/arch/s390/lib/string.c @@ -8,6 +8,9 @@ */ #define IN_ARCH_STRING_C 1 +#ifndef __NO_FORTIFY +# define __NO_FORTIFY +#endif #include <linux/types.h> #include <linux/string.h> @@ -18,23 +21,30 @@ */ static inline char *__strend(const char *s) { - register unsigned long r0 asm("0") = 0; - - asm volatile ("0: srst %0,%1\n" - " jo 0b" - : "+d" (r0), "+a" (s) : : "cc", "memory"); - return (char *) r0; + unsigned long e = 0; + + asm volatile( + " lghi 0,0\n" + "0: srst %[e],%[s]\n" + " jo 0b\n" + : [e] "+&a" (e), [s] "+&a" (s) + : + : "cc", "memory", "0"); + return (char *)e; } static inline char *__strnend(const char *s, size_t n) { - register unsigned long r0 asm("0") = 0; const char *p = s + n; - asm volatile ("0: srst %0,%1\n" - " jo 0b" - : "+d" (p), "+a" (s) : "d" (r0) : "cc", "memory"); - return (char *) p; + asm volatile( + " lghi 0,0\n" + "0: srst %[p],%[s]\n" + " jo 0b\n" + : [p] "+&d" (p), [s] "+&a" (s) + : + : "cc", "memory", "0"); + return (char *)p; } /** @@ -76,45 +86,21 @@ EXPORT_SYMBOL(strnlen); #ifdef __HAVE_ARCH_STRCPY char *strcpy(char *dest, const char *src) { - register int r0 asm("0") = 0; char *ret = dest; - asm volatile ("0: mvst %0,%1\n" - " jo 0b" - : "+&a" (dest), "+&a" (src) : "d" (r0) - : "cc", "memory" ); + asm volatile( + " lghi 0,0\n" + "0: mvst %[dest],%[src]\n" + " jo 0b\n" + : [dest] "+&a" (dest), [src] "+&a" (src) + : + : "cc", "memory", "0"); return ret; } EXPORT_SYMBOL(strcpy); #endif /** - * strlcpy - Copy a %NUL terminated string into a sized buffer - * @dest: Where to copy the string to - * @src: Where to copy the string from - * @size: size of destination buffer - * - * Compatible with *BSD: the result is always a valid - * NUL-terminated string that fits in the buffer (unless, - * of course, the buffer size is zero). It does not pad - * out the result like strncpy() does. - */ -#ifdef __HAVE_ARCH_STRLCPY -size_t strlcpy(char *dest, const char *src, size_t size) -{ - size_t ret = __strend(src) - src; - - if (size) { - size_t len = (ret >= size) ? size-1 : ret; - dest[len] = '\0'; - memcpy(dest, src, len); - } - return ret; -} -EXPORT_SYMBOL(strlcpy); -#endif - -/** * strncpy - Copy a length-limited, %NUL-terminated string * @dest: Where to copy the string to * @src: Where to copy the string from @@ -144,16 +130,18 @@ EXPORT_SYMBOL(strncpy); #ifdef __HAVE_ARCH_STRCAT char *strcat(char *dest, const char *src) { - register int r0 asm("0") = 0; - unsigned long dummy; + unsigned long dummy = 0; char *ret = dest; - asm volatile ("0: srst %0,%1\n" - " jo 0b\n" - "1: mvst %0,%2\n" - " jo 1b" - : "=&a" (dummy), "+a" (dest), "+a" (src) - : "d" (r0), "0" (0UL) : "cc", "memory" ); + asm volatile( + " lghi 0,0\n" + "0: srst %[dummy],%[dest]\n" + " jo 0b\n" + "1: mvst %[dummy],%[src]\n" + " jo 1b\n" + : [dummy] "+&a" (dummy), [dest] "+&a" (dest), [src] "+&a" (src) + : + : "cc", "memory", "0"); return ret; } EXPORT_SYMBOL(strcat); @@ -221,58 +209,40 @@ EXPORT_SYMBOL(strncat); #ifdef __HAVE_ARCH_STRCMP int strcmp(const char *s1, const char *s2) { - register int r0 asm("0") = 0; int ret = 0; - asm volatile ("0: clst %2,%3\n" - " jo 0b\n" - " je 1f\n" - " ic %0,0(%2)\n" - " ic %1,0(%3)\n" - " sr %0,%1\n" - "1:" - : "+d" (ret), "+d" (r0), "+a" (s1), "+a" (s2) - : : "cc", "memory"); + asm volatile( + " lghi 0,0\n" + "0: clst %[s1],%[s2]\n" + " jo 0b\n" + " je 1f\n" + " ic %[ret],0(%[s1])\n" + " ic 0,0(%[s2])\n" + " sr %[ret],0\n" + "1:" + : [ret] "+&d" (ret), [s1] "+&a" (s1), [s2] "+&a" (s2) + : + : "cc", "memory", "0"); return ret; } EXPORT_SYMBOL(strcmp); #endif -/** - * strrchr - Find the last occurrence of a character in a string - * @s: The string to be searched - * @c: The character to search for - */ -#ifdef __HAVE_ARCH_STRRCHR -char *strrchr(const char *s, int c) -{ - size_t len = __strend(s) - s; - - if (len) - do { - if (s[len] == (char) c) - return (char *) s + len; - } while (--len > 0); - return NULL; -} -EXPORT_SYMBOL(strrchr); -#endif - static inline int clcle(const char *s1, unsigned long l1, const char *s2, unsigned long l2) { - register unsigned long r2 asm("2") = (unsigned long) s1; - register unsigned long r3 asm("3") = (unsigned long) l1; - register unsigned long r4 asm("4") = (unsigned long) s2; - register unsigned long r5 asm("5") = (unsigned long) l2; + union register_pair r1 = { .even = (unsigned long)s1, .odd = l1, }; + union register_pair r3 = { .even = (unsigned long)s2, .odd = l2, }; int cc; - asm volatile ("0: clcle %1,%3,0\n" - " jo 0b\n" - " ipm %0\n" - " srl %0,28" - : "=&d" (cc), "+a" (r2), "+a" (r3), - "+a" (r4), "+a" (r5) : : "cc", "memory"); + asm volatile( + "0: clcle %[r1],%[r3],0\n" + " jo 0b\n" + " ipm %[cc]\n" + " srl %[cc],28\n" + : [cc] "=&d" (cc), [r1] "+&d" (r1.pair), [r3] "+&d" (r3.pair) + : + : "cc", "memory"); return cc; } @@ -315,15 +285,18 @@ EXPORT_SYMBOL(strstr); #ifdef __HAVE_ARCH_MEMCHR void *memchr(const void *s, int c, size_t n) { - register int r0 asm("0") = (char) c; const void *ret = s + n; - asm volatile ("0: srst %0,%1\n" - " jo 0b\n" - " jl 1f\n" - " la %0,0\n" - "1:" - : "+a" (ret), "+&a" (s) : "d" (r0) : "cc", "memory"); + asm volatile( + " lgr 0,%[c]\n" + "0: srst %[ret],%[s]\n" + " jo 0b\n" + " jl 1f\n" + " la %[ret],0\n" + "1:" + : [ret] "+&a" (ret), [s] "+&a" (s) + : [c] "d" (c) + : "cc", "memory", "0"); return (void *) ret; } EXPORT_SYMBOL(memchr); @@ -333,7 +306,7 @@ EXPORT_SYMBOL(memchr); * memcmp - Compare two areas of memory * @s1: One area of memory * @s2: Another area of memory - * @count: The size of the area. + * @n: The size of the area. */ #ifdef __HAVE_ARCH_MEMCMP int memcmp(const void *s1, const void *s2, size_t n) @@ -360,13 +333,16 @@ EXPORT_SYMBOL(memcmp); #ifdef __HAVE_ARCH_MEMSCAN void *memscan(void *s, int c, size_t n) { - register int r0 asm("0") = (char) c; const void *ret = s + n; - asm volatile ("0: srst %0,%1\n" - " jo 0b\n" - : "+a" (ret), "+&a" (s) : "d" (r0) : "cc", "memory"); - return (void *) ret; + asm volatile( + " lgr 0,%[c]\n" + "0: srst %[ret],%[s]\n" + " jo 0b\n" + : [ret] "+&a" (ret), [s] "+&a" (s) + : [c] "d" (c) + : "cc", "memory", "0"); + return (void *)ret; } EXPORT_SYMBOL(memscan); #endif diff --git a/arch/s390/lib/test_kprobes.c b/arch/s390/lib/test_kprobes.c new file mode 100644 index 000000000000..9e62d62812e5 --- /dev/null +++ b/arch/s390/lib/test_kprobes.c @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0+ + +#include <linux/kernel.h> +#include <linux/kprobes.h> +#include <linux/random.h> +#include <kunit/test.h> +#include "test_kprobes.h" + +static struct kprobe kp; + +static void setup_kprobe(struct kunit *test, struct kprobe *kp, + const char *symbol, int offset) +{ + kp->offset = offset; + kp->addr = NULL; + kp->symbol_name = symbol; +} + +static void test_kprobe_offset(struct kunit *test, struct kprobe *kp, + const char *target, int offset) +{ + int ret; + + setup_kprobe(test, kp, target, 0); + ret = register_kprobe(kp); + if (!ret) + unregister_kprobe(kp); + KUNIT_EXPECT_EQ(test, 0, ret); + setup_kprobe(test, kp, target, offset); + ret = register_kprobe(kp); + KUNIT_EXPECT_EQ(test, -EINVAL, ret); + if (!ret) + unregister_kprobe(kp); +} + +static void test_kprobe_odd(struct kunit *test) +{ + test_kprobe_offset(test, &kp, "kprobes_target_odd", + kprobes_target_odd_offs); +} + +static void test_kprobe_in_insn4(struct kunit *test) +{ + test_kprobe_offset(test, &kp, "kprobes_target_in_insn4", + kprobes_target_in_insn4_offs); +} + +static void test_kprobe_in_insn6_lo(struct kunit *test) +{ + test_kprobe_offset(test, &kp, "kprobes_target_in_insn6_lo", + kprobes_target_in_insn6_lo_offs); +} + +static void test_kprobe_in_insn6_hi(struct kunit *test) +{ + test_kprobe_offset(test, &kp, "kprobes_target_in_insn6_hi", + kprobes_target_in_insn6_hi_offs); +} + +static struct kunit_case kprobes_testcases[] = { + KUNIT_CASE(test_kprobe_odd), + KUNIT_CASE(test_kprobe_in_insn4), + KUNIT_CASE(test_kprobe_in_insn6_lo), + KUNIT_CASE(test_kprobe_in_insn6_hi), + {} +}; + +static struct kunit_suite kprobes_test_suite = { + .name = "kprobes_test_s390", + .test_cases = kprobes_testcases, +}; + +kunit_test_suites(&kprobes_test_suite); + +MODULE_LICENSE("GPL"); diff --git a/arch/s390/lib/test_kprobes.h b/arch/s390/lib/test_kprobes.h new file mode 100644 index 000000000000..2b4c9bc337f1 --- /dev/null +++ b/arch/s390/lib/test_kprobes.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +#ifndef TEST_KPROBES_H +#define TEST_KPROBES_H + +extern unsigned long kprobes_target_odd_offs; +extern unsigned long kprobes_target_in_insn4_offs; +extern unsigned long kprobes_target_in_insn6_lo_offs; +extern unsigned long kprobes_target_in_insn6_hi_offs; + +#endif diff --git a/arch/s390/lib/test_kprobes_asm.S b/arch/s390/lib/test_kprobes_asm.S new file mode 100644 index 000000000000..ade7a3042334 --- /dev/null +++ b/arch/s390/lib/test_kprobes_asm.S @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#include <linux/linkage.h> +#include <asm/ftrace.h> + +#define KPROBES_TARGET_START(name) \ + SYM_FUNC_START(name); \ + FTRACE_GEN_NOP_ASM(name) + +#define KPROBES_TARGET_END(name) \ + SYM_FUNC_END(name); \ + SYM_DATA(name##_offs, .quad 1b - name) + +KPROBES_TARGET_START(kprobes_target_in_insn4) + .word 0x4700 // bc 0,0 +1: .word 0x0000 + br %r14 +KPROBES_TARGET_END(kprobes_target_in_insn4) + +KPROBES_TARGET_START(kprobes_target_in_insn6_lo) + .word 0xe310 // ly 1,0 +1: .word 0x0000 + .word 0x0058 + br %r14 +KPROBES_TARGET_END(kprobes_target_in_insn6_lo) + +KPROBES_TARGET_START(kprobes_target_in_insn6_hi) + .word 0xe310 // ly 1,0 + .word 0x0000 +1: .word 0x0058 + br %r14 +KPROBES_TARGET_END(kprobes_target_in_insn6_hi) + +KPROBES_TARGET_START(kprobes_target_bp) + nop + .word 0x0000 + nop +1: br %r14 +KPROBES_TARGET_END(kprobes_target_bp) + +KPROBES_TARGET_START(kprobes_target_odd) + .byte 0x07 +1: .byte 0x07 + br %r14 +KPROBES_TARGET_END(kprobes_target_odd) diff --git a/arch/s390/lib/test_modules.c b/arch/s390/lib/test_modules.c new file mode 100644 index 000000000000..9894009fc1f2 --- /dev/null +++ b/arch/s390/lib/test_modules.c @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0+ + +#include <kunit/test.h> +#include <linux/module.h> + +#include "test_modules.h" + +/* + * Test that modules with many relocations are loaded properly. + */ +static void test_modules_many_vmlinux_relocs(struct kunit *test) +{ + int result = 0; + +#define CALL_RETURN(i) result += test_modules_return_ ## i() + REPEAT_10000(CALL_RETURN); + KUNIT_ASSERT_EQ(test, result, 49995000); +} + +static struct kunit_case modules_testcases[] = { + KUNIT_CASE(test_modules_many_vmlinux_relocs), + {} +}; + +static struct kunit_suite modules_test_suite = { + .name = "modules_test_s390", + .test_cases = modules_testcases, +}; + +kunit_test_suites(&modules_test_suite); + +MODULE_LICENSE("GPL"); diff --git a/arch/s390/lib/test_modules.h b/arch/s390/lib/test_modules.h new file mode 100644 index 000000000000..6371fcf17684 --- /dev/null +++ b/arch/s390/lib/test_modules.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +#ifndef TEST_MODULES_H +#define TEST_MODULES_H + +#define __REPEAT_10000_3(f, x) \ + f(x ## 0); \ + f(x ## 1); \ + f(x ## 2); \ + f(x ## 3); \ + f(x ## 4); \ + f(x ## 5); \ + f(x ## 6); \ + f(x ## 7); \ + f(x ## 8); \ + f(x ## 9) +#define __REPEAT_10000_2(f, x) \ + __REPEAT_10000_3(f, x ## 0); \ + __REPEAT_10000_3(f, x ## 1); \ + __REPEAT_10000_3(f, x ## 2); \ + __REPEAT_10000_3(f, x ## 3); \ + __REPEAT_10000_3(f, x ## 4); \ + __REPEAT_10000_3(f, x ## 5); \ + __REPEAT_10000_3(f, x ## 6); \ + __REPEAT_10000_3(f, x ## 7); \ + __REPEAT_10000_3(f, x ## 8); \ + __REPEAT_10000_3(f, x ## 9) +#define __REPEAT_10000_1(f, x) \ + __REPEAT_10000_2(f, x ## 0); \ + __REPEAT_10000_2(f, x ## 1); \ + __REPEAT_10000_2(f, x ## 2); \ + __REPEAT_10000_2(f, x ## 3); \ + __REPEAT_10000_2(f, x ## 4); \ + __REPEAT_10000_2(f, x ## 5); \ + __REPEAT_10000_2(f, x ## 6); \ + __REPEAT_10000_2(f, x ## 7); \ + __REPEAT_10000_2(f, x ## 8); \ + __REPEAT_10000_2(f, x ## 9) +#define REPEAT_10000(f) \ + __REPEAT_10000_1(f, 0); \ + __REPEAT_10000_1(f, 1); \ + __REPEAT_10000_1(f, 2); \ + __REPEAT_10000_1(f, 3); \ + __REPEAT_10000_1(f, 4); \ + __REPEAT_10000_1(f, 5); \ + __REPEAT_10000_1(f, 6); \ + __REPEAT_10000_1(f, 7); \ + __REPEAT_10000_1(f, 8); \ + __REPEAT_10000_1(f, 9) + +#define DECLARE_RETURN(i) int test_modules_return_ ## i(void) +REPEAT_10000(DECLARE_RETURN); + +#endif diff --git a/arch/s390/lib/test_modules_helpers.c b/arch/s390/lib/test_modules_helpers.c new file mode 100644 index 000000000000..1670349a03eb --- /dev/null +++ b/arch/s390/lib/test_modules_helpers.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0+ + +#include <linux/export.h> + +#include "test_modules.h" + +#define DEFINE_RETURN(i) \ + int test_modules_return_ ## i(void) \ + { \ + return 1 ## i - 10000; \ + } \ + EXPORT_SYMBOL_GPL(test_modules_return_ ## i) +REPEAT_10000(DEFINE_RETURN); diff --git a/arch/s390/lib/test_unwind.c b/arch/s390/lib/test_unwind.c index bda7ac0ddd29..2848e3fb2ff5 100644 --- a/arch/s390/lib/test_unwind.c +++ b/arch/s390/lib/test_unwind.c @@ -3,20 +3,28 @@ * Test module for unwind_for_each_frame */ -#define pr_fmt(fmt) "test_unwind: " fmt +#include <kunit/test.h> #include <asm/unwind.h> #include <linux/completion.h> #include <linux/kallsyms.h> #include <linux/kthread.h> +#include <linux/ftrace.h> #include <linux/module.h> +#include <linux/timer.h> +#include <linux/slab.h> #include <linux/string.h> #include <linux/kprobes.h> #include <linux/wait.h> #include <asm/irq.h> -#include <asm/delay.h> + +static struct kunit *current_test; #define BT_BUF_SIZE (PAGE_SIZE * 4) +static bool force_bt; +module_param_named(backtrace, force_bt, bool, 0444); +MODULE_PARM_DESC(backtrace, "print backtraces for all tests"); + /* * To avoid printk line limit split backtrace by lines */ @@ -28,7 +36,7 @@ static void print_backtrace(char *bt) p = strsep(&bt, "\n"); if (!p) break; - pr_err("%s\n", p); + kunit_err(current_test, "%s\n", p); } } @@ -39,7 +47,7 @@ static void print_backtrace(char *bt) static noinline int test_unwind(struct task_struct *task, struct pt_regs *regs, unsigned long sp) { - int frame_count, prev_is_func2, seen_func2_func1; + int frame_count, prev_is_func2, seen_func2_func1, seen_arch_rethook_trampoline; const int max_frames = 128; struct unwind_state state; size_t bt_pos = 0; @@ -48,13 +56,14 @@ static noinline int test_unwind(struct task_struct *task, struct pt_regs *regs, bt = kmalloc(BT_BUF_SIZE, GFP_ATOMIC); if (!bt) { - pr_err("failed to allocate backtrace buffer\n"); + kunit_err(current_test, "failed to allocate backtrace buffer\n"); return -ENOMEM; } /* Unwind. */ frame_count = 0; prev_is_func2 = 0; seen_func2_func1 = 0; + seen_arch_rethook_trampoline = 0; unwind_for_each_frame(&state, task, regs, sp) { unsigned long addr = unwind_get_return_address(&state); char sym[KSYM_SYMBOL_LEN]; @@ -62,8 +71,9 @@ static noinline int test_unwind(struct task_struct *task, struct pt_regs *regs, if (frame_count++ == max_frames) break; if (state.reliable && !addr) { - pr_err("unwind state reliable but addr is 0\n"); - return -EINVAL; + kunit_err(current_test, "unwind state reliable but addr is 0\n"); + ret = -EINVAL; + break; } sprint_symbol(sym, addr); if (bt_pos < BT_BUF_SIZE) { @@ -73,28 +83,34 @@ static noinline int test_unwind(struct task_struct *task, struct pt_regs *regs, stack_type_name(state.stack_info.type), (void *)state.sp, (void *)state.ip); if (bt_pos >= BT_BUF_SIZE) - pr_err("backtrace buffer is too small\n"); + kunit_err(current_test, "backtrace buffer is too small\n"); } frame_count += 1; if (prev_is_func2 && str_has_prefix(sym, "unwindme_func1")) seen_func2_func1 = 1; prev_is_func2 = str_has_prefix(sym, "unwindme_func2"); + if (str_has_prefix(sym, "arch_rethook_trampoline+0x0/")) + seen_arch_rethook_trampoline = 1; } /* Check the results. */ if (unwind_error(&state)) { - pr_err("unwind error\n"); + kunit_err(current_test, "unwind error\n"); ret = -EINVAL; } if (!seen_func2_func1) { - pr_err("unwindme_func2 and unwindme_func1 not found\n"); + kunit_err(current_test, "unwindme_func2 and unwindme_func1 not found\n"); ret = -EINVAL; } if (frame_count == max_frames) { - pr_err("Maximum number of frames exceeded\n"); + kunit_err(current_test, "Maximum number of frames exceeded\n"); ret = -EINVAL; } - if (ret) + if (seen_arch_rethook_trampoline) { + kunit_err(current_test, "arch_rethook_trampoline+0x0 in unwinding results\n"); + ret = -EINVAL; + } + if (ret || force_bt) print_backtrace(bt); kfree(bt); return ret; @@ -118,31 +134,187 @@ static struct unwindme *unwindme; #define UWM_REGS 0x2 /* Pass regs to test_unwind(). */ #define UWM_SP 0x4 /* Pass sp to test_unwind(). */ #define UWM_CALLER 0x8 /* Unwind starting from caller. */ -#define UWM_SWITCH_STACK 0x10 /* Use CALL_ON_STACK. */ +#define UWM_SWITCH_STACK 0x10 /* Use call_on_stack. */ #define UWM_IRQ 0x20 /* Unwind from irq context. */ -#define UWM_PGM 0x40 /* Unwind from program check handler. */ +#define UWM_PGM 0x40 /* Unwind from program check handler */ +#define UWM_KPROBE_ON_FTRACE 0x80 /* Unwind from kprobe handler called via ftrace. */ +#define UWM_FTRACE 0x100 /* Unwind from ftrace handler. */ +#define UWM_KRETPROBE 0x200 /* Unwind through kretprobed function. */ +#define UWM_KRETPROBE_HANDLER 0x400 /* Unwind from kretprobe handler. */ -static __always_inline unsigned long get_psw_addr(void) +static __always_inline struct pt_regs fake_pt_regs(void) { - unsigned long psw_addr; + struct pt_regs regs; + + memset(®s, 0, sizeof(regs)); + regs.gprs[15] = current_stack_pointer; asm volatile( "basr %[psw_addr],0\n" - : [psw_addr] "=d" (psw_addr)); - return psw_addr; + : [psw_addr] "=d" (regs.psw.addr)); + return regs; } -#ifdef CONFIG_KPROBES -static int pgm_pre_handler(struct kprobe *p, struct pt_regs *regs) +static int kretprobe_ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs) { struct unwindme *u = unwindme; + if (!(u->flags & UWM_KRETPROBE_HANDLER)) + return 0; + u->ret = test_unwind(NULL, (u->flags & UWM_REGS) ? regs : NULL, (u->flags & UWM_SP) ? u->sp : 0); + return 0; } + +static noinline notrace int test_unwind_kretprobed_func(struct unwindme *u) +{ + struct pt_regs regs; + + if (!(u->flags & UWM_KRETPROBE)) + return 0; + + regs = fake_pt_regs(); + return test_unwind(NULL, (u->flags & UWM_REGS) ? ®s : NULL, + (u->flags & UWM_SP) ? u->sp : 0); +} + +static noinline int test_unwind_kretprobed_func_caller(struct unwindme *u) +{ + return test_unwind_kretprobed_func(u); +} + +static int test_unwind_kretprobe(struct unwindme *u) +{ + int ret; + struct kretprobe my_kretprobe; + + if (!IS_ENABLED(CONFIG_KPROBES)) + kunit_skip(current_test, "requires CONFIG_KPROBES"); + + u->ret = -1; /* make sure kprobe is called */ + unwindme = u; + + memset(&my_kretprobe, 0, sizeof(my_kretprobe)); + my_kretprobe.handler = kretprobe_ret_handler; + my_kretprobe.maxactive = 1; + my_kretprobe.kp.addr = (kprobe_opcode_t *)test_unwind_kretprobed_func; + + ret = register_kretprobe(&my_kretprobe); + + if (ret < 0) { + kunit_err(current_test, "register_kretprobe failed %d\n", ret); + return -EINVAL; + } + + ret = test_unwind_kretprobed_func_caller(u); + unregister_kretprobe(&my_kretprobe); + unwindme = NULL; + if (u->flags & UWM_KRETPROBE_HANDLER) + ret = u->ret; + return ret; +} + +static int kprobe_pre_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct unwindme *u = unwindme; + + u->ret = test_unwind(NULL, (u->flags & UWM_REGS) ? regs : NULL, + (u->flags & UWM_SP) ? u->sp : 0); + return 0; +} + +extern const char test_unwind_kprobed_insn[]; + +static noinline void test_unwind_kprobed_func(void) +{ + asm volatile( + " nopr %%r7\n" + "test_unwind_kprobed_insn:\n" + " nopr %%r7\n" + :); +} + +static int test_unwind_kprobe(struct unwindme *u) +{ + struct kprobe kp; + int ret; + + if (!IS_ENABLED(CONFIG_KPROBES)) + kunit_skip(current_test, "requires CONFIG_KPROBES"); + if (!IS_ENABLED(CONFIG_KPROBES_ON_FTRACE) && u->flags & UWM_KPROBE_ON_FTRACE) + kunit_skip(current_test, "requires CONFIG_KPROBES_ON_FTRACE"); + + u->ret = -1; /* make sure kprobe is called */ + unwindme = u; + memset(&kp, 0, sizeof(kp)); + kp.pre_handler = kprobe_pre_handler; + kp.addr = u->flags & UWM_KPROBE_ON_FTRACE ? + (kprobe_opcode_t *)test_unwind_kprobed_func : + (kprobe_opcode_t *)test_unwind_kprobed_insn; + ret = register_kprobe(&kp); + if (ret < 0) { + kunit_err(current_test, "register_kprobe failed %d\n", ret); + return -EINVAL; + } + + test_unwind_kprobed_func(); + unregister_kprobe(&kp); + unwindme = NULL; + return u->ret; +} + +static void notrace __used test_unwind_ftrace_handler(unsigned long ip, + unsigned long parent_ip, + struct ftrace_ops *fops, + struct ftrace_regs *fregs) +{ + struct unwindme *u = (struct unwindme *)fregs->regs.gprs[2]; + + u->ret = test_unwind(NULL, (u->flags & UWM_REGS) ? &fregs->regs : NULL, + (u->flags & UWM_SP) ? u->sp : 0); +} + +static noinline int test_unwind_ftraced_func(struct unwindme *u) +{ + return READ_ONCE(u)->ret; +} + +static int test_unwind_ftrace(struct unwindme *u) +{ + int ret; +#ifdef CONFIG_DYNAMIC_FTRACE + struct ftrace_ops *fops; + + fops = kunit_kzalloc(current_test, sizeof(*fops), GFP_KERNEL); + fops->func = test_unwind_ftrace_handler; + fops->flags = FTRACE_OPS_FL_DYNAMIC | + FTRACE_OPS_FL_RECURSION | + FTRACE_OPS_FL_SAVE_REGS | + FTRACE_OPS_FL_PERMANENT; +#else + kunit_skip(current_test, "requires CONFIG_DYNAMIC_FTRACE"); #endif + ret = ftrace_set_filter_ip(fops, (unsigned long)test_unwind_ftraced_func, 0, 0); + if (ret) { + kunit_err(current_test, "failed to set ftrace filter (%d)\n", ret); + return -1; + } + + ret = register_ftrace_function(fops); + if (!ret) { + ret = test_unwind_ftraced_func(u); + unregister_ftrace_function(fops); + } else { + kunit_err(current_test, "failed to register ftrace handler (%d)\n", ret); + } + + ftrace_set_filter_ip(fops, (unsigned long)test_unwind_ftraced_func, 1, 0); + return ret; +} + /* This function may or may not appear in the backtrace. */ static noinline int unwindme_func4(struct unwindme *u) { @@ -153,40 +325,15 @@ static noinline int unwindme_func4(struct unwindme *u) wait_event(u->task_wq, kthread_should_park()); kthread_parkme(); return 0; -#ifdef CONFIG_KPROBES - } else if (u->flags & UWM_PGM) { - struct kprobe kp; - int ret; - - unwindme = u; - memset(&kp, 0, sizeof(kp)); - kp.symbol_name = "do_report_trap"; - kp.pre_handler = pgm_pre_handler; - ret = register_kprobe(&kp); - if (ret < 0) { - pr_err("register_kprobe failed %d\n", ret); - return -EINVAL; - } - - /* - * trigger specification exception - */ - asm volatile( - " mvcl %%r1,%%r1\n" - "0: nopr %%r7\n" - EX_TABLE(0b, 0b) - :); - - unregister_kprobe(&kp); - unwindme = NULL; - return u->ret; -#endif + } else if (u->flags & (UWM_PGM | UWM_KPROBE_ON_FTRACE)) { + return test_unwind_kprobe(u); + } else if (u->flags & (UWM_KRETPROBE | UWM_KRETPROBE_HANDLER)) { + return test_unwind_kretprobe(u); + } else if (u->flags & UWM_FTRACE) { + return test_unwind_ftrace(u); } else { - struct pt_regs regs; + struct pt_regs regs = fake_pt_regs(); - memset(®s, 0, sizeof(regs)); - regs.psw.addr = get_psw_addr(); - regs.gprs[15] = current_stack_pointer(); return test_unwind(NULL, (u->flags & UWM_REGS) ? ®s : NULL, (u->flags & UWM_SP) ? u->sp : 0); @@ -203,12 +350,16 @@ static noinline int unwindme_func3(struct unwindme *u) /* This function must appear in the backtrace. */ static noinline int unwindme_func2(struct unwindme *u) { + unsigned long flags, mflags; int rc; if (u->flags & UWM_SWITCH_STACK) { - preempt_disable(); - rc = CALL_ON_STACK(unwindme_func3, S390_lowcore.nodat_stack, 1, u); - preempt_enable(); + local_irq_save(flags); + local_mcck_save(mflags); + rc = call_on_stack(1, S390_lowcore.nodat_stack, + int, unwindme_func3, struct unwindme *, u); + local_mcck_restore(mflags); + local_irq_restore(flags); return rc; } else { return unwindme_func3(u); @@ -221,31 +372,27 @@ static noinline int unwindme_func1(void *u) return unwindme_func2((struct unwindme *)u); } -static void unwindme_irq_handler(struct ext_code ext_code, - unsigned int param32, - unsigned long param64) +static void unwindme_timer_fn(struct timer_list *unused) { struct unwindme *u = READ_ONCE(unwindme); - if (u && u->task == current) { + if (u) { unwindme = NULL; u->task = NULL; u->ret = unwindme_func1(u); + complete(&u->task_ready); } } +static struct timer_list unwind_timer; + static int test_unwind_irq(struct unwindme *u) { - preempt_disable(); - if (register_external_irq(EXT_IRQ_CLK_COMP, unwindme_irq_handler)) { - pr_info("Couldn't reqister external interrupt handler"); - return -1; - } - u->task = current; unwindme = u; - udelay(1); - unregister_external_irq(EXT_IRQ_CLK_COMP, unwindme_irq_handler); - preempt_enable(); + init_completion(&u->task_ready); + timer_setup(&unwind_timer, unwindme_timer_fn, 0); + mod_timer(&unwind_timer, jiffies + 1); + wait_for_completion(&u->task_ready); return u->ret; } @@ -265,7 +412,7 @@ static int test_unwind_task(struct unwindme *u) */ task = kthread_run(unwindme_func1, u, "%s", __func__); if (IS_ERR(task)) { - pr_err("kthread_run() failed\n"); + kunit_err(current_test, "kthread_run() failed\n"); return PTR_ERR(task); } /* @@ -280,68 +427,96 @@ static int test_unwind_task(struct unwindme *u) return ret; } -static int test_unwind_flags(int flags) +struct test_params { + int flags; + char *name; +}; + +/* + * Create required parameter list for tests + */ +#define TEST_WITH_FLAGS(f) { .flags = f, .name = #f } +static const struct test_params param_list[] = { + TEST_WITH_FLAGS(UWM_DEFAULT), + TEST_WITH_FLAGS(UWM_SP), + TEST_WITH_FLAGS(UWM_REGS), + TEST_WITH_FLAGS(UWM_SWITCH_STACK), + TEST_WITH_FLAGS(UWM_SP | UWM_REGS), + TEST_WITH_FLAGS(UWM_CALLER | UWM_SP), + TEST_WITH_FLAGS(UWM_CALLER | UWM_SP | UWM_REGS), + TEST_WITH_FLAGS(UWM_CALLER | UWM_SP | UWM_REGS | UWM_SWITCH_STACK), + TEST_WITH_FLAGS(UWM_THREAD), + TEST_WITH_FLAGS(UWM_THREAD | UWM_SP), + TEST_WITH_FLAGS(UWM_THREAD | UWM_CALLER | UWM_SP), + TEST_WITH_FLAGS(UWM_IRQ), + TEST_WITH_FLAGS(UWM_IRQ | UWM_SWITCH_STACK), + TEST_WITH_FLAGS(UWM_IRQ | UWM_SP), + TEST_WITH_FLAGS(UWM_IRQ | UWM_REGS), + TEST_WITH_FLAGS(UWM_IRQ | UWM_SP | UWM_REGS), + TEST_WITH_FLAGS(UWM_IRQ | UWM_CALLER | UWM_SP), + TEST_WITH_FLAGS(UWM_IRQ | UWM_CALLER | UWM_SP | UWM_REGS), + TEST_WITH_FLAGS(UWM_IRQ | UWM_CALLER | UWM_SP | UWM_REGS | UWM_SWITCH_STACK), + TEST_WITH_FLAGS(UWM_PGM), + TEST_WITH_FLAGS(UWM_PGM | UWM_SP), + TEST_WITH_FLAGS(UWM_PGM | UWM_REGS), + TEST_WITH_FLAGS(UWM_PGM | UWM_SP | UWM_REGS), + TEST_WITH_FLAGS(UWM_KPROBE_ON_FTRACE), + TEST_WITH_FLAGS(UWM_KPROBE_ON_FTRACE | UWM_SP), + TEST_WITH_FLAGS(UWM_KPROBE_ON_FTRACE | UWM_REGS), + TEST_WITH_FLAGS(UWM_KPROBE_ON_FTRACE | UWM_SP | UWM_REGS), + TEST_WITH_FLAGS(UWM_FTRACE), + TEST_WITH_FLAGS(UWM_FTRACE | UWM_SP), + TEST_WITH_FLAGS(UWM_FTRACE | UWM_REGS), + TEST_WITH_FLAGS(UWM_FTRACE | UWM_SP | UWM_REGS), + TEST_WITH_FLAGS(UWM_KRETPROBE), + TEST_WITH_FLAGS(UWM_KRETPROBE | UWM_SP), + TEST_WITH_FLAGS(UWM_KRETPROBE | UWM_REGS), + TEST_WITH_FLAGS(UWM_KRETPROBE | UWM_SP | UWM_REGS), + TEST_WITH_FLAGS(UWM_KRETPROBE_HANDLER), + TEST_WITH_FLAGS(UWM_KRETPROBE_HANDLER | UWM_SP), + TEST_WITH_FLAGS(UWM_KRETPROBE_HANDLER | UWM_REGS), + TEST_WITH_FLAGS(UWM_KRETPROBE_HANDLER | UWM_SP | UWM_REGS), +}; + +/* + * Parameter description generator: required for KUNIT_ARRAY_PARAM() + */ +static void get_desc(const struct test_params *params, char *desc) +{ + strscpy(desc, params->name, KUNIT_PARAM_DESC_SIZE); +} + +/* + * Create test_unwind_gen_params + */ +KUNIT_ARRAY_PARAM(test_unwind, param_list, get_desc); + +static void test_unwind_flags(struct kunit *test) { struct unwindme u; + const struct test_params *params; - u.flags = flags; + current_test = test; + params = (const struct test_params *)test->param_value; + u.flags = params->flags; if (u.flags & UWM_THREAD) - return test_unwind_task(&u); + KUNIT_EXPECT_EQ(test, 0, test_unwind_task(&u)); else if (u.flags & UWM_IRQ) - return test_unwind_irq(&u); + KUNIT_EXPECT_EQ(test, 0, test_unwind_irq(&u)); else - return unwindme_func1(&u); + KUNIT_EXPECT_EQ(test, 0, unwindme_func1(&u)); } -static int test_unwind_init(void) -{ - int ret = 0; - -#define TEST(flags) \ -do { \ - pr_info("[ RUN ] " #flags "\n"); \ - if (!test_unwind_flags((flags))) { \ - pr_info("[ OK ] " #flags "\n"); \ - } else { \ - pr_err("[ FAILED ] " #flags "\n"); \ - ret = -EINVAL; \ - } \ -} while (0) - - TEST(UWM_DEFAULT); - TEST(UWM_SP); - TEST(UWM_REGS); - TEST(UWM_SWITCH_STACK); - TEST(UWM_SP | UWM_REGS); - TEST(UWM_CALLER | UWM_SP); - TEST(UWM_CALLER | UWM_SP | UWM_REGS); - TEST(UWM_CALLER | UWM_SP | UWM_REGS | UWM_SWITCH_STACK); - TEST(UWM_THREAD); - TEST(UWM_THREAD | UWM_SP); - TEST(UWM_THREAD | UWM_CALLER | UWM_SP); - TEST(UWM_IRQ); - TEST(UWM_IRQ | UWM_SWITCH_STACK); - TEST(UWM_IRQ | UWM_SP); - TEST(UWM_IRQ | UWM_REGS); - TEST(UWM_IRQ | UWM_SP | UWM_REGS); - TEST(UWM_IRQ | UWM_CALLER | UWM_SP); - TEST(UWM_IRQ | UWM_CALLER | UWM_SP | UWM_REGS); - TEST(UWM_IRQ | UWM_CALLER | UWM_SP | UWM_REGS | UWM_SWITCH_STACK); -#ifdef CONFIG_KPROBES - TEST(UWM_PGM); - TEST(UWM_PGM | UWM_SP); - TEST(UWM_PGM | UWM_REGS); - TEST(UWM_PGM | UWM_SP | UWM_REGS); -#endif -#undef TEST +static struct kunit_case unwind_test_cases[] = { + KUNIT_CASE_PARAM(test_unwind_flags, test_unwind_gen_params), + {} +}; - return ret; -} +static struct kunit_suite test_unwind_suite = { + .name = "test_unwind", + .test_cases = unwind_test_cases, +}; -static void test_unwind_exit(void) -{ -} +kunit_test_suites(&test_unwind_suite); -module_init(test_unwind_init); -module_exit(test_unwind_exit); MODULE_LICENSE("GPL"); diff --git a/arch/s390/lib/tishift.S b/arch/s390/lib/tishift.S new file mode 100644 index 000000000000..96214f51f49b --- /dev/null +++ b/arch/s390/lib/tishift.S @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/export.h> +#include <linux/linkage.h> +#include <asm/nospec-insn.h> + + .section .noinstr.text, "ax" + + GEN_BR_THUNK %r14 + +SYM_FUNC_START(__ashlti3) + lmg %r0,%r1,0(%r3) + cije %r4,0,1f + lhi %r3,64 + sr %r3,%r4 + jnh 0f + srlg %r3,%r1,0(%r3) + sllg %r0,%r0,0(%r4) + sllg %r1,%r1,0(%r4) + ogr %r0,%r3 + j 1f +0: sllg %r0,%r1,-64(%r4) + lghi %r1,0 +1: stmg %r0,%r1,0(%r2) + BR_EX %r14 +SYM_FUNC_END(__ashlti3) +EXPORT_SYMBOL(__ashlti3) + +SYM_FUNC_START(__ashrti3) + lmg %r0,%r1,0(%r3) + cije %r4,0,1f + lhi %r3,64 + sr %r3,%r4 + jnh 0f + sllg %r3,%r0,0(%r3) + srlg %r1,%r1,0(%r4) + srag %r0,%r0,0(%r4) + ogr %r1,%r3 + j 1f +0: srag %r1,%r0,-64(%r4) + srag %r0,%r0,63 +1: stmg %r0,%r1,0(%r2) + BR_EX %r14 +SYM_FUNC_END(__ashrti3) +EXPORT_SYMBOL(__ashrti3) + +SYM_FUNC_START(__lshrti3) + lmg %r0,%r1,0(%r3) + cije %r4,0,1f + lhi %r3,64 + sr %r3,%r4 + jnh 0f + sllg %r3,%r0,0(%r3) + srlg %r1,%r1,0(%r4) + srlg %r0,%r0,0(%r4) + ogr %r1,%r3 + j 1f +0: srlg %r1,%r0,-64(%r4) + lghi %r0,0 +1: stmg %r0,%r1,0(%r2) + BR_EX %r14 +SYM_FUNC_END(__lshrti3) +EXPORT_SYMBOL(__lshrti3) diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c index c4f8039a35e8..61d8dcd95bbc 100644 --- a/arch/s390/lib/uaccess.c +++ b/arch/s390/lib/uaccess.c @@ -8,437 +8,179 @@ * Gerald Schaefer (gerald.schaefer@de.ibm.com) */ -#include <linux/jump_label.h> #include <linux/uaccess.h> #include <linux/export.h> -#include <linux/errno.h> #include <linux/mm.h> -#include <asm/mmu_context.h> -#include <asm/facility.h> +#include <asm/asm-extable.h> +#include <asm/ctlreg.h> -#ifndef CONFIG_HAVE_MARCH_Z10_FEATURES -static DEFINE_STATIC_KEY_FALSE(have_mvcos); - -static int __init uaccess_init(void) +#ifdef CONFIG_DEBUG_ENTRY +void debug_user_asce(int exit) { - if (test_facility(27)) - static_branch_enable(&have_mvcos); - return 0; -} -early_initcall(uaccess_init); + struct ctlreg cr1, cr7; -static inline int copy_with_mvcos(void) -{ - if (static_branch_likely(&have_mvcos)) - return 1; - return 0; -} -#else -static inline int copy_with_mvcos(void) -{ - return 1; + local_ctl_store(1, &cr1); + local_ctl_store(7, &cr7); + if (cr1.val == S390_lowcore.kernel_asce.val && cr7.val == S390_lowcore.user_asce.val) + return; + panic("incorrect ASCE on kernel %s\n" + "cr1: %016lx cr7: %016lx\n" + "kernel: %016lx user: %016lx\n", + exit ? "exit" : "entry", cr1.val, cr7.val, + S390_lowcore.kernel_asce.val, S390_lowcore.user_asce.val); } -#endif +#endif /*CONFIG_DEBUG_ENTRY */ -void set_fs(mm_segment_t fs) +static unsigned long raw_copy_from_user_key(void *to, const void __user *from, + unsigned long size, unsigned long key) { - current->thread.mm_segment = fs; - if (fs == USER_DS) { - __ctl_load(S390_lowcore.user_asce, 1, 1); - clear_cpu_flag(CIF_ASCE_PRIMARY); - } else { - __ctl_load(S390_lowcore.kernel_asce, 1, 1); - set_cpu_flag(CIF_ASCE_PRIMARY); - } - if (fs & 1) { - if (fs == USER_DS_SACF) - __ctl_load(S390_lowcore.user_asce, 7, 7); - else - __ctl_load(S390_lowcore.kernel_asce, 7, 7); - set_cpu_flag(CIF_ASCE_SECONDARY); - } -} -EXPORT_SYMBOL(set_fs); + unsigned long rem; + union oac spec = { + .oac2.key = key, + .oac2.as = PSW_BITS_AS_SECONDARY, + .oac2.k = 1, + .oac2.a = 1, + }; -mm_segment_t enable_sacf_uaccess(void) -{ - mm_segment_t old_fs; - unsigned long asce, cr; - - old_fs = current->thread.mm_segment; - if (old_fs & 1) - return old_fs; - current->thread.mm_segment |= 1; - asce = S390_lowcore.kernel_asce; - if (likely(old_fs == USER_DS)) { - __ctl_store(cr, 1, 1); - if (cr != S390_lowcore.kernel_asce) { - __ctl_load(S390_lowcore.kernel_asce, 1, 1); - set_cpu_flag(CIF_ASCE_PRIMARY); - } - asce = S390_lowcore.user_asce; - } - __ctl_store(cr, 7, 7); - if (cr != asce) { - __ctl_load(asce, 7, 7); - set_cpu_flag(CIF_ASCE_SECONDARY); - } - return old_fs; -} -EXPORT_SYMBOL(enable_sacf_uaccess); - -void disable_sacf_uaccess(mm_segment_t old_fs) -{ - current->thread.mm_segment = old_fs; - if (old_fs == USER_DS && test_facility(27)) { - __ctl_load(S390_lowcore.user_asce, 1, 1); - clear_cpu_flag(CIF_ASCE_PRIMARY); - } -} -EXPORT_SYMBOL(disable_sacf_uaccess); - -static inline unsigned long copy_from_user_mvcos(void *x, const void __user *ptr, - unsigned long size) -{ - register unsigned long reg0 asm("0") = 0x01UL; - unsigned long tmp1, tmp2; - - tmp1 = -4096UL; - asm volatile( - "0: .insn ss,0xc80000000000,0(%0,%2),0(%1),0\n" - "6: jz 4f\n" - "1: algr %0,%3\n" - " slgr %1,%3\n" - " slgr %2,%3\n" - " j 0b\n" - "2: la %4,4095(%1)\n"/* %4 = ptr + 4095 */ - " nr %4,%3\n" /* %4 = (ptr + 4095) & -4096 */ - " slgr %4,%1\n" - " clgr %0,%4\n" /* copy crosses next page boundary? */ - " jnh 5f\n" - "3: .insn ss,0xc80000000000,0(%4,%2),0(%1),0\n" - "7: slgr %0,%4\n" - " j 5f\n" - "4: slgr %0,%0\n" - "5:\n" - EX_TABLE(0b,2b) EX_TABLE(3b,5b) EX_TABLE(6b,2b) EX_TABLE(7b,5b) - : "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2) - : "d" (reg0) : "cc", "memory"); - return size; -} - -static inline unsigned long copy_from_user_mvcp(void *x, const void __user *ptr, - unsigned long size) -{ - unsigned long tmp1, tmp2; - mm_segment_t old_fs; - - old_fs = enable_sacf_uaccess(); - tmp1 = -256UL; asm volatile( - " sacf 0\n" - "0: mvcp 0(%0,%2),0(%1),%3\n" - "7: jz 5f\n" - "1: algr %0,%3\n" - " la %1,256(%1)\n" - " la %2,256(%2)\n" - "2: mvcp 0(%0,%2),0(%1),%3\n" - "8: jnz 1b\n" - " j 5f\n" - "3: la %4,255(%1)\n" /* %4 = ptr + 255 */ - " lghi %3,-4096\n" - " nr %4,%3\n" /* %4 = (ptr + 255) & -4096 */ - " slgr %4,%1\n" - " clgr %0,%4\n" /* copy crosses next page boundary? */ - " jnh 6f\n" - "4: mvcp 0(%4,%2),0(%1),%3\n" - "9: slgr %0,%4\n" - " j 6f\n" - "5: slgr %0,%0\n" - "6: sacf 768\n" - EX_TABLE(0b,3b) EX_TABLE(2b,3b) EX_TABLE(4b,6b) - EX_TABLE(7b,3b) EX_TABLE(8b,3b) EX_TABLE(9b,6b) - : "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2) - : : "cc", "memory"); - disable_sacf_uaccess(old_fs); + " lr 0,%[spec]\n" + "0: mvcos 0(%[to]),0(%[from]),%[size]\n" + "1: jz 5f\n" + " algr %[size],%[val]\n" + " slgr %[from],%[val]\n" + " slgr %[to],%[val]\n" + " j 0b\n" + "2: la %[rem],4095(%[from])\n" /* rem = from + 4095 */ + " nr %[rem],%[val]\n" /* rem = (from + 4095) & -4096 */ + " slgr %[rem],%[from]\n" + " clgr %[size],%[rem]\n" /* copy crosses next page boundary? */ + " jnh 6f\n" + "3: mvcos 0(%[to]),0(%[from]),%[rem]\n" + "4: slgr %[size],%[rem]\n" + " j 6f\n" + "5: slgr %[size],%[size]\n" + "6:\n" + EX_TABLE(0b, 2b) + EX_TABLE(1b, 2b) + EX_TABLE(3b, 6b) + EX_TABLE(4b, 6b) + : [size] "+&a" (size), [from] "+&a" (from), [to] "+&a" (to), [rem] "=&a" (rem) + : [val] "a" (-4096UL), [spec] "d" (spec.val) + : "cc", "memory", "0"); return size; } unsigned long raw_copy_from_user(void *to, const void __user *from, unsigned long n) { - if (copy_with_mvcos()) - return copy_from_user_mvcos(to, from, n); - return copy_from_user_mvcp(to, from, n); + return raw_copy_from_user_key(to, from, n, 0); } EXPORT_SYMBOL(raw_copy_from_user); -static inline unsigned long copy_to_user_mvcos(void __user *ptr, const void *x, - unsigned long size) +unsigned long _copy_from_user_key(void *to, const void __user *from, + unsigned long n, unsigned long key) { - register unsigned long reg0 asm("0") = 0x010000UL; - unsigned long tmp1, tmp2; + unsigned long res = n; - tmp1 = -4096UL; - asm volatile( - "0: .insn ss,0xc80000000000,0(%0,%1),0(%2),0\n" - "6: jz 4f\n" - "1: algr %0,%3\n" - " slgr %1,%3\n" - " slgr %2,%3\n" - " j 0b\n" - "2: la %4,4095(%1)\n"/* %4 = ptr + 4095 */ - " nr %4,%3\n" /* %4 = (ptr + 4095) & -4096 */ - " slgr %4,%1\n" - " clgr %0,%4\n" /* copy crosses next page boundary? */ - " jnh 5f\n" - "3: .insn ss,0xc80000000000,0(%4,%1),0(%2),0\n" - "7: slgr %0,%4\n" - " j 5f\n" - "4: slgr %0,%0\n" - "5:\n" - EX_TABLE(0b,2b) EX_TABLE(3b,5b) EX_TABLE(6b,2b) EX_TABLE(7b,5b) - : "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2) - : "d" (reg0) : "cc", "memory"); - return size; + might_fault(); + if (!should_fail_usercopy()) { + instrument_copy_from_user_before(to, from, n); + res = raw_copy_from_user_key(to, from, n, key); + instrument_copy_from_user_after(to, from, n, res); + } + if (unlikely(res)) + memset(to + (n - res), 0, res); + return res; } +EXPORT_SYMBOL(_copy_from_user_key); -static inline unsigned long copy_to_user_mvcs(void __user *ptr, const void *x, - unsigned long size) +static unsigned long raw_copy_to_user_key(void __user *to, const void *from, + unsigned long size, unsigned long key) { - unsigned long tmp1, tmp2; - mm_segment_t old_fs; + unsigned long rem; + union oac spec = { + .oac1.key = key, + .oac1.as = PSW_BITS_AS_SECONDARY, + .oac1.k = 1, + .oac1.a = 1, + }; - old_fs = enable_sacf_uaccess(); - tmp1 = -256UL; asm volatile( - " sacf 0\n" - "0: mvcs 0(%0,%1),0(%2),%3\n" - "7: jz 5f\n" - "1: algr %0,%3\n" - " la %1,256(%1)\n" - " la %2,256(%2)\n" - "2: mvcs 0(%0,%1),0(%2),%3\n" - "8: jnz 1b\n" - " j 5f\n" - "3: la %4,255(%1)\n" /* %4 = ptr + 255 */ - " lghi %3,-4096\n" - " nr %4,%3\n" /* %4 = (ptr + 255) & -4096 */ - " slgr %4,%1\n" - " clgr %0,%4\n" /* copy crosses next page boundary? */ - " jnh 6f\n" - "4: mvcs 0(%4,%1),0(%2),%3\n" - "9: slgr %0,%4\n" - " j 6f\n" - "5: slgr %0,%0\n" - "6: sacf 768\n" - EX_TABLE(0b,3b) EX_TABLE(2b,3b) EX_TABLE(4b,6b) - EX_TABLE(7b,3b) EX_TABLE(8b,3b) EX_TABLE(9b,6b) - : "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2) - : : "cc", "memory"); - disable_sacf_uaccess(old_fs); + " lr 0,%[spec]\n" + "0: mvcos 0(%[to]),0(%[from]),%[size]\n" + "1: jz 5f\n" + " algr %[size],%[val]\n" + " slgr %[to],%[val]\n" + " slgr %[from],%[val]\n" + " j 0b\n" + "2: la %[rem],4095(%[to])\n" /* rem = to + 4095 */ + " nr %[rem],%[val]\n" /* rem = (to + 4095) & -4096 */ + " slgr %[rem],%[to]\n" + " clgr %[size],%[rem]\n" /* copy crosses next page boundary? */ + " jnh 6f\n" + "3: mvcos 0(%[to]),0(%[from]),%[rem]\n" + "4: slgr %[size],%[rem]\n" + " j 6f\n" + "5: slgr %[size],%[size]\n" + "6:\n" + EX_TABLE(0b, 2b) + EX_TABLE(1b, 2b) + EX_TABLE(3b, 6b) + EX_TABLE(4b, 6b) + : [size] "+&a" (size), [to] "+&a" (to), [from] "+&a" (from), [rem] "=&a" (rem) + : [val] "a" (-4096UL), [spec] "d" (spec.val) + : "cc", "memory", "0"); return size; } unsigned long raw_copy_to_user(void __user *to, const void *from, unsigned long n) { - if (copy_with_mvcos()) - return copy_to_user_mvcos(to, from, n); - return copy_to_user_mvcs(to, from, n); + return raw_copy_to_user_key(to, from, n, 0); } EXPORT_SYMBOL(raw_copy_to_user); -static inline unsigned long copy_in_user_mvcos(void __user *to, const void __user *from, - unsigned long size) -{ - register unsigned long reg0 asm("0") = 0x010001UL; - unsigned long tmp1, tmp2; - - tmp1 = -4096UL; - /* FIXME: copy with reduced length. */ - asm volatile( - "0: .insn ss,0xc80000000000,0(%0,%1),0(%2),0\n" - " jz 2f\n" - "1: algr %0,%3\n" - " slgr %1,%3\n" - " slgr %2,%3\n" - " j 0b\n" - "2:slgr %0,%0\n" - "3: \n" - EX_TABLE(0b,3b) - : "+a" (size), "+a" (to), "+a" (from), "+a" (tmp1), "=a" (tmp2) - : "d" (reg0) : "cc", "memory"); - return size; -} - -static inline unsigned long copy_in_user_mvc(void __user *to, const void __user *from, - unsigned long size) +unsigned long _copy_to_user_key(void __user *to, const void *from, + unsigned long n, unsigned long key) { - mm_segment_t old_fs; - unsigned long tmp1; - - old_fs = enable_sacf_uaccess(); - asm volatile( - " sacf 256\n" - " aghi %0,-1\n" - " jo 5f\n" - " bras %3,3f\n" - "0: aghi %0,257\n" - "1: mvc 0(1,%1),0(%2)\n" - " la %1,1(%1)\n" - " la %2,1(%2)\n" - " aghi %0,-1\n" - " jnz 1b\n" - " j 5f\n" - "2: mvc 0(256,%1),0(%2)\n" - " la %1,256(%1)\n" - " la %2,256(%2)\n" - "3: aghi %0,-256\n" - " jnm 2b\n" - "4: ex %0,1b-0b(%3)\n" - "5: slgr %0,%0\n" - "6: sacf 768\n" - EX_TABLE(1b,6b) EX_TABLE(2b,0b) EX_TABLE(4b,0b) - : "+a" (size), "+a" (to), "+a" (from), "=a" (tmp1) - : : "cc", "memory"); - disable_sacf_uaccess(old_fs); - return size; -} - -unsigned long raw_copy_in_user(void __user *to, const void __user *from, unsigned long n) -{ - if (copy_with_mvcos()) - return copy_in_user_mvcos(to, from, n); - return copy_in_user_mvc(to, from, n); -} -EXPORT_SYMBOL(raw_copy_in_user); - -static inline unsigned long clear_user_mvcos(void __user *to, unsigned long size) -{ - register unsigned long reg0 asm("0") = 0x010000UL; - unsigned long tmp1, tmp2; - - tmp1 = -4096UL; - asm volatile( - "0: .insn ss,0xc80000000000,0(%0,%1),0(%4),0\n" - " jz 4f\n" - "1: algr %0,%2\n" - " slgr %1,%2\n" - " j 0b\n" - "2: la %3,4095(%1)\n"/* %4 = to + 4095 */ - " nr %3,%2\n" /* %4 = (to + 4095) & -4096 */ - " slgr %3,%1\n" - " clgr %0,%3\n" /* copy crosses next page boundary? */ - " jnh 5f\n" - "3: .insn ss,0xc80000000000,0(%3,%1),0(%4),0\n" - " slgr %0,%3\n" - " j 5f\n" - "4: slgr %0,%0\n" - "5:\n" - EX_TABLE(0b,2b) EX_TABLE(3b,5b) - : "+a" (size), "+a" (to), "+a" (tmp1), "=a" (tmp2) - : "a" (empty_zero_page), "d" (reg0) : "cc", "memory"); - return size; -} - -static inline unsigned long clear_user_xc(void __user *to, unsigned long size) -{ - mm_segment_t old_fs; - unsigned long tmp1, tmp2; - - old_fs = enable_sacf_uaccess(); - asm volatile( - " sacf 256\n" - " aghi %0,-1\n" - " jo 5f\n" - " bras %3,3f\n" - " xc 0(1,%1),0(%1)\n" - "0: aghi %0,257\n" - " la %2,255(%1)\n" /* %2 = ptr + 255 */ - " srl %2,12\n" - " sll %2,12\n" /* %2 = (ptr + 255) & -4096 */ - " slgr %2,%1\n" - " clgr %0,%2\n" /* clear crosses next page boundary? */ - " jnh 5f\n" - " aghi %2,-1\n" - "1: ex %2,0(%3)\n" - " aghi %2,1\n" - " slgr %0,%2\n" - " j 5f\n" - "2: xc 0(256,%1),0(%1)\n" - " la %1,256(%1)\n" - "3: aghi %0,-256\n" - " jnm 2b\n" - "4: ex %0,0(%3)\n" - "5: slgr %0,%0\n" - "6: sacf 768\n" - EX_TABLE(1b,6b) EX_TABLE(2b,0b) EX_TABLE(4b,0b) - : "+a" (size), "+a" (to), "=a" (tmp1), "=a" (tmp2) - : : "cc", "memory"); - disable_sacf_uaccess(old_fs); - return size; + might_fault(); + if (should_fail_usercopy()) + return n; + instrument_copy_to_user(to, from, n); + return raw_copy_to_user_key(to, from, n, key); } +EXPORT_SYMBOL(_copy_to_user_key); unsigned long __clear_user(void __user *to, unsigned long size) { - if (copy_with_mvcos()) - return clear_user_mvcos(to, size); - return clear_user_xc(to, size); -} -EXPORT_SYMBOL(__clear_user); - -static inline unsigned long strnlen_user_srst(const char __user *src, - unsigned long size) -{ - register unsigned long reg0 asm("0") = 0; - unsigned long tmp1, tmp2; + unsigned long rem; + union oac spec = { + .oac1.as = PSW_BITS_AS_SECONDARY, + .oac1.a = 1, + }; asm volatile( - " la %2,0(%1)\n" - " la %3,0(%0,%1)\n" - " slgr %0,%0\n" - " sacf 256\n" - "0: srst %3,%2\n" - " jo 0b\n" - " la %0,1(%3)\n" /* strnlen_user results includes \0 */ - " slgr %0,%1\n" - "1: sacf 768\n" - EX_TABLE(0b,1b) - : "+a" (size), "+a" (src), "=a" (tmp1), "=a" (tmp2) - : "d" (reg0) : "cc", "memory"); + " lr 0,%[spec]\n" + "0: mvcos 0(%[to]),0(%[zeropg]),%[size]\n" + "1: jz 5f\n" + " algr %[size],%[val]\n" + " slgr %[to],%[val]\n" + " j 0b\n" + "2: la %[rem],4095(%[to])\n" /* rem = to + 4095 */ + " nr %[rem],%[val]\n" /* rem = (to + 4095) & -4096 */ + " slgr %[rem],%[to]\n" + " clgr %[size],%[rem]\n" /* copy crosses next page boundary? */ + " jnh 6f\n" + "3: mvcos 0(%[to]),0(%[zeropg]),%[rem]\n" + "4: slgr %[size],%[rem]\n" + " j 6f\n" + "5: slgr %[size],%[size]\n" + "6:\n" + EX_TABLE(0b, 2b) + EX_TABLE(1b, 2b) + EX_TABLE(3b, 6b) + EX_TABLE(4b, 6b) + : [size] "+&a" (size), [to] "+&a" (to), [rem] "=&a" (rem) + : [val] "a" (-4096UL), [zeropg] "a" (empty_zero_page), [spec] "d" (spec.val) + : "cc", "memory", "0"); return size; } - -unsigned long __strnlen_user(const char __user *src, unsigned long size) -{ - mm_segment_t old_fs; - unsigned long len; - - if (unlikely(!size)) - return 0; - old_fs = enable_sacf_uaccess(); - len = strnlen_user_srst(src, size); - disable_sacf_uaccess(old_fs); - return len; -} -EXPORT_SYMBOL(__strnlen_user); - -long __strncpy_from_user(char *dst, const char __user *src, long size) -{ - size_t done, len, offset, len_str; - - if (unlikely(size <= 0)) - return 0; - done = 0; - do { - offset = (size_t)src & (L1_CACHE_BYTES - 1); - len = min(size - done, L1_CACHE_BYTES - offset); - if (copy_from_user(dst, src, len)) - return -EFAULT; - len_str = strnlen(dst, len); - done += len_str; - src += len_str; - dst += len_str; - } while ((len_str == len) && (done < size)); - return done; -} -EXPORT_SYMBOL(__strncpy_from_user); +EXPORT_SYMBOL(__clear_user); diff --git a/arch/s390/lib/xor.c b/arch/s390/lib/xor.c index 29d9470dbceb..fb924a8041dc 100644 --- a/arch/s390/lib/xor.c +++ b/arch/s390/lib/xor.c @@ -11,7 +11,8 @@ #include <linux/raid/xor.h> #include <asm/xor.h> -static void xor_xc_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) +static void xor_xc_2(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2) { asm volatile( " larl 1,2f\n" @@ -32,8 +33,9 @@ static void xor_xc_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) : "0", "1", "cc", "memory"); } -static void xor_xc_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3) +static void xor_xc_3(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3) { asm volatile( " larl 1,2f\n" @@ -58,8 +60,10 @@ static void xor_xc_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, : : "0", "1", "cc", "memory"); } -static void xor_xc_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4) +static void xor_xc_4(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4) { asm volatile( " larl 1,2f\n" @@ -88,12 +92,12 @@ static void xor_xc_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, : : "0", "1", "cc", "memory"); } -static void xor_xc_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4, unsigned long *p5) +static void xor_xc_5(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4, + const unsigned long * __restrict p5) { - /* Get around a gcc oddity */ - register unsigned long *reg7 asm ("7") = p5; - asm volatile( " larl 1,2f\n" " aghi %0,-1\n" @@ -122,7 +126,7 @@ static void xor_xc_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, " xc 0(1,%1),0(%5)\n" "3:\n" : "+d" (bytes), "+a" (p1), "+a" (p2), "+a" (p3), "+a" (p4), - "+a" (reg7) + "+a" (p5) : : "0", "1", "cc", "memory"); } diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile index 3175413186b9..352ff520fd94 100644 --- a/arch/s390/mm/Makefile +++ b/arch/s390/mm/Makefile @@ -4,12 +4,10 @@ # obj-y := init.o fault.o extmem.o mmap.o vmem.o maccess.o -obj-y += page-states.o pageattr.o pgtable.o pgalloc.o +obj-y += page-states.o pageattr.o pgtable.o pgalloc.o extable.o obj-$(CONFIG_CMM) += cmm.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o -obj-$(CONFIG_S390_PTDUMP) += dump_pagetables.o +obj-$(CONFIG_PTDUMP_CORE) += dump_pagetables.o obj-$(CONFIG_PGSTE) += gmap.o - -KASAN_SANITIZE_kasan_init.o := n -obj-$(CONFIG_KASAN) += kasan_init.o +obj-$(CONFIG_PFAULT) += pfault.o diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c index a51c892f14f3..f8b13f247646 100644 --- a/arch/s390/mm/cmm.c +++ b/arch/s390/mm/cmm.c @@ -14,15 +14,13 @@ #include <linux/moduleparam.h> #include <linux/gfp.h> #include <linux/sched.h> +#include <linux/string_helpers.h> #include <linux/sysctl.h> -#include <linux/ctype.h> #include <linux/swap.h> #include <linux/kthread.h> #include <linux/oom.h> -#include <linux/suspend.h> #include <linux/uaccess.h> -#include <asm/pgalloc.h> #include <asm/diag.h> #ifdef CONFIG_CMM_IUCV @@ -49,7 +47,6 @@ static volatile long cmm_pages_target; static volatile long cmm_timed_pages_target; static long cmm_timeout_pages; static long cmm_timeout_seconds; -static int cmm_suspended; static struct cmm_page_array *cmm_page_list; static struct cmm_page_array *cmm_timed_page_list; @@ -93,7 +90,7 @@ static long cmm_alloc_pages(long nr, long *counter, } else free_page((unsigned long) npa); } - diag10_range(addr >> PAGE_SHIFT, 1); + diag10_range(virt_to_pfn((void *)addr), 1); pa->pages[pa->index++] = addr; (*counter)++; spin_unlock(&cmm_lock); @@ -151,9 +148,9 @@ static int cmm_thread(void *dummy) while (1) { rc = wait_event_interruptible(cmm_thread_wait, - (!cmm_suspended && (cmm_pages != cmm_pages_target || - cmm_timed_pages != cmm_timed_pages_target)) || - kthread_should_stop()); + cmm_pages != cmm_pages_target || + cmm_timed_pages != cmm_timed_pages_target || + kthread_should_stop()); if (kthread_should_stop() || rc == -ERESTARTSYS) { cmm_pages_target = cmm_pages; cmm_timed_pages_target = cmm_timed_pages; @@ -191,7 +188,7 @@ static void cmm_set_timer(void) del_timer(&cmm_timer); return; } - mod_timer(&cmm_timer, jiffies + cmm_timeout_seconds * HZ); + mod_timer(&cmm_timer, jiffies + msecs_to_jiffies(cmm_timeout_seconds * MSEC_PER_SEC)); } static void cmm_timer_fn(struct timer_list *unused) @@ -247,7 +244,7 @@ static int cmm_skip_blanks(char *cp, char **endp) } static int cmm_pages_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { long nr = cmm_get_pages(); struct ctl_table ctl_entry = { @@ -266,7 +263,7 @@ static int cmm_pages_handler(struct ctl_table *ctl, int write, } static int cmm_timed_pages_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { long nr = cmm_get_timed_pages(); @@ -286,7 +283,7 @@ static int cmm_timed_pages_handler(struct ctl_table *ctl, int write, } static int cmm_timeout_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { char buf[64], *p; long nr, seconds; @@ -299,8 +296,7 @@ static int cmm_timeout_handler(struct ctl_table *ctl, int write, if (write) { len = min(*lenp, sizeof(buf)); - if (copy_from_user(buf, buffer, len)) - return -EFAULT; + memcpy(buf, buffer, len); buf[len - 1] = '\0'; cmm_skip_blanks(buf, &p); nr = simple_strtoul(p, &p, 0); @@ -313,8 +309,7 @@ static int cmm_timeout_handler(struct ctl_table *ctl, int write, cmm_timeout_pages, cmm_timeout_seconds); if (len > *lenp) len = *lenp; - if (copy_to_user(buffer, buf, len)) - return -EFAULT; + memcpy(buffer, buf, len); *lenp = len; *ppos += len; } @@ -337,17 +332,6 @@ static struct ctl_table cmm_table[] = { .mode = 0644, .proc_handler = cmm_timeout_handler, }, - { } -}; - -static struct ctl_table cmm_dir_table[] = { - { - .procname = "vm", - .maxlen = 0, - .mode = 0555, - .child = cmm_table, - }, - { } }; #ifdef CONFIG_CMM_IUCV @@ -390,54 +374,19 @@ static void cmm_smsg_target(const char *from, char *msg) static struct ctl_table_header *cmm_sysctl_header; -static int cmm_suspend(void) -{ - cmm_suspended = 1; - cmm_free_pages(cmm_pages, &cmm_pages, &cmm_page_list); - cmm_free_pages(cmm_timed_pages, &cmm_timed_pages, &cmm_timed_page_list); - return 0; -} - -static int cmm_resume(void) -{ - cmm_suspended = 0; - cmm_kick_thread(); - return 0; -} - -static int cmm_power_event(struct notifier_block *this, - unsigned long event, void *ptr) -{ - switch (event) { - case PM_POST_HIBERNATION: - return cmm_resume(); - case PM_HIBERNATION_PREPARE: - return cmm_suspend(); - default: - return NOTIFY_DONE; - } -} - -static struct notifier_block cmm_power_notifier = { - .notifier_call = cmm_power_event, -}; - static int __init cmm_init(void) { int rc = -ENOMEM; - cmm_sysctl_header = register_sysctl_table(cmm_dir_table); + cmm_sysctl_header = register_sysctl("vm", cmm_table); if (!cmm_sysctl_header) goto out_sysctl; #ifdef CONFIG_CMM_IUCV /* convert sender to uppercase characters */ - if (sender) { - int len = strlen(sender); - while (len--) - sender[len] = toupper(sender[len]); - } else { + if (sender) + string_upper(sender, sender); + else sender = cmm_default_sender; - } rc = smsg_register_callback(SMSG_PREFIX, cmm_smsg_target); if (rc < 0) @@ -446,16 +395,11 @@ static int __init cmm_init(void) rc = register_oom_notifier(&cmm_oom_nb); if (rc < 0) goto out_oom_notify; - rc = register_pm_notifier(&cmm_power_notifier); - if (rc) - goto out_pm; cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread"); if (!IS_ERR(cmm_thread_ptr)) return 0; rc = PTR_ERR(cmm_thread_ptr); - unregister_pm_notifier(&cmm_power_notifier); -out_pm: unregister_oom_notifier(&cmm_oom_nb); out_oom_notify: #ifdef CONFIG_CMM_IUCV @@ -475,7 +419,6 @@ static void __exit cmm_exit(void) #ifdef CONFIG_CMM_IUCV smsg_unregister_callback(SMSG_PREFIX, cmm_smsg_target); #endif - unregister_pm_notifier(&cmm_power_notifier); unregister_oom_notifier(&cmm_oom_nb); kthread_stop(cmm_thread_ptr); del_timer_sync(&cmm_timer); diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c index 5d67b81c704a..d37a8f607b71 100644 --- a/arch/s390/mm/dump_pagetables.c +++ b/arch/s390/mm/dump_pagetables.c @@ -1,12 +1,17 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/set_memory.h> +#include <linux/ptdump.h> #include <linux/seq_file.h> #include <linux/debugfs.h> -#include <linux/sched.h> #include <linux/mm.h> +#include <linux/kfence.h> #include <linux/kasan.h> +#include <asm/ptdump.h> #include <asm/kasan.h> +#include <asm/abs_lowcore.h> +#include <asm/nospec-branch.h> #include <asm/sections.h> -#include <asm/pgtable.h> +#include <asm/maccess.h> static unsigned long max_addr; @@ -16,277 +21,295 @@ struct addr_marker { }; enum address_markers_idx { - IDENTITY_NR = 0, + IDENTITY_BEFORE_NR = 0, + IDENTITY_BEFORE_END_NR, + AMODE31_START_NR, + AMODE31_END_NR, KERNEL_START_NR, KERNEL_END_NR, -#ifdef CONFIG_KASAN - KASAN_SHADOW_START_NR, - KASAN_SHADOW_END_NR, +#ifdef CONFIG_KFENCE + KFENCE_START_NR, + KFENCE_END_NR, #endif + IDENTITY_AFTER_NR, + IDENTITY_AFTER_END_NR, VMEMMAP_NR, + VMEMMAP_END_NR, VMALLOC_NR, + VMALLOC_END_NR, MODULES_NR, + MODULES_END_NR, + ABS_LOWCORE_NR, + ABS_LOWCORE_END_NR, + MEMCPY_REAL_NR, + MEMCPY_REAL_END_NR, +#ifdef CONFIG_KASAN + KASAN_SHADOW_START_NR, + KASAN_SHADOW_END_NR, +#endif }; static struct addr_marker address_markers[] = { - [IDENTITY_NR] = {0, "Identity Mapping"}, + [IDENTITY_BEFORE_NR] = {0, "Identity Mapping Start"}, + [IDENTITY_BEFORE_END_NR] = {(unsigned long)_stext, "Identity Mapping End"}, + [AMODE31_START_NR] = {0, "Amode31 Area Start"}, + [AMODE31_END_NR] = {0, "Amode31 Area End"}, [KERNEL_START_NR] = {(unsigned long)_stext, "Kernel Image Start"}, [KERNEL_END_NR] = {(unsigned long)_end, "Kernel Image End"}, +#ifdef CONFIG_KFENCE + [KFENCE_START_NR] = {0, "KFence Pool Start"}, + [KFENCE_END_NR] = {0, "KFence Pool End"}, +#endif + [IDENTITY_AFTER_NR] = {(unsigned long)_end, "Identity Mapping Start"}, + [IDENTITY_AFTER_END_NR] = {0, "Identity Mapping End"}, + [VMEMMAP_NR] = {0, "vmemmap Area Start"}, + [VMEMMAP_END_NR] = {0, "vmemmap Area End"}, + [VMALLOC_NR] = {0, "vmalloc Area Start"}, + [VMALLOC_END_NR] = {0, "vmalloc Area End"}, + [MODULES_NR] = {0, "Modules Area Start"}, + [MODULES_END_NR] = {0, "Modules Area End"}, + [ABS_LOWCORE_NR] = {0, "Lowcore Area Start"}, + [ABS_LOWCORE_END_NR] = {0, "Lowcore Area End"}, + [MEMCPY_REAL_NR] = {0, "Real Memory Copy Area Start"}, + [MEMCPY_REAL_END_NR] = {0, "Real Memory Copy Area End"}, #ifdef CONFIG_KASAN [KASAN_SHADOW_START_NR] = {KASAN_SHADOW_START, "Kasan Shadow Start"}, [KASAN_SHADOW_END_NR] = {KASAN_SHADOW_END, "Kasan Shadow End"}, #endif - [VMEMMAP_NR] = {0, "vmemmap Area"}, - [VMALLOC_NR] = {0, "vmalloc Area"}, - [MODULES_NR] = {0, "Modules Area"}, { -1, NULL } }; struct pg_state { + struct ptdump_state ptdump; + struct seq_file *seq; int level; unsigned int current_prot; + bool check_wx; + unsigned long wx_pages; unsigned long start_address; - unsigned long current_address; const struct addr_marker *marker; }; +#define pt_dump_seq_printf(m, fmt, args...) \ +({ \ + struct seq_file *__m = (m); \ + \ + if (__m) \ + seq_printf(__m, fmt, ##args); \ +}) + +#define pt_dump_seq_puts(m, fmt) \ +({ \ + struct seq_file *__m = (m); \ + \ + if (__m) \ + seq_printf(__m, fmt); \ +}) + static void print_prot(struct seq_file *m, unsigned int pr, int level) { static const char * const level_name[] = { "ASCE", "PGD", "PUD", "PMD", "PTE" }; - seq_printf(m, "%s ", level_name[level]); + pt_dump_seq_printf(m, "%s ", level_name[level]); if (pr & _PAGE_INVALID) { - seq_printf(m, "I\n"); + pt_dump_seq_printf(m, "I\n"); return; } - seq_puts(m, (pr & _PAGE_PROTECT) ? "RO " : "RW "); - seq_puts(m, (pr & _PAGE_NOEXEC) ? "NX\n" : "X\n"); + pt_dump_seq_puts(m, (pr & _PAGE_PROTECT) ? "RO " : "RW "); + pt_dump_seq_puts(m, (pr & _PAGE_NOEXEC) ? "NX\n" : "X\n"); } -static void note_page(struct seq_file *m, struct pg_state *st, - unsigned int new_prot, int level) +static void note_prot_wx(struct pg_state *st, unsigned long addr) +{ +#ifdef CONFIG_DEBUG_WX + if (!st->check_wx) + return; + if (st->current_prot & _PAGE_INVALID) + return; + if (st->current_prot & _PAGE_PROTECT) + return; + if (st->current_prot & _PAGE_NOEXEC) + return; + /* + * The first lowcore page is W+X if spectre mitigations are using + * trampolines or the BEAR enhancements facility is not installed, + * in which case we have two lpswe instructions in lowcore that need + * to be executable. + */ + if (addr == PAGE_SIZE && (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear))) + return; + WARN_ONCE(1, "s390/mm: Found insecure W+X mapping at address %pS\n", + (void *)st->start_address); + st->wx_pages += (addr - st->start_address) / PAGE_SIZE; +#endif /* CONFIG_DEBUG_WX */ +} + +static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, u64 val) { - static const char units[] = "KMGTPE"; int width = sizeof(unsigned long) * 2; + static const char units[] = "KMGTPE"; const char *unit = units; - unsigned int prot, cur; unsigned long delta; + struct pg_state *st; + struct seq_file *m; + unsigned int prot; - /* - * If we have a "break" in the series, we need to flush the state - * that we have now. "break" is either changing perms, levels or - * address space marker. - */ - prot = new_prot; - cur = st->current_prot; - - if (!st->level) { - /* First entry */ - st->current_prot = new_prot; + st = container_of(pt_st, struct pg_state, ptdump); + m = st->seq; + prot = val & (_PAGE_PROTECT | _PAGE_NOEXEC); + if (level == 4 && (val & _PAGE_INVALID)) + prot = _PAGE_INVALID; + /* For pmd_none() & friends val gets passed as zero. */ + if (level != 4 && !val) + prot = _PAGE_INVALID; + /* Final flush from generic code. */ + if (level == -1) + addr = max_addr; + if (st->level == -1) { + pt_dump_seq_printf(m, "---[ %s ]---\n", st->marker->name); + st->start_address = addr; + st->current_prot = prot; st->level = level; - st->marker = address_markers; - seq_printf(m, "---[ %s ]---\n", st->marker->name); - } else if (prot != cur || level != st->level || - st->current_address >= st->marker[1].start_address) { - /* Print the actual finished series */ - seq_printf(m, "0x%0*lx-0x%0*lx ", - width, st->start_address, - width, st->current_address); - delta = (st->current_address - st->start_address) >> 10; + } else if (prot != st->current_prot || level != st->level || + addr >= st->marker[1].start_address) { + note_prot_wx(st, addr); + pt_dump_seq_printf(m, "0x%0*lx-0x%0*lx ", + width, st->start_address, + width, addr); + delta = (addr - st->start_address) >> 10; while (!(delta & 0x3ff) && unit[1]) { delta >>= 10; unit++; } - seq_printf(m, "%9lu%c ", delta, *unit); + pt_dump_seq_printf(m, "%9lu%c ", delta, *unit); print_prot(m, st->current_prot, st->level); - while (st->current_address >= st->marker[1].start_address) { + while (addr >= st->marker[1].start_address) { st->marker++; - seq_printf(m, "---[ %s ]---\n", st->marker->name); + pt_dump_seq_printf(m, "---[ %s ]---\n", st->marker->name); } - st->start_address = st->current_address; - st->current_prot = new_prot; + st->start_address = addr; + st->current_prot = prot; st->level = level; } } -#ifdef CONFIG_KASAN -static void note_kasan_early_shadow_page(struct seq_file *m, - struct pg_state *st) -{ - unsigned int prot; - - prot = pte_val(*kasan_early_shadow_pte) & - (_PAGE_PROTECT | _PAGE_INVALID | _PAGE_NOEXEC); - note_page(m, st, prot, 4); -} -#endif - -/* - * The actual page table walker functions. In order to keep the - * implementation of print_prot() short, we only check and pass - * _PAGE_INVALID and _PAGE_PROTECT flags to note_page() if a region, - * segment or page table entry is invalid or read-only. - * After all it's just a hint that the current level being walked - * contains an invalid or read-only entry. - */ -static void walk_pte_level(struct seq_file *m, struct pg_state *st, - pmd_t *pmd, unsigned long addr) -{ - unsigned int prot; - pte_t *pte; - int i; - - for (i = 0; i < PTRS_PER_PTE && addr < max_addr; i++) { - st->current_address = addr; - pte = pte_offset_kernel(pmd, addr); - prot = pte_val(*pte) & - (_PAGE_PROTECT | _PAGE_INVALID | _PAGE_NOEXEC); - note_page(m, st, prot, 4); - addr += PAGE_SIZE; - } -} - -static void walk_pmd_level(struct seq_file *m, struct pg_state *st, - pud_t *pud, unsigned long addr) -{ - unsigned int prot; - pmd_t *pmd; - int i; - -#ifdef CONFIG_KASAN - if ((pud_val(*pud) & PAGE_MASK) == __pa(kasan_early_shadow_pmd)) { - note_kasan_early_shadow_page(m, st); - return; - } -#endif - - pmd = pmd_offset(pud, addr); - for (i = 0; i < PTRS_PER_PMD && addr < max_addr; i++, pmd++) { - st->current_address = addr; - if (!pmd_none(*pmd)) { - if (pmd_large(*pmd)) { - prot = pmd_val(*pmd) & - (_SEGMENT_ENTRY_PROTECT | - _SEGMENT_ENTRY_NOEXEC); - note_page(m, st, prot, 3); - } else - walk_pte_level(m, st, pmd, addr); - } else - note_page(m, st, _PAGE_INVALID, 3); - addr += PMD_SIZE; - } -} - -static void walk_pud_level(struct seq_file *m, struct pg_state *st, - p4d_t *p4d, unsigned long addr) +#ifdef CONFIG_DEBUG_WX +void ptdump_check_wx(void) { - unsigned int prot; - pud_t *pud; - int i; + struct pg_state st = { + .ptdump = { + .note_page = note_page, + .range = (struct ptdump_range[]) { + {.start = 0, .end = max_addr}, + {.start = 0, .end = 0}, + } + }, + .seq = NULL, + .level = -1, + .current_prot = 0, + .check_wx = true, + .wx_pages = 0, + .start_address = 0, + .marker = (struct addr_marker[]) { + { .start_address = 0, .name = NULL}, + { .start_address = -1, .name = NULL}, + }, + }; -#ifdef CONFIG_KASAN - if ((p4d_val(*p4d) & PAGE_MASK) == __pa(kasan_early_shadow_pud)) { - note_kasan_early_shadow_page(m, st); + if (!MACHINE_HAS_NX) return; - } -#endif - - pud = pud_offset(p4d, addr); - for (i = 0; i < PTRS_PER_PUD && addr < max_addr; i++, pud++) { - st->current_address = addr; - if (!pud_none(*pud)) - if (pud_large(*pud)) { - prot = pud_val(*pud) & - (_REGION_ENTRY_PROTECT | - _REGION_ENTRY_NOEXEC); - note_page(m, st, prot, 2); - } else - walk_pmd_level(m, st, pud, addr); - else - note_page(m, st, _PAGE_INVALID, 2); - addr += PUD_SIZE; - } + ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); + if (st.wx_pages) + pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n", st.wx_pages); + else + pr_info("Checked W+X mappings: passed, no %sW+X pages found\n", + (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)) ? + "unexpected " : ""); } +#endif /* CONFIG_DEBUG_WX */ -static void walk_p4d_level(struct seq_file *m, struct pg_state *st, - pgd_t *pgd, unsigned long addr) +#ifdef CONFIG_PTDUMP_DEBUGFS +static int ptdump_show(struct seq_file *m, void *v) { - p4d_t *p4d; - int i; - -#ifdef CONFIG_KASAN - if ((pgd_val(*pgd) & PAGE_MASK) == __pa(kasan_early_shadow_p4d)) { - note_kasan_early_shadow_page(m, st); - return; - } -#endif + struct pg_state st = { + .ptdump = { + .note_page = note_page, + .range = (struct ptdump_range[]) { + {.start = 0, .end = max_addr}, + {.start = 0, .end = 0}, + } + }, + .seq = m, + .level = -1, + .current_prot = 0, + .check_wx = false, + .wx_pages = 0, + .start_address = 0, + .marker = address_markers, + }; - p4d = p4d_offset(pgd, addr); - for (i = 0; i < PTRS_PER_P4D && addr < max_addr; i++, p4d++) { - st->current_address = addr; - if (!p4d_none(*p4d)) - walk_pud_level(m, st, p4d, addr); - else - note_page(m, st, _PAGE_INVALID, 2); - addr += P4D_SIZE; - } + get_online_mems(); + mutex_lock(&cpa_mutex); + ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); + mutex_unlock(&cpa_mutex); + put_online_mems(); + return 0; } +DEFINE_SHOW_ATTRIBUTE(ptdump); +#endif /* CONFIG_PTDUMP_DEBUGFS */ -static void walk_pgd_level(struct seq_file *m) +/* + * Heapsort from lib/sort.c is not a stable sorting algorithm, do a simple + * insertion sort to preserve the original order of markers with the same + * start address. + */ +static void sort_address_markers(void) { - unsigned long addr = 0; - struct pg_state st; - pgd_t *pgd; - int i; + struct addr_marker tmp; + int i, j; - memset(&st, 0, sizeof(st)); - for (i = 0; i < PTRS_PER_PGD && addr < max_addr; i++) { - st.current_address = addr; - pgd = pgd_offset_k(addr); - if (!pgd_none(*pgd)) - walk_p4d_level(m, &st, pgd, addr); - else - note_page(m, &st, _PAGE_INVALID, 1); - addr += PGDIR_SIZE; - cond_resched(); + for (i = 1; i < ARRAY_SIZE(address_markers) - 1; i++) { + tmp = address_markers[i]; + for (j = i - 1; j >= 0 && address_markers[j].start_address > tmp.start_address; j--) + address_markers[j + 1] = address_markers[j]; + address_markers[j + 1] = tmp; } - /* Flush out the last page */ - st.current_address = max_addr; - note_page(m, &st, 0, 0); -} - -static int ptdump_show(struct seq_file *m, void *v) -{ - walk_pgd_level(m); - return 0; -} - -static int ptdump_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, ptdump_show, NULL); } -static const struct file_operations ptdump_fops = { - .open = ptdump_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int pt_dump_init(void) { +#ifdef CONFIG_KFENCE + unsigned long kfence_start = (unsigned long)__kfence_pool; +#endif /* * Figure out the maximum virtual address being accessible with the * kernel ASCE. We need this to keep the page table walker functions * from accessing non-existent entries. */ - max_addr = (S390_lowcore.kernel_asce & _REGION_ENTRY_TYPE_MASK) >> 2; + max_addr = (S390_lowcore.kernel_asce.val & _REGION_ENTRY_TYPE_MASK) >> 2; max_addr = 1UL << (max_addr * 11 + 31); + address_markers[IDENTITY_AFTER_END_NR].start_address = ident_map_size; + address_markers[AMODE31_START_NR].start_address = (unsigned long)__samode31; + address_markers[AMODE31_END_NR].start_address = (unsigned long)__eamode31; address_markers[MODULES_NR].start_address = MODULES_VADDR; + address_markers[MODULES_END_NR].start_address = MODULES_END; + address_markers[ABS_LOWCORE_NR].start_address = __abs_lowcore; + address_markers[ABS_LOWCORE_END_NR].start_address = __abs_lowcore + ABS_LOWCORE_MAP_SIZE; + address_markers[MEMCPY_REAL_NR].start_address = __memcpy_real_area; + address_markers[MEMCPY_REAL_END_NR].start_address = __memcpy_real_area + MEMCPY_REAL_SIZE; address_markers[VMEMMAP_NR].start_address = (unsigned long) vmemmap; + address_markers[VMEMMAP_END_NR].start_address = (unsigned long)vmemmap + vmemmap_size; address_markers[VMALLOC_NR].start_address = VMALLOC_START; + address_markers[VMALLOC_END_NR].start_address = VMALLOC_END; +#ifdef CONFIG_KFENCE + address_markers[KFENCE_START_NR].start_address = kfence_start; + address_markers[KFENCE_END_NR].start_address = kfence_start + KFENCE_POOL_SIZE; +#endif + sort_address_markers(); +#ifdef CONFIG_PTDUMP_DEBUGFS debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, &ptdump_fops); +#endif /* CONFIG_PTDUMP_DEBUGFS */ return 0; } device_initcall(pt_dump_init); diff --git a/arch/s390/mm/extable.c b/arch/s390/mm/extable.c new file mode 100644 index 000000000000..0a0738a473af --- /dev/null +++ b/arch/s390/mm/extable.c @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/bitfield.h> +#include <linux/extable.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/panic.h> +#include <asm/asm-extable.h> +#include <asm/extable.h> + +const struct exception_table_entry *s390_search_extables(unsigned long addr) +{ + const struct exception_table_entry *fixup; + size_t num; + + fixup = search_exception_tables(addr); + if (fixup) + return fixup; + num = __stop_amode31_ex_table - __start_amode31_ex_table; + return search_extable(__start_amode31_ex_table, num, addr); +} + +static bool ex_handler_fixup(const struct exception_table_entry *ex, struct pt_regs *regs) +{ + regs->psw.addr = extable_fixup(ex); + return true; +} + +static bool ex_handler_ua_store(const struct exception_table_entry *ex, struct pt_regs *regs) +{ + unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data); + + regs->gprs[reg_err] = -EFAULT; + regs->psw.addr = extable_fixup(ex); + return true; +} + +static bool ex_handler_ua_load_mem(const struct exception_table_entry *ex, struct pt_regs *regs) +{ + unsigned int reg_addr = FIELD_GET(EX_DATA_REG_ADDR, ex->data); + unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data); + size_t len = FIELD_GET(EX_DATA_LEN, ex->data); + + regs->gprs[reg_err] = -EFAULT; + memset((void *)regs->gprs[reg_addr], 0, len); + regs->psw.addr = extable_fixup(ex); + return true; +} + +static bool ex_handler_ua_load_reg(const struct exception_table_entry *ex, + bool pair, struct pt_regs *regs) +{ + unsigned int reg_zero = FIELD_GET(EX_DATA_REG_ADDR, ex->data); + unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data); + + regs->gprs[reg_err] = -EFAULT; + regs->gprs[reg_zero] = 0; + if (pair) + regs->gprs[reg_zero + 1] = 0; + regs->psw.addr = extable_fixup(ex); + return true; +} + +static bool ex_handler_zeropad(const struct exception_table_entry *ex, struct pt_regs *regs) +{ + unsigned int reg_addr = FIELD_GET(EX_DATA_REG_ADDR, ex->data); + unsigned int reg_data = FIELD_GET(EX_DATA_REG_ERR, ex->data); + unsigned long data, addr, offset; + + addr = regs->gprs[reg_addr]; + offset = addr & (sizeof(unsigned long) - 1); + addr &= ~(sizeof(unsigned long) - 1); + data = *(unsigned long *)addr; + data <<= BITS_PER_BYTE * offset; + regs->gprs[reg_data] = data; + regs->psw.addr = extable_fixup(ex); + return true; +} + +bool fixup_exception(struct pt_regs *regs) +{ + const struct exception_table_entry *ex; + + ex = s390_search_extables(instruction_pointer(regs)); + if (!ex) + return false; + switch (ex->type) { + case EX_TYPE_FIXUP: + return ex_handler_fixup(ex, regs); + case EX_TYPE_BPF: + return ex_handler_bpf(ex, regs); + case EX_TYPE_UA_STORE: + return ex_handler_ua_store(ex, regs); + case EX_TYPE_UA_LOAD_MEM: + return ex_handler_ua_load_mem(ex, regs); + case EX_TYPE_UA_LOAD_REG: + return ex_handler_ua_load_reg(ex, false, regs); + case EX_TYPE_UA_LOAD_REGPAIR: + return ex_handler_ua_load_reg(ex, true, regs); + case EX_TYPE_ZEROPAD: + return ex_handler_zeropad(ex, regs); + } + panic("invalid exception table entry"); +} diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c index fd0dae9d10f4..e41869f5cc95 100644 --- a/arch/s390/mm/extmem.c +++ b/arch/s390/mm/extmem.c @@ -20,9 +20,9 @@ #include <linux/ctype.h> #include <linux/ioport.h> #include <linux/refcount.h> +#include <linux/pgtable.h> #include <asm/diag.h> #include <asm/page.h> -#include <asm/pgtable.h> #include <asm/ebcdic.h> #include <asm/errno.h> #include <asm/extmem.h> @@ -289,15 +289,17 @@ segment_overlaps_others (struct dcss_segment *seg) /* * real segment loading function, called from segment_load + * Must return either an error code < 0, or the segment type code >= 0 */ static int __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long *end) { unsigned long start_addr, end_addr, dummy; struct dcss_segment *seg; - int rc, diag_cc; + int rc, diag_cc, segtype; start_addr = end_addr = 0; + segtype = -1; seg = kmalloc(sizeof(*seg), GFP_KERNEL | GFP_DMA); if (seg == NULL) { rc = -ENOMEM; @@ -313,15 +315,10 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long goto out_free; } - rc = vmem_add_mapping(seg->start_addr, seg->end - seg->start_addr + 1); - - if (rc) - goto out_free; - seg->res = kzalloc(sizeof(struct resource), GFP_KERNEL); if (seg->res == NULL) { rc = -ENOMEM; - goto out_shared; + goto out_free; } seg->res->flags = IORESOURCE_BUSY | IORESOURCE_MEM; seg->res->start = seg->start_addr; @@ -331,16 +328,21 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long seg->res_name[8] = '\0'; strlcat(seg->res_name, " (DCSS)", sizeof(seg->res_name)); seg->res->name = seg->res_name; - rc = seg->vm_segtype; - if (rc == SEG_TYPE_SC || - ((rc == SEG_TYPE_SR || rc == SEG_TYPE_ER) && !do_nonshared)) + segtype = seg->vm_segtype; + if (segtype == SEG_TYPE_SC || + ((segtype == SEG_TYPE_SR || segtype == SEG_TYPE_ER) && !do_nonshared)) seg->res->flags |= IORESOURCE_READONLY; + + /* Check for overlapping resources before adding the mapping. */ if (request_resource(&iomem_resource, seg->res)) { rc = -EBUSY; - kfree(seg->res); - goto out_shared; + goto out_free_resource; } + rc = vmem_add_mapping(seg->start_addr, seg->end - seg->start_addr + 1); + if (rc) + goto out_resource; + if (do_nonshared) diag_cc = dcss_diag(&loadnsr_scode, seg->dcss_name, &start_addr, &end_addr); @@ -351,14 +353,14 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy); rc = diag_cc; - goto out_resource; + goto out_mapping; } if (diag_cc > 1) { pr_warn("Loading DCSS %s failed with rc=%ld\n", name, end_addr); rc = dcss_diag_translate_rc(end_addr); dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy); - goto out_resource; + goto out_mapping; } seg->start_addr = start_addr; seg->end = end_addr; @@ -377,15 +379,16 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long (void*) seg->end, segtype_string[seg->vm_segtype]); } goto out; + out_mapping: + vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1); out_resource: release_resource(seg->res); + out_free_resource: kfree(seg->res); - out_shared: - vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1); out_free: kfree(seg); out: - return rc; + return rc < 0 ? rc : segtype; } /* @@ -400,8 +403,7 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long * -EIO : could not perform query or load diagnose * -ENOENT : no such segment * -EOPNOTSUPP: multi-part segment cannot be used with linux - * -ENOSPC : segment cannot be used (overlaps with storage) - * -EBUSY : segment can temporarily not be used (overlaps with dcss) + * -EBUSY : segment cannot be used (overlaps with dcss or storage) * -ERANGE : segment cannot be used (exceeds kernel mapping range) * -EPERM : segment is currently loaded with incompatible permissions * -ENOMEM : out of memory @@ -626,10 +628,6 @@ void segment_warning(int rc, char *seg_name) pr_err("DCSS %s has multiple page ranges and cannot be " "loaded or queried\n", seg_name); break; - case -ENOSPC: - pr_err("DCSS %s overlaps with used storage and cannot " - "be loaded\n", seg_name); - break; case -EBUSY: pr_err("%s needs used memory resources and cannot be " "loaded or queried\n", seg_name); @@ -642,10 +640,13 @@ void segment_warning(int rc, char *seg_name) pr_err("There is not enough memory to load or query " "DCSS %s\n", seg_name); break; - case -ERANGE: - pr_err("DCSS %s exceeds the kernel mapping range (%lu) " - "and cannot be loaded\n", seg_name, VMEM_MAX_PHYS); + case -ERANGE: { + struct range mhp_range = arch_get_mappable_range(); + + pr_err("DCSS %s exceeds the kernel mapping range (%llu) " + "and cannot be loaded\n", seg_name, mhp_range.end + 1); break; + } default: break; } diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 7b0bb475c166..ac4c78546d97 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -3,17 +3,19 @@ * S390 version * Copyright IBM Corp. 1999 * Author(s): Hartmut Penner (hp@de.ibm.com) - * Ulrich Weigand (uweigand@de.ibm.com) + * Ulrich Weigand (uweigand@de.ibm.com) * * Derived from "arch/i386/mm/fault.c" * Copyright (C) 1995 Linus Torvalds */ #include <linux/kernel_stat.h> +#include <linux/mmu_context.h> #include <linux/perf_event.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/sched/debug.h> +#include <linux/jump_label.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/string.h> @@ -31,38 +33,30 @@ #include <linux/kprobes.h> #include <linux/uaccess.h> #include <linux/hugetlb.h> +#include <linux/kfence.h> +#include <asm/asm-extable.h> #include <asm/asm-offsets.h> +#include <asm/ptrace.h> +#include <asm/fault.h> #include <asm/diag.h> -#include <asm/pgtable.h> #include <asm/gmap.h> #include <asm/irq.h> -#include <asm/mmu_context.h> #include <asm/facility.h> +#include <asm/uv.h> #include "../kernel/entry.h" -#define __FAIL_ADDR_MASK -4096L -#define __SUBCODE_MASK 0x0600 -#define __PF_RES_FIELD 0x8000000000000000ULL - -#define VM_FAULT_BADCONTEXT 0x010000 -#define VM_FAULT_BADMAP 0x020000 -#define VM_FAULT_BADACCESS 0x040000 -#define VM_FAULT_SIGNAL 0x080000 -#define VM_FAULT_PFAULT 0x100000 - enum fault_type { KERNEL_FAULT, USER_FAULT, - VDSO_FAULT, GMAP_FAULT, }; -static unsigned long store_indication __read_mostly; +static DEFINE_STATIC_KEY_FALSE(have_store_indication); static int __init fault_init(void) { if (test_facility(75)) - store_indication = 0xc00; + static_branch_enable(&have_store_indication); return 0; } early_initcall(fault_init); @@ -72,88 +66,88 @@ early_initcall(fault_init); */ static enum fault_type get_fault_type(struct pt_regs *regs) { - unsigned long trans_exc_code; + union teid teid = { .val = regs->int_parm_long }; - trans_exc_code = regs->int_parm_long & 3; - if (likely(trans_exc_code == 0)) { - /* primary space exception */ - if (IS_ENABLED(CONFIG_PGSTE) && - test_pt_regs_flag(regs, PIF_GUEST_FAULT)) - return GMAP_FAULT; - if (current->thread.mm_segment == USER_DS) + if (likely(teid.as == PSW_BITS_AS_PRIMARY)) { + if (user_mode(regs)) return USER_FAULT; - return KERNEL_FAULT; - } - if (trans_exc_code == 2) { - /* secondary space exception */ - if (current->thread.mm_segment & 1) { - if (current->thread.mm_segment == USER_DS_SACF) - return USER_FAULT; + if (!IS_ENABLED(CONFIG_PGSTE)) return KERNEL_FAULT; - } - return VDSO_FAULT; + if (test_pt_regs_flag(regs, PIF_GUEST_FAULT)) + return GMAP_FAULT; + return KERNEL_FAULT; } - if (trans_exc_code == 1) { - /* access register mode, not used in the kernel */ + if (teid.as == PSW_BITS_AS_SECONDARY) return USER_FAULT; - } - /* home space exception -> access via kernel ASCE */ + /* Access register mode, not used in the kernel */ + if (teid.as == PSW_BITS_AS_ACCREG) + return USER_FAULT; + /* Home space -> access via kernel ASCE */ return KERNEL_FAULT; } -static int bad_address(void *p) +static unsigned long get_fault_address(struct pt_regs *regs) { - unsigned long dummy; + union teid teid = { .val = regs->int_parm_long }; - return probe_kernel_address((unsigned long *)p, dummy); + return teid.addr * PAGE_SIZE; +} + +static __always_inline bool fault_is_write(struct pt_regs *regs) +{ + union teid teid = { .val = regs->int_parm_long }; + + if (static_branch_likely(&have_store_indication)) + return teid.fsi == TEID_FSI_STORE; + return false; } static void dump_pagetable(unsigned long asce, unsigned long address) { - unsigned long *table = __va(asce & _ASCE_ORIGIN); + unsigned long entry, *table = __va(asce & _ASCE_ORIGIN); pr_alert("AS:%016lx ", asce); switch (asce & _ASCE_TYPE_MASK) { case _ASCE_TYPE_REGION1: table += (address & _REGION1_INDEX) >> _REGION1_SHIFT; - if (bad_address(table)) + if (get_kernel_nofault(entry, table)) goto bad; - pr_cont("R1:%016lx ", *table); - if (*table & _REGION_ENTRY_INVALID) + pr_cont("R1:%016lx ", entry); + if (entry & _REGION_ENTRY_INVALID) goto out; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - /* fallthrough */ + table = __va(entry & _REGION_ENTRY_ORIGIN); + fallthrough; case _ASCE_TYPE_REGION2: table += (address & _REGION2_INDEX) >> _REGION2_SHIFT; - if (bad_address(table)) + if (get_kernel_nofault(entry, table)) goto bad; - pr_cont("R2:%016lx ", *table); - if (*table & _REGION_ENTRY_INVALID) + pr_cont("R2:%016lx ", entry); + if (entry & _REGION_ENTRY_INVALID) goto out; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - /* fallthrough */ + table = __va(entry & _REGION_ENTRY_ORIGIN); + fallthrough; case _ASCE_TYPE_REGION3: table += (address & _REGION3_INDEX) >> _REGION3_SHIFT; - if (bad_address(table)) + if (get_kernel_nofault(entry, table)) goto bad; - pr_cont("R3:%016lx ", *table); - if (*table & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE)) + pr_cont("R3:%016lx ", entry); + if (entry & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE)) goto out; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - /* fallthrough */ + table = __va(entry & _REGION_ENTRY_ORIGIN); + fallthrough; case _ASCE_TYPE_SEGMENT: table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; - if (bad_address(table)) + if (get_kernel_nofault(entry, table)) goto bad; - pr_cont("S:%016lx ", *table); - if (*table & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE)) + pr_cont("S:%016lx ", entry); + if (entry & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE)) goto out; - table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN); + table = __va(entry & _SEGMENT_ENTRY_ORIGIN); } table += (address & _PAGE_INDEX) >> _PAGE_SHIFT; - if (bad_address(table)) + if (get_kernel_nofault(entry, table)) goto bad; - pr_cont("P:%016lx ", *table); + pr_cont("P:%016lx ", entry); out: pr_cont("\n"); return; @@ -163,212 +157,113 @@ bad: static void dump_fault_info(struct pt_regs *regs) { + union teid teid = { .val = regs->int_parm_long }; unsigned long asce; pr_alert("Failing address: %016lx TEID: %016lx\n", - regs->int_parm_long & __FAIL_ADDR_MASK, regs->int_parm_long); + get_fault_address(regs), teid.val); pr_alert("Fault in "); - switch (regs->int_parm_long & 3) { - case 3: + switch (teid.as) { + case PSW_BITS_AS_HOME: pr_cont("home space "); break; - case 2: + case PSW_BITS_AS_SECONDARY: pr_cont("secondary space "); break; - case 1: + case PSW_BITS_AS_ACCREG: pr_cont("access register "); break; - case 0: + case PSW_BITS_AS_PRIMARY: pr_cont("primary space "); break; } pr_cont("mode while using "); switch (get_fault_type(regs)) { case USER_FAULT: - asce = S390_lowcore.user_asce; + asce = S390_lowcore.user_asce.val; pr_cont("user "); break; - case VDSO_FAULT: - asce = S390_lowcore.vdso_asce; - pr_cont("vdso "); - break; case GMAP_FAULT: - asce = ((struct gmap *) S390_lowcore.gmap)->asce; + asce = ((struct gmap *)S390_lowcore.gmap)->asce; pr_cont("gmap "); break; case KERNEL_FAULT: - asce = S390_lowcore.kernel_asce; + asce = S390_lowcore.kernel_asce.val; pr_cont("kernel "); break; default: unreachable(); } pr_cont("ASCE.\n"); - dump_pagetable(asce, regs->int_parm_long & __FAIL_ADDR_MASK); + dump_pagetable(asce, get_fault_address(regs)); } int show_unhandled_signals = 1; void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault) { + static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); + if ((task_pid_nr(current) > 1) && !show_unhandled_signals) return; if (!unhandled_signal(current, signr)) return; - if (!printk_ratelimit()) + if (!__ratelimit(&rs)) return; - printk(KERN_ALERT "User process fault: interruption code %04x ilc:%d ", - regs->int_code & 0xffff, regs->int_code >> 17); + pr_alert("User process fault: interruption code %04x ilc:%d ", + regs->int_code & 0xffff, regs->int_code >> 17); print_vma_addr(KERN_CONT "in ", regs->psw.addr); - printk(KERN_CONT "\n"); + pr_cont("\n"); if (is_mm_fault) dump_fault_info(regs); show_regs(regs); } -/* - * Send SIGSEGV to task. This is an external routine - * to keep the stack usage of do_page_fault small. - */ -static noinline void do_sigsegv(struct pt_regs *regs, int si_code) +static void do_sigsegv(struct pt_regs *regs, int si_code) { report_user_fault(regs, SIGSEGV, 1); - force_sig_fault(SIGSEGV, si_code, - (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK)); + force_sig_fault(SIGSEGV, si_code, (void __user *)get_fault_address(regs)); } -const struct exception_table_entry *s390_search_extables(unsigned long addr) +static void handle_fault_error_nolock(struct pt_regs *regs, int si_code) { - const struct exception_table_entry *fixup; - - fixup = search_extable(__start_dma_ex_table, - __stop_dma_ex_table - __start_dma_ex_table, - addr); - if (!fixup) - fixup = search_exception_tables(addr); - return fixup; -} - -static noinline void do_no_context(struct pt_regs *regs) -{ - const struct exception_table_entry *fixup; + enum fault_type fault_type; + unsigned long address; + bool is_write; - /* Are we prepared to handle this kernel fault? */ - fixup = s390_search_extables(regs->psw.addr); - if (fixup) { - regs->psw.addr = extable_fixup(fixup); + if (user_mode(regs)) { + if (WARN_ON_ONCE(!si_code)) + si_code = SEGV_MAPERR; + return do_sigsegv(regs, si_code); + } + if (fixup_exception(regs)) return; + fault_type = get_fault_type(regs); + if (fault_type == KERNEL_FAULT) { + address = get_fault_address(regs); + is_write = fault_is_write(regs); + if (kfence_handle_page_fault(address, is_write, regs)) + return; } - - /* - * Oops. The kernel tried to access some bad page. We'll have to - * terminate things with extreme prejudice. - */ - if (get_fault_type(regs) == KERNEL_FAULT) - printk(KERN_ALERT "Unable to handle kernel pointer dereference" - " in virtual kernel address space\n"); + if (fault_type == KERNEL_FAULT) + pr_alert("Unable to handle kernel pointer dereference in virtual kernel address space\n"); else - printk(KERN_ALERT "Unable to handle kernel paging request" - " in virtual user address space\n"); + pr_alert("Unable to handle kernel paging request in virtual user address space\n"); dump_fault_info(regs); die(regs, "Oops"); - do_exit(SIGKILL); } -static noinline void do_low_address(struct pt_regs *regs) +static void handle_fault_error(struct pt_regs *regs, int si_code) { - /* Low-address protection hit in kernel mode means - NULL pointer write access in kernel mode. */ - if (regs->psw.mask & PSW_MASK_PSTATE) { - /* Low-address protection hit in user mode 'cannot happen'. */ - die (regs, "Low-address protection"); - do_exit(SIGKILL); - } + struct mm_struct *mm = current->mm; - do_no_context(regs); -} - -static noinline void do_sigbus(struct pt_regs *regs) -{ - /* - * Send a sigbus, regardless of whether we were in kernel - * or user mode. - */ - force_sig_fault(SIGBUS, BUS_ADRERR, - (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK)); + mmap_read_unlock(mm); + handle_fault_error_nolock(regs, si_code); } -static noinline int signal_return(struct pt_regs *regs) +static void do_sigbus(struct pt_regs *regs) { - u16 instruction; - int rc; - - rc = __get_user(instruction, (u16 __user *) regs->psw.addr); - if (rc) - return rc; - if (instruction == 0x0a77) { - set_pt_regs_flag(regs, PIF_SYSCALL); - regs->int_code = 0x00040077; - return 0; - } else if (instruction == 0x0aad) { - set_pt_regs_flag(regs, PIF_SYSCALL); - regs->int_code = 0x000400ad; - return 0; - } - return -EACCES; -} - -static noinline void do_fault_error(struct pt_regs *regs, int access, - vm_fault_t fault) -{ - int si_code; - - switch (fault) { - case VM_FAULT_BADACCESS: - if (access == VM_EXEC && signal_return(regs) == 0) - break; - /* fallthrough */ - case VM_FAULT_BADMAP: - /* Bad memory access. Check if it is kernel or user space. */ - if (user_mode(regs)) { - /* User mode accesses just cause a SIGSEGV */ - si_code = (fault == VM_FAULT_BADMAP) ? - SEGV_MAPERR : SEGV_ACCERR; - do_sigsegv(regs, si_code); - break; - } - /* fallthrough */ - case VM_FAULT_BADCONTEXT: - /* fallthrough */ - case VM_FAULT_PFAULT: - do_no_context(regs); - break; - case VM_FAULT_SIGNAL: - if (!user_mode(regs)) - do_no_context(regs); - break; - default: /* fault & VM_FAULT_ERROR */ - if (fault & VM_FAULT_OOM) { - if (!user_mode(regs)) - do_no_context(regs); - else - pagefault_out_of_memory(); - } else if (fault & VM_FAULT_SIGSEGV) { - /* Kernel mode? Handle exceptions or die */ - if (!user_mode(regs)) - do_no_context(regs); - else - do_sigsegv(regs, SEGV_MAPERR); - } else if (fault & VM_FAULT_SIGBUS) { - /* Kernel mode? Handle exceptions or die */ - if (!user_mode(regs)) - do_no_context(regs); - else - do_sigbus(regs); - } else - BUG(); - break; - } + force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)get_fault_address(regs)); } /* @@ -377,178 +272,178 @@ static noinline void do_fault_error(struct pt_regs *regs, int access, * routines. * * interruption code (int_code): - * 04 Protection -> Write-Protection (suprression) - * 10 Segment translation -> Not present (nullification) - * 11 Page translation -> Not present (nullification) - * 3b Region third trans. -> Not present (nullification) + * 04 Protection -> Write-Protection (suppression) + * 10 Segment translation -> Not present (nullification) + * 11 Page translation -> Not present (nullification) + * 3b Region third trans. -> Not present (nullification) */ -static inline vm_fault_t do_exception(struct pt_regs *regs, int access) +static void do_exception(struct pt_regs *regs, int access) { - struct gmap *gmap; - struct task_struct *tsk; - struct mm_struct *mm; struct vm_area_struct *vma; - enum fault_type type; - unsigned long trans_exc_code; unsigned long address; + struct mm_struct *mm; + enum fault_type type; unsigned int flags; + struct gmap *gmap; vm_fault_t fault; + bool is_write; - tsk = current; /* * The instruction that caused the program check has * been nullified. Don't signal single step via SIGTRAP. */ - clear_pt_regs_flag(regs, PIF_PER_TRAP); - + clear_thread_flag(TIF_PER_TRAP); if (kprobe_page_fault(regs, 14)) - return 0; - - mm = tsk->mm; - trans_exc_code = regs->int_parm_long; - - /* - * Verify that the fault happened in user space, that - * we are not in an interrupt and that there is a - * user context. - */ - fault = VM_FAULT_BADCONTEXT; + return; + mm = current->mm; + address = get_fault_address(regs); + is_write = fault_is_write(regs); type = get_fault_type(regs); switch (type) { case KERNEL_FAULT: - goto out; - case VDSO_FAULT: - fault = VM_FAULT_BADMAP; - goto out; + return handle_fault_error_nolock(regs, 0); case USER_FAULT: case GMAP_FAULT: if (faulthandler_disabled() || !mm) - goto out; + return handle_fault_error_nolock(regs, 0); break; } - - address = trans_exc_code & __FAIL_ADDR_MASK; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); - flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + flags = FAULT_FLAG_DEFAULT; if (user_mode(regs)) flags |= FAULT_FLAG_USER; - if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400) + if (is_write) + access = VM_WRITE; + if (access == VM_WRITE) flags |= FAULT_FLAG_WRITE; - down_read(&mm->mmap_sem); + if (!(flags & FAULT_FLAG_USER)) + goto lock_mmap; + vma = lock_vma_under_rcu(mm, address); + if (!vma) + goto lock_mmap; + if (!(vma->vm_flags & access)) { + vma_end_read(vma); + goto lock_mmap; + } + fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); + if (!(fault & VM_FAULT_RETRY)) { + count_vm_vma_lock_event(VMA_LOCK_SUCCESS); + if (unlikely(fault & VM_FAULT_ERROR)) + goto error; + return; + } + count_vm_vma_lock_event(VMA_LOCK_RETRY); + if (fault & VM_FAULT_MAJOR) + flags |= FAULT_FLAG_TRIED; + /* Quick path to respond to signals */ + if (fault_signal_pending(fault, regs)) { + if (!user_mode(regs)) + handle_fault_error_nolock(regs, 0); + return; + } +lock_mmap: + mmap_read_lock(mm); gmap = NULL; if (IS_ENABLED(CONFIG_PGSTE) && type == GMAP_FAULT) { - gmap = (struct gmap *) S390_lowcore.gmap; + gmap = (struct gmap *)S390_lowcore.gmap; current->thread.gmap_addr = address; current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE); current->thread.gmap_int_code = regs->int_code & 0xffff; address = __gmap_translate(gmap, address); - if (address == -EFAULT) { - fault = VM_FAULT_BADMAP; - goto out_up; - } + if (address == -EFAULT) + return handle_fault_error(regs, SEGV_MAPERR); if (gmap->pfault_enabled) flags |= FAULT_FLAG_RETRY_NOWAIT; } - retry: - fault = VM_FAULT_BADMAP; vma = find_vma(mm, address); if (!vma) - goto out_up; - + return handle_fault_error(regs, SEGV_MAPERR); if (unlikely(vma->vm_start > address)) { if (!(vma->vm_flags & VM_GROWSDOWN)) - goto out_up; - if (expand_stack(vma, address)) - goto out_up; + return handle_fault_error(regs, SEGV_MAPERR); + vma = expand_stack(mm, address); + if (!vma) + return handle_fault_error_nolock(regs, SEGV_MAPERR); } - - /* - * Ok, we have a good vm_area for this memory access, so - * we can handle it.. - */ - fault = VM_FAULT_BADACCESS; if (unlikely(!(vma->vm_flags & access))) - goto out_up; - - if (is_vm_hugetlb_page(vma)) - address &= HPAGE_MASK; - /* - * If for any reason at all we couldn't handle the fault, - * make sure we exit gracefully rather than endlessly redo - * the fault. - */ - fault = handle_mm_fault(vma, address, flags); - /* No reason to continue if interrupted by SIGKILL. */ - if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) { - fault = VM_FAULT_SIGNAL; + return handle_fault_error(regs, SEGV_ACCERR); + fault = handle_mm_fault(vma, address, flags, regs); + if (fault_signal_pending(fault, regs)) { if (flags & FAULT_FLAG_RETRY_NOWAIT) - goto out_up; - goto out; + mmap_read_unlock(mm); + if (!user_mode(regs)) + handle_fault_error_nolock(regs, 0); + return; } - if (unlikely(fault & VM_FAULT_ERROR)) - goto out_up; - - /* - * Major/minor page fault accounting is only done on the - * initial attempt. If we go through a retry, it is extremely - * likely that the page will be found in page cache at that point. - */ - if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_MAJOR) { - tsk->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, - regs, address); - } else { - tsk->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, - regs, address); + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) { + if (gmap) { + mmap_read_lock(mm); + goto gmap; } - if (fault & VM_FAULT_RETRY) { - if (IS_ENABLED(CONFIG_PGSTE) && gmap && - (flags & FAULT_FLAG_RETRY_NOWAIT)) { - /* FAULT_FLAG_RETRY_NOWAIT has been set, - * mmap_sem has not been released */ - current->thread.gmap_pfault = 1; - fault = VM_FAULT_PFAULT; - goto out_up; - } - /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk - * of starvation. */ - flags &= ~(FAULT_FLAG_ALLOW_RETRY | - FAULT_FLAG_RETRY_NOWAIT); - flags |= FAULT_FLAG_TRIED; - down_read(&mm->mmap_sem); - goto retry; + return; + } + if (unlikely(fault & VM_FAULT_ERROR)) { + mmap_read_unlock(mm); + goto error; + } + if (fault & VM_FAULT_RETRY) { + if (IS_ENABLED(CONFIG_PGSTE) && gmap && (flags & FAULT_FLAG_RETRY_NOWAIT)) { + /* + * FAULT_FLAG_RETRY_NOWAIT has been set, + * mmap_lock has not been released + */ + current->thread.gmap_pfault = 1; + return handle_fault_error(regs, 0); } + flags &= ~FAULT_FLAG_RETRY_NOWAIT; + flags |= FAULT_FLAG_TRIED; + mmap_read_lock(mm); + goto retry; } +gmap: if (IS_ENABLED(CONFIG_PGSTE) && gmap) { address = __gmap_link(gmap, current->thread.gmap_addr, address); - if (address == -EFAULT) { - fault = VM_FAULT_BADMAP; - goto out_up; - } + if (address == -EFAULT) + return handle_fault_error(regs, SEGV_MAPERR); if (address == -ENOMEM) { fault = VM_FAULT_OOM; - goto out_up; + mmap_read_unlock(mm); + goto error; } } - fault = 0; -out_up: - up_read(&mm->mmap_sem); -out: - return fault; + mmap_read_unlock(mm); + return; +error: + if (fault & VM_FAULT_OOM) { + if (!user_mode(regs)) + handle_fault_error_nolock(regs, 0); + else + pagefault_out_of_memory(); + } else if (fault & VM_FAULT_SIGSEGV) { + if (!user_mode(regs)) + handle_fault_error_nolock(regs, 0); + else + do_sigsegv(regs, SEGV_MAPERR); + } else if (fault & VM_FAULT_SIGBUS) { + if (!user_mode(regs)) + handle_fault_error_nolock(regs, 0); + else + do_sigbus(regs); + } else { + BUG(); + } } void do_protection_exception(struct pt_regs *regs) { - unsigned long trans_exc_code; - int access; - vm_fault_t fault; + union teid teid = { .val = regs->int_parm_long }; - trans_exc_code = regs->int_parm_long; /* * Protection exceptions are suppressing, decrement psw address. * The exception to this rule are aborted transactions, for these @@ -561,258 +456,140 @@ void do_protection_exception(struct pt_regs *regs) * as a special case because the translation exception code * field is not guaranteed to contain valid data in this case. */ - if (unlikely(!(trans_exc_code & 4))) { - do_low_address(regs); - return; + if (unlikely(!teid.b61)) { + if (user_mode(regs)) { + /* Low-address protection in user mode: cannot happen */ + die(regs, "Low-address protection"); + } + /* + * Low-address protection in kernel mode means + * NULL pointer write access in kernel mode. + */ + return handle_fault_error_nolock(regs, 0); } - if (unlikely(MACHINE_HAS_NX && (trans_exc_code & 0x80))) { - regs->int_parm_long = (trans_exc_code & ~PAGE_MASK) | - (regs->psw.addr & PAGE_MASK); - access = VM_EXEC; - fault = VM_FAULT_BADACCESS; - } else { - access = VM_WRITE; - fault = do_exception(regs, access); + if (unlikely(MACHINE_HAS_NX && teid.b56)) { + regs->int_parm_long = (teid.addr * PAGE_SIZE) | (regs->psw.addr & PAGE_MASK); + return handle_fault_error_nolock(regs, SEGV_ACCERR); } - if (unlikely(fault)) - do_fault_error(regs, access, fault); + do_exception(regs, VM_WRITE); } NOKPROBE_SYMBOL(do_protection_exception); void do_dat_exception(struct pt_regs *regs) { - int access; - vm_fault_t fault; - - access = VM_READ | VM_EXEC | VM_WRITE; - fault = do_exception(regs, access); - if (unlikely(fault)) - do_fault_error(regs, access, fault); + do_exception(regs, VM_ACCESS_FLAGS); } NOKPROBE_SYMBOL(do_dat_exception); -#ifdef CONFIG_PFAULT -/* - * 'pfault' pseudo page faults routines. - */ -static int pfault_disable; - -static int __init nopfault(char *str) -{ - pfault_disable = 1; - return 1; -} - -__setup("nopfault", nopfault); - -struct pfault_refbk { - u16 refdiagc; - u16 reffcode; - u16 refdwlen; - u16 refversn; - u64 refgaddr; - u64 refselmk; - u64 refcmpmk; - u64 reserved; -} __attribute__ ((packed, aligned(8))); - -static struct pfault_refbk pfault_init_refbk = { - .refdiagc = 0x258, - .reffcode = 0, - .refdwlen = 5, - .refversn = 2, - .refgaddr = __LC_LPP, - .refselmk = 1ULL << 48, - .refcmpmk = 1ULL << 48, - .reserved = __PF_RES_FIELD -}; - -int pfault_init(void) -{ - int rc; - - if (pfault_disable) - return -1; - diag_stat_inc(DIAG_STAT_X258); - asm volatile( - " diag %1,%0,0x258\n" - "0: j 2f\n" - "1: la %0,8\n" - "2:\n" - EX_TABLE(0b,1b) - : "=d" (rc) - : "a" (&pfault_init_refbk), "m" (pfault_init_refbk) : "cc"); - return rc; -} - -static struct pfault_refbk pfault_fini_refbk = { - .refdiagc = 0x258, - .reffcode = 1, - .refdwlen = 5, - .refversn = 2, -}; +#if IS_ENABLED(CONFIG_PGSTE) -void pfault_fini(void) +void do_secure_storage_access(struct pt_regs *regs) { - - if (pfault_disable) - return; - diag_stat_inc(DIAG_STAT_X258); - asm volatile( - " diag %0,0,0x258\n" - "0: nopr %%r7\n" - EX_TABLE(0b,0b) - : : "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk) : "cc"); -} - -static DEFINE_SPINLOCK(pfault_lock); -static LIST_HEAD(pfault_list); - -#define PF_COMPLETE 0x0080 - -/* - * The mechanism of our pfault code: if Linux is running as guest, runs a user - * space process and the user space process accesses a page that the host has - * paged out we get a pfault interrupt. - * - * This allows us, within the guest, to schedule a different process. Without - * this mechanism the host would have to suspend the whole virtual cpu until - * the page has been paged in. - * - * So when we get such an interrupt then we set the state of the current task - * to uninterruptible and also set the need_resched flag. Both happens within - * interrupt context(!). If we later on want to return to user space we - * recognize the need_resched flag and then call schedule(). It's not very - * obvious how this works... - * - * Of course we have a lot of additional fun with the completion interrupt (-> - * host signals that a page of a process has been paged in and the process can - * continue to run). This interrupt can arrive on any cpu and, since we have - * virtual cpus, actually appear before the interrupt that signals that a page - * is missing. - */ -static void pfault_interrupt(struct ext_code ext_code, - unsigned int param32, unsigned long param64) -{ - struct task_struct *tsk; - __u16 subcode; - pid_t pid; + union teid teid = { .val = regs->int_parm_long }; + unsigned long addr = get_fault_address(regs); + struct vm_area_struct *vma; + struct mm_struct *mm; + struct page *page; + struct gmap *gmap; + int rc; /* - * Get the external interruption subcode & pfault initial/completion - * signal bit. VM stores this in the 'cpu address' field associated - * with the external interrupt. + * Bit 61 indicates if the address is valid, if it is not the + * kernel should be stopped or SIGSEGV should be sent to the + * process. Bit 61 is not reliable without the misc UV feature, + * therefore this needs to be checked too. */ - subcode = ext_code.subcode; - if ((subcode & 0xff00) != __SUBCODE_MASK) - return; - inc_irq_stat(IRQEXT_PFL); - /* Get the token (= pid of the affected task). */ - pid = param64 & LPP_PID_MASK; - rcu_read_lock(); - tsk = find_task_by_pid_ns(pid, &init_pid_ns); - if (tsk) - get_task_struct(tsk); - rcu_read_unlock(); - if (!tsk) - return; - spin_lock(&pfault_lock); - if (subcode & PF_COMPLETE) { - /* signal bit is set -> a page has been swapped in by VM */ - if (tsk->thread.pfault_wait == 1) { - /* Initial interrupt was faster than the completion - * interrupt. pfault_wait is valid. Set pfault_wait - * back to zero and wake up the process. This can - * safely be done because the task is still sleeping - * and can't produce new pfaults. */ - tsk->thread.pfault_wait = 0; - list_del(&tsk->thread.list); - wake_up_process(tsk); - put_task_struct(tsk); - } else { - /* Completion interrupt was faster than initial - * interrupt. Set pfault_wait to -1 so the initial - * interrupt doesn't put the task to sleep. - * If the task is not running, ignore the completion - * interrupt since it must be a leftover of a PFAULT - * CANCEL operation which didn't remove all pending - * completion interrupts. */ - if (tsk->state == TASK_RUNNING) - tsk->thread.pfault_wait = -1; + if (uv_has_feature(BIT_UV_FEAT_MISC) && !teid.b61) { + /* + * When this happens, userspace did something that it + * was not supposed to do, e.g. branching into secure + * memory. Trigger a segmentation fault. + */ + if (user_mode(regs)) { + send_sig(SIGSEGV, current, 0); + return; } - } else { - /* signal bit not set -> a real page is missing. */ - if (WARN_ON_ONCE(tsk != current)) - goto out; - if (tsk->thread.pfault_wait == 1) { - /* Already on the list with a reference: put to sleep */ - goto block; - } else if (tsk->thread.pfault_wait == -1) { - /* Completion interrupt was faster than the initial - * interrupt (pfault_wait == -1). Set pfault_wait - * back to zero and exit. */ - tsk->thread.pfault_wait = 0; - } else { - /* Initial interrupt arrived before completion - * interrupt. Let the task sleep. - * An extra task reference is needed since a different - * cpu may set the task state to TASK_RUNNING again - * before the scheduler is reached. */ - get_task_struct(tsk); - tsk->thread.pfault_wait = 1; - list_add(&tsk->thread.list, &pfault_list); -block: - /* Since this must be a userspace fault, there - * is no kernel task state to trample. Rely on the - * return to userspace schedule() to block. */ - __set_current_state(TASK_UNINTERRUPTIBLE); - set_tsk_need_resched(tsk); - set_preempt_need_resched(); + /* + * The kernel should never run into this case and + * there is no way out of this situation. + */ + panic("Unexpected PGM 0x3d with TEID bit 61=0"); + } + switch (get_fault_type(regs)) { + case GMAP_FAULT: + mm = current->mm; + gmap = (struct gmap *)S390_lowcore.gmap; + mmap_read_lock(mm); + addr = __gmap_translate(gmap, addr); + mmap_read_unlock(mm); + if (IS_ERR_VALUE(addr)) + return handle_fault_error_nolock(regs, SEGV_MAPERR); + fallthrough; + case USER_FAULT: + mm = current->mm; + mmap_read_lock(mm); + vma = find_vma(mm, addr); + if (!vma) + return handle_fault_error(regs, SEGV_MAPERR); + page = follow_page(vma, addr, FOLL_WRITE | FOLL_GET); + if (IS_ERR_OR_NULL(page)) { + mmap_read_unlock(mm); + break; } + if (arch_make_page_accessible(page)) + send_sig(SIGSEGV, current, 0); + put_page(page); + mmap_read_unlock(mm); + break; + case KERNEL_FAULT: + page = phys_to_page(addr); + if (unlikely(!try_get_page(page))) + break; + rc = arch_make_page_accessible(page); + put_page(page); + if (rc) + BUG(); + break; + default: + unreachable(); } -out: - spin_unlock(&pfault_lock); - put_task_struct(tsk); } +NOKPROBE_SYMBOL(do_secure_storage_access); -static int pfault_cpu_dead(unsigned int cpu) +void do_non_secure_storage_access(struct pt_regs *regs) { - struct thread_struct *thread, *next; - struct task_struct *tsk; - - spin_lock_irq(&pfault_lock); - list_for_each_entry_safe(thread, next, &pfault_list, list) { - thread->pfault_wait = 0; - list_del(&thread->list); - tsk = container_of(thread, struct task_struct, thread); - wake_up_process(tsk); - put_task_struct(tsk); - } - spin_unlock_irq(&pfault_lock); - return 0; + struct gmap *gmap = (struct gmap *)S390_lowcore.gmap; + unsigned long gaddr = get_fault_address(regs); + + if (WARN_ON_ONCE(get_fault_type(regs) != GMAP_FAULT)) + return handle_fault_error_nolock(regs, SEGV_MAPERR); + if (gmap_convert_to_secure(gmap, gaddr) == -EINVAL) + send_sig(SIGSEGV, current, 0); } +NOKPROBE_SYMBOL(do_non_secure_storage_access); -static int __init pfault_irq_init(void) +void do_secure_storage_violation(struct pt_regs *regs) { - int rc; - - rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); - if (rc) - goto out_extint; - rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP; - if (rc) - goto out_pfault; - irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL); - cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead", - NULL, pfault_cpu_dead); - return 0; + struct gmap *gmap = (struct gmap *)S390_lowcore.gmap; + unsigned long gaddr = get_fault_address(regs); -out_pfault: - unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); -out_extint: - pfault_disable = 1; - return rc; + /* + * If the VM has been rebooted, its address space might still contain + * secure pages from the previous boot. + * Clear the page so it can be reused. + */ + if (!gmap_destroy_page(gmap, gaddr)) + return; + /* + * Either KVM messed up the secure guest mapping or the same + * page is mapped into multiple secure guests. + * + * This exception is only triggered when a guest 2 is running + * and can therefore never occur in kernel context. + */ + pr_warn_ratelimited("Secure storage violation in task: %s, pid %d\n", + current->comm, current->pid); + send_sig(SIGSEGV, current, 0); } -early_initcall(pfault_irq_init); -#endif /* CONFIG_PFAULT */ +#endif /* CONFIG_PGSTE */ diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index edcdca97e85e..6f96b5a71c63 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -2,7 +2,7 @@ /* * KVM guest address space mapping code * - * Copyright IBM Corp. 2007, 2016, 2018 + * Copyright IBM Corp. 2007, 2020 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> * David Hildenbrand <david@redhat.com> * Janosch Frank <frankja@linux.vnet.ibm.com> @@ -17,17 +17,28 @@ #include <linux/swapops.h> #include <linux/ksm.h> #include <linux/mman.h> - -#include <asm/pgtable.h> +#include <linux/pgtable.h> +#include <asm/page-states.h> #include <asm/pgalloc.h> #include <asm/gmap.h> +#include <asm/page.h> #include <asm/tlb.h> #define GMAP_SHADOW_FAKE_TABLE 1ULL +static struct page *gmap_alloc_crst(void) +{ + struct page *page; + + page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); + if (!page) + return NULL; + __arch_set_page_dat(page_to_virt(page), 1UL << CRST_ALLOC_ORDER); + return page; +} + /** * gmap_alloc - allocate and initialize a guest address space - * @mm: pointer to the parent mm_struct * @limit: maximum address of the gmap address space * * Returns a guest address space structure. @@ -56,24 +67,24 @@ static struct gmap *gmap_alloc(unsigned long limit) atype = _ASCE_TYPE_REGION1; etype = _REGION1_ENTRY_EMPTY; } - gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL); + gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT); if (!gmap) goto out; INIT_LIST_HEAD(&gmap->crst_list); INIT_LIST_HEAD(&gmap->children); INIT_LIST_HEAD(&gmap->pt_list); - INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL); - INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC); - INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC); + INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT); + INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT); + INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT); spin_lock_init(&gmap->guest_table_lock); spin_lock_init(&gmap->shadow_lock); refcount_set(&gmap->ref_count, 1); - page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); + page = gmap_alloc_crst(); if (!page) goto out_free; page->index = 0; list_add(&page->lru, &gmap->crst_list); - table = (unsigned long *) page_to_phys(page); + table = page_to_virt(page); crst_table_init(table, etype); gmap->table = table; gmap->asce = atype | _ASCE_TABLE_LENGTH | @@ -300,7 +311,7 @@ struct gmap *gmap_get_enabled(void) EXPORT_SYMBOL_GPL(gmap_get_enabled); /* - * gmap_alloc_table is assumed to be called with mmap_sem held + * gmap_alloc_table is assumed to be called with mmap_lock held */ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, unsigned long init, unsigned long gaddr) @@ -309,15 +320,15 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, unsigned long *new; /* since we dont free the gmap table until gmap_free we can unlock */ - page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); + page = gmap_alloc_crst(); if (!page) return -ENOMEM; - new = (unsigned long *) page_to_phys(page); + new = page_to_virt(page); crst_table_init(new, init); spin_lock(&gmap->guest_table_lock); if (*table & _REGION_ENTRY_INVALID) { list_add(&page->lru, &gmap->crst_list); - *table = (unsigned long) new | _REGION_ENTRY_LENGTH | + *table = __pa(new) | _REGION_ENTRY_LENGTH | (*table & _REGION_ENTRY_TYPE_MASK); page->index = gaddr; page = NULL; @@ -337,12 +348,11 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, static unsigned long __gmap_segment_gaddr(unsigned long *entry) { struct page *page; - unsigned long offset, mask; + unsigned long offset; offset = (unsigned long) entry / sizeof(unsigned long); offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE; - mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); - page = virt_to_page((void *)((unsigned long) entry & mask)); + page = pmd_pgtable_page((pmd_t *) entry); return page->index + offset; } @@ -405,10 +415,10 @@ int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) return -EINVAL; flush = 0; - down_write(&gmap->mm->mmap_sem); + mmap_write_lock(gmap->mm); for (off = 0; off < len; off += PMD_SIZE) flush |= __gmap_unmap_by_gaddr(gmap, to + off); - up_write(&gmap->mm->mmap_sem); + mmap_write_unlock(gmap->mm); if (flush) gmap_flush_tlb(gmap); return 0; @@ -438,7 +448,7 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from, return -EINVAL; flush = 0; - down_write(&gmap->mm->mmap_sem); + mmap_write_lock(gmap->mm); for (off = 0; off < len; off += PMD_SIZE) { /* Remove old translation */ flush |= __gmap_unmap_by_gaddr(gmap, to + off); @@ -448,7 +458,7 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from, (void *) from + off)) break; } - up_write(&gmap->mm->mmap_sem); + mmap_write_unlock(gmap->mm); if (flush) gmap_flush_tlb(gmap); if (off >= len) @@ -466,7 +476,7 @@ EXPORT_SYMBOL_GPL(gmap_map_segment); * Returns user space address which corresponds to the guest address or * -EFAULT if no such mapping exists. * This function does not establish potentially missing page table entries. - * The mmap_sem of the mm that belongs to the address space must be held + * The mmap_lock of the mm that belongs to the address space must be held * when this function gets called. * * Note: Can also be called for shadow gmaps. @@ -495,16 +505,16 @@ unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr) { unsigned long rc; - down_read(&gmap->mm->mmap_sem); + mmap_read_lock(gmap->mm); rc = __gmap_translate(gmap, gaddr); - up_read(&gmap->mm->mmap_sem); + mmap_read_unlock(gmap->mm); return rc; } EXPORT_SYMBOL_GPL(gmap_translate); /** * gmap_unlink - disconnect a page table from the gmap shadow tables - * @gmap: pointer to guest mapping meta data structure + * @mm: pointer to the parent mm_struct * @table: pointer to the host page table * @vmaddr: vm address associated with the host page table */ @@ -527,14 +537,14 @@ static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new, unsigned long gaddr); /** - * gmap_link - set up shadow page tables to connect a host to a guest address + * __gmap_link - set up shadow page tables to connect a host to a guest address * @gmap: pointer to guest mapping meta data structure * @gaddr: guest address * @vmaddr: vm address * * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT * if the vm address is already mapped to a different guest segment. - * The mmap_sem of the mm that belongs to the address space must be held + * The mmap_lock of the mm that belongs to the address space must be held * when this function gets called. */ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) @@ -558,7 +568,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY, gaddr & _REGION1_MASK)) return -ENOMEM; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); } if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) { table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; @@ -566,7 +576,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY, gaddr & _REGION2_MASK)) return -ENOMEM; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); } if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) { table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; @@ -574,7 +584,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY, gaddr & _REGION3_MASK)) return -ENOMEM; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); } table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; /* Walk the parent mm page table */ @@ -594,7 +604,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) if (pmd_large(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m) return -EFAULT; /* Link gmap segment table entry location to page table. */ - rc = radix_tree_preload(GFP_KERNEL); + rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); if (rc) return rc; ptl = pmd_lock(mm, pmd); @@ -640,7 +650,7 @@ int gmap_fault(struct gmap *gmap, unsigned long gaddr, int rc; bool unlocked; - down_read(&gmap->mm->mmap_sem); + mmap_read_lock(gmap->mm); retry: unlocked = false; @@ -649,13 +659,13 @@ retry: rc = vmaddr; goto out_up; } - if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags, + if (fixup_user_fault(gmap->mm, vmaddr, fault_flags, &unlocked)) { rc = -EFAULT; goto out_up; } /* - * In the case that fixup_user_fault unlocked the mmap_sem during + * In the case that fixup_user_fault unlocked the mmap_lock during * faultin redo __gmap_translate to not race with a map/unmap_segment. */ if (unlocked) @@ -663,16 +673,17 @@ retry: rc = __gmap_link(gmap, gaddr, vmaddr); out_up: - up_read(&gmap->mm->mmap_sem); + mmap_read_unlock(gmap->mm); return rc; } EXPORT_SYMBOL_GPL(gmap_fault); /* - * this function is assumed to be called with mmap_sem held + * this function is assumed to be called with mmap_lock held */ void __gmap_zap(struct gmap *gmap, unsigned long gaddr) { + struct vm_area_struct *vma; unsigned long vmaddr; spinlock_t *ptl; pte_t *ptep; @@ -682,11 +693,17 @@ void __gmap_zap(struct gmap *gmap, unsigned long gaddr) gaddr >> PMD_SHIFT); if (vmaddr) { vmaddr |= gaddr & ~PMD_MASK; + + vma = vma_lookup(gmap->mm, vmaddr); + if (!vma || is_vm_hugetlb_page(vma)) + return; + /* Get pointer to the page table entry */ ptep = get_locked_pte(gmap->mm, vmaddr, &ptl); - if (likely(ptep)) + if (likely(ptep)) { ptep_zap_unused(gmap->mm, vmaddr, ptep, 0); - pte_unmap_unlock(ptep, ptl); + pte_unmap_unlock(ptep, ptl); + } } } EXPORT_SYMBOL_GPL(__gmap_zap); @@ -696,7 +713,7 @@ void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) unsigned long gaddr, vmaddr, size; struct vm_area_struct *vma; - down_read(&gmap->mm->mmap_sem); + mmap_read_lock(gmap->mm); for (gaddr = from; gaddr < to; gaddr = (gaddr + PMD_SIZE) & PMD_MASK) { /* Find the vm address for the guest address */ @@ -717,9 +734,9 @@ void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) if (is_vm_hugetlb_page(vma)) continue; size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); - zap_page_range(vma, vmaddr, size); + zap_page_range_single(vma, vmaddr, size, NULL); } - up_read(&gmap->mm->mmap_sem); + mmap_read_unlock(gmap->mm); } EXPORT_SYMBOL_GPL(gmap_discard); @@ -787,47 +804,51 @@ static void gmap_call_notifier(struct gmap *gmap, unsigned long start, static inline unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level) { - unsigned long *table; + const int asce_type = gmap->asce & _ASCE_TYPE_MASK; + unsigned long *table = gmap->table; - if ((gmap->asce & _ASCE_TYPE_MASK) + 4 < (level * 4)) - return NULL; if (gmap_is_shadow(gmap) && gmap->removed) return NULL; - if (gaddr & (-1UL << (31 + ((gmap->asce & _ASCE_TYPE_MASK) >> 2)*11))) + + if (WARN_ON_ONCE(level > (asce_type >> 2) + 1)) return NULL; - table = gmap->table; - switch (gmap->asce & _ASCE_TYPE_MASK) { + + if (asce_type != _ASCE_TYPE_REGION1 && + gaddr & (-1UL << (31 + (asce_type >> 2) * 11))) + return NULL; + + switch (asce_type) { case _ASCE_TYPE_REGION1: table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT; if (level == 4) break; if (*table & _REGION_ENTRY_INVALID) return NULL; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - /* Fallthrough */ + table = __va(*table & _REGION_ENTRY_ORIGIN); + fallthrough; case _ASCE_TYPE_REGION2: table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; if (level == 3) break; if (*table & _REGION_ENTRY_INVALID) return NULL; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - /* Fallthrough */ + table = __va(*table & _REGION_ENTRY_ORIGIN); + fallthrough; case _ASCE_TYPE_REGION3: table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; if (level == 2) break; if (*table & _REGION_ENTRY_INVALID) return NULL; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - /* Fallthrough */ + table = __va(*table & _REGION_ENTRY_ORIGIN); + fallthrough; case _ASCE_TYPE_SEGMENT: table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; if (level == 1) break; if (*table & _REGION_ENTRY_INVALID) return NULL; - table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN); + table = __va(*table & _SEGMENT_ENTRY_ORIGIN); table += (gaddr & _PAGE_INDEX) >> _PAGE_SHIFT; } return table; @@ -875,10 +896,10 @@ static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr, BUG_ON(gmap_is_shadow(gmap)); fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0; - if (fixup_user_fault(current, mm, vmaddr, fault_flags, &unlocked)) + if (fixup_user_fault(mm, vmaddr, fault_flags, &unlocked)) return -EFAULT; if (unlocked) - /* lost mmap_sem, caller has to retry __gmap_translate */ + /* lost mmap_lock, caller has to retry __gmap_translate */ return 0; /* Connect the page tables */ return __gmap_link(gmap, gaddr, vmaddr); @@ -886,12 +907,12 @@ static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr, /** * gmap_pte_op_end - release the page table lock - * @ptl: pointer to the spinlock pointer + * @ptep: pointer to the locked pte + * @ptl: pointer to the page table spinlock */ -static void gmap_pte_op_end(spinlock_t *ptl) +static void gmap_pte_op_end(pte_t *ptep, spinlock_t *ptl) { - if (ptl) - spin_unlock(ptl); + pte_unmap_unlock(ptep, ptl); } /** @@ -949,7 +970,7 @@ static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp) * -EAGAIN if a fixup is needed * -EINVAL if unsupported notifier bits have been specified * - * Expected to be called with sg->mm->mmap_sem in read and + * Expected to be called with sg->mm->mmap_lock in read and * guest_table_lock held. */ static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr, @@ -964,18 +985,18 @@ static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr, return -EAGAIN; if (prot == PROT_NONE && !pmd_i) { - pmd_val(new) |= _SEGMENT_ENTRY_INVALID; + new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID)); gmap_pmdp_xchg(gmap, pmdp, new, gaddr); } if (prot == PROT_READ && !pmd_p) { - pmd_val(new) &= ~_SEGMENT_ENTRY_INVALID; - pmd_val(new) |= _SEGMENT_ENTRY_PROTECT; + new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID)); + new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_PROTECT)); gmap_pmdp_xchg(gmap, pmdp, new, gaddr); } if (bits & GMAP_NOTIFY_MPROT) - pmd_val(*pmdp) |= _SEGMENT_ENTRY_GMAP_IN; + set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN))); /* Shadow GMAP protection needs split PMDs */ if (bits & GMAP_NOTIFY_SHADOW) @@ -995,14 +1016,14 @@ static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr, * Returns 0 if successfully protected, -ENOMEM if out of memory and * -EAGAIN if a fixup is needed. * - * Expected to be called with sg->mm->mmap_sem in read + * Expected to be called with sg->mm->mmap_lock in read */ static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr, pmd_t *pmdp, int prot, unsigned long bits) { int rc; pte_t *ptep; - spinlock_t *ptl = NULL; + spinlock_t *ptl; unsigned long pbits = 0; if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID) @@ -1016,7 +1037,7 @@ static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr, pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0; /* Protect and unlock. */ rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits); - gmap_pte_op_end(ptl); + gmap_pte_op_end(ptep, ptl); return rc; } @@ -1031,7 +1052,7 @@ static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr, * Returns 0 if successfully protected, -ENOMEM if out of memory and * -EFAULT if gaddr is invalid (or mapping for shadows is missing). * - * Called with sg->mm->mmap_sem in read. + * Called with sg->mm->mmap_lock in read. */ static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr, unsigned long len, int prot, unsigned long bits) @@ -1102,9 +1123,9 @@ int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr, return -EINVAL; if (!MACHINE_HAS_ESOP && prot == PROT_READ) return -EINVAL; - down_read(&gmap->mm->mmap_sem); + mmap_read_lock(gmap->mm); rc = gmap_protect_range(gmap, gaddr, len, prot, GMAP_NOTIFY_MPROT); - up_read(&gmap->mm->mmap_sem); + mmap_read_unlock(gmap->mm); return rc; } EXPORT_SYMBOL_GPL(gmap_mprotect_notify); @@ -1120,7 +1141,7 @@ EXPORT_SYMBOL_GPL(gmap_mprotect_notify); * if reading using the virtual address failed. -EINVAL if called on a gmap * shadow. * - * Called with gmap->mm->mmap_sem in read. + * Called with gmap->mm->mmap_lock in read. */ int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val) { @@ -1140,12 +1161,12 @@ int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val) if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) { address = pte_val(pte) & PAGE_MASK; address += gaddr & ~PAGE_MASK; - *val = *(unsigned long *) address; - pte_val(*ptep) |= _PAGE_YOUNG; + *val = *(unsigned long *)__va(address); + set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG))); /* Do *NOT* clear the _PAGE_INVALID bit! */ rc = 0; } - gmap_pte_op_end(ptl); + gmap_pte_op_end(ptep, ptl); } if (!rc) break; @@ -1173,6 +1194,7 @@ EXPORT_SYMBOL_GPL(gmap_read_table); static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, struct gmap_rmap *rmap) { + struct gmap_rmap *temp; void __rcu **slot; BUG_ON(!gmap_is_shadow(sg)); @@ -1180,6 +1202,12 @@ static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, if (slot) { rmap->next = radix_tree_deref_slot_protected(slot, &sg->guest_table_lock); + for (temp = rmap->next; temp; temp = temp->next) { + if (temp->raddr == rmap->raddr) { + kfree(rmap); + return; + } + } radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap); } else { rmap->next = NULL; @@ -1214,11 +1242,11 @@ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr, vmaddr = __gmap_translate(parent, paddr); if (IS_ERR_VALUE(vmaddr)) return vmaddr; - rmap = kzalloc(sizeof(*rmap), GFP_KERNEL); + rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT); if (!rmap) return -ENOMEM; rmap->raddr = raddr; - rc = radix_tree_preload(GFP_KERNEL); + rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); if (rc) { kfree(rmap); return rc; @@ -1232,7 +1260,7 @@ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr, if (!rc) gmap_insert_rmap(sg, vmaddr, rmap); spin_unlock(&sg->guest_table_lock); - gmap_pte_op_end(ptl); + gmap_pte_op_end(ptep, ptl); } radix_tree_preload_end(); if (rc) { @@ -1268,7 +1296,7 @@ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr, static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr) { asm volatile( - " .insn rrf,0xb98e0000,%0,%1,0,0" + " idte %0,0,%1" : : "a" (asce), "a" (vaddr) : "cc", "memory"); } @@ -1318,7 +1346,8 @@ static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr, */ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) { - unsigned long sto, *ste, *pgt; + unsigned long *ste; + phys_addr_t sto, pgt; struct page *page; BUG_ON(!gmap_is_shadow(sg)); @@ -1326,13 +1355,13 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN)) return; gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1); - sto = (unsigned long) (ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT)); + sto = __pa(ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT)); gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr); - pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN); + pgt = *ste & _SEGMENT_ENTRY_ORIGIN; *ste = _SEGMENT_ENTRY_EMPTY; - __gmap_unshadow_pgt(sg, raddr, pgt); + __gmap_unshadow_pgt(sg, raddr, __va(pgt)); /* Free page table */ - page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT); + page = phys_to_page(pgt); list_del(&page->lru); page_table_free_pgste(page); } @@ -1348,19 +1377,19 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, unsigned long *sgt) { - unsigned long *pgt; struct page *page; + phys_addr_t pgt; int i; BUG_ON(!gmap_is_shadow(sg)); for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) { if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN)) continue; - pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN); + pgt = sgt[i] & _REGION_ENTRY_ORIGIN; sgt[i] = _SEGMENT_ENTRY_EMPTY; - __gmap_unshadow_pgt(sg, raddr, pgt); + __gmap_unshadow_pgt(sg, raddr, __va(pgt)); /* Free page table */ - page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT); + page = phys_to_page(pgt); list_del(&page->lru); page_table_free_pgste(page); } @@ -1375,7 +1404,8 @@ static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, */ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) { - unsigned long r3o, *r3e, *sgt; + unsigned long r3o, *r3e; + phys_addr_t sgt; struct page *page; BUG_ON(!gmap_is_shadow(sg)); @@ -1384,12 +1414,12 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) return; gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1); r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT)); - gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr); - sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN); + gmap_idte_one(__pa(r3o) | _ASCE_TYPE_REGION3, raddr); + sgt = *r3e & _REGION_ENTRY_ORIGIN; *r3e = _REGION3_ENTRY_EMPTY; - __gmap_unshadow_sgt(sg, raddr, sgt); + __gmap_unshadow_sgt(sg, raddr, __va(sgt)); /* Free segment table */ - page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT); + page = phys_to_page(sgt); list_del(&page->lru); __free_pages(page, CRST_ALLOC_ORDER); } @@ -1405,19 +1435,19 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, unsigned long *r3t) { - unsigned long *sgt; struct page *page; + phys_addr_t sgt; int i; BUG_ON(!gmap_is_shadow(sg)); for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) { if (!(r3t[i] & _REGION_ENTRY_ORIGIN)) continue; - sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN); + sgt = r3t[i] & _REGION_ENTRY_ORIGIN; r3t[i] = _REGION3_ENTRY_EMPTY; - __gmap_unshadow_sgt(sg, raddr, sgt); + __gmap_unshadow_sgt(sg, raddr, __va(sgt)); /* Free segment table */ - page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT); + page = phys_to_page(sgt); list_del(&page->lru); __free_pages(page, CRST_ALLOC_ORDER); } @@ -1432,7 +1462,8 @@ static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, */ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) { - unsigned long r2o, *r2e, *r3t; + unsigned long r2o, *r2e; + phys_addr_t r3t; struct page *page; BUG_ON(!gmap_is_shadow(sg)); @@ -1441,12 +1472,12 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) return; gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1); r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT)); - gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr); - r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN); + gmap_idte_one(__pa(r2o) | _ASCE_TYPE_REGION2, raddr); + r3t = *r2e & _REGION_ENTRY_ORIGIN; *r2e = _REGION2_ENTRY_EMPTY; - __gmap_unshadow_r3t(sg, raddr, r3t); + __gmap_unshadow_r3t(sg, raddr, __va(r3t)); /* Free region 3 table */ - page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT); + page = phys_to_page(r3t); list_del(&page->lru); __free_pages(page, CRST_ALLOC_ORDER); } @@ -1462,7 +1493,7 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, unsigned long *r2t) { - unsigned long *r3t; + phys_addr_t r3t; struct page *page; int i; @@ -1470,11 +1501,11 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) { if (!(r2t[i] & _REGION_ENTRY_ORIGIN)) continue; - r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN); + r3t = r2t[i] & _REGION_ENTRY_ORIGIN; r2t[i] = _REGION2_ENTRY_EMPTY; - __gmap_unshadow_r3t(sg, raddr, r3t); + __gmap_unshadow_r3t(sg, raddr, __va(r3t)); /* Free region 3 table */ - page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT); + page = phys_to_page(r3t); list_del(&page->lru); __free_pages(page, CRST_ALLOC_ORDER); } @@ -1489,8 +1520,9 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, */ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) { - unsigned long r1o, *r1e, *r2t; + unsigned long r1o, *r1e; struct page *page; + phys_addr_t r2t; BUG_ON(!gmap_is_shadow(sg)); r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */ @@ -1498,12 +1530,12 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) return; gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1); r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT)); - gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr); - r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN); + gmap_idte_one(__pa(r1o) | _ASCE_TYPE_REGION1, raddr); + r2t = *r1e & _REGION_ENTRY_ORIGIN; *r1e = _REGION1_ENTRY_EMPTY; - __gmap_unshadow_r2t(sg, raddr, r2t); + __gmap_unshadow_r2t(sg, raddr, __va(r2t)); /* Free region 2 table */ - page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT); + page = phys_to_page(r2t); list_del(&page->lru); __free_pages(page, CRST_ALLOC_ORDER); } @@ -1519,22 +1551,23 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, unsigned long *r1t) { - unsigned long asce, *r2t; + unsigned long asce; struct page *page; + phys_addr_t r2t; int i; BUG_ON(!gmap_is_shadow(sg)); - asce = (unsigned long) r1t | _ASCE_TYPE_REGION1; + asce = __pa(r1t) | _ASCE_TYPE_REGION1; for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) { if (!(r1t[i] & _REGION_ENTRY_ORIGIN)) continue; - r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN); - __gmap_unshadow_r2t(sg, raddr, r2t); + r2t = r1t[i] & _REGION_ENTRY_ORIGIN; + __gmap_unshadow_r2t(sg, raddr, __va(r2t)); /* Clear entry and flush translation r1t -> r2t */ gmap_idte_one(asce, raddr); r1t[i] = _REGION1_ENTRY_EMPTY; /* Free region 2 table */ - page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT); + page = phys_to_page(r2t); list_del(&page->lru); __free_pages(page, CRST_ALLOC_ORDER); } @@ -1556,7 +1589,7 @@ static void gmap_unshadow(struct gmap *sg) sg->removed = 1; gmap_call_notifier(sg, 0, -1UL); gmap_flush_tlb(sg); - table = (unsigned long *)(sg->asce & _ASCE_ORIGIN); + table = __va(sg->asce & _ASCE_ORIGIN); switch (sg->asce & _ASCE_TYPE_MASK) { case _ASCE_TYPE_REGION1: __gmap_unshadow_r1t(sg, 0, table); @@ -1692,11 +1725,11 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, } spin_unlock(&parent->shadow_lock); /* protect after insertion, so it will get properly invalidated */ - down_read(&parent->mm->mmap_sem); + mmap_read_lock(parent->mm); rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN, ((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE, PROT_READ, GMAP_NOTIFY_SHADOW); - up_read(&parent->mm->mmap_sem); + mmap_read_unlock(parent->mm); spin_lock(&parent->shadow_lock); new->initialized = true; if (rc) { @@ -1719,31 +1752,32 @@ EXPORT_SYMBOL_GPL(gmap_shadow); * The r2t parameter specifies the address of the source table. The * four pages of the source table are made read-only in the parent gmap * address space. A write to the source table area @r2t will automatically - * remove the shadow r2 table and all of its decendents. + * remove the shadow r2 table and all of its descendants. * * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the * shadow table structure is incomplete, -ENOMEM if out of memory and * -EFAULT if an address in the parent gmap could not be resolved. * - * Called with sg->mm->mmap_sem in read. + * Called with sg->mm->mmap_lock in read. */ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, int fake) { unsigned long raddr, origin, offset, len; - unsigned long *s_r2t, *table; + unsigned long *table; + phys_addr_t s_r2t; struct page *page; int rc; BUG_ON(!gmap_is_shadow(sg)); /* Allocate a shadow region second table */ - page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); + page = gmap_alloc_crst(); if (!page) return -ENOMEM; page->index = r2t & _REGION_ENTRY_ORIGIN; if (fake) page->index |= GMAP_SHADOW_FAKE_TABLE; - s_r2t = (unsigned long *) page_to_phys(page); + s_r2t = page_to_phys(page); /* Install shadow region second table */ spin_lock(&sg->guest_table_lock); table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */ @@ -1758,9 +1792,9 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, rc = -EAGAIN; /* Race with shadow */ goto out_free; } - crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY); + crst_table_init(__va(s_r2t), _REGION2_ENTRY_EMPTY); /* mark as invalid as long as the parent table is not protected */ - *table = (unsigned long) s_r2t | _REGION_ENTRY_LENGTH | + *table = s_r2t | _REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID; if (sg->edat_level >= 1) *table |= (r2t & _REGION_ENTRY_PROTECT); @@ -1781,8 +1815,7 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, spin_lock(&sg->guest_table_lock); if (!rc) { table = gmap_table_walk(sg, saddr, 4); - if (!table || (*table & _REGION_ENTRY_ORIGIN) != - (unsigned long) s_r2t) + if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r2t) rc = -EAGAIN; /* Race with unshadow */ else *table &= ~_REGION_ENTRY_INVALID; @@ -1809,25 +1842,26 @@ EXPORT_SYMBOL_GPL(gmap_shadow_r2t); * shadow table structure is incomplete, -ENOMEM if out of memory and * -EFAULT if an address in the parent gmap could not be resolved. * - * Called with sg->mm->mmap_sem in read. + * Called with sg->mm->mmap_lock in read. */ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, int fake) { unsigned long raddr, origin, offset, len; - unsigned long *s_r3t, *table; + unsigned long *table; + phys_addr_t s_r3t; struct page *page; int rc; BUG_ON(!gmap_is_shadow(sg)); /* Allocate a shadow region second table */ - page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); + page = gmap_alloc_crst(); if (!page) return -ENOMEM; page->index = r3t & _REGION_ENTRY_ORIGIN; if (fake) page->index |= GMAP_SHADOW_FAKE_TABLE; - s_r3t = (unsigned long *) page_to_phys(page); + s_r3t = page_to_phys(page); /* Install shadow region second table */ spin_lock(&sg->guest_table_lock); table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */ @@ -1840,10 +1874,11 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, goto out_free; } else if (*table & _REGION_ENTRY_ORIGIN) { rc = -EAGAIN; /* Race with shadow */ + goto out_free; } - crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY); + crst_table_init(__va(s_r3t), _REGION3_ENTRY_EMPTY); /* mark as invalid as long as the parent table is not protected */ - *table = (unsigned long) s_r3t | _REGION_ENTRY_LENGTH | + *table = s_r3t | _REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID; if (sg->edat_level >= 1) *table |= (r3t & _REGION_ENTRY_PROTECT); @@ -1864,8 +1899,7 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, spin_lock(&sg->guest_table_lock); if (!rc) { table = gmap_table_walk(sg, saddr, 3); - if (!table || (*table & _REGION_ENTRY_ORIGIN) != - (unsigned long) s_r3t) + if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r3t) rc = -EAGAIN; /* Race with unshadow */ else *table &= ~_REGION_ENTRY_INVALID; @@ -1892,25 +1926,26 @@ EXPORT_SYMBOL_GPL(gmap_shadow_r3t); * shadow table structure is incomplete, -ENOMEM if out of memory and * -EFAULT if an address in the parent gmap could not be resolved. * - * Called with sg->mm->mmap_sem in read. + * Called with sg->mm->mmap_lock in read. */ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, int fake) { unsigned long raddr, origin, offset, len; - unsigned long *s_sgt, *table; + unsigned long *table; + phys_addr_t s_sgt; struct page *page; int rc; BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE)); /* Allocate a shadow segment table */ - page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); + page = gmap_alloc_crst(); if (!page) return -ENOMEM; page->index = sgt & _REGION_ENTRY_ORIGIN; if (fake) page->index |= GMAP_SHADOW_FAKE_TABLE; - s_sgt = (unsigned long *) page_to_phys(page); + s_sgt = page_to_phys(page); /* Install shadow region second table */ spin_lock(&sg->guest_table_lock); table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */ @@ -1925,9 +1960,9 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, rc = -EAGAIN; /* Race with shadow */ goto out_free; } - crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY); + crst_table_init(__va(s_sgt), _SEGMENT_ENTRY_EMPTY); /* mark as invalid as long as the parent table is not protected */ - *table = (unsigned long) s_sgt | _REGION_ENTRY_LENGTH | + *table = s_sgt | _REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID; if (sg->edat_level >= 1) *table |= sgt & _REGION_ENTRY_PROTECT; @@ -1948,8 +1983,7 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, spin_lock(&sg->guest_table_lock); if (!rc) { table = gmap_table_walk(sg, saddr, 2); - if (!table || (*table & _REGION_ENTRY_ORIGIN) != - (unsigned long) s_sgt) + if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_sgt) rc = -EAGAIN; /* Race with unshadow */ else *table &= ~_REGION_ENTRY_INVALID; @@ -1966,7 +2000,7 @@ out_free: EXPORT_SYMBOL_GPL(gmap_shadow_sgt); /** - * gmap_shadow_lookup_pgtable - find a shadow page table + * gmap_shadow_pgt_lookup - find a shadow page table * @sg: pointer to the shadow guest address space structure * @saddr: the address in the shadow aguest address space * @pgt: parent gmap address of the page table to get shadowed @@ -1976,7 +2010,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_sgt); * Returns 0 if the shadow page table was found and -EAGAIN if the page * table was not found. * - * Called with sg->mm->mmap_sem in read. + * Called with sg->mm->mmap_lock in read. */ int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, unsigned long *pgt, int *dat_protection, @@ -2016,14 +2050,15 @@ EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup); * shadow table structure is incomplete, -ENOMEM if out of memory, * -EFAULT if an address in the parent gmap could not be resolved and * - * Called with gmap->mm->mmap_sem in read + * Called with gmap->mm->mmap_lock in read */ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, int fake) { unsigned long raddr, origin; - unsigned long *s_pgt, *table; + unsigned long *table; struct page *page; + phys_addr_t s_pgt; int rc; BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE)); @@ -2034,7 +2069,7 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, page->index = pgt & _SEGMENT_ENTRY_ORIGIN; if (fake) page->index |= GMAP_SHADOW_FAKE_TABLE; - s_pgt = (unsigned long *) page_to_phys(page); + s_pgt = page_to_phys(page); /* Install shadow page table */ spin_lock(&sg->guest_table_lock); table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ @@ -2067,8 +2102,7 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, spin_lock(&sg->guest_table_lock); if (!rc) { table = gmap_table_walk(sg, saddr, 1); - if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != - (unsigned long) s_pgt) + if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != s_pgt) rc = -EAGAIN; /* Race with unshadow */ else *table &= ~_SEGMENT_ENTRY_INVALID; @@ -2095,7 +2129,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_pgt); * shadow table structure is incomplete, -ENOMEM if out of memory and * -EFAULT if an address in the parent gmap could not be resolved. * - * Called with sg->mm->mmap_sem in read. + * Called with sg->mm->mmap_lock in read. */ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) { @@ -2111,7 +2145,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) parent = sg->parent; prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE; - rmap = kzalloc(sizeof(*rmap), GFP_KERNEL); + rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT); if (!rmap) return -ENOMEM; rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE; @@ -2123,7 +2157,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) rc = vmaddr; break; } - rc = radix_tree_preload(GFP_KERNEL); + rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); if (rc) break; rc = -EAGAIN; @@ -2134,7 +2168,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) tptep = (pte_t *) gmap_table_walk(sg, saddr, 0); if (!tptep) { spin_unlock(&sg->guest_table_lock); - gmap_pte_op_end(ptl); + gmap_pte_op_end(sptep, ptl); radix_tree_preload_end(); break; } @@ -2145,7 +2179,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) rmap = NULL; rc = 0; } - gmap_pte_op_end(ptl); + gmap_pte_op_end(sptep, ptl); spin_unlock(&sg->guest_table_lock); } radix_tree_preload_end(); @@ -2160,7 +2194,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) } EXPORT_SYMBOL_GPL(gmap_shadow_page); -/** +/* * gmap_shadow_notify - handle notifications for shadow gmap * * Called with sg->parent->shadow_lock. @@ -2220,7 +2254,7 @@ static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr, /** * ptep_notify - call all invalidation callbacks for a specific pte. * @mm: pointer to the process mm_struct - * @addr: virtual address in the process address space + * @vmaddr: virtual address in the process address space * @pte: pointer to the page table entry * @bits: bits from the pgste that caused the notify call * @@ -2264,7 +2298,7 @@ EXPORT_SYMBOL_GPL(ptep_notify); static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp, unsigned long gaddr) { - pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_IN; + set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN))); gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1); } @@ -2283,7 +2317,7 @@ static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new, { gaddr &= HPAGE_MASK; pmdp_notify_gmap(gmap, pmdp, gaddr); - pmd_val(new) &= ~_SEGMENT_ENTRY_GMAP_IN; + new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_GMAP_IN)); if (MACHINE_HAS_TLB_GUEST) __pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce, IDTE_GLOBAL); @@ -2291,7 +2325,7 @@ static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new, __pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL); else __pmdp_csp(pmdp); - *pmdp = new; + set_pmd(pmdp, new); } static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr, @@ -2313,7 +2347,7 @@ static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr, _SEGMENT_ENTRY_GMAP_UC)); if (purge) __pmdp_csp(pmdp); - pmd_val(*pmdp) = _SEGMENT_ENTRY_EMPTY; + set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); } spin_unlock(&gmap->guest_table_lock); } @@ -2436,7 +2470,7 @@ static bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp, return false; /* Clear UC indication and reset protection */ - pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_UC; + set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_UC))); gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0); return true; } @@ -2473,36 +2507,55 @@ void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4], continue; if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep)) set_bit(i, bitmap); - spin_unlock(ptl); + pte_unmap_unlock(ptep, ptl); } } gmap_pmd_op_end(gmap, pmdp); } EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + + split_huge_pmd(vma, pmd, addr); + return 0; +} + +static const struct mm_walk_ops thp_split_walk_ops = { + .pmd_entry = thp_split_walk_pmd_entry, + .walk_lock = PGWALK_WRLOCK_VERIFY, +}; + static inline void thp_split_mm(struct mm_struct *mm) { -#ifdef CONFIG_TRANSPARENT_HUGEPAGE struct vm_area_struct *vma; - unsigned long addr; + VMA_ITERATOR(vmi, mm, 0); - for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { - for (addr = vma->vm_start; - addr < vma->vm_end; - addr += PAGE_SIZE) - follow_page(vma, addr, FOLL_SPLIT); - vma->vm_flags &= ~VM_HUGEPAGE; - vma->vm_flags |= VM_NOHUGEPAGE; + for_each_vma(vmi, vma) { + vm_flags_mod(vma, VM_NOHUGEPAGE, VM_HUGEPAGE); + walk_page_vma(vma, &thp_split_walk_ops, NULL); } mm->def_flags |= VM_NOHUGEPAGE; -#endif } +#else +static inline void thp_split_mm(struct mm_struct *mm) +{ +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* * Remove all empty zero pages from the mapping for lazy refaulting * - This must be called after mm->context.has_pgste is set, to avoid * future creation of zero pages - * - This must be called after THP was enabled + * - This must be called after THP was disabled. + * + * mm contracts with s390, that even if mm were to remove a page table, + * racing with the loop below and so causing pte_offset_map_lock() to fail, + * it will never insert a page table containing empty zero pages once + * mm_forbids_zeropage(mm) i.e. mm->context.has_pgste is set. */ static int __zap_zero_pages(pmd_t *pmd, unsigned long start, unsigned long end, struct mm_walk *walk) @@ -2514,6 +2567,8 @@ static int __zap_zero_pages(pmd_t *pmd, unsigned long start, spinlock_t *ptl; ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + if (!ptep) + break; if (is_zero_pfn(pte_pfn(*ptep))) ptep_xchg_direct(walk->mm, addr, ptep, __pte(_PAGE_INVALID)); pte_unmap_unlock(ptep, ptl); @@ -2523,6 +2578,7 @@ static int __zap_zero_pages(pmd_t *pmd, unsigned long start, static const struct mm_walk_ops zap_zero_walk_ops = { .pmd_entry = __zap_zero_pages, + .walk_lock = PGWALK_WRLOCK, }; /* @@ -2538,16 +2594,27 @@ int s390_enable_sie(void) /* Fail if the page tables are 2K */ if (!mm_alloc_pgste(mm)) return -EINVAL; - down_write(&mm->mmap_sem); + mmap_write_lock(mm); mm->context.has_pgste = 1; /* split thp mappings and disable thp for future mappings */ thp_split_mm(mm); walk_page_range(mm, 0, TASK_SIZE, &zap_zero_walk_ops, NULL); - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); return 0; } EXPORT_SYMBOL_GPL(s390_enable_sie); +int gmap_mark_unmergeable(void) +{ + /* + * Make sure to disable KSM (if enabled for the whole process or + * individual VMAs). Note that nothing currently hinders user space + * from re-enabling it. + */ + return ksm_disable(current->mm); +} +EXPORT_SYMBOL_GPL(gmap_mark_unmergeable); + /* * Enable storage key handling from now on and initialize the storage * keys with the default key. @@ -2560,6 +2627,18 @@ static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr, return 0; } +/* + * Give a chance to schedule after setting a key to 256 pages. + * We only hold the mm lock, which is a rwsem and the kvm srcu. + * Both can sleep. + */ +static int __s390_enable_skey_pmd(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + cond_resched(); + return 0; +} + static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr, unsigned long hmask, unsigned long next, struct mm_walk *walk) @@ -2582,39 +2661,36 @@ static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr, end = start + HPAGE_SIZE - 1; __storage_key_init_range(start, end); set_bit(PG_arch_1, &page->flags); + cond_resched(); return 0; } static const struct mm_walk_ops enable_skey_walk_ops = { .hugetlb_entry = __s390_enable_skey_hugetlb, .pte_entry = __s390_enable_skey_pte, + .pmd_entry = __s390_enable_skey_pmd, + .walk_lock = PGWALK_WRLOCK, }; int s390_enable_skey(void) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; int rc = 0; - down_write(&mm->mmap_sem); + mmap_write_lock(mm); if (mm_uses_skeys(mm)) goto out_up; mm->context.uses_skeys = 1; - for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (ksm_madvise(vma, vma->vm_start, vma->vm_end, - MADV_UNMERGEABLE, &vma->vm_flags)) { - mm->context.uses_skeys = 0; - rc = -ENOMEM; - goto out_up; - } + rc = gmap_mark_unmergeable(); + if (rc) { + mm->context.uses_skeys = 0; + goto out_up; } - mm->def_flags &= ~VM_MERGEABLE; - walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL); out_up: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); return rc; } EXPORT_SYMBOL_GPL(s390_enable_skey); @@ -2631,12 +2707,188 @@ static int __s390_reset_cmma(pte_t *pte, unsigned long addr, static const struct mm_walk_ops reset_cmma_walk_ops = { .pte_entry = __s390_reset_cmma, + .walk_lock = PGWALK_WRLOCK, }; void s390_reset_cmma(struct mm_struct *mm) { - down_write(&mm->mmap_sem); + mmap_write_lock(mm); walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL); - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); } EXPORT_SYMBOL_GPL(s390_reset_cmma); + +#define GATHER_GET_PAGES 32 + +struct reset_walk_state { + unsigned long next; + unsigned long count; + unsigned long pfns[GATHER_GET_PAGES]; +}; + +static int s390_gather_pages(pte_t *ptep, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct reset_walk_state *p = walk->private; + pte_t pte = READ_ONCE(*ptep); + + if (pte_present(pte)) { + /* we have a reference from the mapping, take an extra one */ + get_page(phys_to_page(pte_val(pte))); + p->pfns[p->count] = phys_to_pfn(pte_val(pte)); + p->next = next; + p->count++; + } + return p->count >= GATHER_GET_PAGES; +} + +static const struct mm_walk_ops gather_pages_ops = { + .pte_entry = s390_gather_pages, + .walk_lock = PGWALK_RDLOCK, +}; + +/* + * Call the Destroy secure page UVC on each page in the given array of PFNs. + * Each page needs to have an extra reference, which will be released here. + */ +void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns) +{ + unsigned long i; + + for (i = 0; i < count; i++) { + /* we always have an extra reference */ + uv_destroy_owned_page(pfn_to_phys(pfns[i])); + /* get rid of the extra reference */ + put_page(pfn_to_page(pfns[i])); + cond_resched(); + } +} +EXPORT_SYMBOL_GPL(s390_uv_destroy_pfns); + +/** + * __s390_uv_destroy_range - Call the destroy secure page UVC on each page + * in the given range of the given address space. + * @mm: the mm to operate on + * @start: the start of the range + * @end: the end of the range + * @interruptible: if not 0, stop when a fatal signal is received + * + * Walk the given range of the given address space and call the destroy + * secure page UVC on each page. Optionally exit early if a fatal signal is + * pending. + * + * Return: 0 on success, -EINTR if the function stopped before completing + */ +int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, + unsigned long end, bool interruptible) +{ + struct reset_walk_state state = { .next = start }; + int r = 1; + + while (r > 0) { + state.count = 0; + mmap_read_lock(mm); + r = walk_page_range(mm, state.next, end, &gather_pages_ops, &state); + mmap_read_unlock(mm); + cond_resched(); + s390_uv_destroy_pfns(state.count, state.pfns); + if (interruptible && fatal_signal_pending(current)) + return -EINTR; + } + return 0; +} +EXPORT_SYMBOL_GPL(__s390_uv_destroy_range); + +/** + * s390_unlist_old_asce - Remove the topmost level of page tables from the + * list of page tables of the gmap. + * @gmap: the gmap whose table is to be removed + * + * On s390x, KVM keeps a list of all pages containing the page tables of the + * gmap (the CRST list). This list is used at tear down time to free all + * pages that are now not needed anymore. + * + * This function removes the topmost page of the tree (the one pointed to by + * the ASCE) from the CRST list. + * + * This means that it will not be freed when the VM is torn down, and needs + * to be handled separately by the caller, unless a leak is actually + * intended. Notice that this function will only remove the page from the + * list, the page will still be used as a top level page table (and ASCE). + */ +void s390_unlist_old_asce(struct gmap *gmap) +{ + struct page *old; + + old = virt_to_page(gmap->table); + spin_lock(&gmap->guest_table_lock); + list_del(&old->lru); + /* + * Sometimes the topmost page might need to be "removed" multiple + * times, for example if the VM is rebooted into secure mode several + * times concurrently, or if s390_replace_asce fails after calling + * s390_remove_old_asce and is attempted again later. In that case + * the old asce has been removed from the list, and therefore it + * will not be freed when the VM terminates, but the ASCE is still + * in use and still pointed to. + * A subsequent call to replace_asce will follow the pointer and try + * to remove the same page from the list again. + * Therefore it's necessary that the page of the ASCE has valid + * pointers, so list_del can work (and do nothing) without + * dereferencing stale or invalid pointers. + */ + INIT_LIST_HEAD(&old->lru); + spin_unlock(&gmap->guest_table_lock); +} +EXPORT_SYMBOL_GPL(s390_unlist_old_asce); + +/** + * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy + * @gmap: the gmap whose ASCE needs to be replaced + * + * If the ASCE is a SEGMENT type then this function will return -EINVAL, + * otherwise the pointers in the host_to_guest radix tree will keep pointing + * to the wrong pages, causing use-after-free and memory corruption. + * If the allocation of the new top level page table fails, the ASCE is not + * replaced. + * In any case, the old ASCE is always removed from the gmap CRST list. + * Therefore the caller has to make sure to save a pointer to it + * beforehand, unless a leak is actually intended. + */ +int s390_replace_asce(struct gmap *gmap) +{ + unsigned long asce; + struct page *page; + void *table; + + s390_unlist_old_asce(gmap); + + /* Replacing segment type ASCEs would cause serious issues */ + if ((gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT) + return -EINVAL; + + page = gmap_alloc_crst(); + if (!page) + return -ENOMEM; + page->index = 0; + table = page_to_virt(page); + memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT)); + + /* + * The caller has to deal with the old ASCE, but here we make sure + * the new one is properly added to the CRST list, so that + * it will be freed when the VM is torn down. + */ + spin_lock(&gmap->guest_table_lock); + list_add(&page->lru, &gmap->crst_list); + spin_unlock(&gmap->guest_table_lock); + + /* Set new table origin while preserving existing ASCE control bits */ + asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table); + WRITE_ONCE(gmap->asce, asce); + WRITE_ONCE(gmap->mm->context.gmap_asce, asce); + WRITE_ONCE(gmap->table, table); + + return 0; +} +EXPORT_SYMBOL_GPL(s390_replace_asce); diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index b0246c705a19..297a6d897d5a 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -2,15 +2,19 @@ /* * IBM System z Huge TLB Page Support for Kernel. * - * Copyright IBM Corp. 2007,2016 + * Copyright IBM Corp. 2007,2020 * Author(s): Gerald Schaefer <gerald.schaefer@de.ibm.com> */ #define KMSG_COMPONENT "hugetlb" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#include <asm/pgalloc.h> #include <linux/mm.h> #include <linux/hugetlb.h> +#include <linux/mman.h> +#include <linux/sched/mm.h> +#include <linux/security.h> /* * If the bit selected by single-bit bitmask "a" is set within "x", move @@ -69,8 +73,8 @@ static inline unsigned long __pte_to_rste(pte_t pte) static inline pte_t __rste_to_pte(unsigned long rste) { + unsigned long pteval; int present; - pte_t pte; if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) present = pud_present(__pud(rste)); @@ -98,29 +102,21 @@ static inline pte_t __rste_to_pte(unsigned long rste) * u unused, l large */ if (present) { - pte_val(pte) = rste & _SEGMENT_ENTRY_ORIGIN_LARGE; - pte_val(pte) |= _PAGE_LARGE | _PAGE_PRESENT; - pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_READ, - _PAGE_READ); - pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_WRITE, - _PAGE_WRITE); - pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_INVALID, - _PAGE_INVALID); - pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_PROTECT, - _PAGE_PROTECT); - pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_DIRTY, - _PAGE_DIRTY); - pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_YOUNG, - _PAGE_YOUNG); + pteval = rste & _SEGMENT_ENTRY_ORIGIN_LARGE; + pteval |= _PAGE_LARGE | _PAGE_PRESENT; + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_READ, _PAGE_READ); + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_WRITE, _PAGE_WRITE); + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_INVALID, _PAGE_INVALID); + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_PROTECT, _PAGE_PROTECT); + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_DIRTY, _PAGE_DIRTY); + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_YOUNG, _PAGE_YOUNG); #ifdef CONFIG_MEM_SOFT_DIRTY - pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_SOFT_DIRTY, - _PAGE_DIRTY); + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_SOFT_DIRTY, _PAGE_SOFT_DIRTY); #endif - pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_NOEXEC, - _PAGE_NOEXEC); + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_NOEXEC, _PAGE_NOEXEC); } else - pte_val(pte) = _PAGE_INVALID; - return pte; + pteval = _PAGE_INVALID; + return __pte(pteval); } static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste) @@ -146,7 +142,7 @@ static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste) __storage_key_init_range(paddr, paddr + size - 1); } -void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, +void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { unsigned long rste; @@ -156,12 +152,21 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, rste &= ~_SEGMENT_ENTRY_NOEXEC; /* Set correct table type for 2G hugepages */ - if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) - rste |= _REGION_ENTRY_TYPE_R3 | _REGION3_ENTRY_LARGE; - else + if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) { + if (likely(pte_present(pte))) + rste |= _REGION3_ENTRY_LARGE; + rste |= _REGION_ENTRY_TYPE_R3; + } else if (likely(pte_present(pte))) rste |= _SEGMENT_ENTRY_LARGE; + clear_huge_pte_skeys(mm, rste); - pte_val(*ptep) = rste; + set_pte(ptep, __pte(rste)); +} + +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned long sz) +{ + __set_huge_pte_at(mm, addr, ptep, pte); } pte_t huge_ptep_get(pte_t *ptep) @@ -183,7 +188,7 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, return pte; } -pte_t *huge_pte_alloc(struct mm_struct *mm, +pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz) { pgd_t *pgdp; @@ -238,32 +243,100 @@ int pud_huge(pud_t pud) return pud_large(pud); } -struct page * -follow_huge_pud(struct mm_struct *mm, unsigned long address, - pud_t *pud, int flags) +bool __init arch_hugetlb_valid_size(unsigned long size) { - if (flags & FOLL_GET) - return NULL; + if (MACHINE_HAS_EDAT1 && size == PMD_SIZE) + return true; + else if (MACHINE_HAS_EDAT2 && size == PUD_SIZE) + return true; + else + return false; +} - return pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); +static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + struct hstate *h = hstate_file(file); + struct vm_unmapped_area_info info; + + info.flags = 0; + info.length = len; + info.low_limit = current->mm->mmap_base; + info.high_limit = TASK_SIZE; + info.align_mask = PAGE_MASK & ~huge_page_mask(h); + info.align_offset = 0; + return vm_unmapped_area(&info); } -static __init int setup_hugepagesz(char *opt) +static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, + unsigned long addr0, unsigned long len, + unsigned long pgoff, unsigned long flags) { - unsigned long size; - char *string = opt; - - size = memparse(opt, &opt); - if (MACHINE_HAS_EDAT1 && size == PMD_SIZE) { - hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); - } else if (MACHINE_HAS_EDAT2 && size == PUD_SIZE) { - hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); - } else { - hugetlb_bad_size(); - pr_err("hugepagesz= specifies an unsupported page size %s\n", - string); - return 0; + struct hstate *h = hstate_file(file); + struct vm_unmapped_area_info info; + unsigned long addr; + + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = PAGE_SIZE; + info.high_limit = current->mm->mmap_base; + info.align_mask = PAGE_MASK & ~huge_page_mask(h); + info.align_offset = 0; + addr = vm_unmapped_area(&info); + + /* + * A failed mmap() very likely causes application failure, + * so fall back to the bottom-up function here. This scenario + * can happen with large stack limits and large mmap() + * allocations. + */ + if (addr & ~PAGE_MASK) { + VM_BUG_ON(addr != -ENOMEM); + info.flags = 0; + info.low_limit = TASK_UNMAPPED_BASE; + info.high_limit = TASK_SIZE; + addr = vm_unmapped_area(&info); } - return 1; + + return addr; +} + +unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + struct hstate *h = hstate_file(file); + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + + if (len & ~huge_page_mask(h)) + return -EINVAL; + if (len > TASK_SIZE - mmap_min_addr) + return -ENOMEM; + + if (flags & MAP_FIXED) { + if (prepare_hugepage_range(file, addr, len)) + return -EINVAL; + goto check_asce_limit; + } + + if (addr) { + addr = ALIGN(addr, huge_page_size(h)); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && + (!vma || addr + len <= vm_start_gap(vma))) + goto check_asce_limit; + } + + if (mm->get_unmapped_area == arch_get_unmapped_area) + addr = hugetlb_get_unmapped_area_bottomup(file, addr, len, + pgoff, flags); + else + addr = hugetlb_get_unmapped_area_topdown(file, addr, len, + pgoff, flags); + if (offset_in_page(addr)) + return addr; + +check_asce_limit: + return check_asce_limit(mm, addr, len); } -__setup("hugepagesz=", setup_hugepagesz); diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index f0ce22220565..43e612bc2bcd 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -31,30 +31,35 @@ #include <linux/cma.h> #include <linux/gfp.h> #include <linux/dma-direct.h> +#include <linux/percpu.h> #include <asm/processor.h> #include <linux/uaccess.h> -#include <asm/pgtable.h> #include <asm/pgalloc.h> +#include <asm/ctlreg.h> +#include <asm/kfence.h> +#include <asm/ptdump.h> #include <asm/dma.h> -#include <asm/lowcore.h> +#include <asm/abs_lowcore.h> #include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/sections.h> -#include <asm/ctl_reg.h> #include <asm/sclp.h> #include <asm/set_memory.h> #include <asm/kasan.h> #include <asm/dma-mapping.h> #include <asm/uv.h> +#include <linux/virtio_anchor.h> +#include <linux/virtio_config.h> -pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(.bss..swapper_pg_dir); +pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(".bss..swapper_pg_dir"); +pgd_t invalid_pg_dir[PTRS_PER_PGD] __section(".bss..invalid_pg_dir"); + +struct ctlreg __bootdata_preserved(s390_invalid_asce); unsigned long empty_zero_page, zero_page_mask; EXPORT_SYMBOL(empty_zero_page); EXPORT_SYMBOL(zero_page_mask); -bool initmem_freed; - static void __init setup_zero_pages(void) { unsigned int order; @@ -88,70 +93,44 @@ static void __init setup_zero_pages(void) void __init paging_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; - unsigned long pgd_type, asce_bits; - psw_t psw; - - init_mm.pgd = swapper_pg_dir; - if (VMALLOC_END > _REGION2_SIZE) { - asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; - pgd_type = _REGION2_ENTRY_EMPTY; - } else { - asce_bits = _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; - pgd_type = _REGION3_ENTRY_EMPTY; - } - init_mm.context.asce = (__pa(init_mm.pgd) & PAGE_MASK) | asce_bits; - S390_lowcore.kernel_asce = init_mm.context.asce; - S390_lowcore.user_asce = S390_lowcore.kernel_asce; - crst_table_init((unsigned long *) init_mm.pgd, pgd_type); + vmem_map_init(); - kasan_copy_shadow(init_mm.pgd); - - /* enable virtual mapping in kernel mode */ - __ctl_load(S390_lowcore.kernel_asce, 1, 1); - __ctl_load(S390_lowcore.kernel_asce, 7, 7); - __ctl_load(S390_lowcore.kernel_asce, 13, 13); - psw.mask = __extract_psw(); - psw_bits(psw).dat = 1; - psw_bits(psw).as = PSW_BITS_AS_HOME; - __load_psw_mask(psw.mask); - kasan_free_early_identity(); - - sparse_memory_present_with_active_regions(MAX_NUMNODES); sparse_init(); zone_dma_bits = 31; memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); - max_zone_pfns[ZONE_DMA] = PFN_DOWN(MAX_DMA_ADDRESS); + max_zone_pfns[ZONE_DMA] = virt_to_pfn(MAX_DMA_ADDRESS); max_zone_pfns[ZONE_NORMAL] = max_low_pfn; - free_area_init_nodes(max_zone_pfns); + free_area_init(max_zone_pfns); } void mark_rodata_ro(void) { unsigned long size = __end_ro_after_init - __start_ro_after_init; - set_memory_ro((unsigned long)__start_ro_after_init, size >> PAGE_SHIFT); + __set_memory_ro(__start_ro_after_init, __end_ro_after_init); pr_info("Write protected read-only-after-init data: %luk\n", size >> 10); + debug_checkwx(); } -int set_memory_encrypted(unsigned long addr, int numpages) +int set_memory_encrypted(unsigned long vaddr, int numpages) { int i; /* make specified pages unshared, (swiotlb, dma_free) */ for (i = 0; i < numpages; ++i) { - uv_remove_shared(addr); - addr += PAGE_SIZE; + uv_remove_shared(virt_to_phys((void *)vaddr)); + vaddr += PAGE_SIZE; } return 0; } -int set_memory_decrypted(unsigned long addr, int numpages) +int set_memory_decrypted(unsigned long vaddr, int numpages) { int i; /* make specified pages shared (swiotlb, dma_alloca) */ for (i = 0; i < numpages; ++i) { - uv_set_shared(addr); - addr += PAGE_SIZE; + uv_set_shared(virt_to_phys((void *)vaddr)); + vaddr += PAGE_SIZE; } return 0; } @@ -168,10 +147,11 @@ static void pv_init(void) if (!is_prot_virt_guest()) return; + virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc); + /* make sure bounce buffers are shared */ - swiotlb_init(1); + swiotlb_init(true, SWIOTLB_FORCE | SWIOTLB_VERBOSE); swiotlb_update_mem_attributes(); - swiotlb_force = SWIOTLB_FORCE; } void __init mem_init(void) @@ -183,25 +163,17 @@ void __init mem_init(void) high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); pv_init(); - - /* Setup guest page hinting */ - cmma_init(); + kfence_split_mapping(); /* this will put all low memory onto the freelists */ memblock_free_all(); setup_zero_pages(); /* Setup zeroed pages. */ - - cmma_init_nodat(); - - mem_init_print_info(NULL); } void free_initmem(void) { - initmem_freed = true; - __set_memory((unsigned long)_sinittext, - (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT, - SET_MEMORY_RW | SET_MEMORY_NX); + set_memory_rwnx((unsigned long)_sinittext, + (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT); free_initmem_default(POISON_FREE_INITMEM); } @@ -214,6 +186,41 @@ unsigned long memory_block_size_bytes(void) return max_t(unsigned long, MIN_MEMORY_BLOCK_SIZE, sclp.rzm); } +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(__per_cpu_offset); + +static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) +{ + return LOCAL_DISTANCE; +} + +static int __init pcpu_cpu_to_node(int cpu) +{ + return 0; +} + +void __init setup_per_cpu_areas(void) +{ + unsigned long delta; + unsigned int cpu; + int rc; + + /* + * Always reserve area for module percpu variables. That's + * what the legacy allocator did. + */ + rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, + PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, + pcpu_cpu_distance, + pcpu_cpu_to_node); + if (rc < 0) + panic("Failed to initialize percpu areas."); + + delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; + for_each_possible_cpu(cpu) + __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; +} + #ifdef CONFIG_MEMORY_HOTPLUG #ifdef CONFIG_CMA @@ -268,34 +275,35 @@ device_initcall(s390_cma_mem_init); #endif /* CONFIG_CMA */ int arch_add_memory(int nid, u64 start, u64 size, - struct mhp_restrictions *restrictions) + struct mhp_params *params) { unsigned long start_pfn = PFN_DOWN(start); unsigned long size_pages = PFN_DOWN(size); int rc; - if (WARN_ON_ONCE(restrictions->altmap)) + if (WARN_ON_ONCE(params->altmap)) + return -EINVAL; + + if (WARN_ON_ONCE(params->pgprot.pgprot != PAGE_KERNEL.pgprot)) return -EINVAL; + VM_BUG_ON(!mhp_range_allowed(start, size, true)); rc = vmem_add_mapping(start, size); if (rc) return rc; - rc = __add_pages(nid, start_pfn, size_pages, restrictions); + rc = __add_pages(nid, start_pfn, size_pages, params); if (rc) vmem_remove_mapping(start, size); return rc; } -void arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap) +void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - struct zone *zone; - zone = page_zone(pfn_to_page(start_pfn)); - __remove_pages(zone, start_pfn, nr_pages, altmap); + __remove_pages(start_pfn, nr_pages, altmap); vmem_remove_mapping(start, size); } #endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/arch/s390/mm/kasan_init.c b/arch/s390/mm/kasan_init.c deleted file mode 100644 index 460f25572940..000000000000 --- a/arch/s390/mm/kasan_init.c +++ /dev/null @@ -1,382 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/kasan.h> -#include <linux/sched/task.h> -#include <linux/memblock.h> -#include <asm/pgalloc.h> -#include <asm/pgtable.h> -#include <asm/kasan.h> -#include <asm/mem_detect.h> -#include <asm/processor.h> -#include <asm/sclp.h> -#include <asm/facility.h> -#include <asm/sections.h> -#include <asm/setup.h> - -static unsigned long segment_pos __initdata; -static unsigned long segment_low __initdata; -static unsigned long pgalloc_pos __initdata; -static unsigned long pgalloc_low __initdata; -static unsigned long pgalloc_freeable __initdata; -static bool has_edat __initdata; -static bool has_nx __initdata; - -#define __sha(x) ((unsigned long)kasan_mem_to_shadow((void *)x)) - -static pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE); - -static void __init kasan_early_panic(const char *reason) -{ - sclp_early_printk("The Linux kernel failed to boot with the KernelAddressSanitizer:\n"); - sclp_early_printk(reason); - disabled_wait(); -} - -static void * __init kasan_early_alloc_segment(void) -{ - segment_pos -= _SEGMENT_SIZE; - - if (segment_pos < segment_low) - kasan_early_panic("out of memory during initialisation\n"); - - return (void *)segment_pos; -} - -static void * __init kasan_early_alloc_pages(unsigned int order) -{ - pgalloc_pos -= (PAGE_SIZE << order); - - if (pgalloc_pos < pgalloc_low) - kasan_early_panic("out of memory during initialisation\n"); - - return (void *)pgalloc_pos; -} - -static void * __init kasan_early_crst_alloc(unsigned long val) -{ - unsigned long *table; - - table = kasan_early_alloc_pages(CRST_ALLOC_ORDER); - if (table) - crst_table_init(table, val); - return table; -} - -static pte_t * __init kasan_early_pte_alloc(void) -{ - static void *pte_leftover; - pte_t *pte; - - BUILD_BUG_ON(_PAGE_TABLE_SIZE * 2 != PAGE_SIZE); - - if (!pte_leftover) { - pte_leftover = kasan_early_alloc_pages(0); - pte = pte_leftover + _PAGE_TABLE_SIZE; - } else { - pte = pte_leftover; - pte_leftover = NULL; - } - memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE); - return pte; -} - -enum populate_mode { - POPULATE_ONE2ONE, - POPULATE_MAP, - POPULATE_ZERO_SHADOW -}; -static void __init kasan_early_vmemmap_populate(unsigned long address, - unsigned long end, - enum populate_mode mode) -{ - unsigned long pgt_prot_zero, pgt_prot, sgt_prot; - pgd_t *pg_dir; - p4d_t *p4_dir; - pud_t *pu_dir; - pmd_t *pm_dir; - pte_t *pt_dir; - - pgt_prot_zero = pgprot_val(PAGE_KERNEL_RO); - if (!has_nx) - pgt_prot_zero &= ~_PAGE_NOEXEC; - pgt_prot = pgprot_val(PAGE_KERNEL_EXEC); - sgt_prot = pgprot_val(SEGMENT_KERNEL_EXEC); - - while (address < end) { - pg_dir = pgd_offset_k(address); - if (pgd_none(*pg_dir)) { - if (mode == POPULATE_ZERO_SHADOW && - IS_ALIGNED(address, PGDIR_SIZE) && - end - address >= PGDIR_SIZE) { - pgd_populate(&init_mm, pg_dir, - kasan_early_shadow_p4d); - address = (address + PGDIR_SIZE) & PGDIR_MASK; - continue; - } - p4_dir = kasan_early_crst_alloc(_REGION2_ENTRY_EMPTY); - pgd_populate(&init_mm, pg_dir, p4_dir); - } - - p4_dir = p4d_offset(pg_dir, address); - if (p4d_none(*p4_dir)) { - if (mode == POPULATE_ZERO_SHADOW && - IS_ALIGNED(address, P4D_SIZE) && - end - address >= P4D_SIZE) { - p4d_populate(&init_mm, p4_dir, - kasan_early_shadow_pud); - address = (address + P4D_SIZE) & P4D_MASK; - continue; - } - pu_dir = kasan_early_crst_alloc(_REGION3_ENTRY_EMPTY); - p4d_populate(&init_mm, p4_dir, pu_dir); - } - - pu_dir = pud_offset(p4_dir, address); - if (pud_none(*pu_dir)) { - if (mode == POPULATE_ZERO_SHADOW && - IS_ALIGNED(address, PUD_SIZE) && - end - address >= PUD_SIZE) { - pud_populate(&init_mm, pu_dir, - kasan_early_shadow_pmd); - address = (address + PUD_SIZE) & PUD_MASK; - continue; - } - pm_dir = kasan_early_crst_alloc(_SEGMENT_ENTRY_EMPTY); - pud_populate(&init_mm, pu_dir, pm_dir); - } - - pm_dir = pmd_offset(pu_dir, address); - if (pmd_none(*pm_dir)) { - if (mode == POPULATE_ZERO_SHADOW && - IS_ALIGNED(address, PMD_SIZE) && - end - address >= PMD_SIZE) { - pmd_populate(&init_mm, pm_dir, - kasan_early_shadow_pte); - address = (address + PMD_SIZE) & PMD_MASK; - continue; - } - /* the first megabyte of 1:1 is mapped with 4k pages */ - if (has_edat && address && end - address >= PMD_SIZE && - mode != POPULATE_ZERO_SHADOW) { - void *page; - - if (mode == POPULATE_ONE2ONE) { - page = (void *)address; - } else { - page = kasan_early_alloc_segment(); - memset(page, 0, _SEGMENT_SIZE); - } - pmd_val(*pm_dir) = __pa(page) | sgt_prot; - address = (address + PMD_SIZE) & PMD_MASK; - continue; - } - - pt_dir = kasan_early_pte_alloc(); - pmd_populate(&init_mm, pm_dir, pt_dir); - } else if (pmd_large(*pm_dir)) { - address = (address + PMD_SIZE) & PMD_MASK; - continue; - } - - pt_dir = pte_offset_kernel(pm_dir, address); - if (pte_none(*pt_dir)) { - void *page; - - switch (mode) { - case POPULATE_ONE2ONE: - page = (void *)address; - pte_val(*pt_dir) = __pa(page) | pgt_prot; - break; - case POPULATE_MAP: - page = kasan_early_alloc_pages(0); - memset(page, 0, PAGE_SIZE); - pte_val(*pt_dir) = __pa(page) | pgt_prot; - break; - case POPULATE_ZERO_SHADOW: - page = kasan_early_shadow_page; - pte_val(*pt_dir) = __pa(page) | pgt_prot_zero; - break; - } - } - address += PAGE_SIZE; - } -} - -static void __init kasan_set_pgd(pgd_t *pgd, unsigned long asce_type) -{ - unsigned long asce_bits; - - asce_bits = asce_type | _ASCE_TABLE_LENGTH; - S390_lowcore.kernel_asce = (__pa(pgd) & PAGE_MASK) | asce_bits; - S390_lowcore.user_asce = S390_lowcore.kernel_asce; - - __ctl_load(S390_lowcore.kernel_asce, 1, 1); - __ctl_load(S390_lowcore.kernel_asce, 7, 7); - __ctl_load(S390_lowcore.kernel_asce, 13, 13); -} - -static void __init kasan_enable_dat(void) -{ - psw_t psw; - - psw.mask = __extract_psw(); - psw_bits(psw).dat = 1; - psw_bits(psw).as = PSW_BITS_AS_HOME; - __load_psw_mask(psw.mask); -} - -static void __init kasan_early_detect_facilities(void) -{ - if (test_facility(8)) { - has_edat = true; - __ctl_set_bit(0, 23); - } - if (!noexec_disabled && test_facility(130)) { - has_nx = true; - __ctl_set_bit(0, 20); - } -} - -void __init kasan_early_init(void) -{ - unsigned long untracked_mem_end; - unsigned long shadow_alloc_size; - unsigned long initrd_end; - unsigned long asce_type; - unsigned long memsize; - unsigned long vmax; - unsigned long pgt_prot = pgprot_val(PAGE_KERNEL_RO); - pte_t pte_z; - pmd_t pmd_z = __pmd(__pa(kasan_early_shadow_pte) | _SEGMENT_ENTRY); - pud_t pud_z = __pud(__pa(kasan_early_shadow_pmd) | _REGION3_ENTRY); - p4d_t p4d_z = __p4d(__pa(kasan_early_shadow_pud) | _REGION2_ENTRY); - - kasan_early_detect_facilities(); - if (!has_nx) - pgt_prot &= ~_PAGE_NOEXEC; - pte_z = __pte(__pa(kasan_early_shadow_page) | pgt_prot); - - memsize = get_mem_detect_end(); - if (!memsize) - kasan_early_panic("cannot detect physical memory size\n"); - /* respect mem= cmdline parameter */ - if (memory_end_set && memsize > memory_end) - memsize = memory_end; - if (IS_ENABLED(CONFIG_CRASH_DUMP) && OLDMEM_BASE) - memsize = min(memsize, OLDMEM_SIZE); - memsize = min(memsize, KASAN_SHADOW_START); - - if (IS_ENABLED(CONFIG_KASAN_S390_4_LEVEL_PAGING)) { - /* 4 level paging */ - BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, P4D_SIZE)); - BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, P4D_SIZE)); - crst_table_init((unsigned long *)early_pg_dir, - _REGION2_ENTRY_EMPTY); - untracked_mem_end = vmax = _REGION1_SIZE; - asce_type = _ASCE_TYPE_REGION2; - } else { - /* 3 level paging */ - BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, PUD_SIZE)); - BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, PUD_SIZE)); - crst_table_init((unsigned long *)early_pg_dir, - _REGION3_ENTRY_EMPTY); - untracked_mem_end = vmax = _REGION2_SIZE; - asce_type = _ASCE_TYPE_REGION3; - } - - /* init kasan zero shadow */ - crst_table_init((unsigned long *)kasan_early_shadow_p4d, - p4d_val(p4d_z)); - crst_table_init((unsigned long *)kasan_early_shadow_pud, - pud_val(pud_z)); - crst_table_init((unsigned long *)kasan_early_shadow_pmd, - pmd_val(pmd_z)); - memset64((u64 *)kasan_early_shadow_pte, pte_val(pte_z), PTRS_PER_PTE); - - shadow_alloc_size = memsize >> KASAN_SHADOW_SCALE_SHIFT; - pgalloc_low = round_up((unsigned long)_end, _SEGMENT_SIZE); - if (IS_ENABLED(CONFIG_BLK_DEV_INITRD)) { - initrd_end = - round_up(INITRD_START + INITRD_SIZE, _SEGMENT_SIZE); - pgalloc_low = max(pgalloc_low, initrd_end); - } - - if (pgalloc_low + shadow_alloc_size > memsize) - kasan_early_panic("out of memory during initialisation\n"); - - if (has_edat) { - segment_pos = round_down(memsize, _SEGMENT_SIZE); - segment_low = segment_pos - shadow_alloc_size; - pgalloc_pos = segment_low; - } else { - pgalloc_pos = memsize; - } - init_mm.pgd = early_pg_dir; - /* - * Current memory layout: - * +- 0 -------------+ +- shadow start -+ - * | 1:1 ram mapping | /| 1/8 ram | - * +- end of ram ----+ / +----------------+ - * | ... gap ... |/ | kasan | - * +- shadow start --+ | zero | - * | 1/8 addr space | | page | - * +- shadow end -+ | mapping | - * | ... gap ... |\ | (untracked) | - * +- modules vaddr -+ \ +----------------+ - * | 2Gb | \| unmapped | allocated per module - * +-----------------+ +- shadow end ---+ - */ - /* populate kasan shadow (for identity mapping and zero page mapping) */ - kasan_early_vmemmap_populate(__sha(0), __sha(memsize), POPULATE_MAP); - if (IS_ENABLED(CONFIG_MODULES)) - untracked_mem_end = vmax - MODULES_LEN; - kasan_early_vmemmap_populate(__sha(max_physmem_end), - __sha(untracked_mem_end), - POPULATE_ZERO_SHADOW); - /* memory allocated for identity mapping structs will be freed later */ - pgalloc_freeable = pgalloc_pos; - /* populate identity mapping */ - kasan_early_vmemmap_populate(0, memsize, POPULATE_ONE2ONE); - kasan_set_pgd(early_pg_dir, asce_type); - kasan_enable_dat(); - /* enable kasan */ - init_task.kasan_depth = 0; - memblock_reserve(pgalloc_pos, memsize - pgalloc_pos); - sclp_early_printk("KernelAddressSanitizer initialized\n"); -} - -void __init kasan_copy_shadow(pgd_t *pg_dir) -{ - /* - * At this point we are still running on early pages setup early_pg_dir, - * while swapper_pg_dir has just been initialized with identity mapping. - * Carry over shadow memory region from early_pg_dir to swapper_pg_dir. - */ - - pgd_t *pg_dir_src; - pgd_t *pg_dir_dst; - p4d_t *p4_dir_src; - p4d_t *p4_dir_dst; - pud_t *pu_dir_src; - pud_t *pu_dir_dst; - - pg_dir_src = pgd_offset_raw(early_pg_dir, KASAN_SHADOW_START); - pg_dir_dst = pgd_offset_raw(pg_dir, KASAN_SHADOW_START); - p4_dir_src = p4d_offset(pg_dir_src, KASAN_SHADOW_START); - p4_dir_dst = p4d_offset(pg_dir_dst, KASAN_SHADOW_START); - if (!p4d_folded(*p4_dir_src)) { - /* 4 level paging */ - memcpy(p4_dir_dst, p4_dir_src, - (KASAN_SHADOW_SIZE >> P4D_SHIFT) * sizeof(p4d_t)); - return; - } - /* 3 level paging */ - pu_dir_src = pud_offset(p4_dir_src, KASAN_SHADOW_START); - pu_dir_dst = pud_offset(p4_dir_dst, KASAN_SHADOW_START); - memcpy(pu_dir_dst, pu_dir_src, - (KASAN_SHADOW_SIZE >> PUD_SHIFT) * sizeof(pud_t)); -} - -void __init kasan_free_early_identity(void) -{ - memblock_free(pgalloc_pos, pgalloc_freeable - pgalloc_pos); -} diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c index de7ca4b6718f..632c3a55feed 100644 --- a/arch/s390/mm/maccess.c +++ b/arch/s390/mm/maccess.c @@ -4,8 +4,6 @@ * * Copyright IBM Corp. 2009, 2015 * - * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>, - * */ #include <linux/uaccess.h> @@ -14,9 +12,17 @@ #include <linux/errno.h> #include <linux/gfp.h> #include <linux/cpu.h> -#include <asm/ctl_reg.h> -#include <asm/io.h> +#include <linux/uio.h> +#include <linux/io.h> +#include <asm/asm-extable.h> +#include <asm/abs_lowcore.h> #include <asm/stacktrace.h> +#include <asm/maccess.h> +#include <asm/ctlreg.h> + +unsigned long __bootdata_preserved(__memcpy_real_area); +pte_t *__bootdata_preserved(memcpy_real_ptep); +static DEFINE_MUTEX(memcpy_real_mutex); static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t size) { @@ -55,155 +61,84 @@ static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t siz */ static DEFINE_SPINLOCK(s390_kernel_write_lock); -void notrace s390_kernel_write(void *dst, const void *src, size_t size) +notrace void *s390_kernel_write(void *dst, const void *src, size_t size) { + void *tmp = dst; unsigned long flags; long copied; spin_lock_irqsave(&s390_kernel_write_lock, flags); while (size) { - copied = s390_kernel_write_odd(dst, src, size); - dst += copied; + copied = s390_kernel_write_odd(tmp, src, size); + tmp += copied; src += copied; size -= copied; } spin_unlock_irqrestore(&s390_kernel_write_lock, flags); -} - -static int __no_sanitize_address __memcpy_real(void *dest, void *src, size_t count) -{ - register unsigned long _dest asm("2") = (unsigned long) dest; - register unsigned long _len1 asm("3") = (unsigned long) count; - register unsigned long _src asm("4") = (unsigned long) src; - register unsigned long _len2 asm("5") = (unsigned long) count; - int rc = -EFAULT; - - asm volatile ( - "0: mvcle %1,%2,0x0\n" - "1: jo 0b\n" - " lhi %0,0x0\n" - "2:\n" - EX_TABLE(1b,2b) - : "+d" (rc), "+d" (_dest), "+d" (_src), "+d" (_len1), - "+d" (_len2), "=m" (*((long *) dest)) - : "m" (*((long *) src)) - : "cc", "memory"); - return rc; -} - -static unsigned long __no_sanitize_address _memcpy_real(unsigned long dest, - unsigned long src, - unsigned long count) -{ - int irqs_disabled, rc; - unsigned long flags; - if (!count) - return 0; - flags = arch_local_irq_save(); - irqs_disabled = arch_irqs_disabled_flags(flags); - if (!irqs_disabled) - trace_hardirqs_off(); - __arch_local_irq_stnsm(0xf8); // disable DAT - rc = __memcpy_real((void *) dest, (void *) src, (size_t) count); - if (flags & PSW_MASK_DAT) - __arch_local_irq_stosm(0x04); // enable DAT - if (!irqs_disabled) - trace_hardirqs_on(); - __arch_local_irq_ssm(flags); - return rc; + return dst; } -/* - * Copy memory in real mode (kernel to kernel) - */ -int memcpy_real(void *dest, void *src, size_t count) +size_t memcpy_real_iter(struct iov_iter *iter, unsigned long src, size_t count) { - int rc; - - if (S390_lowcore.nodat_stack != 0) { - preempt_disable(); - rc = CALL_ON_STACK(_memcpy_real, S390_lowcore.nodat_stack, 3, - dest, src, count); - preempt_enable(); - return rc; - } - /* - * This is a really early memcpy_real call, the stacks are - * not set up yet. Just call _memcpy_real on the early boot - * stack - */ - return _memcpy_real((unsigned long) dest,(unsigned long) src, - (unsigned long) count); -} - -/* - * Copy memory in absolute mode (kernel to kernel) - */ -void memcpy_absolute(void *dest, void *src, size_t count) -{ - unsigned long cr0, flags, prefix; - - flags = arch_local_irq_save(); - __ctl_store(cr0, 0, 0); - __ctl_clear_bit(0, 28); /* disable lowcore protection */ - prefix = store_prefix(); - if (prefix) { - local_mcck_disable(); - set_prefix(0); - memcpy(dest, src, count); - set_prefix(prefix); - local_mcck_enable(); - } else { - memcpy(dest, src, count); + size_t len, copied, res = 0; + unsigned long phys, offset; + void *chunk; + pte_t pte; + + BUILD_BUG_ON(MEMCPY_REAL_SIZE != PAGE_SIZE); + while (count) { + phys = src & MEMCPY_REAL_MASK; + offset = src & ~MEMCPY_REAL_MASK; + chunk = (void *)(__memcpy_real_area + offset); + len = min(count, MEMCPY_REAL_SIZE - offset); + pte = mk_pte_phys(phys, PAGE_KERNEL_RO); + + mutex_lock(&memcpy_real_mutex); + if (pte_val(pte) != pte_val(*memcpy_real_ptep)) { + __ptep_ipte(__memcpy_real_area, memcpy_real_ptep, 0, 0, IPTE_GLOBAL); + set_pte(memcpy_real_ptep, pte); + } + copied = copy_to_iter(chunk, len, iter); + mutex_unlock(&memcpy_real_mutex); + + count -= copied; + src += copied; + res += copied; + if (copied < len) + break; } - __ctl_load(cr0, 0, 0); - arch_local_irq_restore(flags); + return res; } -/* - * Copy memory from kernel (real) to user (virtual) - */ -int copy_to_user_real(void __user *dest, void *src, unsigned long count) +int memcpy_real(void *dest, unsigned long src, size_t count) { - int offs = 0, size, rc; - char *buf; - - buf = (char *) __get_free_page(GFP_KERNEL); - if (!buf) - return -ENOMEM; - rc = -EFAULT; - while (offs < count) { - size = min(PAGE_SIZE, count - offs); - if (memcpy_real(buf, src + offs, size)) - goto out; - if (copy_to_user(dest + offs, buf, size)) - goto out; - offs += size; - } - rc = 0; -out: - free_page((unsigned long) buf); - return rc; + struct iov_iter iter; + struct kvec kvec; + + kvec.iov_base = dest; + kvec.iov_len = count; + iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count); + if (memcpy_real_iter(&iter, src, count) < count) + return -EFAULT; + return 0; } /* - * Check if physical address is within prefix or zero page + * Find CPU that owns swapped prefix page */ -static int is_swapped(unsigned long addr) +static int get_swapped_owner(phys_addr_t addr) { - unsigned long lc; + phys_addr_t lc; int cpu; - if (addr < sizeof(struct lowcore)) - return 1; for_each_online_cpu(cpu) { - lc = (unsigned long) lowcore_ptr[cpu]; + lc = virt_to_phys(lowcore_ptr[cpu]); if (addr > lc + sizeof(struct lowcore) - 1 || addr < lc) continue; - return 1; + return cpu; } - return 0; + return -1; } /* @@ -214,27 +149,45 @@ static int is_swapped(unsigned long addr) */ void *xlate_dev_mem_ptr(phys_addr_t addr) { - void *bounce = (void *) addr; + void *ptr = phys_to_virt(addr); + void *bounce = ptr; + struct lowcore *abs_lc; unsigned long size; + int this_cpu, cpu; - get_online_cpus(); - preempt_disable(); - if (is_swapped(addr)) { - size = PAGE_SIZE - (addr & ~PAGE_MASK); - bounce = (void *) __get_free_page(GFP_ATOMIC); - if (bounce) - memcpy_absolute(bounce, (void *) addr, size); + cpus_read_lock(); + this_cpu = get_cpu(); + if (addr >= sizeof(struct lowcore)) { + cpu = get_swapped_owner(addr); + if (cpu < 0) + goto out; } - preempt_enable(); - put_online_cpus(); + bounce = (void *)__get_free_page(GFP_ATOMIC); + if (!bounce) + goto out; + size = PAGE_SIZE - (addr & ~PAGE_MASK); + if (addr < sizeof(struct lowcore)) { + abs_lc = get_abs_lowcore(); + ptr = (void *)abs_lc + addr; + memcpy(bounce, ptr, size); + put_abs_lowcore(abs_lc); + } else if (cpu == this_cpu) { + ptr = (void *)(addr - virt_to_phys(lowcore_ptr[cpu])); + memcpy(bounce, ptr, size); + } else { + memcpy(bounce, ptr, size); + } +out: + put_cpu(); + cpus_read_unlock(); return bounce; } /* * Free converted buffer for /dev/mem access (if necessary) */ -void unxlate_dev_mem_ptr(phys_addr_t addr, void *buf) +void unxlate_dev_mem_ptr(phys_addr_t addr, void *ptr) { - if ((void *) addr != buf) - free_page((unsigned long) buf); + if (addr != virt_to_phys(ptr)) + free_page((unsigned long)ptr); } diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index cbc718ba6d78..fc9a7dc26c5e 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -17,7 +17,6 @@ #include <linux/random.h> #include <linux/compat.h> #include <linux/security.h> -#include <asm/pgalloc.h> #include <asm/elf.h> static unsigned long stack_maxrandom_size(void) @@ -38,7 +37,7 @@ static inline int mmap_is_legacy(struct rlimit *rlim_stack) unsigned long arch_mmap_rnd(void) { - return (get_random_int() & MMAP_RND_MASK) << PAGE_SHIFT; + return (get_random_u32() & MMAP_RND_MASK) << PAGE_SHIFT; } static unsigned long mmap_base_legacy(unsigned long rnd) @@ -59,9 +58,9 @@ static inline unsigned long mmap_base(unsigned long rnd, /* * Top of mmap area (just below the process stack). - * Leave at least a ~32 MB hole. + * Leave at least a ~128 MB hole. */ - gap_min = 32 * 1024 * 1024UL; + gap_min = SZ_128M; gap_max = (STACK_TOP / 6) * 5; if (gap < gap_min) @@ -72,14 +71,13 @@ static inline unsigned long mmap_base(unsigned long rnd, return PAGE_ALIGN(STACK_TOP - gap - rnd); } -unsigned long -arch_get_unmapped_area(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) +unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; struct vm_unmapped_area_info info; - int rc; if (len > TASK_SIZE - mmap_min_addr) return -ENOMEM; @@ -105,30 +103,20 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, info.align_mask = 0; info.align_offset = pgoff << PAGE_SHIFT; addr = vm_unmapped_area(&info); - if (addr & ~PAGE_MASK) + if (offset_in_page(addr)) return addr; check_asce_limit: - if (addr + len > current->mm->context.asce_limit && - addr + len <= TASK_SIZE) { - rc = crst_table_upgrade(mm, addr + len); - if (rc) - return (unsigned long) rc; - } - - return addr; + return check_asce_limit(mm, addr, len); } -unsigned long -arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, - const unsigned long len, const unsigned long pgoff, - const unsigned long flags) +unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; - unsigned long addr = addr0; struct vm_unmapped_area_info info; - int rc; /* requested length too big for entire address space */ if (len > TASK_SIZE - mmap_min_addr) @@ -148,7 +136,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; - info.low_limit = max(PAGE_SIZE, mmap_min_addr); + info.low_limit = PAGE_SIZE; info.high_limit = mm->mmap_base; if (filp || (flags & MAP_SHARED)) info.align_mask = MMAP_ALIGN_MASK << PAGE_SHIFT; @@ -163,25 +151,18 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, * can happen with large stack limits and large mmap() * allocations. */ - if (addr & ~PAGE_MASK) { + if (offset_in_page(addr)) { VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = TASK_UNMAPPED_BASE; info.high_limit = TASK_SIZE; addr = vm_unmapped_area(&info); - if (addr & ~PAGE_MASK) + if (offset_in_page(addr)) return addr; } check_asce_limit: - if (addr + len > current->mm->context.asce_limit && - addr + len <= TASK_SIZE) { - rc = crst_table_upgrade(mm, addr + len); - if (rc) - return (unsigned long) rc; - } - - return addr; + return check_asce_limit(mm, addr, len); } /* @@ -207,3 +188,23 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } + +static const pgprot_t protection_map[16] = { + [VM_NONE] = PAGE_NONE, + [VM_READ] = PAGE_RO, + [VM_WRITE] = PAGE_RO, + [VM_WRITE | VM_READ] = PAGE_RO, + [VM_EXEC] = PAGE_RX, + [VM_EXEC | VM_READ] = PAGE_RX, + [VM_EXEC | VM_WRITE] = PAGE_RX, + [VM_EXEC | VM_WRITE | VM_READ] = PAGE_RX, + [VM_SHARED] = PAGE_NONE, + [VM_SHARED | VM_READ] = PAGE_RO, + [VM_SHARED | VM_WRITE] = PAGE_RW, + [VM_SHARED | VM_WRITE | VM_READ] = PAGE_RW, + [VM_SHARED | VM_EXEC] = PAGE_RX, + [VM_SHARED | VM_EXEC | VM_READ] = PAGE_RX, + [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_RWX, + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_RWX +}; +DECLARE_VM_GET_PAGE_PROT diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c index fc141893d028..01f9b39e65f5 100644 --- a/arch/s390/mm/page-states.c +++ b/arch/s390/mm/page-states.c @@ -7,211 +7,18 @@ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> */ -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/types.h> #include <linux/mm.h> -#include <linux/memblock.h> -#include <linux/gfp.h> -#include <linux/init.h> -#include <asm/facility.h> #include <asm/page-states.h> +#include <asm/sections.h> +#include <asm/page.h> -static int cmma_flag = 1; - -static int __init cmma(char *str) -{ - bool enabled; - - if (!kstrtobool(str, &enabled)) - cmma_flag = enabled; - return 1; -} -__setup("cmma=", cmma); - -static inline int cmma_test_essa(void) -{ - register unsigned long tmp asm("0") = 0; - register int rc asm("1"); - - /* test ESSA_GET_STATE */ - asm volatile( - " .insn rrf,0xb9ab0000,%1,%1,%2,0\n" - "0: la %0,0\n" - "1:\n" - EX_TABLE(0b,1b) - : "=&d" (rc), "+&d" (tmp) - : "i" (ESSA_GET_STATE), "0" (-EOPNOTSUPP)); - return rc; -} - -void __init cmma_init(void) -{ - if (!cmma_flag) - return; - if (cmma_test_essa()) { - cmma_flag = 0; - return; - } - if (test_facility(147)) - cmma_flag = 2; -} - -static inline unsigned char get_page_state(struct page *page) -{ - unsigned char state; - - asm volatile(" .insn rrf,0xb9ab0000,%0,%1,%2,0" - : "=&d" (state) - : "a" (page_to_phys(page)), - "i" (ESSA_GET_STATE)); - return state & 0x3f; -} - -static inline void set_page_unused(struct page *page, int order) -{ - int i, rc; - - for (i = 0; i < (1 << order); i++) - asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0" - : "=&d" (rc) - : "a" (page_to_phys(page + i)), - "i" (ESSA_SET_UNUSED)); -} - -static inline void set_page_stable_dat(struct page *page, int order) -{ - int i, rc; - - for (i = 0; i < (1 << order); i++) - asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0" - : "=&d" (rc) - : "a" (page_to_phys(page + i)), - "i" (ESSA_SET_STABLE)); -} - -static inline void set_page_stable_nodat(struct page *page, int order) -{ - int i, rc; - - for (i = 0; i < (1 << order); i++) - asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0" - : "=&d" (rc) - : "a" (page_to_phys(page + i)), - "i" (ESSA_SET_STABLE_NODAT)); -} - -static void mark_kernel_pmd(pud_t *pud, unsigned long addr, unsigned long end) -{ - unsigned long next; - struct page *page; - pmd_t *pmd; - - pmd = pmd_offset(pud, addr); - do { - next = pmd_addr_end(addr, end); - if (pmd_none(*pmd) || pmd_large(*pmd)) - continue; - page = virt_to_page(pmd_val(*pmd)); - set_bit(PG_arch_1, &page->flags); - } while (pmd++, addr = next, addr != end); -} - -static void mark_kernel_pud(p4d_t *p4d, unsigned long addr, unsigned long end) -{ - unsigned long next; - struct page *page; - pud_t *pud; - int i; - - pud = pud_offset(p4d, addr); - do { - next = pud_addr_end(addr, end); - if (pud_none(*pud) || pud_large(*pud)) - continue; - if (!pud_folded(*pud)) { - page = virt_to_page(pud_val(*pud)); - for (i = 0; i < 3; i++) - set_bit(PG_arch_1, &page[i].flags); - } - mark_kernel_pmd(pud, addr, next); - } while (pud++, addr = next, addr != end); -} - -static void mark_kernel_p4d(pgd_t *pgd, unsigned long addr, unsigned long end) -{ - unsigned long next; - struct page *page; - p4d_t *p4d; - int i; - - p4d = p4d_offset(pgd, addr); - do { - next = p4d_addr_end(addr, end); - if (p4d_none(*p4d)) - continue; - if (!p4d_folded(*p4d)) { - page = virt_to_page(p4d_val(*p4d)); - for (i = 0; i < 3; i++) - set_bit(PG_arch_1, &page[i].flags); - } - mark_kernel_pud(p4d, addr, next); - } while (p4d++, addr = next, addr != end); -} - -static void mark_kernel_pgd(void) -{ - unsigned long addr, next; - struct page *page; - pgd_t *pgd; - int i; - - addr = 0; - pgd = pgd_offset_k(addr); - do { - next = pgd_addr_end(addr, MODULES_END); - if (pgd_none(*pgd)) - continue; - if (!pgd_folded(*pgd)) { - page = virt_to_page(pgd_val(*pgd)); - for (i = 0; i < 3; i++) - set_bit(PG_arch_1, &page[i].flags); - } - mark_kernel_p4d(pgd, addr, next); - } while (pgd++, addr = next, addr != MODULES_END); -} - -void __init cmma_init_nodat(void) -{ - struct memblock_region *reg; - struct page *page; - unsigned long start, end, ix; - - if (cmma_flag < 2) - return; - /* Mark pages used in kernel page tables */ - mark_kernel_pgd(); - - /* Set all kernel pages not used for page tables to stable/no-dat */ - for_each_memblock(memory, reg) { - start = memblock_region_memory_base_pfn(reg); - end = memblock_region_memory_end_pfn(reg); - page = pfn_to_page(start); - for (ix = start; ix < end; ix++, page++) { - if (__test_and_clear_bit(PG_arch_1, &page->flags)) - continue; /* skip page table pages */ - if (!list_empty(&page->lru)) - continue; /* skip free pages */ - set_page_stable_nodat(page, 0); - } - } -} +int __bootdata_preserved(cmma_flag); void arch_free_page(struct page *page, int order) { if (!cmma_flag) return; - set_page_unused(page, order); + __set_page_unused(page_to_virt(page), 1UL << order); } void arch_alloc_page(struct page *page, int order) @@ -219,57 +26,7 @@ void arch_alloc_page(struct page *page, int order) if (!cmma_flag) return; if (cmma_flag < 2) - set_page_stable_dat(page, order); + __set_page_stable_dat(page_to_virt(page), 1UL << order); else - set_page_stable_nodat(page, order); -} - -void arch_set_page_dat(struct page *page, int order) -{ - if (!cmma_flag) - return; - set_page_stable_dat(page, order); -} - -void arch_set_page_nodat(struct page *page, int order) -{ - if (cmma_flag < 2) - return; - set_page_stable_nodat(page, order); -} - -int arch_test_page_nodat(struct page *page) -{ - unsigned char state; - - if (cmma_flag < 2) - return 0; - state = get_page_state(page); - return !!(state & 0x20); -} - -void arch_set_page_states(int make_stable) -{ - unsigned long flags, order, t; - struct list_head *l; - struct page *page; - struct zone *zone; - - if (!cmma_flag) - return; - if (make_stable) - drain_local_pages(NULL); - for_each_populated_zone(zone) { - spin_lock_irqsave(&zone->lock, flags); - for_each_migratetype_order(order, t) { - list_for_each(l, &zone->free_area[order].free_list[t]) { - page = list_entry(l, struct page, lru); - if (make_stable) - set_page_stable_dat(page, order); - else - set_page_unused(page, order); - } - } - spin_unlock_irqrestore(&zone->lock, flags); - } + __set_page_stable_nodat(page_to_virt(page), 1UL << order); } diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index f8c6faab41f4..631e3a4ee2de 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -4,11 +4,13 @@ * Author(s): Jan Glauber <jang@linux.vnet.ibm.com> */ #include <linux/hugetlb.h> +#include <linux/proc_fs.h> +#include <linux/vmalloc.h> #include <linux/mm.h> #include <asm/cacheflush.h> #include <asm/facility.h> -#include <asm/pgtable.h> #include <asm/pgalloc.h> +#include <asm/kfence.h> #include <asm/page.h> #include <asm/set_memory.h> @@ -41,7 +43,7 @@ void __storage_key_init_range(unsigned long start, unsigned long end) } #ifdef CONFIG_PROC_FS -atomic_long_t direct_pages_count[PG_DIRECT_MAP_MAX]; +atomic_long_t __bootdata_preserved(direct_pages_count[PG_DIRECT_MAP_MAX]); void arch_report_meminfo(struct seq_file *m) { @@ -57,7 +59,7 @@ void arch_report_meminfo(struct seq_file *m) static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr, unsigned long dtt) { - unsigned long table, mask; + unsigned long *table, mask; mask = 0; if (MACHINE_HAS_EDAT2) { @@ -72,8 +74,8 @@ static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr, mask = ~(PTRS_PER_PTE * sizeof(pte_t) - 1); break; } - table = (unsigned long)old & mask; - crdte(*old, new, table, dtt, addr, S390_lowcore.kernel_asce); + table = (unsigned long *)((unsigned long)old & mask); + crdte(*old, new, table, dtt, addr, S390_lowcore.kernel_asce.val); } else if (MACHINE_HAS_IDTE) { cspg(old, *old, new); } else { @@ -86,7 +88,9 @@ static int walk_pte_level(pmd_t *pmdp, unsigned long addr, unsigned long end, { pte_t *ptep, new; - ptep = pte_offset(pmdp, addr); + if (flags == SET_MEMORY_4K) + return 0; + ptep = pte_offset_kernel(pmdp, addr); do { new = *ptep; if (pte_none(new)) @@ -94,11 +98,19 @@ static int walk_pte_level(pmd_t *pmdp, unsigned long addr, unsigned long end, if (flags & SET_MEMORY_RO) new = pte_wrprotect(new); else if (flags & SET_MEMORY_RW) - new = pte_mkwrite(pte_mkdirty(new)); + new = pte_mkwrite_novma(pte_mkdirty(new)); if (flags & SET_MEMORY_NX) - pte_val(new) |= _PAGE_NOEXEC; + new = set_pte_bit(new, __pgprot(_PAGE_NOEXEC)); else if (flags & SET_MEMORY_X) - pte_val(new) &= ~_PAGE_NOEXEC; + new = clear_pte_bit(new, __pgprot(_PAGE_NOEXEC)); + if (flags & SET_MEMORY_INV) { + new = set_pte_bit(new, __pgprot(_PAGE_INVALID)); + } else if (flags & SET_MEMORY_DEF) { + new = __pte(pte_val(new) & PAGE_MASK); + new = set_pte_bit(new, PAGE_KERNEL); + if (!MACHINE_HAS_NX) + new = clear_pte_bit(new, __pgprot(_PAGE_NOEXEC)); + } pgt_set((unsigned long *)ptep, pte_val(new), addr, CRDTE_DTT_PAGE); ptep++; addr += PAGE_SIZE; @@ -125,11 +137,11 @@ static int split_pmd_page(pmd_t *pmdp, unsigned long addr) prot &= ~_PAGE_NOEXEC; ptep = pt_dir; for (i = 0; i < PTRS_PER_PTE; i++) { - pte_val(*ptep) = pte_addr | prot; + set_pte(ptep, __pte(pte_addr | prot)); pte_addr += PAGE_SIZE; ptep++; } - pmd_val(new) = __pa(pt_dir) | _SEGMENT_ENTRY; + new = __pmd(__pa(pt_dir) | _SEGMENT_ENTRY); pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT); update_page_count(PG_DIRECT_MAP_4K, PTRS_PER_PTE); update_page_count(PG_DIRECT_MAP_1M, -1); @@ -144,11 +156,19 @@ static void modify_pmd_page(pmd_t *pmdp, unsigned long addr, if (flags & SET_MEMORY_RO) new = pmd_wrprotect(new); else if (flags & SET_MEMORY_RW) - new = pmd_mkwrite(pmd_mkdirty(new)); + new = pmd_mkwrite_novma(pmd_mkdirty(new)); if (flags & SET_MEMORY_NX) - pmd_val(new) |= _SEGMENT_ENTRY_NOEXEC; + new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC)); else if (flags & SET_MEMORY_X) - pmd_val(new) &= ~_SEGMENT_ENTRY_NOEXEC; + new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC)); + if (flags & SET_MEMORY_INV) { + new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID)); + } else if (flags & SET_MEMORY_DEF) { + new = __pmd(pmd_val(new) & PMD_MASK); + new = set_pmd_bit(new, SEGMENT_KERNEL); + if (!MACHINE_HAS_NX) + new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC)); + } pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT); } @@ -156,6 +176,7 @@ static int walk_pmd_level(pud_t *pudp, unsigned long addr, unsigned long end, unsigned long flags) { unsigned long next; + int need_split; pmd_t *pmdp; int rc = 0; @@ -165,7 +186,10 @@ static int walk_pmd_level(pud_t *pudp, unsigned long addr, unsigned long end, return -EINVAL; next = pmd_addr_end(addr, end); if (pmd_large(*pmdp)) { - if (addr & ~PMD_MASK || addr + PMD_SIZE > next) { + need_split = !!(flags & SET_MEMORY_4K); + need_split |= !!(addr & ~PMD_MASK); + need_split |= !!(addr + PMD_SIZE > next); + if (need_split) { rc = split_pmd_page(pmdp, addr); if (rc) return rc; @@ -202,11 +226,11 @@ static int split_pud_page(pud_t *pudp, unsigned long addr) prot &= ~_SEGMENT_ENTRY_NOEXEC; pmdp = pm_dir; for (i = 0; i < PTRS_PER_PMD; i++) { - pmd_val(*pmdp) = pmd_addr | prot; + set_pmd(pmdp, __pmd(pmd_addr | prot)); pmd_addr += PMD_SIZE; pmdp++; } - pud_val(new) = __pa(pm_dir) | _REGION3_ENTRY; + new = __pud(__pa(pm_dir) | _REGION3_ENTRY); pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3); update_page_count(PG_DIRECT_MAP_1M, PTRS_PER_PMD); update_page_count(PG_DIRECT_MAP_2G, -1); @@ -223,9 +247,17 @@ static void modify_pud_page(pud_t *pudp, unsigned long addr, else if (flags & SET_MEMORY_RW) new = pud_mkwrite(pud_mkdirty(new)); if (flags & SET_MEMORY_NX) - pud_val(new) |= _REGION_ENTRY_NOEXEC; + new = set_pud_bit(new, __pgprot(_REGION_ENTRY_NOEXEC)); else if (flags & SET_MEMORY_X) - pud_val(new) &= ~_REGION_ENTRY_NOEXEC; + new = clear_pud_bit(new, __pgprot(_REGION_ENTRY_NOEXEC)); + if (flags & SET_MEMORY_INV) { + new = set_pud_bit(new, __pgprot(_REGION_ENTRY_INVALID)); + } else if (flags & SET_MEMORY_DEF) { + new = __pud(pud_val(new) & PUD_MASK); + new = set_pud_bit(new, REGION3_KERNEL); + if (!MACHINE_HAS_NX) + new = clear_pud_bit(new, __pgprot(_REGION_ENTRY_NOEXEC)); + } pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3); } @@ -233,6 +265,7 @@ static int walk_pud_level(p4d_t *p4d, unsigned long addr, unsigned long end, unsigned long flags) { unsigned long next; + int need_split; pud_t *pudp; int rc = 0; @@ -242,7 +275,10 @@ static int walk_pud_level(p4d_t *p4d, unsigned long addr, unsigned long end, return -EINVAL; next = pud_addr_end(addr, end); if (pud_large(*pudp)) { - if (addr & ~PUD_MASK || addr + PUD_SIZE > next) { + need_split = !!(flags & SET_MEMORY_4K); + need_split |= !!(addr & ~PUD_MASK); + need_split |= !!(addr + PUD_SIZE > next); + if (need_split) { rc = split_pud_page(pudp, addr); if (rc) break; @@ -279,7 +315,7 @@ static int walk_p4d_level(pgd_t *pgd, unsigned long addr, unsigned long end, return rc; } -static DEFINE_MUTEX(cpa_mutex); +DEFINE_MUTEX(cpa_mutex); static int change_page_attr(unsigned long addr, unsigned long end, unsigned long flags) @@ -288,11 +324,6 @@ static int change_page_attr(unsigned long addr, unsigned long end, int rc = -EINVAL; pgd_t *pgdp; - if (addr == end) - return 0; - if (end >= MODULES_END) - return -EINVAL; - mutex_lock(&cpa_mutex); pgdp = pgd_offset_k(addr); do { if (pgd_none(*pgdp)) @@ -303,21 +334,79 @@ static int change_page_attr(unsigned long addr, unsigned long end, break; cond_resched(); } while (pgdp++, addr = next, addr < end && !rc); - mutex_unlock(&cpa_mutex); return rc; } -int __set_memory(unsigned long addr, int numpages, unsigned long flags) +static int change_page_attr_alias(unsigned long addr, unsigned long end, + unsigned long flags) { + unsigned long alias, offset, va_start, va_end; + struct vm_struct *area; + int rc = 0; + + /* + * Changes to read-only permissions on kernel VA mappings are also + * applied to the kernel direct mapping. Execute permissions are + * intentionally not transferred to keep all allocated pages within + * the direct mapping non-executable. + */ + flags &= SET_MEMORY_RO | SET_MEMORY_RW; + if (!flags) + return 0; + area = NULL; + while (addr < end) { + if (!area) + area = find_vm_area((void *)addr); + if (!area || !(area->flags & VM_ALLOC)) + return 0; + va_start = (unsigned long)area->addr; + va_end = va_start + area->nr_pages * PAGE_SIZE; + offset = (addr - va_start) >> PAGE_SHIFT; + alias = (unsigned long)page_address(area->pages[offset]); + rc = change_page_attr(alias, alias + PAGE_SIZE, flags); + if (rc) + break; + addr += PAGE_SIZE; + if (addr >= va_end) + area = NULL; + } + return rc; +} + +int __set_memory(unsigned long addr, unsigned long numpages, unsigned long flags) +{ + unsigned long end; + int rc; + if (!MACHINE_HAS_NX) flags &= ~(SET_MEMORY_NX | SET_MEMORY_X); if (!flags) return 0; + if (!numpages) + return 0; addr &= PAGE_MASK; - return change_page_attr(addr, addr + numpages * PAGE_SIZE, flags); + end = addr + numpages * PAGE_SIZE; + mutex_lock(&cpa_mutex); + rc = change_page_attr(addr, end, flags); + if (rc) + goto out; + rc = change_page_attr_alias(addr, end, flags); +out: + mutex_unlock(&cpa_mutex); + return rc; +} + +int set_direct_map_invalid_noflush(struct page *page) +{ + return __set_memory((unsigned long)page_to_virt(page), 1, SET_MEMORY_INV); +} + +int set_direct_map_default_noflush(struct page *page) +{ + return __set_memory((unsigned long)page_to_virt(page), 1, SET_MEMORY_DEF); } -#ifdef CONFIG_DEBUG_PAGEALLOC +#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE) static void ipte_range(pte_t *pte, unsigned long address, int nr) { @@ -337,50 +426,27 @@ static void ipte_range(pte_t *pte, unsigned long address, int nr) void __kernel_map_pages(struct page *page, int numpages, int enable) { unsigned long address; + pte_t *ptep, pte; int nr, i, j; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; for (i = 0; i < numpages;) { - address = page_to_phys(page + i); - pgd = pgd_offset_k(address); - p4d = p4d_offset(pgd, address); - pud = pud_offset(p4d, address); - pmd = pmd_offset(pud, address); - pte = pte_offset_kernel(pmd, address); - nr = (unsigned long)pte >> ilog2(sizeof(long)); + address = (unsigned long)page_to_virt(page + i); + ptep = virt_to_kpte(address); + nr = (unsigned long)ptep >> ilog2(sizeof(long)); nr = PTRS_PER_PTE - (nr & (PTRS_PER_PTE - 1)); nr = min(numpages - i, nr); if (enable) { for (j = 0; j < nr; j++) { - pte_val(*pte) &= ~_PAGE_INVALID; + pte = clear_pte_bit(*ptep, __pgprot(_PAGE_INVALID)); + set_pte(ptep, pte); address += PAGE_SIZE; - pte++; + ptep++; } } else { - ipte_range(pte, address, nr); + ipte_range(ptep, address, nr); } i += nr; } } -#ifdef CONFIG_HIBERNATION -bool kernel_page_present(struct page *page) -{ - unsigned long addr; - int cc; - - addr = page_to_phys(page); - asm volatile( - " lra %1,0(%1)\n" - " ipm %0\n" - " srl %0,28" - : "=d" (cc), "+a" (addr) : : "cc"); - return cc == 0; -} -#endif /* CONFIG_HIBERNATION */ - #endif /* CONFIG_DEBUG_PAGEALLOC */ diff --git a/arch/s390/mm/pfault.c b/arch/s390/mm/pfault.c new file mode 100644 index 000000000000..1aac13bb8f53 --- /dev/null +++ b/arch/s390/mm/pfault.c @@ -0,0 +1,248 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 1999, 2023 + */ + +#include <linux/cpuhotplug.h> +#include <linux/sched/task.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/irq.h> +#include <asm/asm-extable.h> +#include <asm/pfault.h> +#include <asm/diag.h> + +#define __SUBCODE_MASK 0x0600 +#define __PF_RES_FIELD 0x8000000000000000UL + +/* + * 'pfault' pseudo page faults routines. + */ +static int pfault_disable; + +static int __init nopfault(char *str) +{ + pfault_disable = 1; + return 1; +} +early_param("nopfault", nopfault); + +struct pfault_refbk { + u16 refdiagc; + u16 reffcode; + u16 refdwlen; + u16 refversn; + u64 refgaddr; + u64 refselmk; + u64 refcmpmk; + u64 reserved; +}; + +static struct pfault_refbk pfault_init_refbk = { + .refdiagc = 0x258, + .reffcode = 0, + .refdwlen = 5, + .refversn = 2, + .refgaddr = __LC_LPP, + .refselmk = 1UL << 48, + .refcmpmk = 1UL << 48, + .reserved = __PF_RES_FIELD +}; + +int __pfault_init(void) +{ + int rc = -EOPNOTSUPP; + + if (pfault_disable) + return rc; + diag_stat_inc(DIAG_STAT_X258); + asm volatile( + " diag %[refbk],%[rc],0x258\n" + "0: nopr %%r7\n" + EX_TABLE(0b, 0b) + : [rc] "+d" (rc) + : [refbk] "a" (&pfault_init_refbk), "m" (pfault_init_refbk) + : "cc"); + return rc; +} + +static struct pfault_refbk pfault_fini_refbk = { + .refdiagc = 0x258, + .reffcode = 1, + .refdwlen = 5, + .refversn = 2, +}; + +void __pfault_fini(void) +{ + if (pfault_disable) + return; + diag_stat_inc(DIAG_STAT_X258); + asm volatile( + " diag %[refbk],0,0x258\n" + "0: nopr %%r7\n" + EX_TABLE(0b, 0b) + : + : [refbk] "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk) + : "cc"); +} + +static DEFINE_SPINLOCK(pfault_lock); +static LIST_HEAD(pfault_list); + +#define PF_COMPLETE 0x0080 + +/* + * The mechanism of our pfault code: if Linux is running as guest, runs a user + * space process and the user space process accesses a page that the host has + * paged out we get a pfault interrupt. + * + * This allows us, within the guest, to schedule a different process. Without + * this mechanism the host would have to suspend the whole virtual cpu until + * the page has been paged in. + * + * So when we get such an interrupt then we set the state of the current task + * to uninterruptible and also set the need_resched flag. Both happens within + * interrupt context(!). If we later on want to return to user space we + * recognize the need_resched flag and then call schedule(). It's not very + * obvious how this works... + * + * Of course we have a lot of additional fun with the completion interrupt (-> + * host signals that a page of a process has been paged in and the process can + * continue to run). This interrupt can arrive on any cpu and, since we have + * virtual cpus, actually appear before the interrupt that signals that a page + * is missing. + */ +static void pfault_interrupt(struct ext_code ext_code, + unsigned int param32, unsigned long param64) +{ + struct task_struct *tsk; + __u16 subcode; + pid_t pid; + + /* + * Get the external interruption subcode & pfault initial/completion + * signal bit. VM stores this in the 'cpu address' field associated + * with the external interrupt. + */ + subcode = ext_code.subcode; + if ((subcode & 0xff00) != __SUBCODE_MASK) + return; + inc_irq_stat(IRQEXT_PFL); + /* Get the token (= pid of the affected task). */ + pid = param64 & LPP_PID_MASK; + rcu_read_lock(); + tsk = find_task_by_pid_ns(pid, &init_pid_ns); + if (tsk) + get_task_struct(tsk); + rcu_read_unlock(); + if (!tsk) + return; + spin_lock(&pfault_lock); + if (subcode & PF_COMPLETE) { + /* signal bit is set -> a page has been swapped in by VM */ + if (tsk->thread.pfault_wait == 1) { + /* + * Initial interrupt was faster than the completion + * interrupt. pfault_wait is valid. Set pfault_wait + * back to zero and wake up the process. This can + * safely be done because the task is still sleeping + * and can't produce new pfaults. + */ + tsk->thread.pfault_wait = 0; + list_del(&tsk->thread.list); + wake_up_process(tsk); + put_task_struct(tsk); + } else { + /* + * Completion interrupt was faster than initial + * interrupt. Set pfault_wait to -1 so the initial + * interrupt doesn't put the task to sleep. + * If the task is not running, ignore the completion + * interrupt since it must be a leftover of a PFAULT + * CANCEL operation which didn't remove all pending + * completion interrupts. + */ + if (task_is_running(tsk)) + tsk->thread.pfault_wait = -1; + } + } else { + /* signal bit not set -> a real page is missing. */ + if (WARN_ON_ONCE(tsk != current)) + goto out; + if (tsk->thread.pfault_wait == 1) { + /* Already on the list with a reference: put to sleep */ + goto block; + } else if (tsk->thread.pfault_wait == -1) { + /* + * Completion interrupt was faster than the initial + * interrupt (pfault_wait == -1). Set pfault_wait + * back to zero and exit. + */ + tsk->thread.pfault_wait = 0; + } else { + /* + * Initial interrupt arrived before completion + * interrupt. Let the task sleep. + * An extra task reference is needed since a different + * cpu may set the task state to TASK_RUNNING again + * before the scheduler is reached. + */ + get_task_struct(tsk); + tsk->thread.pfault_wait = 1; + list_add(&tsk->thread.list, &pfault_list); +block: + /* + * Since this must be a userspace fault, there + * is no kernel task state to trample. Rely on the + * return to userspace schedule() to block. + */ + __set_current_state(TASK_UNINTERRUPTIBLE); + set_tsk_need_resched(tsk); + set_preempt_need_resched(); + } + } +out: + spin_unlock(&pfault_lock); + put_task_struct(tsk); +} + +static int pfault_cpu_dead(unsigned int cpu) +{ + struct thread_struct *thread, *next; + struct task_struct *tsk; + + spin_lock_irq(&pfault_lock); + list_for_each_entry_safe(thread, next, &pfault_list, list) { + thread->pfault_wait = 0; + list_del(&thread->list); + tsk = container_of(thread, struct task_struct, thread); + wake_up_process(tsk); + put_task_struct(tsk); + } + spin_unlock_irq(&pfault_lock); + return 0; +} + +static int __init pfault_irq_init(void) +{ + int rc; + + rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); + if (rc) + goto out_extint; + rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP; + if (rc) + goto out_pfault; + irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL); + cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead", + NULL, pfault_cpu_dead); + return 0; + +out_pfault: + unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); +out_extint: + pfault_disable = 1; + return rc; +} +early_initcall(pfault_irq_init); diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 3dd253f81a77..008e487c94a6 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -10,6 +10,7 @@ #include <linux/slab.h> #include <linux/mm.h> #include <asm/mmu_context.h> +#include <asm/page-states.h> #include <asm/pgalloc.h> #include <asm/gmap.h> #include <asm/tlb.h> @@ -30,22 +31,11 @@ static struct ctl_table page_table_sysctl[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - { } -}; - -static struct ctl_table page_table_sysctl_dir[] = { - { - .procname = "vm", - .maxlen = 0, - .mode = 0555, - .child = page_table_sysctl, - }, - { } }; static int __init page_table_register_sysctl(void) { - return register_sysctl_table(page_table_sysctl_dir) ? 0 : -ENOMEM; + return register_sysctl("vm", page_table_sysctl) ? 0 : -ENOMEM; } __initcall(page_table_register_sysctl); @@ -53,266 +43,187 @@ __initcall(page_table_register_sysctl); unsigned long *crst_table_alloc(struct mm_struct *mm) { - struct page *page = alloc_pages(GFP_KERNEL, 2); + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER); + unsigned long *table; - if (!page) + if (!ptdesc) return NULL; - arch_set_page_dat(page, 2); - return (unsigned long *) page_to_phys(page); + table = ptdesc_to_virt(ptdesc); + __arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER); + return table; } void crst_table_free(struct mm_struct *mm, unsigned long *table) { - free_pages((unsigned long) table, 2); + pagetable_free(virt_to_ptdesc(table)); } static void __crst_table_upgrade(void *arg) { struct mm_struct *mm = arg; - if (current->active_mm == mm) - set_user_asce(mm); + /* change all active ASCEs to avoid the creation of new TLBs */ + if (current->active_mm == mm) { + S390_lowcore.user_asce.val = mm->context.asce; + local_ctl_load(7, &S390_lowcore.user_asce); + } __tlb_flush_local(); } int crst_table_upgrade(struct mm_struct *mm, unsigned long end) { - unsigned long *table, *pgd; - int rc, notify; + unsigned long *pgd = NULL, *p4d = NULL, *__pgd; + unsigned long asce_limit = mm->context.asce_limit; /* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */ - VM_BUG_ON(mm->context.asce_limit < _REGION2_SIZE); - rc = 0; - notify = 0; - while (mm->context.asce_limit < end) { - table = crst_table_alloc(mm); - if (!table) { - rc = -ENOMEM; - break; - } - spin_lock_bh(&mm->page_table_lock); - pgd = (unsigned long *) mm->pgd; - if (mm->context.asce_limit == _REGION2_SIZE) { - crst_table_init(table, _REGION2_ENTRY_EMPTY); - p4d_populate(mm, (p4d_t *) table, (pud_t *) pgd); - mm->pgd = (pgd_t *) table; - mm->context.asce_limit = _REGION1_SIZE; - mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | _ASCE_TYPE_REGION2; - mm_inc_nr_puds(mm); - } else { - crst_table_init(table, _REGION1_ENTRY_EMPTY); - pgd_populate(mm, (pgd_t *) table, (p4d_t *) pgd); - mm->pgd = (pgd_t *) table; - mm->context.asce_limit = -PAGE_SIZE; - mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | _ASCE_TYPE_REGION1; - } - notify = 1; - spin_unlock_bh(&mm->page_table_lock); - } - if (notify) - on_each_cpu(__crst_table_upgrade, mm, 0); - return rc; -} + VM_BUG_ON(asce_limit < _REGION2_SIZE); -void crst_table_downgrade(struct mm_struct *mm) -{ - pgd_t *pgd; + if (end <= asce_limit) + return 0; - /* downgrade should only happen from 3 to 2 levels (compat only) */ - VM_BUG_ON(mm->context.asce_limit != _REGION2_SIZE); + if (asce_limit == _REGION2_SIZE) { + p4d = crst_table_alloc(mm); + if (unlikely(!p4d)) + goto err_p4d; + crst_table_init(p4d, _REGION2_ENTRY_EMPTY); + } + if (end > _REGION1_SIZE) { + pgd = crst_table_alloc(mm); + if (unlikely(!pgd)) + goto err_pgd; + crst_table_init(pgd, _REGION1_ENTRY_EMPTY); + } - if (current->active_mm == mm) { - clear_user_asce(); - __tlb_flush_mm(mm); + spin_lock_bh(&mm->page_table_lock); + + /* + * This routine gets called with mmap_lock lock held and there is + * no reason to optimize for the case of otherwise. However, if + * that would ever change, the below check will let us know. + */ + VM_BUG_ON(asce_limit != mm->context.asce_limit); + + if (p4d) { + __pgd = (unsigned long *) mm->pgd; + p4d_populate(mm, (p4d_t *) p4d, (pud_t *) __pgd); + mm->pgd = (pgd_t *) p4d; + mm->context.asce_limit = _REGION1_SIZE; + mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | _ASCE_TYPE_REGION2; + mm_inc_nr_puds(mm); + } + if (pgd) { + __pgd = (unsigned long *) mm->pgd; + pgd_populate(mm, (pgd_t *) pgd, (p4d_t *) __pgd); + mm->pgd = (pgd_t *) pgd; + mm->context.asce_limit = TASK_SIZE_MAX; + mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | _ASCE_TYPE_REGION1; } - pgd = mm->pgd; - mm_dec_nr_pmds(mm); - mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN); - mm->context.asce_limit = _REGION3_SIZE; - mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT; - crst_table_free(mm, (unsigned long *) pgd); + spin_unlock_bh(&mm->page_table_lock); - if (current->active_mm == mm) - set_user_asce(mm); -} + on_each_cpu(__crst_table_upgrade, mm, 0); -static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) -{ - unsigned int old, new; + return 0; - do { - old = atomic_read(v); - new = old ^ bits; - } while (atomic_cmpxchg(v, old, new) != old); - return new; +err_pgd: + crst_table_free(mm, p4d); +err_p4d: + return -ENOMEM; } #ifdef CONFIG_PGSTE struct page *page_table_alloc_pgste(struct mm_struct *mm) { - struct page *page; + struct ptdesc *ptdesc; u64 *table; - page = alloc_page(GFP_KERNEL); - if (page) { - table = (u64 *)page_to_phys(page); + ptdesc = pagetable_alloc(GFP_KERNEL, 0); + if (ptdesc) { + table = (u64 *)ptdesc_to_virt(ptdesc); + __arch_set_page_dat(table, 1); memset64(table, _PAGE_INVALID, PTRS_PER_PTE); memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE); } - return page; + return ptdesc_page(ptdesc); } void page_table_free_pgste(struct page *page) { - __free_page(page); + pagetable_free(page_ptdesc(page)); } #endif /* CONFIG_PGSTE */ -/* - * page table entry allocation/free routines. - */ unsigned long *page_table_alloc(struct mm_struct *mm) { + struct ptdesc *ptdesc; unsigned long *table; - struct page *page; - unsigned int mask, bit; - - /* Try to get a fragment of a 4K page as a 2K page table */ - if (!mm_alloc_pgste(mm)) { - table = NULL; - spin_lock_bh(&mm->context.lock); - if (!list_empty(&mm->context.pgtable_list)) { - page = list_first_entry(&mm->context.pgtable_list, - struct page, lru); - mask = atomic_read(&page->_refcount) >> 24; - mask = (mask | (mask >> 4)) & 3; - if (mask != 3) { - table = (unsigned long *) page_to_phys(page); - bit = mask & 1; /* =1 -> second 2K */ - if (bit) - table += PTRS_PER_PTE; - atomic_xor_bits(&page->_refcount, - 1U << (bit + 24)); - list_del(&page->lru); - } - } - spin_unlock_bh(&mm->context.lock); - if (table) - return table; - } - /* Allocate a fresh page */ - page = alloc_page(GFP_KERNEL); - if (!page) + + ptdesc = pagetable_alloc(GFP_KERNEL, 0); + if (!ptdesc) return NULL; - if (!pgtable_pte_page_ctor(page)) { - __free_page(page); + if (!pagetable_pte_ctor(ptdesc)) { + pagetable_free(ptdesc); return NULL; } - arch_set_page_dat(page, 0); - /* Initialize page table */ - table = (unsigned long *) page_to_phys(page); - if (mm_alloc_pgste(mm)) { - /* Return 4K page table with PGSTEs */ - atomic_xor_bits(&page->_refcount, 3 << 24); - memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); - memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); - } else { - /* Return the first 2K fragment of the page */ - atomic_xor_bits(&page->_refcount, 1 << 24); - memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE); - spin_lock_bh(&mm->context.lock); - list_add(&page->lru, &mm->context.pgtable_list); - spin_unlock_bh(&mm->context.lock); - } + table = ptdesc_to_virt(ptdesc); + __arch_set_page_dat(table, 1); + /* pt_list is used by gmap only */ + INIT_LIST_HEAD(&ptdesc->pt_list); + memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); + memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); return table; } +static void pagetable_pte_dtor_free(struct ptdesc *ptdesc) +{ + pagetable_pte_dtor(ptdesc); + pagetable_free(ptdesc); +} + void page_table_free(struct mm_struct *mm, unsigned long *table) { - struct page *page; - unsigned int bit, mask; - - page = pfn_to_page(__pa(table) >> PAGE_SHIFT); - if (!mm_alloc_pgste(mm)) { - /* Free 2K page table fragment of a 4K page */ - bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); - spin_lock_bh(&mm->context.lock); - mask = atomic_xor_bits(&page->_refcount, 1U << (bit + 24)); - mask >>= 24; - if (mask & 3) - list_add(&page->lru, &mm->context.pgtable_list); - else - list_del(&page->lru); - spin_unlock_bh(&mm->context.lock); - if (mask != 0) - return; - } else { - atomic_xor_bits(&page->_refcount, 3U << 24); - } + struct ptdesc *ptdesc = virt_to_ptdesc(table); - pgtable_pte_page_dtor(page); - __free_page(page); + pagetable_pte_dtor_free(ptdesc); } -void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, - unsigned long vmaddr) +void __tlb_remove_table(void *table) { - struct mm_struct *mm; - struct page *page; - unsigned int bit, mask; - - mm = tlb->mm; - page = pfn_to_page(__pa(table) >> PAGE_SHIFT); - if (mm_alloc_pgste(mm)) { - gmap_unlink(mm, table, vmaddr); - table = (unsigned long *) (__pa(table) | 3); - tlb_remove_table(tlb, table); + struct ptdesc *ptdesc = virt_to_ptdesc(table); + struct page *page = ptdesc_page(ptdesc); + + if (compound_order(page) == CRST_ALLOC_ORDER) { + /* pmd, pud, or p4d */ + pagetable_free(ptdesc); return; } - bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)); - spin_lock_bh(&mm->context.lock); - mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); - mask >>= 24; - if (mask & 3) - list_add_tail(&page->lru, &mm->context.pgtable_list); - else - list_del(&page->lru); - spin_unlock_bh(&mm->context.lock); - table = (unsigned long *) (__pa(table) | (1U << bit)); - tlb_remove_table(tlb, table); + pagetable_pte_dtor_free(ptdesc); } -void __tlb_remove_table(void *_table) +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static void pte_free_now(struct rcu_head *head) { - unsigned int mask = (unsigned long) _table & 3; - void *table = (void *)((unsigned long) _table ^ mask); - struct page *page = pfn_to_page(__pa(table) >> PAGE_SHIFT); + struct ptdesc *ptdesc = container_of(head, struct ptdesc, pt_rcu_head); - switch (mask) { - case 0: /* pmd, pud, or p4d */ - free_pages((unsigned long) table, 2); - break; - case 1: /* lower 2K of a 4K page table */ - case 2: /* higher 2K of a 4K page table */ - mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24)); - mask >>= 24; - if (mask != 0) - break; - /* fallthrough */ - case 3: /* 4K page table with pgstes */ - if (mask & 3) - atomic_xor_bits(&page->_refcount, 3 << 24); - pgtable_pte_page_dtor(page); - __free_page(page); - break; - } + pagetable_pte_dtor_free(ptdesc); +} + +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) +{ + struct ptdesc *ptdesc = virt_to_ptdesc(pgtable); + + call_rcu(&ptdesc->pt_rcu_head, pte_free_now); + /* + * THPs are not allowed for KVM guests. Warn if pgste ever reaches here. + * Turn to the generic pte_free_defer() version once gmap is removed. + */ + WARN_ON_ONCE(mm_has_pgste(mm)); } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* * Base infrastructure required to generate basic asces, region, segment, @@ -321,34 +232,37 @@ void __tlb_remove_table(void *_table) static struct kmem_cache *base_pgt_cache; -static unsigned long base_pgt_alloc(void) +static unsigned long *base_pgt_alloc(void) { - u64 *table; + unsigned long *table; table = kmem_cache_alloc(base_pgt_cache, GFP_KERNEL); if (table) - memset64(table, _PAGE_INVALID, PTRS_PER_PTE); - return (unsigned long) table; + memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); + return table; } -static void base_pgt_free(unsigned long table) +static void base_pgt_free(unsigned long *table) { - kmem_cache_free(base_pgt_cache, (void *) table); + kmem_cache_free(base_pgt_cache, table); } -static unsigned long base_crst_alloc(unsigned long val) +static unsigned long *base_crst_alloc(unsigned long val) { - unsigned long table; + unsigned long *table; + struct ptdesc *ptdesc; - table = __get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER); - if (table) - crst_table_init((unsigned long *)table, val); + ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER); + if (!ptdesc) + return NULL; + table = ptdesc_address(ptdesc); + crst_table_init(table, val); return table; } -static void base_crst_free(unsigned long table) +static void base_crst_free(unsigned long *table) { - free_pages(table, CRST_ALLOC_ORDER); + pagetable_free(virt_to_ptdesc(table)); } #define BASE_ADDR_END_FUNC(NAME, SIZE) \ @@ -376,14 +290,14 @@ static inline unsigned long base_lra(unsigned long address) return real; } -static int base_page_walk(unsigned long origin, unsigned long addr, +static int base_page_walk(unsigned long *origin, unsigned long addr, unsigned long end, int alloc) { unsigned long *pte, next; if (!alloc) return 0; - pte = (unsigned long *) origin; + pte = origin; pte += (addr & _PAGE_INDEX) >> _PAGE_SHIFT; do { next = base_page_addr_end(addr, end); @@ -392,13 +306,13 @@ static int base_page_walk(unsigned long origin, unsigned long addr, return 0; } -static int base_segment_walk(unsigned long origin, unsigned long addr, +static int base_segment_walk(unsigned long *origin, unsigned long addr, unsigned long end, int alloc) { - unsigned long *ste, next, table; + unsigned long *ste, next, *table; int rc; - ste = (unsigned long *) origin; + ste = origin; ste += (addr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; do { next = base_segment_addr_end(addr, end); @@ -408,9 +322,9 @@ static int base_segment_walk(unsigned long origin, unsigned long addr, table = base_pgt_alloc(); if (!table) return -ENOMEM; - *ste = table | _SEGMENT_ENTRY; + *ste = __pa(table) | _SEGMENT_ENTRY; } - table = *ste & _SEGMENT_ENTRY_ORIGIN; + table = __va(*ste & _SEGMENT_ENTRY_ORIGIN); rc = base_page_walk(table, addr, next, alloc); if (rc) return rc; @@ -421,13 +335,13 @@ static int base_segment_walk(unsigned long origin, unsigned long addr, return 0; } -static int base_region3_walk(unsigned long origin, unsigned long addr, +static int base_region3_walk(unsigned long *origin, unsigned long addr, unsigned long end, int alloc) { - unsigned long *rtte, next, table; + unsigned long *rtte, next, *table; int rc; - rtte = (unsigned long *) origin; + rtte = origin; rtte += (addr & _REGION3_INDEX) >> _REGION3_SHIFT; do { next = base_region3_addr_end(addr, end); @@ -437,9 +351,9 @@ static int base_region3_walk(unsigned long origin, unsigned long addr, table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY); if (!table) return -ENOMEM; - *rtte = table | _REGION3_ENTRY; + *rtte = __pa(table) | _REGION3_ENTRY; } - table = *rtte & _REGION_ENTRY_ORIGIN; + table = __va(*rtte & _REGION_ENTRY_ORIGIN); rc = base_segment_walk(table, addr, next, alloc); if (rc) return rc; @@ -449,13 +363,13 @@ static int base_region3_walk(unsigned long origin, unsigned long addr, return 0; } -static int base_region2_walk(unsigned long origin, unsigned long addr, +static int base_region2_walk(unsigned long *origin, unsigned long addr, unsigned long end, int alloc) { - unsigned long *rste, next, table; + unsigned long *rste, next, *table; int rc; - rste = (unsigned long *) origin; + rste = origin; rste += (addr & _REGION2_INDEX) >> _REGION2_SHIFT; do { next = base_region2_addr_end(addr, end); @@ -465,9 +379,9 @@ static int base_region2_walk(unsigned long origin, unsigned long addr, table = base_crst_alloc(_REGION3_ENTRY_EMPTY); if (!table) return -ENOMEM; - *rste = table | _REGION2_ENTRY; + *rste = __pa(table) | _REGION2_ENTRY; } - table = *rste & _REGION_ENTRY_ORIGIN; + table = __va(*rste & _REGION_ENTRY_ORIGIN); rc = base_region3_walk(table, addr, next, alloc); if (rc) return rc; @@ -477,13 +391,13 @@ static int base_region2_walk(unsigned long origin, unsigned long addr, return 0; } -static int base_region1_walk(unsigned long origin, unsigned long addr, +static int base_region1_walk(unsigned long *origin, unsigned long addr, unsigned long end, int alloc) { - unsigned long *rfte, next, table; + unsigned long *rfte, next, *table; int rc; - rfte = (unsigned long *) origin; + rfte = origin; rfte += (addr & _REGION1_INDEX) >> _REGION1_SHIFT; do { next = base_region1_addr_end(addr, end); @@ -493,9 +407,9 @@ static int base_region1_walk(unsigned long origin, unsigned long addr, table = base_crst_alloc(_REGION2_ENTRY_EMPTY); if (!table) return -ENOMEM; - *rfte = table | _REGION1_ENTRY; + *rfte = __pa(table) | _REGION1_ENTRY; } - table = *rfte & _REGION_ENTRY_ORIGIN; + table = __va(*rfte & _REGION_ENTRY_ORIGIN); rc = base_region2_walk(table, addr, next, alloc); if (rc) return rc; @@ -514,7 +428,7 @@ static int base_region1_walk(unsigned long origin, unsigned long addr, */ void base_asce_free(unsigned long asce) { - unsigned long table = asce & _ASCE_ORIGIN; + unsigned long *table = __va(asce & _ASCE_ORIGIN); if (!asce) return; @@ -529,7 +443,7 @@ void base_asce_free(unsigned long asce) base_region2_walk(table, 0, _REGION1_SIZE, 0); break; case _ASCE_TYPE_REGION1: - base_region1_walk(table, 0, -_PAGE_SIZE, 0); + base_region1_walk(table, 0, TASK_SIZE_MAX, 0); break; } base_crst_free(table); @@ -566,7 +480,7 @@ static int base_pgt_cache_init(void) */ unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages) { - unsigned long asce, table, end; + unsigned long asce, *table, end; int rc; if (base_pgt_cache_init()) @@ -577,25 +491,25 @@ unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages) if (!table) return 0; rc = base_segment_walk(table, addr, end, 1); - asce = table | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH; + asce = __pa(table) | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH; } else if (end <= _REGION2_SIZE) { table = base_crst_alloc(_REGION3_ENTRY_EMPTY); if (!table) return 0; rc = base_region3_walk(table, addr, end, 1); - asce = table | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; + asce = __pa(table) | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; } else if (end <= _REGION1_SIZE) { table = base_crst_alloc(_REGION2_ENTRY_EMPTY); if (!table) return 0; rc = base_region2_walk(table, addr, end, 1); - asce = table | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; + asce = __pa(table) | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; } else { table = base_crst_alloc(_REGION1_ENTRY_EMPTY); if (!table) return 0; rc = base_region1_walk(table, addr, end, 1); - asce = table | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH; + asce = __pa(table) | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH; } if (rc) { base_asce_free(asce); diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 9ebd01219812..99422926efe1 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -19,13 +19,31 @@ #include <linux/ksm.h> #include <linux/mman.h> -#include <asm/pgtable.h> -#include <asm/pgalloc.h> #include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> #include <asm/page-states.h> +pgprot_t pgprot_writecombine(pgprot_t prot) +{ + /* + * mio_wb_bit_mask may be set on a different CPU, but it is only set + * once at init and only read afterwards. + */ + return __pgprot(pgprot_val(prot) | mio_wb_bit_mask); +} +EXPORT_SYMBOL_GPL(pgprot_writecombine); + +pgprot_t pgprot_writethrough(pgprot_t prot) +{ + /* + * mio_wb_bit_mask may be set on a different CPU, but it is only set + * once at init and only read afterwards. + */ + return __pgprot(pgprot_val(prot) & ~mio_wb_bit_mask); +} +EXPORT_SYMBOL_GPL(pgprot_writethrough); + static inline void ptep_ipte_local(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int nodat) { @@ -97,7 +115,7 @@ static inline pte_t ptep_flush_lazy(struct mm_struct *mm, atomic_inc(&mm->context.flush_count); if (cpumask_equal(&mm->context.cpu_attach_mask, cpumask_of(smp_processor_id()))) { - pte_val(*ptep) |= _PAGE_INVALID; + set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_INVALID))); mm->context.flush_mm = 1; } else ptep_ipte_global(mm, addr, ptep, nodat); @@ -107,32 +125,23 @@ static inline pte_t ptep_flush_lazy(struct mm_struct *mm, static inline pgste_t pgste_get_lock(pte_t *ptep) { - unsigned long new = 0; + unsigned long value = 0; #ifdef CONFIG_PGSTE - unsigned long old; - - asm( - " lg %0,%2\n" - "0: lgr %1,%0\n" - " nihh %0,0xff7f\n" /* clear PCL bit in old */ - " oihh %1,0x0080\n" /* set PCL bit in new */ - " csg %0,%1,%2\n" - " jl 0b\n" - : "=&d" (old), "=&d" (new), "=Q" (ptep[PTRS_PER_PTE]) - : "Q" (ptep[PTRS_PER_PTE]) : "cc", "memory"); + unsigned long *ptr = (unsigned long *)(ptep + PTRS_PER_PTE); + + do { + value = __atomic64_or_barrier(PGSTE_PCL_BIT, ptr); + } while (value & PGSTE_PCL_BIT); + value |= PGSTE_PCL_BIT; #endif - return __pgste(new); + return __pgste(value); } static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste) { #ifdef CONFIG_PGSTE - asm( - " nihh %1,0xff7f\n" /* clear PCL bit */ - " stg %1,%0\n" - : "=Q" (ptep[PTRS_PER_PTE]) - : "d" (pgste_val(pgste)), "Q" (ptep[PTRS_PER_PTE]) - : "cc", "memory"); + barrier(); + WRITE_ONCE(*(unsigned long *)(ptep + PTRS_PER_PTE), pgste_val(pgste) & ~PGSTE_PCL_BIT); #endif } @@ -206,15 +215,15 @@ static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry) * Without enhanced suppression-on-protection force * the dirty bit on for all writable ptes. */ - pte_val(entry) |= _PAGE_DIRTY; - pte_val(entry) &= ~_PAGE_PROTECT; + entry = set_pte_bit(entry, __pgprot(_PAGE_DIRTY)); + entry = clear_pte_bit(entry, __pgprot(_PAGE_PROTECT)); } if (!(pte_val(entry) & _PAGE_PROTECT)) /* This pte allows write access, set user-dirty */ pgste_val(pgste) |= PGSTE_UC_BIT; } #endif - *ptep = entry; + set_pte(ptep, entry); return pgste; } @@ -257,12 +266,12 @@ static inline pte_t ptep_xchg_commit(struct mm_struct *mm, pgste = pgste_update_all(old, pgste, mm); if ((pgste_val(pgste) & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) - pte_val(old) |= _PAGE_UNUSED; + old = set_pte_bit(old, __pgprot(_PAGE_UNUSED)); } pgste = pgste_set_pte(ptep, pgste, new); pgste_set_unlock(ptep, pgste); } else { - *ptep = new; + set_pte(ptep, new); } return old; } @@ -284,6 +293,31 @@ pte_t ptep_xchg_direct(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL(ptep_xchg_direct); +/* + * Caller must check that new PTE only differs in _PAGE_PROTECT HW bit, so that + * RDP can be used instead of IPTE. See also comments at pte_allow_rdp(). + */ +void ptep_reset_dat_prot(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + pte_t new) +{ + preempt_disable(); + atomic_inc(&mm->context.flush_count); + if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) + __ptep_rdp(addr, ptep, 0, 0, 1); + else + __ptep_rdp(addr, ptep, 0, 0, 0); + /* + * PTE is not invalidated by RDP, only _PAGE_PROTECT is cleared. That + * means it is still valid and active, and must not be changed according + * to the architecture. But writing a new value that only differs in SW + * bits is allowed. + */ + set_pte(ptep, new); + atomic_dec(&mm->context.flush_count); + preempt_enable(); +} +EXPORT_SYMBOL(ptep_reset_dat_prot); + pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t new) { @@ -327,14 +361,14 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, struct mm_struct *mm = vma->vm_mm; if (!MACHINE_HAS_NX) - pte_val(pte) &= ~_PAGE_NOEXEC; + pte = clear_pte_bit(pte, __pgprot(_PAGE_NOEXEC)); if (mm_has_pgste(mm)) { pgste = pgste_get(ptep); pgste_set_key(ptep, pgste, pte, mm); pgste = pgste_set_pte(ptep, pgste, pte); pgste_set_unlock(ptep, pgste); } else { - *ptep = pte; + set_pte(ptep, pte); } preempt_enable(); } @@ -399,7 +433,7 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm, atomic_inc(&mm->context.flush_count); if (cpumask_equal(&mm->context.cpu_attach_mask, cpumask_of(smp_processor_id()))) { - pmd_val(*pmdp) |= _SEGMENT_ENTRY_INVALID; + set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_INVALID))); mm->context.flush_mm = 1; if (mm_has_pgste(mm)) gmap_pmdp_invalidate(mm, addr); @@ -411,22 +445,36 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm, } #ifdef CONFIG_PGSTE -static pmd_t *pmd_alloc_map(struct mm_struct *mm, unsigned long addr) +static int pmd_lookup(struct mm_struct *mm, unsigned long addr, pmd_t **pmdp) { + struct vm_area_struct *vma; pgd_t *pgd; p4d_t *p4d; pud_t *pud; - pmd_t *pmd; + + /* We need a valid VMA, otherwise this is clearly a fault. */ + vma = vma_lookup(mm, addr); + if (!vma) + return -EFAULT; pgd = pgd_offset(mm, addr); - p4d = p4d_alloc(mm, pgd, addr); - if (!p4d) - return NULL; - pud = pud_alloc(mm, p4d, addr); - if (!pud) - return NULL; - pmd = pmd_alloc(mm, pud, addr); - return pmd; + if (!pgd_present(*pgd)) + return -ENOENT; + + p4d = p4d_offset(pgd, addr); + if (!p4d_present(*p4d)) + return -ENOENT; + + pud = pud_offset(p4d, addr); + if (!pud_present(*pud)) + return -ENOENT; + + /* Large PUDs are not supported yet. */ + if (pud_large(*pud)) + return -EFAULT; + + *pmdp = pmd_offset(pud, addr); + return 0; } #endif @@ -437,7 +485,7 @@ pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr, preempt_disable(); old = pmdp_flush_direct(mm, addr, pmdp); - *pmdp = new; + set_pmd(pmdp, new); preempt_enable(); return old; } @@ -450,7 +498,7 @@ pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr, preempt_disable(); old = pmdp_flush_lazy(mm, addr, pmdp); - *pmdp = new; + set_pmd(pmdp, new); preempt_enable(); return old; } @@ -507,7 +555,7 @@ pud_t pudp_xchg_direct(struct mm_struct *mm, unsigned long addr, preempt_disable(); old = pudp_flush_direct(mm, addr, pudp); - *pudp = new; + set_pud(pudp, new); preempt_enable(); return old; } @@ -547,9 +595,9 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) list_del(lh); } ptep = (pte_t *) pgtable; - pte_val(*ptep) = _PAGE_INVALID; + set_pte(ptep, __pte(_PAGE_INVALID)); ptep++; - pte_val(*ptep) = _PAGE_INVALID; + set_pte(ptep, __pte(_PAGE_INVALID)); return pgtable; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -614,12 +662,12 @@ int ptep_force_prot(struct mm_struct *mm, unsigned long addr, if (prot == PROT_NONE && !pte_i) { ptep_flush_direct(mm, addr, ptep, nodat); pgste = pgste_update_all(entry, pgste, mm); - pte_val(entry) |= _PAGE_INVALID; + entry = set_pte_bit(entry, __pgprot(_PAGE_INVALID)); } if (prot == PROT_READ && !pte_p) { ptep_flush_direct(mm, addr, ptep, nodat); - pte_val(entry) &= ~_PAGE_INVALID; - pte_val(entry) |= _PAGE_PROTECT; + entry = clear_pte_bit(entry, __pgprot(_PAGE_INVALID)); + entry = set_pte_bit(entry, __pgprot(_PAGE_PROTECT)); } pgste_val(pgste) |= bit; pgste = pgste_set_pte(ptep, pgste, entry); @@ -643,8 +691,8 @@ int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, !(pte_val(pte) & _PAGE_PROTECT))) { pgste_val(spgste) |= PGSTE_VSIE_BIT; tpgste = pgste_get_lock(tptep); - pte_val(tpte) = (pte_val(spte) & PAGE_MASK) | - (pte_val(pte) & _PAGE_PROTECT); + tpte = __pte((pte_val(spte) & PAGE_MASK) | + (pte_val(pte) & _PAGE_PROTECT)); /* don't touch the storage key - it belongs to parent pgste */ tpgste = pgste_set_pte(tptep, tpgste, tpte); pgste_set_unlock(tptep, tpgste); @@ -673,7 +721,7 @@ static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) if (!non_swap_entry(entry)) dec_mm_counter(mm, MM_SWAPENTS); else if (is_migration_entry(entry)) { - struct page *page = migration_entry_to_page(entry); + struct page *page = pfn_swap_entry_to_page(entry); dec_mm_counter(mm, mm_counter(page)); } @@ -699,7 +747,7 @@ void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, pte_clear(mm, addr, ptep); } if (reset) - pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK; + pgste_val(pgste) &= ~(_PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT); pgste_set_unlock(ptep, pgste); preempt_enable(); } @@ -716,7 +764,7 @@ void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep) pgste_val(pgste) |= PGSTE_GR_BIT | PGSTE_GC_BIT; ptev = pte_val(*ptep); if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE)) - page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 1); + page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 0); pgste_set_unlock(ptep, pgste); preempt_enable(); } @@ -741,10 +789,10 @@ bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long addr, nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); ptep_ipte_global(mm, addr, ptep, nodat); if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE)) - pte_val(pte) |= _PAGE_PROTECT; + pte = set_pte_bit(pte, __pgprot(_PAGE_PROTECT)); else - pte_val(pte) |= _PAGE_INVALID; - *ptep = pte; + pte = set_pte_bit(pte, __pgprot(_PAGE_INVALID)); + set_pte(ptep, pte); } pgste_set_unlock(ptep, pgste); return dirty; @@ -760,14 +808,23 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp; pte_t *ptep; - pmdp = pmd_alloc_map(mm, addr); - if (unlikely(!pmdp)) + /* + * If we don't have a PTE table and if there is no huge page mapped, + * we can ignore attempts to set the key to 0, because it already is 0. + */ + switch (pmd_lookup(mm, addr, &pmdp)) { + case -ENOENT: + return key ? -EFAULT : 0; + case 0: + break; + default: return -EFAULT; - + } +again: ptl = pmd_lock(mm, pmdp); if (!pmd_present(*pmdp)) { spin_unlock(ptl); - return -EFAULT; + return key ? -EFAULT : 0; } if (pmd_large(*pmdp)) { @@ -783,10 +840,9 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, } spin_unlock(ptl); - ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl); - if (unlikely(!ptep)) - return -EFAULT; - + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + if (!ptep) + goto again; new = old = pgste_get_lock(ptep); pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT | PGSTE_ACC_BITS | PGSTE_FP_BIT); @@ -816,7 +872,7 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL(set_guest_storage_key); -/** +/* * Conditionally set a guest storage key (handling csske). * oldkey will be updated when either mr or mc is set and a pointer is given. * @@ -849,7 +905,7 @@ int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL(cond_set_guest_storage_key); -/** +/* * Reset a guest reference bit (rrbe), returning the reference and changed bit. * * Returns < 0 in case of error, otherwise the cc to be reported to the guest. @@ -863,14 +919,23 @@ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr) pte_t *ptep; int cc = 0; - pmdp = pmd_alloc_map(mm, addr); - if (unlikely(!pmdp)) + /* + * If we don't have a PTE table and if there is no huge page mapped, + * the storage key is 0 and there is nothing for us to do. + */ + switch (pmd_lookup(mm, addr, &pmdp)) { + case -ENOENT: + return 0; + case 0: + break; + default: return -EFAULT; - + } +again: ptl = pmd_lock(mm, pmdp); if (!pmd_present(*pmdp)) { spin_unlock(ptl); - return -EFAULT; + return 0; } if (pmd_large(*pmdp)) { @@ -882,10 +947,9 @@ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr) } spin_unlock(ptl); - ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl); - if (unlikely(!ptep)) - return -EFAULT; - + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + if (!ptep) + goto again; new = old = pgste_get_lock(ptep); /* Reset guest reference bit only */ pgste_val(new) &= ~PGSTE_GR_BIT; @@ -917,15 +981,24 @@ int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp; pte_t *ptep; - pmdp = pmd_alloc_map(mm, addr); - if (unlikely(!pmdp)) - return -EFAULT; + /* + * If we don't have a PTE table and if there is no huge page mapped, + * the storage key is 0. + */ + *key = 0; + switch (pmd_lookup(mm, addr, &pmdp)) { + case -ENOENT: + return 0; + case 0: + break; + default: + return -EFAULT; + } +again: ptl = pmd_lock(mm, pmdp); if (!pmd_present(*pmdp)) { - /* Not yet mapped memory has a zero key */ spin_unlock(ptl); - *key = 0; return 0; } @@ -938,10 +1011,9 @@ int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, } spin_unlock(ptl); - ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl); - if (unlikely(!ptep)) - return -EFAULT; - + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + if (!ptep) + goto again; pgste = pgste_get_lock(ptep); *key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; paddr = pte_val(*ptep) & PAGE_MASK; @@ -970,6 +1042,7 @@ EXPORT_SYMBOL(get_guest_storage_key); int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc, unsigned long *oldpte, unsigned long *oldpgste) { + struct vm_area_struct *vma; unsigned long pgstev; spinlock_t *ptl; pgste_t pgste; @@ -979,6 +1052,10 @@ int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc, WARN_ON_ONCE(orc > ESSA_MAX); if (unlikely(orc > ESSA_MAX)) return -EINVAL; + + vma = vma_lookup(mm, hva); + if (!vma || is_vm_hugetlb_page(vma)) + return -EFAULT; ptep = get_locked_pte(mm, hva, &ptl); if (unlikely(!ptep)) return -EFAULT; @@ -1071,10 +1148,14 @@ EXPORT_SYMBOL(pgste_perform_essa); int set_pgste_bits(struct mm_struct *mm, unsigned long hva, unsigned long bits, unsigned long value) { + struct vm_area_struct *vma; spinlock_t *ptl; pgste_t new; pte_t *ptep; + vma = vma_lookup(mm, hva); + if (!vma || is_vm_hugetlb_page(vma)) + return -EFAULT; ptep = get_locked_pte(mm, hva, &ptl); if (unlikely(!ptep)) return -EFAULT; @@ -1099,9 +1180,13 @@ EXPORT_SYMBOL(set_pgste_bits); */ int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep) { + struct vm_area_struct *vma; spinlock_t *ptl; pte_t *ptep; + vma = vma_lookup(mm, hva); + if (!vma || is_vm_hugetlb_page(vma)) + return -EFAULT; ptep = get_locked_pte(mm, hva, &ptl); if (unlikely(!ptep)) return -EFAULT; diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index b403fa14847d..186a020857cf 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -1,9 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright IBM Corp. 2006 - * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com> */ +#include <linux/memory_hotplug.h> #include <linux/memblock.h> #include <linux/pfn.h> #include <linux/mm.h> @@ -11,9 +11,12 @@ #include <linux/list.h> #include <linux/hugetlb.h> #include <linux/slab.h> +#include <linux/sort.h> +#include <asm/page-states.h> #include <asm/cacheflush.h> +#include <asm/nospec-branch.h> +#include <asm/ctlreg.h> #include <asm/pgalloc.h> -#include <asm/pgtable.h> #include <asm/setup.h> #include <asm/tlbflush.h> #include <asm/sections.h> @@ -21,21 +24,22 @@ static DEFINE_MUTEX(vmem_mutex); -struct memory_segment { - struct list_head list; - unsigned long start; - unsigned long size; -}; - -static LIST_HEAD(mem_segs); - static void __ref *vmem_alloc_pages(unsigned int order) { unsigned long size = PAGE_SIZE << order; if (slab_is_available()) return (void *)__get_free_pages(GFP_KERNEL, order); - return (void *) memblock_phys_alloc(size, size); + return memblock_alloc(size, size); +} + +static void vmem_free_pages(unsigned long addr, int order) +{ + /* We don't expect boot memory to be removed ever. */ + if (!slab_is_available() || + WARN_ON_ONCE(PageReserved(virt_to_page((void *)addr)))) + return; + free_pages(addr, order); } void *vmem_crst_alloc(unsigned long val) @@ -43,8 +47,10 @@ void *vmem_crst_alloc(unsigned long val) unsigned long *table; table = vmem_alloc_pages(CRST_ALLOC_ORDER); - if (table) - crst_table_init(table, val); + if (!table) + return NULL; + crst_table_init(table, val); + __arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER); return table; } @@ -56,389 +62,613 @@ pte_t __ref *vmem_pte_alloc(void) if (slab_is_available()) pte = (pte_t *) page_table_alloc(&init_mm); else - pte = (pte_t *) memblock_phys_alloc(size, size); + pte = (pte_t *) memblock_alloc(size, size); if (!pte) return NULL; memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE); + __arch_set_page_dat(pte, 1); return pte; } +static void vmem_pte_free(unsigned long *table) +{ + /* We don't expect boot memory to be removed ever. */ + if (!slab_is_available() || + WARN_ON_ONCE(PageReserved(virt_to_page(table)))) + return; + page_table_free(&init_mm, table); +} + +#define PAGE_UNUSED 0xFD + /* - * Add a physical memory range to the 1:1 mapping. + * The unused vmemmap range, which was not yet memset(PAGE_UNUSED) ranges + * from unused_sub_pmd_start to next PMD_SIZE boundary. */ -static int vmem_add_mem(unsigned long start, unsigned long size) -{ - unsigned long pgt_prot, sgt_prot, r3_prot; - unsigned long pages4k, pages1m, pages2g; - unsigned long end = start + size; - unsigned long address = start; - pgd_t *pg_dir; - p4d_t *p4_dir; - pud_t *pu_dir; - pmd_t *pm_dir; - pte_t *pt_dir; - int ret = -ENOMEM; +static unsigned long unused_sub_pmd_start; + +static void vmemmap_flush_unused_sub_pmd(void) +{ + if (!unused_sub_pmd_start) + return; + memset((void *)unused_sub_pmd_start, PAGE_UNUSED, + ALIGN(unused_sub_pmd_start, PMD_SIZE) - unused_sub_pmd_start); + unused_sub_pmd_start = 0; +} - pgt_prot = pgprot_val(PAGE_KERNEL); - sgt_prot = pgprot_val(SEGMENT_KERNEL); - r3_prot = pgprot_val(REGION3_KERNEL); - if (!MACHINE_HAS_NX) { - pgt_prot &= ~_PAGE_NOEXEC; - sgt_prot &= ~_SEGMENT_ENTRY_NOEXEC; - r3_prot &= ~_REGION_ENTRY_NOEXEC; +static void vmemmap_mark_sub_pmd_used(unsigned long start, unsigned long end) +{ + /* + * As we expect to add in the same granularity as we remove, it's + * sufficient to mark only some piece used to block the memmap page from + * getting removed (just in case the memmap never gets initialized, + * e.g., because the memory block never gets onlined). + */ + memset((void *)start, 0, sizeof(struct page)); +} + +static void vmemmap_use_sub_pmd(unsigned long start, unsigned long end) +{ + /* + * We only optimize if the new used range directly follows the + * previously unused range (esp., when populating consecutive sections). + */ + if (unused_sub_pmd_start == start) { + unused_sub_pmd_start = end; + if (likely(IS_ALIGNED(unused_sub_pmd_start, PMD_SIZE))) + unused_sub_pmd_start = 0; + return; } - pages4k = pages1m = pages2g = 0; - while (address < end) { - pg_dir = pgd_offset_k(address); - if (pgd_none(*pg_dir)) { - p4_dir = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); - if (!p4_dir) - goto out; - pgd_populate(&init_mm, pg_dir, p4_dir); - } - p4_dir = p4d_offset(pg_dir, address); - if (p4d_none(*p4_dir)) { - pu_dir = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); - if (!pu_dir) - goto out; - p4d_populate(&init_mm, p4_dir, pu_dir); - } - pu_dir = pud_offset(p4_dir, address); - if (MACHINE_HAS_EDAT2 && pud_none(*pu_dir) && address && - !(address & ~PUD_MASK) && (address + PUD_SIZE <= end) && - !debug_pagealloc_enabled()) { - pud_val(*pu_dir) = address | r3_prot; - address += PUD_SIZE; - pages2g++; - continue; - } - if (pud_none(*pu_dir)) { - pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); - if (!pm_dir) - goto out; - pud_populate(&init_mm, pu_dir, pm_dir); - } - pm_dir = pmd_offset(pu_dir, address); - if (MACHINE_HAS_EDAT1 && pmd_none(*pm_dir) && address && - !(address & ~PMD_MASK) && (address + PMD_SIZE <= end) && - !debug_pagealloc_enabled()) { - pmd_val(*pm_dir) = address | sgt_prot; - address += PMD_SIZE; - pages1m++; + vmemmap_flush_unused_sub_pmd(); + vmemmap_mark_sub_pmd_used(start, end); +} + +static void vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end) +{ + unsigned long page = ALIGN_DOWN(start, PMD_SIZE); + + vmemmap_flush_unused_sub_pmd(); + + /* Could be our memmap page is filled with PAGE_UNUSED already ... */ + vmemmap_mark_sub_pmd_used(start, end); + + /* Mark the unused parts of the new memmap page PAGE_UNUSED. */ + if (!IS_ALIGNED(start, PMD_SIZE)) + memset((void *)page, PAGE_UNUSED, start - page); + /* + * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of + * consecutive sections. Remember for the last added PMD the last + * unused range in the populated PMD. + */ + if (!IS_ALIGNED(end, PMD_SIZE)) + unused_sub_pmd_start = end; +} + +/* Returns true if the PMD is completely unused and can be freed. */ +static bool vmemmap_unuse_sub_pmd(unsigned long start, unsigned long end) +{ + unsigned long page = ALIGN_DOWN(start, PMD_SIZE); + + vmemmap_flush_unused_sub_pmd(); + memset((void *)start, PAGE_UNUSED, end - start); + return !memchr_inv((void *)page, PAGE_UNUSED, PMD_SIZE); +} + +/* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ +static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr, + unsigned long end, bool add, bool direct) +{ + unsigned long prot, pages = 0; + int ret = -ENOMEM; + pte_t *pte; + + prot = pgprot_val(PAGE_KERNEL); + if (!MACHINE_HAS_NX) + prot &= ~_PAGE_NOEXEC; + + pte = pte_offset_kernel(pmd, addr); + for (; addr < end; addr += PAGE_SIZE, pte++) { + if (!add) { + if (pte_none(*pte)) + continue; + if (!direct) + vmem_free_pages((unsigned long) pfn_to_virt(pte_pfn(*pte)), 0); + pte_clear(&init_mm, addr, pte); + } else if (pte_none(*pte)) { + if (!direct) { + void *new_page = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE); + + if (!new_page) + goto out; + set_pte(pte, __pte(__pa(new_page) | prot)); + } else { + set_pte(pte, __pte(__pa(addr) | prot)); + } + } else { continue; } - if (pmd_none(*pm_dir)) { - pt_dir = vmem_pte_alloc(); - if (!pt_dir) - goto out; - pmd_populate(&init_mm, pm_dir, pt_dir); - } - - pt_dir = pte_offset_kernel(pm_dir, address); - pte_val(*pt_dir) = address | pgt_prot; - address += PAGE_SIZE; - pages4k++; + pages++; } ret = 0; out: - update_page_count(PG_DIRECT_MAP_4K, pages4k); - update_page_count(PG_DIRECT_MAP_1M, pages1m); - update_page_count(PG_DIRECT_MAP_2G, pages2g); + if (direct) + update_page_count(PG_DIRECT_MAP_4K, add ? pages : -pages); return ret; } -/* - * Remove a physical memory range from the 1:1 mapping. - * Currently only invalidates page table entries. - */ -static void vmem_remove_range(unsigned long start, unsigned long size) +static void try_free_pte_table(pmd_t *pmd, unsigned long start) { - unsigned long pages4k, pages1m, pages2g; - unsigned long end = start + size; - unsigned long address = start; - pgd_t *pg_dir; - p4d_t *p4_dir; - pud_t *pu_dir; - pmd_t *pm_dir; - pte_t *pt_dir; - - pages4k = pages1m = pages2g = 0; - while (address < end) { - pg_dir = pgd_offset_k(address); - if (pgd_none(*pg_dir)) { - address += PGDIR_SIZE; - continue; - } - p4_dir = p4d_offset(pg_dir, address); - if (p4d_none(*p4_dir)) { - address += P4D_SIZE; - continue; - } - pu_dir = pud_offset(p4_dir, address); - if (pud_none(*pu_dir)) { - address += PUD_SIZE; - continue; - } - if (pud_large(*pu_dir)) { - pud_clear(pu_dir); - address += PUD_SIZE; - pages2g++; - continue; - } - pm_dir = pmd_offset(pu_dir, address); - if (pmd_none(*pm_dir)) { - address += PMD_SIZE; - continue; - } - if (pmd_large(*pm_dir)) { - pmd_clear(pm_dir); - address += PMD_SIZE; - pages1m++; - continue; - } - pt_dir = pte_offset_kernel(pm_dir, address); - pte_clear(&init_mm, address, pt_dir); - address += PAGE_SIZE; - pages4k++; + pte_t *pte; + int i; + + /* We can safely assume this is fully in 1:1 mapping & vmemmap area */ + pte = pte_offset_kernel(pmd, start); + for (i = 0; i < PTRS_PER_PTE; i++, pte++) { + if (!pte_none(*pte)) + return; } - flush_tlb_kernel_range(start, end); - update_page_count(PG_DIRECT_MAP_4K, -pages4k); - update_page_count(PG_DIRECT_MAP_1M, -pages1m); - update_page_count(PG_DIRECT_MAP_2G, -pages2g); + vmem_pte_free((unsigned long *) pmd_deref(*pmd)); + pmd_clear(pmd); } -/* - * Add a backed mem_map array to the virtual mem_map array. - */ -int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, - struct vmem_altmap *altmap) -{ - unsigned long pgt_prot, sgt_prot; - unsigned long address = start; - pgd_t *pg_dir; - p4d_t *p4_dir; - pud_t *pu_dir; - pmd_t *pm_dir; - pte_t *pt_dir; +/* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ +static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, + unsigned long end, bool add, bool direct) +{ + unsigned long next, prot, pages = 0; int ret = -ENOMEM; + pmd_t *pmd; + pte_t *pte; - pgt_prot = pgprot_val(PAGE_KERNEL); - sgt_prot = pgprot_val(SEGMENT_KERNEL); - if (!MACHINE_HAS_NX) { - pgt_prot &= ~_PAGE_NOEXEC; - sgt_prot &= ~_SEGMENT_ENTRY_NOEXEC; - } - for (address = start; address < end;) { - pg_dir = pgd_offset_k(address); - if (pgd_none(*pg_dir)) { - p4_dir = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); - if (!p4_dir) - goto out; - pgd_populate(&init_mm, pg_dir, p4_dir); - } + prot = pgprot_val(SEGMENT_KERNEL); + if (!MACHINE_HAS_NX) + prot &= ~_SEGMENT_ENTRY_NOEXEC; - p4_dir = p4d_offset(pg_dir, address); - if (p4d_none(*p4_dir)) { - pu_dir = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); - if (!pu_dir) - goto out; - p4d_populate(&init_mm, p4_dir, pu_dir); - } + pmd = pmd_offset(pud, addr); + for (; addr < end; addr = next, pmd++) { + next = pmd_addr_end(addr, end); + if (!add) { + if (pmd_none(*pmd)) + continue; + if (pmd_large(*pmd)) { + if (IS_ALIGNED(addr, PMD_SIZE) && + IS_ALIGNED(next, PMD_SIZE)) { + if (!direct) + vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE)); + pmd_clear(pmd); + pages++; + } else if (!direct && vmemmap_unuse_sub_pmd(addr, next)) { + vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE)); + pmd_clear(pmd); + } + continue; + } + } else if (pmd_none(*pmd)) { + if (IS_ALIGNED(addr, PMD_SIZE) && + IS_ALIGNED(next, PMD_SIZE) && + MACHINE_HAS_EDAT1 && direct && + !debug_pagealloc_enabled()) { + set_pmd(pmd, __pmd(__pa(addr) | prot)); + pages++; + continue; + } else if (!direct && MACHINE_HAS_EDAT1) { + void *new_page; - pu_dir = pud_offset(p4_dir, address); - if (pud_none(*pu_dir)) { - pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); - if (!pm_dir) + /* + * Use 1MB frames for vmemmap if available. We + * always use large frames even if they are only + * partially used. Otherwise we would have also + * page tables since vmemmap_populate gets + * called for each section separately. + */ + new_page = vmemmap_alloc_block(PMD_SIZE, NUMA_NO_NODE); + if (new_page) { + set_pmd(pmd, __pmd(__pa(new_page) | prot)); + if (!IS_ALIGNED(addr, PMD_SIZE) || + !IS_ALIGNED(next, PMD_SIZE)) { + vmemmap_use_new_sub_pmd(addr, next); + } + continue; + } + } + pte = vmem_pte_alloc(); + if (!pte) goto out; - pud_populate(&init_mm, pu_dir, pm_dir); + pmd_populate(&init_mm, pmd, pte); + } else if (pmd_large(*pmd)) { + if (!direct) + vmemmap_use_sub_pmd(addr, next); + continue; } + ret = modify_pte_table(pmd, addr, next, add, direct); + if (ret) + goto out; + if (!add) + try_free_pte_table(pmd, addr & PMD_MASK); + } + ret = 0; +out: + if (direct) + update_page_count(PG_DIRECT_MAP_1M, add ? pages : -pages); + return ret; +} - pm_dir = pmd_offset(pu_dir, address); - if (pmd_none(*pm_dir)) { - /* Use 1MB frames for vmemmap if available. We always - * use large frames even if they are only partially - * used. - * Otherwise we would have also page tables since - * vmemmap_populate gets called for each section - * separately. */ - if (MACHINE_HAS_EDAT1) { - void *new_page; +static void try_free_pmd_table(pud_t *pud, unsigned long start) +{ + pmd_t *pmd; + int i; + + pmd = pmd_offset(pud, start); + for (i = 0; i < PTRS_PER_PMD; i++, pmd++) + if (!pmd_none(*pmd)) + return; + vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER); + pud_clear(pud); +} - new_page = vmemmap_alloc_block(PMD_SIZE, node); - if (!new_page) - goto out; - pmd_val(*pm_dir) = __pa(new_page) | sgt_prot; - address = (address + PMD_SIZE) & PMD_MASK; +static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, + bool add, bool direct) +{ + unsigned long next, prot, pages = 0; + int ret = -ENOMEM; + pud_t *pud; + pmd_t *pmd; + + prot = pgprot_val(REGION3_KERNEL); + if (!MACHINE_HAS_NX) + prot &= ~_REGION_ENTRY_NOEXEC; + pud = pud_offset(p4d, addr); + for (; addr < end; addr = next, pud++) { + next = pud_addr_end(addr, end); + if (!add) { + if (pud_none(*pud)) + continue; + if (pud_large(*pud)) { + if (IS_ALIGNED(addr, PUD_SIZE) && + IS_ALIGNED(next, PUD_SIZE)) { + pud_clear(pud); + pages++; + } + continue; + } + } else if (pud_none(*pud)) { + if (IS_ALIGNED(addr, PUD_SIZE) && + IS_ALIGNED(next, PUD_SIZE) && + MACHINE_HAS_EDAT2 && direct && + !debug_pagealloc_enabled()) { + set_pud(pud, __pud(__pa(addr) | prot)); + pages++; continue; } - pt_dir = vmem_pte_alloc(); - if (!pt_dir) + pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); + if (!pmd) goto out; - pmd_populate(&init_mm, pm_dir, pt_dir); - } else if (pmd_large(*pm_dir)) { - address = (address + PMD_SIZE) & PMD_MASK; + pud_populate(&init_mm, pud, pmd); + } else if (pud_large(*pud)) { continue; } + ret = modify_pmd_table(pud, addr, next, add, direct); + if (ret) + goto out; + if (!add) + try_free_pmd_table(pud, addr & PUD_MASK); + } + ret = 0; +out: + if (direct) + update_page_count(PG_DIRECT_MAP_2G, add ? pages : -pages); + return ret; +} - pt_dir = pte_offset_kernel(pm_dir, address); - if (pte_none(*pt_dir)) { - void *new_page; +static void try_free_pud_table(p4d_t *p4d, unsigned long start) +{ + pud_t *pud; + int i; + + pud = pud_offset(p4d, start); + for (i = 0; i < PTRS_PER_PUD; i++, pud++) { + if (!pud_none(*pud)) + return; + } + vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER); + p4d_clear(p4d); +} - new_page = vmemmap_alloc_block(PAGE_SIZE, node); - if (!new_page) +static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end, + bool add, bool direct) +{ + unsigned long next; + int ret = -ENOMEM; + p4d_t *p4d; + pud_t *pud; + + p4d = p4d_offset(pgd, addr); + for (; addr < end; addr = next, p4d++) { + next = p4d_addr_end(addr, end); + if (!add) { + if (p4d_none(*p4d)) + continue; + } else if (p4d_none(*p4d)) { + pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); + if (!pud) goto out; - pte_val(*pt_dir) = __pa(new_page) | pgt_prot; + p4d_populate(&init_mm, p4d, pud); } - address += PAGE_SIZE; + ret = modify_pud_table(p4d, addr, next, add, direct); + if (ret) + goto out; + if (!add) + try_free_pud_table(p4d, addr & P4D_MASK); } ret = 0; out: return ret; } -void vmemmap_free(unsigned long start, unsigned long end, - struct vmem_altmap *altmap) +static void try_free_p4d_table(pgd_t *pgd, unsigned long start) { + p4d_t *p4d; + int i; + + p4d = p4d_offset(pgd, start); + for (i = 0; i < PTRS_PER_P4D; i++, p4d++) { + if (!p4d_none(*p4d)) + return; + } + vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER); + pgd_clear(pgd); } -/* - * Add memory segment to the segment list if it doesn't overlap with - * an already present segment. - */ -static int insert_memory_segment(struct memory_segment *seg) +static int modify_pagetable(unsigned long start, unsigned long end, bool add, + bool direct) { - struct memory_segment *tmp; + unsigned long addr, next; + int ret = -ENOMEM; + pgd_t *pgd; + p4d_t *p4d; + + if (WARN_ON_ONCE(!PAGE_ALIGNED(start | end))) + return -EINVAL; + /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */ + if (WARN_ON_ONCE(end > VMALLOC_START)) + return -EINVAL; + for (addr = start; addr < end; addr = next) { + next = pgd_addr_end(addr, end); + pgd = pgd_offset_k(addr); + + if (!add) { + if (pgd_none(*pgd)) + continue; + } else if (pgd_none(*pgd)) { + p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); + if (!p4d) + goto out; + pgd_populate(&init_mm, pgd, p4d); + } + ret = modify_p4d_table(pgd, addr, next, add, direct); + if (ret) + goto out; + if (!add) + try_free_p4d_table(pgd, addr & PGDIR_MASK); + } + ret = 0; +out: + if (!add) + flush_tlb_kernel_range(start, end); + return ret; +} - if (seg->start + seg->size > VMEM_MAX_PHYS || - seg->start + seg->size < seg->start) - return -ERANGE; +static int add_pagetable(unsigned long start, unsigned long end, bool direct) +{ + return modify_pagetable(start, end, true, direct); +} - list_for_each_entry(tmp, &mem_segs, list) { - if (seg->start >= tmp->start + tmp->size) - continue; - if (seg->start + seg->size <= tmp->start) - continue; - return -ENOSPC; - } - list_add(&seg->list, &mem_segs); - return 0; +static int remove_pagetable(unsigned long start, unsigned long end, bool direct) +{ + return modify_pagetable(start, end, false, direct); } /* - * Remove memory segment from the segment list. + * Add a physical memory range to the 1:1 mapping. */ -static void remove_memory_segment(struct memory_segment *seg) +static int vmem_add_range(unsigned long start, unsigned long size) { - list_del(&seg->list); + start = (unsigned long)__va(start); + return add_pagetable(start, start + size, true); } -static void __remove_shared_memory(struct memory_segment *seg) +/* + * Remove a physical memory range from the 1:1 mapping. + */ +static void vmem_remove_range(unsigned long start, unsigned long size) { - remove_memory_segment(seg); - vmem_remove_range(seg->start, seg->size); + start = (unsigned long)__va(start); + remove_pagetable(start, start + size, true); } -int vmem_remove_mapping(unsigned long start, unsigned long size) +/* + * Add a backed mem_map array to the virtual mem_map array. + */ +int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, + struct vmem_altmap *altmap) { - struct memory_segment *seg; int ret; mutex_lock(&vmem_mutex); + /* We don't care about the node, just use NUMA_NO_NODE on allocations */ + ret = add_pagetable(start, end, false); + if (ret) + remove_pagetable(start, end, false); + mutex_unlock(&vmem_mutex); + return ret; +} - ret = -ENOENT; - list_for_each_entry(seg, &mem_segs, list) { - if (seg->start == start && seg->size == size) - break; - } +#ifdef CONFIG_MEMORY_HOTPLUG - if (seg->start != start || seg->size != size) - goto out; +void vmemmap_free(unsigned long start, unsigned long end, + struct vmem_altmap *altmap) +{ + mutex_lock(&vmem_mutex); + remove_pagetable(start, end, false); + mutex_unlock(&vmem_mutex); +} - ret = 0; - __remove_shared_memory(seg); - kfree(seg); -out: +#endif + +void vmem_remove_mapping(unsigned long start, unsigned long size) +{ + mutex_lock(&vmem_mutex); + vmem_remove_range(start, size); mutex_unlock(&vmem_mutex); - return ret; +} + +struct range arch_get_mappable_range(void) +{ + struct range mhp_range; + + mhp_range.start = 0; + mhp_range.end = max_mappable - 1; + return mhp_range; } int vmem_add_mapping(unsigned long start, unsigned long size) { - struct memory_segment *seg; + struct range range = arch_get_mappable_range(); int ret; - mutex_lock(&vmem_mutex); - ret = -ENOMEM; - seg = kzalloc(sizeof(*seg), GFP_KERNEL); - if (!seg) - goto out; - seg->start = start; - seg->size = size; - - ret = insert_memory_segment(seg); - if (ret) - goto out_free; + if (start < range.start || + start + size > range.end + 1 || + start + size < start) + return -ERANGE; - ret = vmem_add_mem(start, size); + mutex_lock(&vmem_mutex); + ret = vmem_add_range(start, size); if (ret) - goto out_remove; - goto out; - -out_remove: - __remove_shared_memory(seg); -out_free: - kfree(seg); -out: + vmem_remove_range(start, size); mutex_unlock(&vmem_mutex); return ret; } /* - * map whole physical memory to virtual memory (identity mapping) - * we reserve enough space in the vmalloc area for vmemmap to hotplug - * additional memory segments. + * Allocate new or return existing page-table entry, but do not map it + * to any physical address. If missing, allocate segment- and region- + * table entries along. Meeting a large segment- or region-table entry + * while traversing is an error, since the function is expected to be + * called against virtual regions reserved for 4KB mappings only. */ -void __init vmem_map_init(void) +pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc) { - struct memblock_region *reg; - - for_each_memblock(memory, reg) - vmem_add_mem(reg->base, reg->size); - __set_memory((unsigned long)_stext, - (unsigned long)(_etext - _stext) >> PAGE_SHIFT, - SET_MEMORY_RO | SET_MEMORY_X); - __set_memory((unsigned long)_etext, - (unsigned long)(__end_rodata - _etext) >> PAGE_SHIFT, - SET_MEMORY_RO); - __set_memory((unsigned long)_sinittext, - (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT, - SET_MEMORY_RO | SET_MEMORY_X); - __set_memory(__stext_dma, (__etext_dma - __stext_dma) >> PAGE_SHIFT, - SET_MEMORY_RO | SET_MEMORY_X); - pr_info("Write protected kernel read-only data: %luk\n", - (unsigned long)(__end_rodata - _stext) >> 10); + pte_t *ptep = NULL; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset_k(addr); + if (pgd_none(*pgd)) { + if (!alloc) + goto out; + p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); + if (!p4d) + goto out; + pgd_populate(&init_mm, pgd, p4d); + } + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) { + if (!alloc) + goto out; + pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); + if (!pud) + goto out; + p4d_populate(&init_mm, p4d, pud); + } + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) { + if (!alloc) + goto out; + pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); + if (!pmd) + goto out; + pud_populate(&init_mm, pud, pmd); + } else if (WARN_ON_ONCE(pud_large(*pud))) { + goto out; + } + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) { + if (!alloc) + goto out; + pte = vmem_pte_alloc(); + if (!pte) + goto out; + pmd_populate(&init_mm, pmd, pte); + } else if (WARN_ON_ONCE(pmd_large(*pmd))) { + goto out; + } + ptep = pte_offset_kernel(pmd, addr); +out: + return ptep; } -/* - * Convert memblock.memory to a memory segment list so there is a single - * list that contains all memory segments. - */ -static int __init vmem_convert_memory_chunk(void) +int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc) { - struct memblock_region *reg; - struct memory_segment *seg; + pte_t *ptep, pte; + + if (!IS_ALIGNED(addr, PAGE_SIZE)) + return -EINVAL; + ptep = vmem_get_alloc_pte(addr, alloc); + if (!ptep) + return -ENOMEM; + __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); + pte = mk_pte_phys(phys, prot); + set_pte(ptep, pte); + return 0; +} + +int vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot) +{ + int rc; mutex_lock(&vmem_mutex); - for_each_memblock(memory, reg) { - seg = kzalloc(sizeof(*seg), GFP_KERNEL); - if (!seg) - panic("Out of memory...\n"); - seg->start = reg->base; - seg->size = reg->size; - insert_memory_segment(seg); - } + rc = __vmem_map_4k_page(addr, phys, prot, true); mutex_unlock(&vmem_mutex); - return 0; + return rc; } -core_initcall(vmem_convert_memory_chunk); +void vmem_unmap_4k_page(unsigned long addr) +{ + pte_t *ptep; + + mutex_lock(&vmem_mutex); + ptep = virt_to_kpte(addr); + __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); + pte_clear(&init_mm, addr, ptep); + mutex_unlock(&vmem_mutex); +} + +void __init vmem_map_init(void) +{ + __set_memory_rox(_stext, _etext); + __set_memory_ro(_etext, __end_rodata); + __set_memory_rox(_sinittext, _einittext); + __set_memory_rox(__stext_amode31, __etext_amode31); + /* + * If the BEAR-enhancement facility is not installed the first + * prefix page is used to return to the previous context with + * an LPSWE instruction and therefore must be executable. + */ + if (!static_key_enabled(&cpu_has_bear)) + set_memory_x(0, 1); + if (debug_pagealloc_enabled()) { + /* + * Use RELOC_HIDE() as long as __va(0) translates to NULL, + * since performing pointer arithmetic on a NULL pointer + * has undefined behavior and generates compiler warnings. + */ + __set_memory_4k(__va(0), RELOC_HIDE(__va(0), ident_map_size)); + } + if (MACHINE_HAS_NX) + system_ctl_set_bit(0, CR0_INSTRUCTION_EXEC_PROTECTION_BIT); + pr_info("Write protected kernel read-only data: %luk\n", + (unsigned long)(__end_rodata - _stext) >> 10); +} diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 8d2134136290..b418333bb086 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -7,7 +7,6 @@ * - HAVE_MARCH_Z196_FEATURES: laal, laalg * - HAVE_MARCH_Z10_FEATURES: msfi, cgrj, clgrj * - HAVE_MARCH_Z9_109_FEATURES: alfi, llilf, clfi, oilf, nilf - * - PACK_STACK * - 64BIT * * Copyright IBM Corp. 2012,2015 @@ -26,10 +25,12 @@ #include <linux/mm.h> #include <linux/kernel.h> #include <asm/cacheflush.h> +#include <asm/extable.h> #include <asm/dis.h> #include <asm/facility.h> #include <asm/nospec-branch.h> #include <asm/set_memory.h> +#include <asm/text-patching.h> #include "bpf_jit.h" struct bpf_jit { @@ -49,13 +50,14 @@ struct bpf_jit { int r1_thunk_ip; /* Address of expoline thunk for 'br %r1' */ int r14_thunk_ip; /* Address of expoline thunk for 'br %r14' */ int tail_call_start; /* Tail call start offset */ - int labels[1]; /* Labels for local jumps */ + int excnt; /* Number of exception table entries */ + int prologue_plt_ret; /* Return address for prologue hotpatch PLT */ + int prologue_plt; /* Start of prologue hotpatch PLT */ }; #define SEEN_MEM BIT(0) /* use mem[] for temporary storage */ #define SEEN_LITERAL BIT(1) /* code uses literals */ #define SEEN_FUNC BIT(2) /* calls C functions */ -#define SEEN_TAIL_CALL BIT(3) /* code uses tail calls */ #define SEEN_STACK (SEEN_FUNC | SEEN_MEM) /* @@ -68,6 +70,10 @@ struct bpf_jit { #define REG_0 REG_W0 /* Register 0 */ #define REG_1 REG_W1 /* Register 1 */ #define REG_2 BPF_REG_1 /* Register 2 */ +#define REG_3 BPF_REG_2 /* Register 3 */ +#define REG_4 BPF_REG_3 /* Register 4 */ +#define REG_7 BPF_REG_6 /* Register 7 */ +#define REG_8 BPF_REG_7 /* Register 8 */ #define REG_14 BPF_REG_0 /* Register 14 */ /* @@ -112,7 +118,7 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) { u32 r1 = reg2hex[b1]; - if (!jit->seen_reg[r1] && r1 >= 6 && r1 <= 15) + if (r1 >= 6 && r1 <= 15 && !jit->seen_reg[r1]) jit->seen_reg[r1] = 1; } @@ -228,18 +234,18 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) REG_SET_SEEN(b3); \ }) -#define EMIT6_PCREL_LABEL(op1, op2, b1, b2, label, mask) \ +#define EMIT6_PCREL_RIEB(op1, op2, b1, b2, mask, target) \ ({ \ - int rel = (jit->labels[label] - jit->prg) >> 1; \ + unsigned int rel = (int)((target) - jit->prg) / 2; \ _EMIT6((op1) | reg(b1, b2) << 16 | (rel & 0xffff), \ (op2) | (mask) << 12); \ REG_SET_SEEN(b1); \ REG_SET_SEEN(b2); \ }) -#define EMIT6_PCREL_IMM_LABEL(op1, op2, b1, imm, label, mask) \ +#define EMIT6_PCREL_RIEC(op1, op2, b1, imm, mask, target) \ ({ \ - int rel = (jit->labels[label] - jit->prg) >> 1; \ + unsigned int rel = (int)((target) - jit->prg) / 2; \ _EMIT6((op1) | (reg_high(b1) | (mask)) << 16 | \ (rel & 0xffff), (op2) | ((imm) & 0xff) << 8); \ REG_SET_SEEN(b1); \ @@ -248,8 +254,7 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) #define EMIT6_PCREL(op1, op2, b1, b2, i, off, mask) \ ({ \ - /* Branch instruction needs 6 bytes */ \ - int rel = (addrs[(i) + (off) + 1] - (addrs[(i) + 1] - 6)) / 2;\ + int rel = (addrs[(i) + (off) + 1] - jit->prg) / 2; \ _EMIT6((op1) | reg(b1, b2) << 16 | (rel & 0xffff), (op2) | (mask));\ REG_SET_SEEN(b1); \ REG_SET_SEEN(b2); \ @@ -489,21 +494,79 @@ static void save_restore_regs(struct bpf_jit *jit, int op, u32 stack_depth) } while (re <= last); } +static void bpf_skip(struct bpf_jit *jit, int size) +{ + if (size >= 6 && !is_valid_rel(size)) { + /* brcl 0xf,size */ + EMIT6_PCREL_RIL(0xc0f4000000, size); + size -= 6; + } else if (size >= 4 && is_valid_rel(size)) { + /* brc 0xf,size */ + EMIT4_PCREL(0xa7f40000, size); + size -= 4; + } + while (size >= 2) { + /* bcr 0,%0 */ + _EMIT2(0x0700); + size -= 2; + } +} + +/* + * PLT for hotpatchable calls. The calling convention is the same as for the + * ftrace hotpatch trampolines: %r0 is return address, %r1 is clobbered. + */ +extern const char bpf_plt[]; +extern const char bpf_plt_ret[]; +extern const char bpf_plt_target[]; +extern const char bpf_plt_end[]; +#define BPF_PLT_SIZE 32 +asm( + ".pushsection .rodata\n" + " .balign 8\n" + "bpf_plt:\n" + " lgrl %r0,bpf_plt_ret\n" + " lgrl %r1,bpf_plt_target\n" + " br %r1\n" + " .balign 8\n" + "bpf_plt_ret: .quad 0\n" + "bpf_plt_target: .quad 0\n" + "bpf_plt_end:\n" + " .popsection\n" +); + +static void bpf_jit_plt(void *plt, void *ret, void *target) +{ + memcpy(plt, bpf_plt, BPF_PLT_SIZE); + *(void **)((char *)plt + (bpf_plt_ret - bpf_plt)) = ret; + *(void **)((char *)plt + (bpf_plt_target - bpf_plt)) = target ?: ret; +} + /* * Emit function prologue * * Save registers and create stack frame if necessary. - * See stack frame layout desription in "bpf_jit.h"! + * See stack frame layout description in "bpf_jit.h"! */ -static void bpf_jit_prologue(struct bpf_jit *jit, u32 stack_depth) +static void bpf_jit_prologue(struct bpf_jit *jit, struct bpf_prog *fp, + u32 stack_depth) { - if (jit->seen & SEEN_TAIL_CALL) { + /* No-op for hotpatching */ + /* brcl 0,prologue_plt */ + EMIT6_PCREL_RILC(0xc0040000, 0, jit->prologue_plt); + jit->prologue_plt_ret = jit->prg; + + if (!bpf_is_subprog(fp)) { + /* Initialize the tail call counter in the main program. */ /* xc STK_OFF_TCCNT(4,%r15),STK_OFF_TCCNT(%r15) */ _EMIT6(0xd703f000 | STK_OFF_TCCNT, 0xf000 | STK_OFF_TCCNT); } else { - /* j tail_call_start: NOP if no tail calls are used */ - EMIT4_PCREL(0xa7f40000, 6); - _EMIT2(0); + /* + * Skip the tail call counter initialization in subprograms. + * Insert nops in order to have tail_call_start at a + * predictable offset. + */ + bpf_skip(jit, 6); } /* Tail calls have to skip above initialization */ jit->tail_call_start = jit->prg; @@ -539,6 +602,43 @@ static void bpf_jit_prologue(struct bpf_jit *jit, u32 stack_depth) } /* + * Emit an expoline for a jump that follows + */ +static void emit_expoline(struct bpf_jit *jit) +{ + /* exrl %r0,.+10 */ + EMIT6_PCREL_RIL(0xc6000000, jit->prg + 10); + /* j . */ + EMIT4_PCREL(0xa7f40000, 0); +} + +/* + * Emit __s390_indirect_jump_r1 thunk if necessary + */ +static void emit_r1_thunk(struct bpf_jit *jit) +{ + if (nospec_uses_trampoline()) { + jit->r1_thunk_ip = jit->prg; + emit_expoline(jit); + /* br %r1 */ + _EMIT2(0x07f1); + } +} + +/* + * Call r1 either directly or via __s390_indirect_jump_r1 thunk + */ +static void call_r1(struct bpf_jit *jit) +{ + if (nospec_uses_trampoline()) + /* brasl %r14,__s390_indirect_jump_r1 */ + EMIT6_PCREL_RILB(0xc0050000, REG_14, jit->r1_thunk_ip); + else + /* basr %r14,%r1 */ + EMIT2(0x0d00, REG_14, REG_1); +} + +/* * Function epilogue */ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth) @@ -548,42 +648,124 @@ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth) EMIT4(0xb9040000, REG_2, BPF_REG_0); /* Restore registers */ save_restore_regs(jit, REGS_RESTORE, stack_depth); - if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable) { + if (nospec_uses_trampoline()) { jit->r14_thunk_ip = jit->prg; /* Generate __s390_indirect_jump_r14 thunk */ - if (test_facility(35)) { - /* exrl %r0,.+10 */ - EMIT6_PCREL_RIL(0xc6000000, jit->prg + 10); - } else { - /* larl %r1,.+14 */ - EMIT6_PCREL_RILB(0xc0000000, REG_1, jit->prg + 14); - /* ex 0,0(%r1) */ - EMIT4_DISP(0x44000000, REG_0, REG_1, 0); - } - /* j . */ - EMIT4_PCREL(0xa7f40000, 0); + emit_expoline(jit); } /* br %r14 */ _EMIT2(0x07fe); - if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable && - (is_first_pass(jit) || (jit->seen & SEEN_FUNC))) { - jit->r1_thunk_ip = jit->prg; - /* Generate __s390_indirect_jump_r1 thunk */ - if (test_facility(35)) { - /* exrl %r0,.+10 */ - EMIT6_PCREL_RIL(0xc6000000, jit->prg + 10); - /* j . */ - EMIT4_PCREL(0xa7f40000, 0); - /* br %r1 */ - _EMIT2(0x07f1); - } else { - /* ex 0,S390_lowcore.br_r1_tampoline */ - EMIT4_DISP(0x44000000, REG_0, REG_0, - offsetof(struct lowcore, br_r1_trampoline)); - /* j . */ - EMIT4_PCREL(0xa7f40000, 0); - } + if (is_first_pass(jit) || (jit->seen & SEEN_FUNC)) + emit_r1_thunk(jit); + + jit->prg = ALIGN(jit->prg, 8); + jit->prologue_plt = jit->prg; + if (jit->prg_buf) + bpf_jit_plt(jit->prg_buf + jit->prg, + jit->prg_buf + jit->prologue_plt_ret, NULL); + jit->prg += BPF_PLT_SIZE; +} + +static int get_probe_mem_regno(const u8 *insn) +{ + /* + * insn must point to llgc, llgh, llgf, lg, lgb, lgh or lgf, which have + * destination register at the same position. + */ + if (insn[0] != 0xe3) /* common prefix */ + return -1; + if (insn[5] != 0x90 && /* llgc */ + insn[5] != 0x91 && /* llgh */ + insn[5] != 0x16 && /* llgf */ + insn[5] != 0x04 && /* lg */ + insn[5] != 0x77 && /* lgb */ + insn[5] != 0x15 && /* lgh */ + insn[5] != 0x14) /* lgf */ + return -1; + return insn[1] >> 4; +} + +bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs) +{ + regs->psw.addr = extable_fixup(x); + regs->gprs[x->data] = 0; + return true; +} + +static int bpf_jit_probe_mem(struct bpf_jit *jit, struct bpf_prog *fp, + int probe_prg, int nop_prg) +{ + struct exception_table_entry *ex; + int reg, prg; + s64 delta; + u8 *insn; + int i; + + if (!fp->aux->extable) + /* Do nothing during early JIT passes. */ + return 0; + insn = jit->prg_buf + probe_prg; + reg = get_probe_mem_regno(insn); + if (WARN_ON_ONCE(reg < 0)) + /* JIT bug - unexpected probe instruction. */ + return -1; + if (WARN_ON_ONCE(probe_prg + insn_length(*insn) != nop_prg)) + /* JIT bug - gap between probe and nop instructions. */ + return -1; + for (i = 0; i < 2; i++) { + if (WARN_ON_ONCE(jit->excnt >= fp->aux->num_exentries)) + /* Verifier bug - not enough entries. */ + return -1; + ex = &fp->aux->extable[jit->excnt]; + /* Add extable entries for probe and nop instructions. */ + prg = i == 0 ? probe_prg : nop_prg; + delta = jit->prg_buf + prg - (u8 *)&ex->insn; + if (WARN_ON_ONCE(delta < INT_MIN || delta > INT_MAX)) + /* JIT bug - code and extable must be close. */ + return -1; + ex->insn = delta; + /* + * Always land on the nop. Note that extable infrastructure + * ignores fixup field, it is handled by ex_handler_bpf(). + */ + delta = jit->prg_buf + nop_prg - (u8 *)&ex->fixup; + if (WARN_ON_ONCE(delta < INT_MIN || delta > INT_MAX)) + /* JIT bug - landing pad and extable must be close. */ + return -1; + ex->fixup = delta; + ex->type = EX_TYPE_BPF; + ex->data = reg; + jit->excnt++; + } + return 0; +} + +/* + * Sign-extend the register if necessary + */ +static int sign_extend(struct bpf_jit *jit, int r, u8 size, u8 flags) +{ + if (!(flags & BTF_FMODEL_SIGNED_ARG)) + return 0; + + switch (size) { + case 1: + /* lgbr %r,%r */ + EMIT4(0xb9060000, r, r); + return 0; + case 2: + /* lghr %r,%r */ + EMIT4(0xb9070000, r, r); + return 0; + case 4: + /* lgfr %r,%r */ + EMIT4(0xb9140000, r, r); + return 0; + case 8: + return 0; + default: + return -1; } } @@ -594,30 +776,71 @@ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth) * stack space for the large switch statement. */ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, - int i, bool extra_pass) + int i, bool extra_pass, u32 stack_depth) { struct bpf_insn *insn = &fp->insnsi[i]; + s32 branch_oc_off = insn->off; u32 dst_reg = insn->dst_reg; u32 src_reg = insn->src_reg; int last, insn_count = 1; u32 *addrs = jit->addrs; s32 imm = insn->imm; s16 off = insn->off; + int probe_prg = -1; unsigned int mask; + int nop_prg; + int err; + + if (BPF_CLASS(insn->code) == BPF_LDX && + (BPF_MODE(insn->code) == BPF_PROBE_MEM || + BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) + probe_prg = jit->prg; switch (insn->code) { /* * BPF_MOV */ - case BPF_ALU | BPF_MOV | BPF_X: /* dst = (u32) src */ - /* llgfr %dst,%src */ - EMIT4(0xb9160000, dst_reg, src_reg); - if (insn_is_zext(&insn[1])) - insn_count = 2; + case BPF_ALU | BPF_MOV | BPF_X: + switch (insn->off) { + case 0: /* DST = (u32) SRC */ + /* llgfr %dst,%src */ + EMIT4(0xb9160000, dst_reg, src_reg); + if (insn_is_zext(&insn[1])) + insn_count = 2; + break; + case 8: /* DST = (u32)(s8) SRC */ + /* lbr %dst,%src */ + EMIT4(0xb9260000, dst_reg, src_reg); + /* llgfr %dst,%dst */ + EMIT4(0xb9160000, dst_reg, dst_reg); + break; + case 16: /* DST = (u32)(s16) SRC */ + /* lhr %dst,%src */ + EMIT4(0xb9270000, dst_reg, src_reg); + /* llgfr %dst,%dst */ + EMIT4(0xb9160000, dst_reg, dst_reg); + break; + } break; - case BPF_ALU64 | BPF_MOV | BPF_X: /* dst = src */ - /* lgr %dst,%src */ - EMIT4(0xb9040000, dst_reg, src_reg); + case BPF_ALU64 | BPF_MOV | BPF_X: + switch (insn->off) { + case 0: /* DST = SRC */ + /* lgr %dst,%src */ + EMIT4(0xb9040000, dst_reg, src_reg); + break; + case 8: /* DST = (s8) SRC */ + /* lgbr %dst,%src */ + EMIT4(0xb9060000, dst_reg, src_reg); + break; + case 16: /* DST = (s16) SRC */ + /* lghr %dst,%src */ + EMIT4(0xb9070000, dst_reg, src_reg); + break; + case 32: /* DST = (s32) SRC */ + /* lgfr %dst,%src */ + EMIT4(0xb9140000, dst_reg, src_reg); + break; + } break; case BPF_ALU | BPF_MOV | BPF_K: /* dst = (u32) imm */ /* llilf %dst,imm */ @@ -656,10 +879,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT4(0xb9080000, dst_reg, src_reg); break; case BPF_ALU | BPF_ADD | BPF_K: /* dst = (u32) dst + (u32) imm */ - if (!imm) - break; - /* alfi %dst,imm */ - EMIT6_IMM(0xc20b0000, dst_reg, imm); + if (imm != 0) { + /* alfi %dst,imm */ + EMIT6_IMM(0xc20b0000, dst_reg, imm); + } EMIT_ZERO(dst_reg); break; case BPF_ALU64 | BPF_ADD | BPF_K: /* dst = dst + imm */ @@ -681,17 +904,22 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT4(0xb9090000, dst_reg, src_reg); break; case BPF_ALU | BPF_SUB | BPF_K: /* dst = (u32) dst - (u32) imm */ - if (!imm) - break; - /* alfi %dst,-imm */ - EMIT6_IMM(0xc20b0000, dst_reg, -imm); + if (imm != 0) { + /* alfi %dst,-imm */ + EMIT6_IMM(0xc20b0000, dst_reg, -imm); + } EMIT_ZERO(dst_reg); break; case BPF_ALU64 | BPF_SUB | BPF_K: /* dst = dst - imm */ if (!imm) break; - /* agfi %dst,-imm */ - EMIT6_IMM(0xc2080000, dst_reg, -imm); + if (imm == -0x80000000) { + /* algfi %dst,0x80000000 */ + EMIT6_IMM(0xc20a0000, dst_reg, 0x80000000); + } else { + /* agfi %dst,-imm */ + EMIT6_IMM(0xc2080000, dst_reg, -imm); + } break; /* * BPF_MUL @@ -706,10 +934,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT4(0xb90c0000, dst_reg, src_reg); break; case BPF_ALU | BPF_MUL | BPF_K: /* dst = (u32) dst * (u32) imm */ - if (imm == 1) - break; - /* msfi %r5,imm */ - EMIT6_IMM(0xc2010000, dst_reg, imm); + if (imm != 1) { + /* msfi %r5,imm */ + EMIT6_IMM(0xc2010000, dst_reg, imm); + } EMIT_ZERO(dst_reg); break; case BPF_ALU64 | BPF_MUL | BPF_K: /* dst = dst * imm */ @@ -721,64 +949,115 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, /* * BPF_DIV / BPF_MOD */ - case BPF_ALU | BPF_DIV | BPF_X: /* dst = (u32) dst / (u32) src */ - case BPF_ALU | BPF_MOD | BPF_X: /* dst = (u32) dst % (u32) src */ + case BPF_ALU | BPF_DIV | BPF_X: + case BPF_ALU | BPF_MOD | BPF_X: { int rc_reg = BPF_OP(insn->code) == BPF_DIV ? REG_W1 : REG_W0; - /* lhi %w0,0 */ - EMIT4_IMM(0xa7080000, REG_W0, 0); - /* lr %w1,%dst */ - EMIT2(0x1800, REG_W1, dst_reg); - /* dlr %w0,%src */ - EMIT4(0xb9970000, REG_W0, src_reg); + switch (off) { + case 0: /* dst = (u32) dst {/,%} (u32) src */ + /* xr %w0,%w0 */ + EMIT2(0x1700, REG_W0, REG_W0); + /* lr %w1,%dst */ + EMIT2(0x1800, REG_W1, dst_reg); + /* dlr %w0,%src */ + EMIT4(0xb9970000, REG_W0, src_reg); + break; + case 1: /* dst = (u32) ((s32) dst {/,%} (s32) src) */ + /* lgfr %r1,%dst */ + EMIT4(0xb9140000, REG_W1, dst_reg); + /* dsgfr %r0,%src */ + EMIT4(0xb91d0000, REG_W0, src_reg); + break; + } /* llgfr %dst,%rc */ EMIT4(0xb9160000, dst_reg, rc_reg); if (insn_is_zext(&insn[1])) insn_count = 2; break; } - case BPF_ALU64 | BPF_DIV | BPF_X: /* dst = dst / src */ - case BPF_ALU64 | BPF_MOD | BPF_X: /* dst = dst % src */ + case BPF_ALU64 | BPF_DIV | BPF_X: + case BPF_ALU64 | BPF_MOD | BPF_X: { int rc_reg = BPF_OP(insn->code) == BPF_DIV ? REG_W1 : REG_W0; - /* lghi %w0,0 */ - EMIT4_IMM(0xa7090000, REG_W0, 0); - /* lgr %w1,%dst */ - EMIT4(0xb9040000, REG_W1, dst_reg); - /* dlgr %w0,%dst */ - EMIT4(0xb9870000, REG_W0, src_reg); + switch (off) { + case 0: /* dst = dst {/,%} src */ + /* lghi %w0,0 */ + EMIT4_IMM(0xa7090000, REG_W0, 0); + /* lgr %w1,%dst */ + EMIT4(0xb9040000, REG_W1, dst_reg); + /* dlgr %w0,%src */ + EMIT4(0xb9870000, REG_W0, src_reg); + break; + case 1: /* dst = (s64) dst {/,%} (s64) src */ + /* lgr %w1,%dst */ + EMIT4(0xb9040000, REG_W1, dst_reg); + /* dsgr %w0,%src */ + EMIT4(0xb90d0000, REG_W0, src_reg); + break; + } /* lgr %dst,%rc */ EMIT4(0xb9040000, dst_reg, rc_reg); break; } - case BPF_ALU | BPF_DIV | BPF_K: /* dst = (u32) dst / (u32) imm */ - case BPF_ALU | BPF_MOD | BPF_K: /* dst = (u32) dst % (u32) imm */ + case BPF_ALU | BPF_DIV | BPF_K: + case BPF_ALU | BPF_MOD | BPF_K: { int rc_reg = BPF_OP(insn->code) == BPF_DIV ? REG_W1 : REG_W0; if (imm == 1) { if (BPF_OP(insn->code) == BPF_MOD) - /* lhgi %dst,0 */ + /* lghi %dst,0 */ EMIT4_IMM(0xa7090000, dst_reg, 0); + else + EMIT_ZERO(dst_reg); break; } - /* lhi %w0,0 */ - EMIT4_IMM(0xa7080000, REG_W0, 0); - /* lr %w1,%dst */ - EMIT2(0x1800, REG_W1, dst_reg); if (!is_first_pass(jit) && can_use_ldisp_for_lit32(jit)) { - /* dl %w0,<d(imm)>(%l) */ - EMIT6_DISP_LH(0xe3000000, 0x0097, REG_W0, REG_0, REG_L, - EMIT_CONST_U32(imm)); + switch (off) { + case 0: /* dst = (u32) dst {/,%} (u32) imm */ + /* xr %w0,%w0 */ + EMIT2(0x1700, REG_W0, REG_W0); + /* lr %w1,%dst */ + EMIT2(0x1800, REG_W1, dst_reg); + /* dl %w0,<d(imm)>(%l) */ + EMIT6_DISP_LH(0xe3000000, 0x0097, REG_W0, REG_0, + REG_L, EMIT_CONST_U32(imm)); + break; + case 1: /* dst = (s32) dst {/,%} (s32) imm */ + /* lgfr %r1,%dst */ + EMIT4(0xb9140000, REG_W1, dst_reg); + /* dsgf %r0,<d(imm)>(%l) */ + EMIT6_DISP_LH(0xe3000000, 0x001d, REG_W0, REG_0, + REG_L, EMIT_CONST_U32(imm)); + break; + } } else { - /* lgfrl %dst,imm */ - EMIT6_PCREL_RILB(0xc40c0000, dst_reg, - _EMIT_CONST_U32(imm)); - jit->seen |= SEEN_LITERAL; - /* dlr %w0,%dst */ - EMIT4(0xb9970000, REG_W0, dst_reg); + switch (off) { + case 0: /* dst = (u32) dst {/,%} (u32) imm */ + /* xr %w0,%w0 */ + EMIT2(0x1700, REG_W0, REG_W0); + /* lr %w1,%dst */ + EMIT2(0x1800, REG_W1, dst_reg); + /* lrl %dst,imm */ + EMIT6_PCREL_RILB(0xc40d0000, dst_reg, + _EMIT_CONST_U32(imm)); + jit->seen |= SEEN_LITERAL; + /* dlr %w0,%dst */ + EMIT4(0xb9970000, REG_W0, dst_reg); + break; + case 1: /* dst = (s32) dst {/,%} (s32) imm */ + /* lgfr %w1,%dst */ + EMIT4(0xb9140000, REG_W1, dst_reg); + /* lgfrl %dst,imm */ + EMIT6_PCREL_RILB(0xc40c0000, dst_reg, + _EMIT_CONST_U32(imm)); + jit->seen |= SEEN_LITERAL; + /* dsgr %w0,%dst */ + EMIT4(0xb90d0000, REG_W0, dst_reg); + break; + } } /* llgfr %dst,%rc */ EMIT4(0xb9160000, dst_reg, rc_reg); @@ -786,8 +1065,8 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, insn_count = 2; break; } - case BPF_ALU64 | BPF_DIV | BPF_K: /* dst = dst / imm */ - case BPF_ALU64 | BPF_MOD | BPF_K: /* dst = dst % imm */ + case BPF_ALU64 | BPF_DIV | BPF_K: + case BPF_ALU64 | BPF_MOD | BPF_K: { int rc_reg = BPF_OP(insn->code) == BPF_DIV ? REG_W1 : REG_W0; @@ -797,21 +1076,50 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT4_IMM(0xa7090000, dst_reg, 0); break; } - /* lghi %w0,0 */ - EMIT4_IMM(0xa7090000, REG_W0, 0); - /* lgr %w1,%dst */ - EMIT4(0xb9040000, REG_W1, dst_reg); if (!is_first_pass(jit) && can_use_ldisp_for_lit64(jit)) { - /* dlg %w0,<d(imm)>(%l) */ - EMIT6_DISP_LH(0xe3000000, 0x0087, REG_W0, REG_0, REG_L, - EMIT_CONST_U64(imm)); + switch (off) { + case 0: /* dst = dst {/,%} imm */ + /* lghi %w0,0 */ + EMIT4_IMM(0xa7090000, REG_W0, 0); + /* lgr %w1,%dst */ + EMIT4(0xb9040000, REG_W1, dst_reg); + /* dlg %w0,<d(imm)>(%l) */ + EMIT6_DISP_LH(0xe3000000, 0x0087, REG_W0, REG_0, + REG_L, EMIT_CONST_U64(imm)); + break; + case 1: /* dst = (s64) dst {/,%} (s64) imm */ + /* lgr %w1,%dst */ + EMIT4(0xb9040000, REG_W1, dst_reg); + /* dsg %w0,<d(imm)>(%l) */ + EMIT6_DISP_LH(0xe3000000, 0x000d, REG_W0, REG_0, + REG_L, EMIT_CONST_U64(imm)); + break; + } } else { - /* lgrl %dst,imm */ - EMIT6_PCREL_RILB(0xc4080000, dst_reg, - _EMIT_CONST_U64(imm)); - jit->seen |= SEEN_LITERAL; - /* dlgr %w0,%dst */ - EMIT4(0xb9870000, REG_W0, dst_reg); + switch (off) { + case 0: /* dst = dst {/,%} imm */ + /* lghi %w0,0 */ + EMIT4_IMM(0xa7090000, REG_W0, 0); + /* lgr %w1,%dst */ + EMIT4(0xb9040000, REG_W1, dst_reg); + /* lgrl %dst,imm */ + EMIT6_PCREL_RILB(0xc4080000, dst_reg, + _EMIT_CONST_U64(imm)); + jit->seen |= SEEN_LITERAL; + /* dlgr %w0,%dst */ + EMIT4(0xb9870000, REG_W0, dst_reg); + break; + case 1: /* dst = (s64) dst {/,%} (s64) imm */ + /* lgr %w1,%dst */ + EMIT4(0xb9040000, REG_W1, dst_reg); + /* lgrl %dst,imm */ + EMIT6_PCREL_RILB(0xc4080000, dst_reg, + _EMIT_CONST_U64(imm)); + jit->seen |= SEEN_LITERAL; + /* dsgr %w0,%dst */ + EMIT4(0xb90d0000, REG_W0, dst_reg); + break; + } } /* lgr %dst,%rc */ EMIT4(0xb9040000, dst_reg, rc_reg); @@ -894,10 +1202,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT4(0xb9820000, dst_reg, src_reg); break; case BPF_ALU | BPF_XOR | BPF_K: /* dst = (u32) dst ^ (u32) imm */ - if (!imm) - break; - /* xilf %dst,imm */ - EMIT6_IMM(0xc0070000, dst_reg, imm); + if (imm != 0) { + /* xilf %dst,imm */ + EMIT6_IMM(0xc0070000, dst_reg, imm); + } EMIT_ZERO(dst_reg); break; case BPF_ALU64 | BPF_XOR | BPF_K: /* dst = dst ^ imm */ @@ -928,10 +1236,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT6_DISP_LH(0xeb000000, 0x000d, dst_reg, dst_reg, src_reg, 0); break; case BPF_ALU | BPF_LSH | BPF_K: /* dst = (u32) dst << (u32) imm */ - if (imm == 0) - break; - /* sll %dst,imm(%r0) */ - EMIT4_DISP(0x89000000, dst_reg, REG_0, imm); + if (imm != 0) { + /* sll %dst,imm(%r0) */ + EMIT4_DISP(0x89000000, dst_reg, REG_0, imm); + } EMIT_ZERO(dst_reg); break; case BPF_ALU64 | BPF_LSH | BPF_K: /* dst = dst << imm */ @@ -953,10 +1261,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT6_DISP_LH(0xeb000000, 0x000c, dst_reg, dst_reg, src_reg, 0); break; case BPF_ALU | BPF_RSH | BPF_K: /* dst = (u32) dst >> (u32) imm */ - if (imm == 0) - break; - /* srl %dst,imm(%r0) */ - EMIT4_DISP(0x88000000, dst_reg, REG_0, imm); + if (imm != 0) { + /* srl %dst,imm(%r0) */ + EMIT4_DISP(0x88000000, dst_reg, REG_0, imm); + } EMIT_ZERO(dst_reg); break; case BPF_ALU64 | BPF_RSH | BPF_K: /* dst = dst >> imm */ @@ -978,10 +1286,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT6_DISP_LH(0xeb000000, 0x000a, dst_reg, dst_reg, src_reg, 0); break; case BPF_ALU | BPF_ARSH | BPF_K: /* ((s32) dst >> imm */ - if (imm == 0) - break; - /* sra %dst,imm(%r0) */ - EMIT4_DISP(0x8a000000, dst_reg, REG_0, imm); + if (imm != 0) { + /* sra %dst,imm(%r0) */ + EMIT4_DISP(0x8a000000, dst_reg, REG_0, imm); + } EMIT_ZERO(dst_reg); break; case BPF_ALU64 | BPF_ARSH | BPF_K: /* ((s64) dst) >>= imm */ @@ -1024,6 +1332,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, } break; case BPF_ALU | BPF_END | BPF_FROM_LE: + case BPF_ALU64 | BPF_END | BPF_FROM_LE: switch (imm) { case 16: /* dst = (u16) cpu_to_le16(dst) */ /* lrvr %dst,%dst */ @@ -1049,6 +1358,11 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, } break; /* + * BPF_NOSPEC (speculation barrier) + */ + case BPF_ST | BPF_NOSPEC: + break; + /* * BPF_ST(X) */ case BPF_STX | BPF_MEM | BPF_B: /* *(u8 *)(dst + off) = src_reg */ @@ -1100,45 +1414,118 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, jit->seen |= SEEN_MEM; break; /* - * BPF_STX XADD (atomic_add) + * BPF_ATOMIC */ - case BPF_STX | BPF_XADD | BPF_W: /* *(u32 *)(dst + off) += src */ - /* laal %w0,%src,off(%dst) */ - EMIT6_DISP_LH(0xeb000000, 0x00fa, REG_W0, src_reg, - dst_reg, off); - jit->seen |= SEEN_MEM; - break; - case BPF_STX | BPF_XADD | BPF_DW: /* *(u64 *)(dst + off) += src */ - /* laalg %w0,%src,off(%dst) */ - EMIT6_DISP_LH(0xeb000000, 0x00ea, REG_W0, src_reg, - dst_reg, off); + case BPF_STX | BPF_ATOMIC | BPF_DW: + case BPF_STX | BPF_ATOMIC | BPF_W: + { + bool is32 = BPF_SIZE(insn->code) == BPF_W; + + switch (insn->imm) { +/* {op32|op64} {%w0|%src},%src,off(%dst) */ +#define EMIT_ATOMIC(op32, op64) do { \ + EMIT6_DISP_LH(0xeb000000, is32 ? (op32) : (op64), \ + (insn->imm & BPF_FETCH) ? src_reg : REG_W0, \ + src_reg, dst_reg, off); \ + if (is32 && (insn->imm & BPF_FETCH)) \ + EMIT_ZERO(src_reg); \ +} while (0) + case BPF_ADD: + case BPF_ADD | BPF_FETCH: + /* {laal|laalg} */ + EMIT_ATOMIC(0x00fa, 0x00ea); + break; + case BPF_AND: + case BPF_AND | BPF_FETCH: + /* {lan|lang} */ + EMIT_ATOMIC(0x00f4, 0x00e4); + break; + case BPF_OR: + case BPF_OR | BPF_FETCH: + /* {lao|laog} */ + EMIT_ATOMIC(0x00f6, 0x00e6); + break; + case BPF_XOR: + case BPF_XOR | BPF_FETCH: + /* {lax|laxg} */ + EMIT_ATOMIC(0x00f7, 0x00e7); + break; +#undef EMIT_ATOMIC + case BPF_XCHG: + /* {ly|lg} %w0,off(%dst) */ + EMIT6_DISP_LH(0xe3000000, + is32 ? 0x0058 : 0x0004, REG_W0, REG_0, + dst_reg, off); + /* 0: {csy|csg} %w0,%src,off(%dst) */ + EMIT6_DISP_LH(0xeb000000, is32 ? 0x0014 : 0x0030, + REG_W0, src_reg, dst_reg, off); + /* brc 4,0b */ + EMIT4_PCREL_RIC(0xa7040000, 4, jit->prg - 6); + /* {llgfr|lgr} %src,%w0 */ + EMIT4(is32 ? 0xb9160000 : 0xb9040000, src_reg, REG_W0); + if (is32 && insn_is_zext(&insn[1])) + insn_count = 2; + break; + case BPF_CMPXCHG: + /* 0: {csy|csg} %b0,%src,off(%dst) */ + EMIT6_DISP_LH(0xeb000000, is32 ? 0x0014 : 0x0030, + BPF_REG_0, src_reg, dst_reg, off); + break; + default: + pr_err("Unknown atomic operation %02x\n", insn->imm); + return -1; + } + jit->seen |= SEEN_MEM; break; + } /* * BPF_LDX */ case BPF_LDX | BPF_MEM | BPF_B: /* dst = *(u8 *)(ul) (src + off) */ + case BPF_LDX | BPF_PROBE_MEM | BPF_B: /* llgc %dst,0(off,%src) */ EMIT6_DISP_LH(0xe3000000, 0x0090, dst_reg, src_reg, REG_0, off); jit->seen |= SEEN_MEM; if (insn_is_zext(&insn[1])) insn_count = 2; break; + case BPF_LDX | BPF_MEMSX | BPF_B: /* dst = *(s8 *)(ul) (src + off) */ + case BPF_LDX | BPF_PROBE_MEMSX | BPF_B: + /* lgb %dst,0(off,%src) */ + EMIT6_DISP_LH(0xe3000000, 0x0077, dst_reg, src_reg, REG_0, off); + jit->seen |= SEEN_MEM; + break; case BPF_LDX | BPF_MEM | BPF_H: /* dst = *(u16 *)(ul) (src + off) */ + case BPF_LDX | BPF_PROBE_MEM | BPF_H: /* llgh %dst,0(off,%src) */ EMIT6_DISP_LH(0xe3000000, 0x0091, dst_reg, src_reg, REG_0, off); jit->seen |= SEEN_MEM; if (insn_is_zext(&insn[1])) insn_count = 2; break; + case BPF_LDX | BPF_MEMSX | BPF_H: /* dst = *(s16 *)(ul) (src + off) */ + case BPF_LDX | BPF_PROBE_MEMSX | BPF_H: + /* lgh %dst,0(off,%src) */ + EMIT6_DISP_LH(0xe3000000, 0x0015, dst_reg, src_reg, REG_0, off); + jit->seen |= SEEN_MEM; + break; case BPF_LDX | BPF_MEM | BPF_W: /* dst = *(u32 *)(ul) (src + off) */ + case BPF_LDX | BPF_PROBE_MEM | BPF_W: /* llgf %dst,off(%src) */ jit->seen |= SEEN_MEM; EMIT6_DISP_LH(0xe3000000, 0x0016, dst_reg, src_reg, REG_0, off); if (insn_is_zext(&insn[1])) insn_count = 2; break; + case BPF_LDX | BPF_MEMSX | BPF_W: /* dst = *(s32 *)(ul) (src + off) */ + case BPF_LDX | BPF_PROBE_MEMSX | BPF_W: + /* lgf %dst,off(%src) */ + jit->seen |= SEEN_MEM; + EMIT6_DISP_LH(0xe3000000, 0x0014, dst_reg, src_reg, REG_0, off); + break; case BPF_LDX | BPF_MEM | BPF_DW: /* dst = *(u64 *)(ul) (src + off) */ + case BPF_LDX | BPF_PROBE_MEM | BPF_DW: /* lg %dst,0(off,%src) */ jit->seen |= SEEN_MEM; EMIT6_DISP_LH(0xe3000000, 0x0004, dst_reg, src_reg, REG_0, off); @@ -1148,9 +1535,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, */ case BPF_JMP | BPF_CALL: { - u64 func; + const struct btf_func_model *m; bool func_addr_fixed; - int ret; + int j, ret; + u64 func; ret = bpf_jit_get_func_addr(fp, insn, extra_pass, &func, &func_addr_fixed); @@ -1159,29 +1547,51 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, REG_SET_SEEN(BPF_REG_5); jit->seen |= SEEN_FUNC; + /* + * Copy the tail call counter to where the callee expects it. + * + * Note 1: The callee can increment the tail call counter, but + * we do not load it back, since the x86 JIT does not do this + * either. + * + * Note 2: We assume that the verifier does not let us call the + * main program, which clears the tail call counter on entry. + */ + /* mvc STK_OFF_TCCNT(4,%r15),N(%r15) */ + _EMIT6(0xd203f000 | STK_OFF_TCCNT, + 0xf000 | (STK_OFF_TCCNT + STK_OFF + stack_depth)); + + /* Sign-extend the kfunc arguments. */ + if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { + m = bpf_jit_find_kfunc_model(fp, insn); + if (!m) + return -1; + + for (j = 0; j < m->nr_args; j++) { + if (sign_extend(jit, BPF_REG_1 + j, + m->arg_size[j], + m->arg_flags[j])) + return -1; + } + } + /* lgrl %w1,func */ EMIT6_PCREL_RILB(0xc4080000, REG_W1, _EMIT_CONST_U64(func)); - if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable) { - /* brasl %r14,__s390_indirect_jump_r1 */ - EMIT6_PCREL_RILB(0xc0050000, REG_14, jit->r1_thunk_ip); - } else { - /* basr %r14,%w1 */ - EMIT2(0x0d00, REG_14, REG_W1); - } + /* %r1() */ + call_r1(jit); /* lgr %b0,%r2: load return value into %b0 */ EMIT4(0xb9040000, BPF_REG_0, REG_2); break; } - case BPF_JMP | BPF_TAIL_CALL: + case BPF_JMP | BPF_TAIL_CALL: { + int patch_1_clrj, patch_2_clij, patch_3_brc; + /* * Implicit input: * B1: pointer to ctx * B2: pointer to bpf_array * B3: index in bpf_array - */ - jit->seen |= SEEN_TAIL_CALL; - - /* + * * if (index >= array->map.max_entries) * goto out; */ @@ -1190,40 +1600,28 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT6_DISP_LH(0xe3000000, 0x0016, REG_W1, REG_0, BPF_REG_2, offsetof(struct bpf_array, map.max_entries)); /* if ((u32)%b3 >= (u32)%w1) goto out; */ - if (!is_first_pass(jit) && can_use_rel(jit, jit->labels[0])) { - /* clrj %b3,%w1,0xa,label0 */ - EMIT6_PCREL_LABEL(0xec000000, 0x0077, BPF_REG_3, - REG_W1, 0, 0xa); - } else { - /* clr %b3,%w1 */ - EMIT2(0x1500, BPF_REG_3, REG_W1); - /* brcl 0xa,label0 */ - EMIT6_PCREL_RILC(0xc0040000, 0xa, jit->labels[0]); - } + /* clrj %b3,%w1,0xa,out */ + patch_1_clrj = jit->prg; + EMIT6_PCREL_RIEB(0xec000000, 0x0077, BPF_REG_3, REG_W1, 0xa, + jit->prg); /* - * if (tail_call_cnt++ > MAX_TAIL_CALL_CNT) + * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT) * goto out; */ if (jit->seen & SEEN_STACK) - off = STK_OFF_TCCNT + STK_OFF + fp->aux->stack_depth; + off = STK_OFF_TCCNT + STK_OFF + stack_depth; else off = STK_OFF_TCCNT; /* lhi %w0,1 */ EMIT4_IMM(0xa7080000, REG_W0, 1); /* laal %w1,%w0,off(%r15) */ EMIT6_DISP_LH(0xeb000000, 0x00fa, REG_W1, REG_W0, REG_15, off); - if (!is_first_pass(jit) && can_use_rel(jit, jit->labels[0])) { - /* clij %w1,MAX_TAIL_CALL_CNT,0x2,label0 */ - EMIT6_PCREL_IMM_LABEL(0xec000000, 0x007f, REG_W1, - MAX_TAIL_CALL_CNT, 0, 0x2); - } else { - /* clfi %w1,MAX_TAIL_CALL_CNT */ - EMIT6_IMM(0xc20f0000, REG_W1, MAX_TAIL_CALL_CNT); - /* brcl 0x2,label0 */ - EMIT6_PCREL_RILC(0xc0040000, 0x2, jit->labels[0]); - } + /* clij %w1,MAX_TAIL_CALL_CNT-1,0x2,out */ + patch_2_clij = jit->prg; + EMIT6_PCREL_RIEC(0xec000000, 0x007f, REG_W1, MAX_TAIL_CALL_CNT - 1, + 2, jit->prg); /* * prog = array->ptrs[index]; @@ -1238,18 +1636,14 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, /* ltg %r1,prog(%b2,%r1) */ EMIT6_DISP_LH(0xe3000000, 0x0002, REG_1, BPF_REG_2, REG_1, offsetof(struct bpf_array, ptrs)); - if (!is_first_pass(jit) && can_use_rel(jit, jit->labels[0])) { - /* brc 0x8,label0 */ - EMIT4_PCREL_RIC(0xa7040000, 0x8, jit->labels[0]); - } else { - /* brcl 0x8,label0 */ - EMIT6_PCREL_RILC(0xc0040000, 0x8, jit->labels[0]); - } + /* brc 0x8,out */ + patch_3_brc = jit->prg; + EMIT4_PCREL_RIC(0xa7040000, 8, jit->prg); /* * Restore registers before calling function */ - save_restore_regs(jit, REGS_RESTORE, fp->aux->stack_depth); + save_restore_regs(jit, REGS_RESTORE, stack_depth); /* * goto *(prog->bpf_func + tail_call_start); @@ -1258,17 +1652,37 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, /* lg %r1,bpf_func(%r1) */ EMIT6_DISP_LH(0xe3000000, 0x0004, REG_1, REG_1, REG_0, offsetof(struct bpf_prog, bpf_func)); - /* bc 0xf,tail_call_start(%r1) */ - _EMIT4(0x47f01000 + jit->tail_call_start); + if (nospec_uses_trampoline()) { + jit->seen |= SEEN_FUNC; + /* aghi %r1,tail_call_start */ + EMIT4_IMM(0xa70b0000, REG_1, jit->tail_call_start); + /* brcl 0xf,__s390_indirect_jump_r1 */ + EMIT6_PCREL_RILC(0xc0040000, 0xf, jit->r1_thunk_ip); + } else { + /* bc 0xf,tail_call_start(%r1) */ + _EMIT4(0x47f01000 + jit->tail_call_start); + } /* out: */ - jit->labels[0] = jit->prg; + if (jit->prg_buf) { + *(u16 *)(jit->prg_buf + patch_1_clrj + 2) = + (jit->prg - patch_1_clrj) >> 1; + *(u16 *)(jit->prg_buf + patch_2_clij + 2) = + (jit->prg - patch_2_clij) >> 1; + *(u16 *)(jit->prg_buf + patch_3_brc + 2) = + (jit->prg - patch_3_brc) >> 1; + } break; + } case BPF_JMP | BPF_EXIT: /* return b0 */ last = (i == fp->len - 1) ? 1 : 0; if (last) break; - /* j <exit> */ - EMIT4_PCREL(0xa7f40000, jit->exit_ip - jit->prg); + if (!is_first_pass(jit) && can_use_rel(jit, jit->exit_ip)) + /* brc 0xf, <exit> */ + EMIT4_PCREL_RIC(0xa7040000, 0xf, jit->exit_ip); + else + /* brcl 0xf, <exit> */ + EMIT6_PCREL_RILC(0xc0040000, 0xf, jit->exit_ip); break; /* * Branch relative (number of skipped instructions) to offset on @@ -1290,6 +1704,9 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, * instruction itself (loop) and for BPF with offset 0 we * branch to the instruction behind the branch. */ + case BPF_JMP32 | BPF_JA: /* if (true) */ + branch_oc_off = imm; + fallthrough; case BPF_JMP | BPF_JA: /* if (true) */ mask = 0xf000; /* j */ goto branch_oc; @@ -1416,21 +1833,10 @@ branch_ks: } break; branch_ku: - is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; - /* clfi or clgfi %dst,imm */ - EMIT6_IMM(is_jmp32 ? 0xc20f0000 : 0xc20e0000, - dst_reg, imm); - if (!is_first_pass(jit) && - can_use_rel(jit, addrs[i + off + 1])) { - /* brc mask,off */ - EMIT4_PCREL_RIC(0xa7040000, - mask >> 12, addrs[i + off + 1]); - } else { - /* brcl mask,off */ - EMIT6_PCREL_RILC(0xc0040000, - mask >> 12, addrs[i + off + 1]); - } - break; + /* lgfi %w1,imm (load sign extend imm) */ + src_reg = REG_1; + EMIT6_IMM(0xc0010000, src_reg, imm); + goto branch_xu; branch_xs: is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; if (!is_first_pass(jit) && @@ -1469,14 +1875,16 @@ branch_xu: break; branch_oc: if (!is_first_pass(jit) && - can_use_rel(jit, addrs[i + off + 1])) { + can_use_rel(jit, addrs[i + branch_oc_off + 1])) { /* brc mask,off */ EMIT4_PCREL_RIC(0xa7040000, - mask >> 12, addrs[i + off + 1]); + mask >> 12, + addrs[i + branch_oc_off + 1]); } else { /* brcl mask,off */ EMIT6_PCREL_RILC(0xc0040000, - mask >> 12, addrs[i + off + 1]); + mask >> 12, + addrs[i + branch_oc_off + 1]); } break; } @@ -1484,6 +1892,23 @@ branch_oc: pr_err("Unknown opcode %02x\n", insn->code); return -1; } + + if (probe_prg != -1) { + /* + * Handlers of certain exceptions leave psw.addr pointing to + * the instruction directly after the failing one. Therefore, + * create two exception table entries and also add a nop in + * case two probing instructions come directly after each + * other. + */ + nop_prg = jit->prg; + /* bcr 0,%0 */ + _EMIT2(0x0700); + err = bpf_jit_probe_mem(jit, fp, probe_prg, nop_prg); + if (err < 0) + return err; + } + return insn_count; } @@ -1509,7 +1934,14 @@ static bool bpf_is_new_addr_sane(struct bpf_jit *jit, int i) */ static int bpf_set_addr(struct bpf_jit *jit, int i) { - if (!bpf_is_new_addr_sane(jit, i)) + int delta; + + if (is_codegen_pass(jit)) { + delta = jit->prg - jit->addrs[i]; + if (delta < 0) + bpf_skip(jit, -delta); + } + if (WARN_ON_ONCE(!bpf_is_new_addr_sane(jit, i))) return -1; jit->addrs[i] = jit->prg; return 0; @@ -1519,26 +1951,27 @@ static int bpf_set_addr(struct bpf_jit *jit, int i) * Compile eBPF program into s390x code */ static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp, - bool extra_pass) + bool extra_pass, u32 stack_depth) { int i, insn_count, lit32_size, lit64_size; jit->lit32 = jit->lit32_start; jit->lit64 = jit->lit64_start; jit->prg = 0; + jit->excnt = 0; - bpf_jit_prologue(jit, fp->aux->stack_depth); + bpf_jit_prologue(jit, fp, stack_depth); if (bpf_set_addr(jit, 0) < 0) return -1; for (i = 0; i < fp->len; i += insn_count) { - insn_count = bpf_jit_insn(jit, fp, i, extra_pass); + insn_count = bpf_jit_insn(jit, fp, i, extra_pass, stack_depth); if (insn_count < 0) return -1; /* Next instruction address */ if (bpf_set_addr(jit, i + insn_count) < 0) return -1; } - bpf_jit_epilogue(jit, fp->aux->stack_depth); + bpf_jit_epilogue(jit, stack_depth); lit32_size = jit->lit32 - jit->lit32_start; lit64_size = jit->lit64 - jit->lit64_start; @@ -1550,6 +1983,12 @@ static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp, jit->lit64_start = ALIGN(jit->lit64_start, 8); jit->size = jit->lit64_start + lit64_size; jit->size_prg = jit->prg; + + if (WARN_ON_ONCE(fp->aux->extable && + jit->excnt != fp->aux->num_exentries)) + /* Verifier bug - too many entries. */ + return -1; + return 0; } @@ -1564,11 +2003,35 @@ struct s390_jit_data { int pass; }; +static struct bpf_binary_header *bpf_jit_alloc(struct bpf_jit *jit, + struct bpf_prog *fp) +{ + struct bpf_binary_header *header; + u32 extable_size; + u32 code_size; + + /* We need two entries per insn. */ + fp->aux->num_exentries *= 2; + + code_size = roundup(jit->size, + __alignof__(struct exception_table_entry)); + extable_size = fp->aux->num_exentries * + sizeof(struct exception_table_entry); + header = bpf_jit_binary_alloc(code_size + extable_size, &jit->prg_buf, + 8, jit_fill_hole); + if (!header) + return NULL; + fp->aux->extable = (struct exception_table_entry *) + (jit->prg_buf + code_size); + return header; +} + /* * Compile eBPF program "fp" */ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) { + u32 stack_depth = round_up(fp->aux->stack_depth, 8); struct bpf_prog *tmp, *orig_fp = fp; struct bpf_binary_header *header; struct s390_jit_data *jit_data; @@ -1577,6 +2040,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) struct bpf_jit jit; int pass; + if (WARN_ON_ONCE(bpf_plt_end - bpf_plt != BPF_PLT_SIZE)) + return orig_fp; + if (!fp->jit_requested) return orig_fp; @@ -1613,15 +2079,15 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) jit.addrs = kvcalloc(fp->len + 1, sizeof(*jit.addrs), GFP_KERNEL); if (jit.addrs == NULL) { fp = orig_fp; - goto out; + goto free_addrs; } /* * Three initial passes: * - 1/2: Determine clobbered registers - * - 3: Calculate program size and addrs arrray + * - 3: Calculate program size and addrs array */ for (pass = 1; pass <= 3; pass++) { - if (bpf_jit_prog(&jit, fp, extra_pass)) { + if (bpf_jit_prog(&jit, fp, extra_pass, stack_depth)) { fp = orig_fp; goto free_addrs; } @@ -1629,13 +2095,13 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) /* * Final pass: Allocate and generate program */ - header = bpf_jit_binary_alloc(jit.size, &jit.prg_buf, 8, jit_fill_hole); + header = bpf_jit_alloc(&jit, fp); if (!header) { fp = orig_fp; goto free_addrs; } skip_init_ctx: - if (bpf_jit_prog(&jit, fp, extra_pass)) { + if (bpf_jit_prog(&jit, fp, extra_pass, stack_depth)) { bpf_jit_binary_free(header); fp = orig_fp; goto free_addrs; @@ -1668,3 +2134,556 @@ out: tmp : orig_fp); return fp; } + +bool bpf_jit_supports_kfunc_call(void) +{ + return true; +} + +bool bpf_jit_supports_far_kfunc_call(void) +{ + return true; +} + +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, + void *old_addr, void *new_addr) +{ + struct { + u16 opc; + s32 disp; + } __packed insn; + char expected_plt[BPF_PLT_SIZE]; + char current_plt[BPF_PLT_SIZE]; + char new_plt[BPF_PLT_SIZE]; + char *plt; + char *ret; + int err; + + /* Verify the branch to be patched. */ + err = copy_from_kernel_nofault(&insn, ip, sizeof(insn)); + if (err < 0) + return err; + if (insn.opc != (0xc004 | (old_addr ? 0xf0 : 0))) + return -EINVAL; + + if (t == BPF_MOD_JUMP && + insn.disp == ((char *)new_addr - (char *)ip) >> 1) { + /* + * The branch already points to the destination, + * there is no PLT. + */ + } else { + /* Verify the PLT. */ + plt = (char *)ip + (insn.disp << 1); + err = copy_from_kernel_nofault(current_plt, plt, BPF_PLT_SIZE); + if (err < 0) + return err; + ret = (char *)ip + 6; + bpf_jit_plt(expected_plt, ret, old_addr); + if (memcmp(current_plt, expected_plt, BPF_PLT_SIZE)) + return -EINVAL; + /* Adjust the call address. */ + bpf_jit_plt(new_plt, ret, new_addr); + s390_kernel_write(plt + (bpf_plt_target - bpf_plt), + new_plt + (bpf_plt_target - bpf_plt), + sizeof(void *)); + } + + /* Adjust the mask of the branch. */ + insn.opc = 0xc004 | (new_addr ? 0xf0 : 0); + s390_kernel_write((char *)ip + 1, (char *)&insn.opc + 1, 1); + + /* Make the new code visible to the other CPUs. */ + text_poke_sync_lock(); + + return 0; +} + +struct bpf_tramp_jit { + struct bpf_jit common; + int orig_stack_args_off;/* Offset of arguments placed on stack by the + * func_addr's original caller + */ + int stack_size; /* Trampoline stack size */ + int backchain_off; /* Offset of backchain */ + int stack_args_off; /* Offset of stack arguments for calling + * func_addr, has to be at the top + */ + int reg_args_off; /* Offset of register arguments for calling + * func_addr + */ + int ip_off; /* For bpf_get_func_ip(), has to be at + * (ctx - 16) + */ + int arg_cnt_off; /* For bpf_get_func_arg_cnt(), has to be at + * (ctx - 8) + */ + int bpf_args_off; /* Offset of BPF_PROG context, which consists + * of BPF arguments followed by return value + */ + int retval_off; /* Offset of return value (see above) */ + int r7_r8_off; /* Offset of saved %r7 and %r8, which are used + * for __bpf_prog_enter() return value and + * func_addr respectively + */ + int run_ctx_off; /* Offset of struct bpf_tramp_run_ctx */ + int tccnt_off; /* Offset of saved tailcall counter */ + int r14_off; /* Offset of saved %r14, has to be at the + * bottom */ + int do_fexit; /* do_fexit: label */ +}; + +static void load_imm64(struct bpf_jit *jit, int dst_reg, u64 val) +{ + /* llihf %dst_reg,val_hi */ + EMIT6_IMM(0xc00e0000, dst_reg, (val >> 32)); + /* oilf %rdst_reg,val_lo */ + EMIT6_IMM(0xc00d0000, dst_reg, val); +} + +static int invoke_bpf_prog(struct bpf_tramp_jit *tjit, + const struct btf_func_model *m, + struct bpf_tramp_link *tlink, bool save_ret) +{ + struct bpf_jit *jit = &tjit->common; + int cookie_off = tjit->run_ctx_off + + offsetof(struct bpf_tramp_run_ctx, bpf_cookie); + struct bpf_prog *p = tlink->link.prog; + int patch; + + /* + * run_ctx.cookie = tlink->cookie; + */ + + /* %r0 = tlink->cookie */ + load_imm64(jit, REG_W0, tlink->cookie); + /* stg %r0,cookie_off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W0, REG_0, REG_15, cookie_off); + + /* + * if ((start = __bpf_prog_enter(p, &run_ctx)) == 0) + * goto skip; + */ + + /* %r1 = __bpf_prog_enter */ + load_imm64(jit, REG_1, (u64)bpf_trampoline_enter(p)); + /* %r2 = p */ + load_imm64(jit, REG_2, (u64)p); + /* la %r3,run_ctx_off(%r15) */ + EMIT4_DISP(0x41000000, REG_3, REG_15, tjit->run_ctx_off); + /* %r1() */ + call_r1(jit); + /* ltgr %r7,%r2 */ + EMIT4(0xb9020000, REG_7, REG_2); + /* brcl 8,skip */ + patch = jit->prg; + EMIT6_PCREL_RILC(0xc0040000, 8, 0); + + /* + * retval = bpf_func(args, p->insnsi); + */ + + /* %r1 = p->bpf_func */ + load_imm64(jit, REG_1, (u64)p->bpf_func); + /* la %r2,bpf_args_off(%r15) */ + EMIT4_DISP(0x41000000, REG_2, REG_15, tjit->bpf_args_off); + /* %r3 = p->insnsi */ + if (!p->jited) + load_imm64(jit, REG_3, (u64)p->insnsi); + /* %r1() */ + call_r1(jit); + /* stg %r2,retval_off(%r15) */ + if (save_ret) { + if (sign_extend(jit, REG_2, m->ret_size, m->ret_flags)) + return -1; + EMIT6_DISP_LH(0xe3000000, 0x0024, REG_2, REG_0, REG_15, + tjit->retval_off); + } + + /* skip: */ + if (jit->prg_buf) + *(u32 *)&jit->prg_buf[patch + 2] = (jit->prg - patch) >> 1; + + /* + * __bpf_prog_exit(p, start, &run_ctx); + */ + + /* %r1 = __bpf_prog_exit */ + load_imm64(jit, REG_1, (u64)bpf_trampoline_exit(p)); + /* %r2 = p */ + load_imm64(jit, REG_2, (u64)p); + /* lgr %r3,%r7 */ + EMIT4(0xb9040000, REG_3, REG_7); + /* la %r4,run_ctx_off(%r15) */ + EMIT4_DISP(0x41000000, REG_4, REG_15, tjit->run_ctx_off); + /* %r1() */ + call_r1(jit); + + return 0; +} + +static int alloc_stack(struct bpf_tramp_jit *tjit, size_t size) +{ + int stack_offset = tjit->stack_size; + + tjit->stack_size += size; + return stack_offset; +} + +/* ABI uses %r2 - %r6 for parameter passing. */ +#define MAX_NR_REG_ARGS 5 + +/* The "L" field of the "mvc" instruction is 8 bits. */ +#define MAX_MVC_SIZE 256 +#define MAX_NR_STACK_ARGS (MAX_MVC_SIZE / sizeof(u64)) + +/* -mfentry generates a 6-byte nop on s390x. */ +#define S390X_PATCH_SIZE 6 + +static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, + struct bpf_tramp_jit *tjit, + const struct btf_func_model *m, + u32 flags, + struct bpf_tramp_links *tlinks, + void *func_addr) +{ + struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN]; + struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY]; + struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT]; + int nr_bpf_args, nr_reg_args, nr_stack_args; + struct bpf_jit *jit = &tjit->common; + int arg, bpf_arg_off; + int i, j; + + /* Support as many stack arguments as "mvc" instruction can handle. */ + nr_reg_args = min_t(int, m->nr_args, MAX_NR_REG_ARGS); + nr_stack_args = m->nr_args - nr_reg_args; + if (nr_stack_args > MAX_NR_STACK_ARGS) + return -ENOTSUPP; + + /* Return to %r14, since func_addr and %r0 are not available. */ + if ((!func_addr && !(flags & BPF_TRAMP_F_ORIG_STACK)) || + (flags & BPF_TRAMP_F_INDIRECT)) + flags |= BPF_TRAMP_F_SKIP_FRAME; + + /* + * Compute how many arguments we need to pass to BPF programs. + * BPF ABI mirrors that of x86_64: arguments that are 16 bytes or + * smaller are packed into 1 or 2 registers; larger arguments are + * passed via pointers. + * In s390x ABI, arguments that are 8 bytes or smaller are packed into + * a register; larger arguments are passed via pointers. + * We need to deal with this difference. + */ + nr_bpf_args = 0; + for (i = 0; i < m->nr_args; i++) { + if (m->arg_size[i] <= 8) + nr_bpf_args += 1; + else if (m->arg_size[i] <= 16) + nr_bpf_args += 2; + else + return -ENOTSUPP; + } + + /* + * Calculate the stack layout. + */ + + /* + * Allocate STACK_FRAME_OVERHEAD bytes for the callees. As the s390x + * ABI requires, put our backchain at the end of the allocated memory. + */ + tjit->stack_size = STACK_FRAME_OVERHEAD; + tjit->backchain_off = tjit->stack_size - sizeof(u64); + tjit->stack_args_off = alloc_stack(tjit, nr_stack_args * sizeof(u64)); + tjit->reg_args_off = alloc_stack(tjit, nr_reg_args * sizeof(u64)); + tjit->ip_off = alloc_stack(tjit, sizeof(u64)); + tjit->arg_cnt_off = alloc_stack(tjit, sizeof(u64)); + tjit->bpf_args_off = alloc_stack(tjit, nr_bpf_args * sizeof(u64)); + tjit->retval_off = alloc_stack(tjit, sizeof(u64)); + tjit->r7_r8_off = alloc_stack(tjit, 2 * sizeof(u64)); + tjit->run_ctx_off = alloc_stack(tjit, + sizeof(struct bpf_tramp_run_ctx)); + tjit->tccnt_off = alloc_stack(tjit, sizeof(u64)); + tjit->r14_off = alloc_stack(tjit, sizeof(u64) * 2); + /* + * In accordance with the s390x ABI, the caller has allocated + * STACK_FRAME_OVERHEAD bytes for us. 8 of them contain the caller's + * backchain, and the rest we can use. + */ + tjit->stack_size -= STACK_FRAME_OVERHEAD - sizeof(u64); + tjit->orig_stack_args_off = tjit->stack_size + STACK_FRAME_OVERHEAD; + + /* lgr %r1,%r15 */ + EMIT4(0xb9040000, REG_1, REG_15); + /* aghi %r15,-stack_size */ + EMIT4_IMM(0xa70b0000, REG_15, -tjit->stack_size); + /* stg %r1,backchain_off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, REG_1, REG_0, REG_15, + tjit->backchain_off); + /* mvc tccnt_off(4,%r15),stack_size+STK_OFF_TCCNT(%r15) */ + _EMIT6(0xd203f000 | tjit->tccnt_off, + 0xf000 | (tjit->stack_size + STK_OFF_TCCNT)); + /* stmg %r2,%rN,fwd_reg_args_off(%r15) */ + if (nr_reg_args) + EMIT6_DISP_LH(0xeb000000, 0x0024, REG_2, + REG_2 + (nr_reg_args - 1), REG_15, + tjit->reg_args_off); + for (i = 0, j = 0; i < m->nr_args; i++) { + if (i < MAX_NR_REG_ARGS) + arg = REG_2 + i; + else + arg = tjit->orig_stack_args_off + + (i - MAX_NR_REG_ARGS) * sizeof(u64); + bpf_arg_off = tjit->bpf_args_off + j * sizeof(u64); + if (m->arg_size[i] <= 8) { + if (i < MAX_NR_REG_ARGS) + /* stg %arg,bpf_arg_off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, arg, + REG_0, REG_15, bpf_arg_off); + else + /* mvc bpf_arg_off(8,%r15),arg(%r15) */ + _EMIT6(0xd207f000 | bpf_arg_off, + 0xf000 | arg); + j += 1; + } else { + if (i < MAX_NR_REG_ARGS) { + /* mvc bpf_arg_off(16,%r15),0(%arg) */ + _EMIT6(0xd20ff000 | bpf_arg_off, + reg2hex[arg] << 12); + } else { + /* lg %r1,arg(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0004, REG_1, REG_0, + REG_15, arg); + /* mvc bpf_arg_off(16,%r15),0(%r1) */ + _EMIT6(0xd20ff000 | bpf_arg_off, 0x1000); + } + j += 2; + } + } + /* stmg %r7,%r8,r7_r8_off(%r15) */ + EMIT6_DISP_LH(0xeb000000, 0x0024, REG_7, REG_8, REG_15, + tjit->r7_r8_off); + /* stg %r14,r14_off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, REG_14, REG_0, REG_15, tjit->r14_off); + + if (flags & BPF_TRAMP_F_ORIG_STACK) { + /* + * The ftrace trampoline puts the return address (which is the + * address of the original function + S390X_PATCH_SIZE) into + * %r0; see ftrace_shared_hotpatch_trampoline_br and + * ftrace_init_nop() for details. + */ + + /* lgr %r8,%r0 */ + EMIT4(0xb9040000, REG_8, REG_0); + } else { + /* %r8 = func_addr + S390X_PATCH_SIZE */ + load_imm64(jit, REG_8, (u64)func_addr + S390X_PATCH_SIZE); + } + + /* + * ip = func_addr; + * arg_cnt = m->nr_args; + */ + + if (flags & BPF_TRAMP_F_IP_ARG) { + /* %r0 = func_addr */ + load_imm64(jit, REG_0, (u64)func_addr); + /* stg %r0,ip_off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, REG_0, REG_0, REG_15, + tjit->ip_off); + } + /* lghi %r0,nr_bpf_args */ + EMIT4_IMM(0xa7090000, REG_0, nr_bpf_args); + /* stg %r0,arg_cnt_off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, REG_0, REG_0, REG_15, + tjit->arg_cnt_off); + + if (flags & BPF_TRAMP_F_CALL_ORIG) { + /* + * __bpf_tramp_enter(im); + */ + + /* %r1 = __bpf_tramp_enter */ + load_imm64(jit, REG_1, (u64)__bpf_tramp_enter); + /* %r2 = im */ + load_imm64(jit, REG_2, (u64)im); + /* %r1() */ + call_r1(jit); + } + + for (i = 0; i < fentry->nr_links; i++) + if (invoke_bpf_prog(tjit, m, fentry->links[i], + flags & BPF_TRAMP_F_RET_FENTRY_RET)) + return -EINVAL; + + if (fmod_ret->nr_links) { + /* + * retval = 0; + */ + + /* xc retval_off(8,%r15),retval_off(%r15) */ + _EMIT6(0xd707f000 | tjit->retval_off, + 0xf000 | tjit->retval_off); + + for (i = 0; i < fmod_ret->nr_links; i++) { + if (invoke_bpf_prog(tjit, m, fmod_ret->links[i], true)) + return -EINVAL; + + /* + * if (retval) + * goto do_fexit; + */ + + /* ltg %r0,retval_off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0002, REG_0, REG_0, REG_15, + tjit->retval_off); + /* brcl 7,do_fexit */ + EMIT6_PCREL_RILC(0xc0040000, 7, tjit->do_fexit); + } + } + + if (flags & BPF_TRAMP_F_CALL_ORIG) { + /* + * retval = func_addr(args); + */ + + /* lmg %r2,%rN,reg_args_off(%r15) */ + if (nr_reg_args) + EMIT6_DISP_LH(0xeb000000, 0x0004, REG_2, + REG_2 + (nr_reg_args - 1), REG_15, + tjit->reg_args_off); + /* mvc stack_args_off(N,%r15),orig_stack_args_off(%r15) */ + if (nr_stack_args) + _EMIT6(0xd200f000 | + (nr_stack_args * sizeof(u64) - 1) << 16 | + tjit->stack_args_off, + 0xf000 | tjit->orig_stack_args_off); + /* mvc STK_OFF_TCCNT(4,%r15),tccnt_off(%r15) */ + _EMIT6(0xd203f000 | STK_OFF_TCCNT, 0xf000 | tjit->tccnt_off); + /* lgr %r1,%r8 */ + EMIT4(0xb9040000, REG_1, REG_8); + /* %r1() */ + call_r1(jit); + /* stg %r2,retval_off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, REG_2, REG_0, REG_15, + tjit->retval_off); + + im->ip_after_call = jit->prg_buf + jit->prg; + + /* + * The following nop will be patched by bpf_tramp_image_put(). + */ + + /* brcl 0,im->ip_epilogue */ + EMIT6_PCREL_RILC(0xc0040000, 0, (u64)im->ip_epilogue); + } + + /* do_fexit: */ + tjit->do_fexit = jit->prg; + for (i = 0; i < fexit->nr_links; i++) + if (invoke_bpf_prog(tjit, m, fexit->links[i], false)) + return -EINVAL; + + if (flags & BPF_TRAMP_F_CALL_ORIG) { + im->ip_epilogue = jit->prg_buf + jit->prg; + + /* + * __bpf_tramp_exit(im); + */ + + /* %r1 = __bpf_tramp_exit */ + load_imm64(jit, REG_1, (u64)__bpf_tramp_exit); + /* %r2 = im */ + load_imm64(jit, REG_2, (u64)im); + /* %r1() */ + call_r1(jit); + } + + /* lmg %r2,%rN,reg_args_off(%r15) */ + if ((flags & BPF_TRAMP_F_RESTORE_REGS) && nr_reg_args) + EMIT6_DISP_LH(0xeb000000, 0x0004, REG_2, + REG_2 + (nr_reg_args - 1), REG_15, + tjit->reg_args_off); + /* lgr %r1,%r8 */ + if (!(flags & BPF_TRAMP_F_SKIP_FRAME)) + EMIT4(0xb9040000, REG_1, REG_8); + /* lmg %r7,%r8,r7_r8_off(%r15) */ + EMIT6_DISP_LH(0xeb000000, 0x0004, REG_7, REG_8, REG_15, + tjit->r7_r8_off); + /* lg %r14,r14_off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0004, REG_14, REG_0, REG_15, tjit->r14_off); + /* lg %r2,retval_off(%r15) */ + if (flags & (BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_RET_FENTRY_RET)) + EMIT6_DISP_LH(0xe3000000, 0x0004, REG_2, REG_0, REG_15, + tjit->retval_off); + /* mvc stack_size+STK_OFF_TCCNT(4,%r15),tccnt_off(%r15) */ + _EMIT6(0xd203f000 | (tjit->stack_size + STK_OFF_TCCNT), + 0xf000 | tjit->tccnt_off); + /* aghi %r15,stack_size */ + EMIT4_IMM(0xa70b0000, REG_15, tjit->stack_size); + /* Emit an expoline for the following indirect jump. */ + if (nospec_uses_trampoline()) + emit_expoline(jit); + if (flags & BPF_TRAMP_F_SKIP_FRAME) + /* br %r14 */ + _EMIT2(0x07fe); + else + /* br %r1 */ + _EMIT2(0x07f1); + + emit_r1_thunk(jit); + + return 0; +} + +int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, + struct bpf_tramp_links *tlinks, void *orig_call) +{ + struct bpf_tramp_image im; + struct bpf_tramp_jit tjit; + int ret; + + memset(&tjit, 0, sizeof(tjit)); + + ret = __arch_prepare_bpf_trampoline(&im, &tjit, m, flags, + tlinks, orig_call); + + return ret < 0 ? ret : tjit.common.prg; +} + +int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, + void *image_end, const struct btf_func_model *m, + u32 flags, struct bpf_tramp_links *tlinks, + void *func_addr) +{ + struct bpf_tramp_jit tjit; + int ret; + + /* Compute offsets, check whether the code fits. */ + memset(&tjit, 0, sizeof(tjit)); + ret = __arch_prepare_bpf_trampoline(im, &tjit, m, flags, + tlinks, func_addr); + + if (ret < 0) + return ret; + if (tjit.common.prg > (char *)image_end - (char *)image) + /* + * Use the same error code as for exceeding + * BPF_MAX_TRAMP_LINKS. + */ + return -E2BIG; + + tjit.common.prg = 0; + tjit.common.prg_buf = image; + ret = __arch_prepare_bpf_trampoline(im, &tjit, m, flags, + tlinks, func_addr); + + return ret < 0 ? ret : tjit.common.prg; +} + +bool bpf_jit_supports_subprog_tailcalls(void) +{ + return true; +} diff --git a/arch/s390/numa/Makefile b/arch/s390/numa/Makefile deleted file mode 100644 index 66c2dff74895..000000000000 --- a/arch/s390/numa/Makefile +++ /dev/null @@ -1,4 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -obj-y += numa.o -obj-y += toptree.o -obj-$(CONFIG_NUMA_EMU) += mode_emu.o diff --git a/arch/s390/numa/mode_emu.c b/arch/s390/numa/mode_emu.c deleted file mode 100644 index 72d742bb2d17..000000000000 --- a/arch/s390/numa/mode_emu.c +++ /dev/null @@ -1,577 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * NUMA support for s390 - * - * NUMA emulation (aka fake NUMA) distributes the available memory to nodes - * without using real topology information about the physical memory of the - * machine. - * - * It distributes the available CPUs to nodes while respecting the original - * machine topology information. This is done by trying to avoid to separate - * CPUs which reside on the same book or even on the same MC. - * - * Because the current Linux scheduler code requires a stable cpu to node - * mapping, cores are pinned to nodes when the first CPU thread is set online. - * - * Copyright IBM Corp. 2015 - */ - -#define KMSG_COMPONENT "numa_emu" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt - -#include <linux/kernel.h> -#include <linux/cpumask.h> -#include <linux/memblock.h> -#include <linux/node.h> -#include <linux/memory.h> -#include <linux/slab.h> -#include <asm/smp.h> -#include <asm/topology.h> -#include "numa_mode.h" -#include "toptree.h" - -/* Distances between the different system components */ -#define DIST_EMPTY 0 -#define DIST_CORE 1 -#define DIST_MC 2 -#define DIST_BOOK 3 -#define DIST_DRAWER 4 -#define DIST_MAX 5 - -/* Node distance reported to common code */ -#define EMU_NODE_DIST 10 - -/* Node ID for free (not yet pinned) cores */ -#define NODE_ID_FREE -1 - -/* Different levels of toptree */ -enum toptree_level {CORE, MC, BOOK, DRAWER, NODE, TOPOLOGY}; - -/* The two toptree IDs */ -enum {TOPTREE_ID_PHYS, TOPTREE_ID_NUMA}; - -/* Number of NUMA nodes */ -static int emu_nodes = 1; -/* NUMA stripe size */ -static unsigned long emu_size; - -/* - * Node to core pinning information updates are protected by - * "sched_domains_mutex". - */ -static struct { - s32 to_node_id[CONFIG_NR_CPUS]; /* Pinned core to node mapping */ - int total; /* Total number of pinned cores */ - int per_node_target; /* Cores per node without extra cores */ - int per_node[MAX_NUMNODES]; /* Number of cores pinned to node */ -} *emu_cores; - -/* - * Pin a core to a node - */ -static void pin_core_to_node(int core_id, int node_id) -{ - if (emu_cores->to_node_id[core_id] == NODE_ID_FREE) { - emu_cores->per_node[node_id]++; - emu_cores->to_node_id[core_id] = node_id; - emu_cores->total++; - } else { - WARN_ON(emu_cores->to_node_id[core_id] != node_id); - } -} - -/* - * Number of pinned cores of a node - */ -static int cores_pinned(struct toptree *node) -{ - return emu_cores->per_node[node->id]; -} - -/* - * ID of the node where the core is pinned (or NODE_ID_FREE) - */ -static int core_pinned_to_node_id(struct toptree *core) -{ - return emu_cores->to_node_id[core->id]; -} - -/* - * Number of cores in the tree that are not yet pinned - */ -static int cores_free(struct toptree *tree) -{ - struct toptree *core; - int count = 0; - - toptree_for_each(core, tree, CORE) { - if (core_pinned_to_node_id(core) == NODE_ID_FREE) - count++; - } - return count; -} - -/* - * Return node of core - */ -static struct toptree *core_node(struct toptree *core) -{ - return core->parent->parent->parent->parent; -} - -/* - * Return drawer of core - */ -static struct toptree *core_drawer(struct toptree *core) -{ - return core->parent->parent->parent; -} - -/* - * Return book of core - */ -static struct toptree *core_book(struct toptree *core) -{ - return core->parent->parent; -} - -/* - * Return mc of core - */ -static struct toptree *core_mc(struct toptree *core) -{ - return core->parent; -} - -/* - * Distance between two cores - */ -static int dist_core_to_core(struct toptree *core1, struct toptree *core2) -{ - if (core_drawer(core1)->id != core_drawer(core2)->id) - return DIST_DRAWER; - if (core_book(core1)->id != core_book(core2)->id) - return DIST_BOOK; - if (core_mc(core1)->id != core_mc(core2)->id) - return DIST_MC; - /* Same core or sibling on same MC */ - return DIST_CORE; -} - -/* - * Distance of a node to a core - */ -static int dist_node_to_core(struct toptree *node, struct toptree *core) -{ - struct toptree *core_node; - int dist_min = DIST_MAX; - - toptree_for_each(core_node, node, CORE) - dist_min = min(dist_min, dist_core_to_core(core_node, core)); - return dist_min == DIST_MAX ? DIST_EMPTY : dist_min; -} - -/* - * Unify will delete empty nodes, therefore recreate nodes. - */ -static void toptree_unify_tree(struct toptree *tree) -{ - int nid; - - toptree_unify(tree); - for (nid = 0; nid < emu_nodes; nid++) - toptree_get_child(tree, nid); -} - -/* - * Find the best/nearest node for a given core and ensure that no node - * gets more than "emu_cores->per_node_target + extra" cores. - */ -static struct toptree *node_for_core(struct toptree *numa, struct toptree *core, - int extra) -{ - struct toptree *node, *node_best = NULL; - int dist_cur, dist_best, cores_target; - - cores_target = emu_cores->per_node_target + extra; - dist_best = DIST_MAX; - node_best = NULL; - toptree_for_each(node, numa, NODE) { - /* Already pinned cores must use their nodes */ - if (core_pinned_to_node_id(core) == node->id) { - node_best = node; - break; - } - /* Skip nodes that already have enough cores */ - if (cores_pinned(node) >= cores_target) - continue; - dist_cur = dist_node_to_core(node, core); - if (dist_cur < dist_best) { - dist_best = dist_cur; - node_best = node; - } - } - return node_best; -} - -/* - * Find the best node for each core with respect to "extra" core count - */ -static void toptree_to_numa_single(struct toptree *numa, struct toptree *phys, - int extra) -{ - struct toptree *node, *core, *tmp; - - toptree_for_each_safe(core, tmp, phys, CORE) { - node = node_for_core(numa, core, extra); - if (!node) - return; - toptree_move(core, node); - pin_core_to_node(core->id, node->id); - } -} - -/* - * Move structures of given level to specified NUMA node - */ -static void move_level_to_numa_node(struct toptree *node, struct toptree *phys, - enum toptree_level level, bool perfect) -{ - int cores_free, cores_target = emu_cores->per_node_target; - struct toptree *cur, *tmp; - - toptree_for_each_safe(cur, tmp, phys, level) { - cores_free = cores_target - toptree_count(node, CORE); - if (perfect) { - if (cores_free == toptree_count(cur, CORE)) - toptree_move(cur, node); - } else { - if (cores_free >= toptree_count(cur, CORE)) - toptree_move(cur, node); - } - } -} - -/* - * Move structures of a given level to NUMA nodes. If "perfect" is specified - * move only perfectly fitting structures. Otherwise move also smaller - * than needed structures. - */ -static void move_level_to_numa(struct toptree *numa, struct toptree *phys, - enum toptree_level level, bool perfect) -{ - struct toptree *node; - - toptree_for_each(node, numa, NODE) - move_level_to_numa_node(node, phys, level, perfect); -} - -/* - * For the first run try to move the big structures - */ -static void toptree_to_numa_first(struct toptree *numa, struct toptree *phys) -{ - struct toptree *core; - - /* Always try to move perfectly fitting structures first */ - move_level_to_numa(numa, phys, DRAWER, true); - move_level_to_numa(numa, phys, DRAWER, false); - move_level_to_numa(numa, phys, BOOK, true); - move_level_to_numa(numa, phys, BOOK, false); - move_level_to_numa(numa, phys, MC, true); - move_level_to_numa(numa, phys, MC, false); - /* Now pin all the moved cores */ - toptree_for_each(core, numa, CORE) - pin_core_to_node(core->id, core_node(core)->id); -} - -/* - * Allocate new topology and create required nodes - */ -static struct toptree *toptree_new(int id, int nodes) -{ - struct toptree *tree; - int nid; - - tree = toptree_alloc(TOPOLOGY, id); - if (!tree) - goto fail; - for (nid = 0; nid < nodes; nid++) { - if (!toptree_get_child(tree, nid)) - goto fail; - } - return tree; -fail: - panic("NUMA emulation could not allocate topology"); -} - -/* - * Allocate and initialize core to node mapping - */ -static void __ref create_core_to_node_map(void) -{ - int i; - - emu_cores = memblock_alloc(sizeof(*emu_cores), 8); - if (!emu_cores) - panic("%s: Failed to allocate %zu bytes align=0x%x\n", - __func__, sizeof(*emu_cores), 8); - for (i = 0; i < ARRAY_SIZE(emu_cores->to_node_id); i++) - emu_cores->to_node_id[i] = NODE_ID_FREE; -} - -/* - * Move cores from physical topology into NUMA target topology - * and try to keep as much of the physical topology as possible. - */ -static struct toptree *toptree_to_numa(struct toptree *phys) -{ - static int first = 1; - struct toptree *numa; - int cores_total; - - cores_total = emu_cores->total + cores_free(phys); - emu_cores->per_node_target = cores_total / emu_nodes; - numa = toptree_new(TOPTREE_ID_NUMA, emu_nodes); - if (first) { - toptree_to_numa_first(numa, phys); - first = 0; - } - toptree_to_numa_single(numa, phys, 0); - toptree_to_numa_single(numa, phys, 1); - toptree_unify_tree(numa); - - WARN_ON(cpumask_weight(&phys->mask)); - return numa; -} - -/* - * Create a toptree out of the physical topology that we got from the hypervisor - */ -static struct toptree *toptree_from_topology(void) -{ - struct toptree *phys, *node, *drawer, *book, *mc, *core; - struct cpu_topology_s390 *top; - int cpu; - - phys = toptree_new(TOPTREE_ID_PHYS, 1); - - for_each_cpu(cpu, &cpus_with_topology) { - top = &cpu_topology[cpu]; - node = toptree_get_child(phys, 0); - drawer = toptree_get_child(node, top->drawer_id); - book = toptree_get_child(drawer, top->book_id); - mc = toptree_get_child(book, top->socket_id); - core = toptree_get_child(mc, smp_get_base_cpu(cpu)); - if (!drawer || !book || !mc || !core) - panic("NUMA emulation could not allocate memory"); - cpumask_set_cpu(cpu, &core->mask); - toptree_update_mask(mc); - } - return phys; -} - -/* - * Add toptree core to topology and create correct CPU masks - */ -static void topology_add_core(struct toptree *core) -{ - struct cpu_topology_s390 *top; - int cpu; - - for_each_cpu(cpu, &core->mask) { - top = &cpu_topology[cpu]; - cpumask_copy(&top->thread_mask, &core->mask); - cpumask_copy(&top->core_mask, &core_mc(core)->mask); - cpumask_copy(&top->book_mask, &core_book(core)->mask); - cpumask_copy(&top->drawer_mask, &core_drawer(core)->mask); - cpumask_set_cpu(cpu, &node_to_cpumask_map[core_node(core)->id]); - top->node_id = core_node(core)->id; - } -} - -/* - * Apply toptree to topology and create CPU masks - */ -static void toptree_to_topology(struct toptree *numa) -{ - struct toptree *core; - int i; - - /* Clear all node masks */ - for (i = 0; i < MAX_NUMNODES; i++) - cpumask_clear(&node_to_cpumask_map[i]); - - /* Rebuild all masks */ - toptree_for_each(core, numa, CORE) - topology_add_core(core); -} - -/* - * Show the node to core mapping - */ -static void print_node_to_core_map(void) -{ - int nid, cid; - - if (!numa_debug_enabled) - return; - printk(KERN_DEBUG "NUMA node to core mapping\n"); - for (nid = 0; nid < emu_nodes; nid++) { - printk(KERN_DEBUG " node %3d: ", nid); - for (cid = 0; cid < ARRAY_SIZE(emu_cores->to_node_id); cid++) { - if (emu_cores->to_node_id[cid] == nid) - printk(KERN_CONT "%d ", cid); - } - printk(KERN_CONT "\n"); - } -} - -static void pin_all_possible_cpus(void) -{ - int core_id, node_id, cpu; - static int initialized; - - if (initialized) - return; - print_node_to_core_map(); - node_id = 0; - for_each_possible_cpu(cpu) { - core_id = smp_get_base_cpu(cpu); - if (emu_cores->to_node_id[core_id] != NODE_ID_FREE) - continue; - pin_core_to_node(core_id, node_id); - cpu_topology[cpu].node_id = node_id; - node_id = (node_id + 1) % emu_nodes; - } - print_node_to_core_map(); - initialized = 1; -} - -/* - * Transfer physical topology into a NUMA topology and modify CPU masks - * according to the NUMA topology. - * - * Must be called with "sched_domains_mutex" lock held. - */ -static void emu_update_cpu_topology(void) -{ - struct toptree *phys, *numa; - - if (emu_cores == NULL) - create_core_to_node_map(); - phys = toptree_from_topology(); - numa = toptree_to_numa(phys); - toptree_free(phys); - toptree_to_topology(numa); - toptree_free(numa); - pin_all_possible_cpus(); -} - -/* - * If emu_size is not set, use CONFIG_EMU_SIZE. Then round to minimum - * alignment (needed for memory hotplug). - */ -static unsigned long emu_setup_size_adjust(unsigned long size) -{ - unsigned long size_new; - - size = size ? : CONFIG_EMU_SIZE; - size_new = roundup(size, memory_block_size_bytes()); - if (size_new == size) - return size; - pr_warn("Increasing memory stripe size from %ld MB to %ld MB\n", - size >> 20, size_new >> 20); - return size_new; -} - -/* - * If we have not enough memory for the specified nodes, reduce the node count. - */ -static int emu_setup_nodes_adjust(int nodes) -{ - int nodes_max; - - nodes_max = memblock.memory.total_size / emu_size; - nodes_max = max(nodes_max, 1); - if (nodes_max >= nodes) - return nodes; - pr_warn("Not enough memory for %d nodes, reducing node count\n", nodes); - return nodes_max; -} - -/* - * Early emu setup - */ -static void emu_setup(void) -{ - int nid; - - emu_size = emu_setup_size_adjust(emu_size); - emu_nodes = emu_setup_nodes_adjust(emu_nodes); - for (nid = 0; nid < emu_nodes; nid++) - node_set(nid, node_possible_map); - pr_info("Creating %d nodes with memory stripe size %ld MB\n", - emu_nodes, emu_size >> 20); -} - -/* - * Return node id for given page number - */ -static int emu_pfn_to_nid(unsigned long pfn) -{ - return (pfn / (emu_size >> PAGE_SHIFT)) % emu_nodes; -} - -/* - * Return stripe size - */ -static unsigned long emu_align(void) -{ - return emu_size; -} - -/* - * Return distance between two nodes - */ -static int emu_distance(int node1, int node2) -{ - return (node1 != node2) * EMU_NODE_DIST; -} - -/* - * Define callbacks for generic s390 NUMA infrastructure - */ -const struct numa_mode numa_mode_emu = { - .name = "emu", - .setup = emu_setup, - .update_cpu_topology = emu_update_cpu_topology, - .__pfn_to_nid = emu_pfn_to_nid, - .align = emu_align, - .distance = emu_distance, -}; - -/* - * Kernel parameter: emu_nodes=<n> - */ -static int __init early_parse_emu_nodes(char *p) -{ - int count; - - if (!p || kstrtoint(p, 0, &count) != 0 || count <= 0) - return 0; - emu_nodes = min(count, MAX_NUMNODES); - return 0; -} -early_param("emu_nodes", early_parse_emu_nodes); - -/* - * Kernel parameter: emu_size=[<n>[k|M|G|T]] - */ -static int __init early_parse_emu_size(char *p) -{ - if (p) - emu_size = memparse(p, NULL); - return 0; -} -early_param("emu_size", early_parse_emu_size); diff --git a/arch/s390/numa/numa.c b/arch/s390/numa/numa.c deleted file mode 100644 index d2910fa834c8..000000000000 --- a/arch/s390/numa/numa.c +++ /dev/null @@ -1,171 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * NUMA support for s390 - * - * Implement NUMA core code. - * - * Copyright IBM Corp. 2015 - */ - -#define KMSG_COMPONENT "numa" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt - -#include <linux/kernel.h> -#include <linux/mmzone.h> -#include <linux/cpumask.h> -#include <linux/memblock.h> -#include <linux/slab.h> -#include <linux/node.h> - -#include <asm/numa.h> -#include "numa_mode.h" - -pg_data_t *node_data[MAX_NUMNODES]; -EXPORT_SYMBOL(node_data); - -cpumask_t node_to_cpumask_map[MAX_NUMNODES]; -EXPORT_SYMBOL(node_to_cpumask_map); - -static void plain_setup(void) -{ - node_set(0, node_possible_map); -} - -const struct numa_mode numa_mode_plain = { - .name = "plain", - .setup = plain_setup, -}; - -static const struct numa_mode *mode = &numa_mode_plain; - -int numa_pfn_to_nid(unsigned long pfn) -{ - return mode->__pfn_to_nid ? mode->__pfn_to_nid(pfn) : 0; -} - -void numa_update_cpu_topology(void) -{ - if (mode->update_cpu_topology) - mode->update_cpu_topology(); -} - -int __node_distance(int a, int b) -{ - return mode->distance ? mode->distance(a, b) : 0; -} -EXPORT_SYMBOL(__node_distance); - -int numa_debug_enabled; - -/* - * numa_setup_memory() - Assign bootmem to nodes - * - * The memory is first added to memblock without any respect to nodes. - * This is fixed before remaining memblock memory is handed over to the - * buddy allocator. - * An important side effect is that large bootmem allocations might easily - * cross node boundaries, which can be needed for large allocations with - * smaller memory stripes in each node (i.e. when using NUMA emulation). - * - * Memory defines nodes: - * Therefore this routine also sets the nodes online with memory. - */ -static void __init numa_setup_memory(void) -{ - unsigned long cur_base, align, end_of_dram; - int nid = 0; - - end_of_dram = memblock_end_of_DRAM(); - align = mode->align ? mode->align() : ULONG_MAX; - - /* - * Step through all available memory and assign it to the nodes - * indicated by the mode implementation. - * All nodes which are seen here will be set online. - */ - cur_base = 0; - do { - nid = numa_pfn_to_nid(PFN_DOWN(cur_base)); - node_set_online(nid); - memblock_set_node(cur_base, align, &memblock.memory, nid); - cur_base += align; - } while (cur_base < end_of_dram); - - /* Allocate and fill out node_data */ - for (nid = 0; nid < MAX_NUMNODES; nid++) { - NODE_DATA(nid) = memblock_alloc(sizeof(pg_data_t), 8); - if (!NODE_DATA(nid)) - panic("%s: Failed to allocate %zu bytes align=0x%x\n", - __func__, sizeof(pg_data_t), 8); - } - - for_each_online_node(nid) { - unsigned long start_pfn, end_pfn; - unsigned long t_start, t_end; - int i; - - start_pfn = ULONG_MAX; - end_pfn = 0; - for_each_mem_pfn_range(i, nid, &t_start, &t_end, NULL) { - if (t_start < start_pfn) - start_pfn = t_start; - if (t_end > end_pfn) - end_pfn = t_end; - } - NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn; - NODE_DATA(nid)->node_id = nid; - } -} - -/* - * numa_setup() - Earliest initialization - * - * Assign the mode and call the mode's setup routine. - */ -void __init numa_setup(void) -{ - pr_info("NUMA mode: %s\n", mode->name); - nodes_clear(node_possible_map); - /* Initially attach all possible CPUs to node 0. */ - cpumask_copy(&node_to_cpumask_map[0], cpu_possible_mask); - if (mode->setup) - mode->setup(); - numa_setup_memory(); - memblock_dump_all(); -} - -/* - * numa_init_late() - Initialization initcall - * - * Register NUMA nodes. - */ -static int __init numa_init_late(void) -{ - int nid; - - for_each_online_node(nid) - register_one_node(nid); - return 0; -} -arch_initcall(numa_init_late); - -static int __init parse_debug(char *parm) -{ - numa_debug_enabled = 1; - return 0; -} -early_param("numa_debug", parse_debug); - -static int __init parse_numa(char *parm) -{ - if (!parm) - return 1; - if (strcmp(parm, numa_mode_plain.name) == 0) - mode = &numa_mode_plain; -#ifdef CONFIG_NUMA_EMU - if (strcmp(parm, numa_mode_emu.name) == 0) - mode = &numa_mode_emu; -#endif - return 0; -} -early_param("numa", parse_numa); diff --git a/arch/s390/numa/numa_mode.h b/arch/s390/numa/numa_mode.h deleted file mode 100644 index dfd3e2784081..000000000000 --- a/arch/s390/numa/numa_mode.h +++ /dev/null @@ -1,25 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * NUMA support for s390 - * - * Define declarations used for communication between NUMA mode - * implementations and NUMA core functionality. - * - * Copyright IBM Corp. 2015 - */ -#ifndef __S390_NUMA_MODE_H -#define __S390_NUMA_MODE_H - -struct numa_mode { - char *name; /* Name of mode */ - void (*setup)(void); /* Initizalize mode */ - void (*update_cpu_topology)(void); /* Called by topology code */ - int (*__pfn_to_nid)(unsigned long pfn); /* PFN to node ID */ - unsigned long (*align)(void); /* Minimum node alignment */ - int (*distance)(int a, int b); /* Distance between two nodes */ -}; - -extern const struct numa_mode numa_mode_plain; -extern const struct numa_mode numa_mode_emu; - -#endif /* __S390_NUMA_MODE_H */ diff --git a/arch/s390/numa/toptree.c b/arch/s390/numa/toptree.c deleted file mode 100644 index 71a608cd4f61..000000000000 --- a/arch/s390/numa/toptree.c +++ /dev/null @@ -1,351 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * NUMA support for s390 - * - * A tree structure used for machine topology mangling - * - * Copyright IBM Corp. 2015 - */ - -#include <linux/kernel.h> -#include <linux/memblock.h> -#include <linux/cpumask.h> -#include <linux/list.h> -#include <linux/list_sort.h> -#include <linux/slab.h> -#include <asm/numa.h> - -#include "toptree.h" - -/** - * toptree_alloc - Allocate and initialize a new tree node. - * @level: The node's vertical level; level 0 contains the leaves. - * @id: ID number, explicitly not unique beyond scope of node's siblings - * - * Allocate a new tree node and initialize it. - * - * RETURNS: - * Pointer to the new tree node or NULL on error - */ -struct toptree __ref *toptree_alloc(int level, int id) -{ - struct toptree *res; - - if (slab_is_available()) - res = kzalloc(sizeof(*res), GFP_KERNEL); - else - res = memblock_alloc(sizeof(*res), 8); - if (!res) - return res; - - INIT_LIST_HEAD(&res->children); - INIT_LIST_HEAD(&res->sibling); - cpumask_clear(&res->mask); - res->level = level; - res->id = id; - return res; -} - -/** - * toptree_remove - Remove a tree node from a tree - * @cand: Pointer to the node to remove - * - * The node is detached from its parent node. The parent node's - * masks will be updated to reflect the loss of the child. - */ -static void toptree_remove(struct toptree *cand) -{ - struct toptree *oldparent; - - list_del_init(&cand->sibling); - oldparent = cand->parent; - cand->parent = NULL; - toptree_update_mask(oldparent); -} - -/** - * toptree_free - discard a tree node - * @cand: Pointer to the tree node to discard - * - * Checks if @cand is attached to a parent node. Detaches it - * cleanly using toptree_remove. Possible children are freed - * recursively. In the end @cand itself is freed. - */ -void __ref toptree_free(struct toptree *cand) -{ - struct toptree *child, *tmp; - - if (cand->parent) - toptree_remove(cand); - toptree_for_each_child_safe(child, tmp, cand) - toptree_free(child); - if (slab_is_available()) - kfree(cand); - else - memblock_free_early((unsigned long)cand, sizeof(*cand)); -} - -/** - * toptree_update_mask - Update node bitmasks - * @cand: Pointer to a tree node - * - * The node's cpumask will be updated by combining all children's - * masks. Then toptree_update_mask is called recursively for the - * parent if applicable. - * - * NOTE: - * This must not be called on leaves. If called on a leaf, its - * CPU mask is cleared and lost. - */ -void toptree_update_mask(struct toptree *cand) -{ - struct toptree *child; - - cpumask_clear(&cand->mask); - list_for_each_entry(child, &cand->children, sibling) - cpumask_or(&cand->mask, &cand->mask, &child->mask); - if (cand->parent) - toptree_update_mask(cand->parent); -} - -/** - * toptree_insert - Insert a tree node into tree - * @cand: Pointer to the node to insert - * @target: Pointer to the node to which @cand will added as a child - * - * Insert a tree node into a tree. Masks will be updated automatically. - * - * RETURNS: - * 0 on success, -1 if NULL is passed as argument or the node levels - * don't fit. - */ -static int toptree_insert(struct toptree *cand, struct toptree *target) -{ - if (!cand || !target) - return -1; - if (target->level != (cand->level + 1)) - return -1; - list_add_tail(&cand->sibling, &target->children); - cand->parent = target; - toptree_update_mask(target); - return 0; -} - -/** - * toptree_move_children - Move all child nodes of a node to a new place - * @cand: Pointer to the node whose children are to be moved - * @target: Pointer to the node to which @cand's children will be attached - * - * Take all child nodes of @cand and move them using toptree_move. - */ -static void toptree_move_children(struct toptree *cand, struct toptree *target) -{ - struct toptree *child, *tmp; - - toptree_for_each_child_safe(child, tmp, cand) - toptree_move(child, target); -} - -/** - * toptree_unify - Merge children with same ID - * @cand: Pointer to node whose direct children should be made unique - * - * When mangling the tree it is possible that a node has two or more children - * which have the same ID. This routine merges these children into one and - * moves all children of the merged nodes into the unified node. - */ -void toptree_unify(struct toptree *cand) -{ - struct toptree *child, *tmp, *cand_copy; - - /* Threads cannot be split, cores are not split */ - if (cand->level < 2) - return; - - cand_copy = toptree_alloc(cand->level, 0); - toptree_for_each_child_safe(child, tmp, cand) { - struct toptree *tmpchild; - - if (!cpumask_empty(&child->mask)) { - tmpchild = toptree_get_child(cand_copy, child->id); - toptree_move_children(child, tmpchild); - } - toptree_free(child); - } - toptree_move_children(cand_copy, cand); - toptree_free(cand_copy); - - toptree_for_each_child(child, cand) - toptree_unify(child); -} - -/** - * toptree_move - Move a node to another context - * @cand: Pointer to the node to move - * @target: Pointer to the node where @cand should go - * - * In the easiest case @cand is exactly on the level below @target - * and will be immediately moved to the target. - * - * If @target's level is not the direct parent level of @cand, - * nodes for the missing levels are created and put between - * @cand and @target. The "stacking" nodes' IDs are taken from - * @cand's parents. - * - * After this it is likely to have redundant nodes in the tree - * which are addressed by means of toptree_unify. - */ -void toptree_move(struct toptree *cand, struct toptree *target) -{ - struct toptree *stack_target, *real_insert_point, *ptr, *tmp; - - if (cand->level + 1 == target->level) { - toptree_remove(cand); - toptree_insert(cand, target); - return; - } - - real_insert_point = NULL; - ptr = cand; - stack_target = NULL; - - do { - tmp = stack_target; - stack_target = toptree_alloc(ptr->level + 1, - ptr->parent->id); - toptree_insert(tmp, stack_target); - if (!real_insert_point) - real_insert_point = stack_target; - ptr = ptr->parent; - } while (stack_target->level < (target->level - 1)); - - toptree_remove(cand); - toptree_insert(cand, real_insert_point); - toptree_insert(stack_target, target); -} - -/** - * toptree_get_child - Access a tree node's child by its ID - * @cand: Pointer to tree node whose child is to access - * @id: The desired child's ID - * - * @cand's children are searched for a child with matching ID. - * If no match can be found, a new child with the desired ID - * is created and returned. - */ -struct toptree *toptree_get_child(struct toptree *cand, int id) -{ - struct toptree *child; - - toptree_for_each_child(child, cand) - if (child->id == id) - return child; - child = toptree_alloc(cand->level-1, id); - toptree_insert(child, cand); - return child; -} - -/** - * toptree_first - Find the first descendant on specified level - * @context: Pointer to tree node whose descendants are to be used - * @level: The level of interest - * - * RETURNS: - * @context's first descendant on the specified level, or NULL - * if there is no matching descendant - */ -struct toptree *toptree_first(struct toptree *context, int level) -{ - struct toptree *child, *tmp; - - if (context->level == level) - return context; - - if (!list_empty(&context->children)) { - list_for_each_entry(child, &context->children, sibling) { - tmp = toptree_first(child, level); - if (tmp) - return tmp; - } - } - return NULL; -} - -/** - * toptree_next_sibling - Return next sibling - * @cur: Pointer to a tree node - * - * RETURNS: - * If @cur has a parent and is not the last in the parent's children list, - * the next sibling is returned. Or NULL when there are no siblings left. - */ -static struct toptree *toptree_next_sibling(struct toptree *cur) -{ - if (cur->parent == NULL) - return NULL; - - if (cur == list_last_entry(&cur->parent->children, - struct toptree, sibling)) - return NULL; - return (struct toptree *) list_next_entry(cur, sibling); -} - -/** - * toptree_next - Tree traversal function - * @cur: Pointer to current element - * @context: Pointer to the root node of the tree or subtree to - * be traversed. - * @level: The level of interest. - * - * RETURNS: - * Pointer to the next node on level @level - * or NULL when there is no next node. - */ -struct toptree *toptree_next(struct toptree *cur, struct toptree *context, - int level) -{ - struct toptree *cur_context, *tmp; - - if (!cur) - return NULL; - - if (context->level == level) - return NULL; - - tmp = toptree_next_sibling(cur); - if (tmp != NULL) - return tmp; - - cur_context = cur; - while (cur_context->level < context->level - 1) { - /* Step up */ - cur_context = cur_context->parent; - /* Step aside */ - tmp = toptree_next_sibling(cur_context); - if (tmp != NULL) { - /* Step down */ - tmp = toptree_first(tmp, level); - if (tmp != NULL) - return tmp; - } - } - return NULL; -} - -/** - * toptree_count - Count descendants on specified level - * @context: Pointer to node whose descendants are to be considered - * @level: Only descendants on the specified level will be counted - * - * RETURNS: - * Number of descendants on the specified level - */ -int toptree_count(struct toptree *context, int level) -{ - struct toptree *cur; - int cnt = 0; - - toptree_for_each(cur, context, level) - cnt++; - return cnt; -} diff --git a/arch/s390/numa/toptree.h b/arch/s390/numa/toptree.h deleted file mode 100644 index 5246371ec713..000000000000 --- a/arch/s390/numa/toptree.h +++ /dev/null @@ -1,61 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * NUMA support for s390 - * - * A tree structure used for machine topology mangling - * - * Copyright IBM Corp. 2015 - */ -#ifndef S390_TOPTREE_H -#define S390_TOPTREE_H - -#include <linux/cpumask.h> -#include <linux/list.h> - -struct toptree { - int level; - int id; - cpumask_t mask; - struct toptree *parent; - struct list_head sibling; - struct list_head children; -}; - -struct toptree *toptree_alloc(int level, int id); -void toptree_free(struct toptree *cand); -void toptree_update_mask(struct toptree *cand); -void toptree_unify(struct toptree *cand); -struct toptree *toptree_get_child(struct toptree *cand, int id); -void toptree_move(struct toptree *cand, struct toptree *target); -int toptree_count(struct toptree *context, int level); - -struct toptree *toptree_first(struct toptree *context, int level); -struct toptree *toptree_next(struct toptree *cur, struct toptree *context, - int level); - -#define toptree_for_each_child(child, ptree) \ - list_for_each_entry(child, &ptree->children, sibling) - -#define toptree_for_each_child_safe(child, ptmp, ptree) \ - list_for_each_entry_safe(child, ptmp, &ptree->children, sibling) - -#define toptree_is_last(ptree) \ - ((ptree->parent == NULL) || \ - (ptree->parent->children.prev == &ptree->sibling)) - -#define toptree_for_each(ptree, cont, ttype) \ - for (ptree = toptree_first(cont, ttype); \ - ptree != NULL; \ - ptree = toptree_next(ptree, cont, ttype)) - -#define toptree_for_each_safe(ptree, tmp, cont, ttype) \ - for (ptree = toptree_first(cont, ttype), \ - tmp = toptree_next(ptree, cont, ttype); \ - ptree != NULL; \ - ptree = tmp, \ - tmp = toptree_next(ptree, cont, ttype)) - -#define toptree_for_each_sibling(ptree, start) \ - toptree_for_each(ptree, start->parent, start->level) - -#endif /* S390_TOPTREE_H */ diff --git a/arch/s390/oprofile/Makefile b/arch/s390/oprofile/Makefile deleted file mode 100644 index 36261f9d360b..000000000000 --- a/arch/s390/oprofile/Makefile +++ /dev/null @@ -1,10 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_OPROFILE) += oprofile.o - -DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \ - oprof.o cpu_buffer.o buffer_sync.o \ - event_buffer.o oprofile_files.o \ - oprofilefs.o oprofile_stats.o \ - timer_int.o ) - -oprofile-y := $(DRIVER_OBJS) init.o diff --git a/arch/s390/oprofile/init.c b/arch/s390/oprofile/init.c deleted file mode 100644 index 7441857df51b..000000000000 --- a/arch/s390/oprofile/init.c +++ /dev/null @@ -1,37 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * S390 Version - * Copyright IBM Corp. 2002, 2011 - * Author(s): Thomas Spatzier (tspat@de.ibm.com) - * Author(s): Mahesh Salgaonkar (mahesh@linux.vnet.ibm.com) - * Author(s): Heinz Graalfs (graalfs@linux.vnet.ibm.com) - * Author(s): Andreas Krebbel (krebbel@linux.vnet.ibm.com) - * - * @remark Copyright 2002-2011 OProfile authors - */ - -#include <linux/oprofile.h> -#include <linux/init.h> -#include <asm/processor.h> -#include <asm/unwind.h> - -static void s390_backtrace(struct pt_regs *regs, unsigned int depth) -{ - struct unwind_state state; - - unwind_for_each_frame(&state, current, regs, 0) { - if (depth-- == 0) - break; - oprofile_add_trace(state.ip); - } -} - -int __init oprofile_arch_init(struct oprofile_operations *ops) -{ - ops->backtrace = s390_backtrace; - return 0; -} - -void oprofile_arch_exit(void) -{ -} diff --git a/arch/s390/pci/Makefile b/arch/s390/pci/Makefile index 748626a33028..0547a10406e7 100644 --- a/arch/s390/pci/Makefile +++ b/arch/s390/pci/Makefile @@ -3,5 +3,7 @@ # Makefile for the s390 PCI subsystem. # -obj-$(CONFIG_PCI) += pci.o pci_irq.o pci_dma.o pci_clp.o pci_sysfs.o \ - pci_event.o pci_debug.o pci_insn.o pci_mmio.o +obj-$(CONFIG_PCI) += pci.o pci_irq.o pci_clp.o pci_sysfs.o \ + pci_event.o pci_debug.o pci_insn.o pci_mmio.o \ + pci_bus.o pci_kvm_hook.o +obj-$(CONFIG_PCI_IOV) += pci_iov.o diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 8e872951c07b..676ac74026a8 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -36,17 +36,22 @@ #include <asm/pci_clp.h> #include <asm/pci_dma.h> +#include "pci_bus.h" +#include "pci_iov.h" + /* list of all detected zpci devices */ static LIST_HEAD(zpci_list); static DEFINE_SPINLOCK(zpci_list_lock); -static DECLARE_BITMAP(zpci_domain, ZPCI_NR_DEVICES); +static DECLARE_BITMAP(zpci_domain, ZPCI_DOMAIN_BITMAP_SIZE); static DEFINE_SPINLOCK(zpci_domain_lock); #define ZPCI_IOMAP_ENTRIES \ min(((unsigned long) ZPCI_NR_DEVICES * PCI_STD_NUM_BARS / 2), \ ZPCI_IOMAP_MAX_ENTRIES) +unsigned int s390_pci_no_rid; + static DEFINE_SPINLOCK(zpci_iomap_lock); static unsigned long *zpci_iomap_bitmap; struct zpci_iomap_entry *zpci_iomap_start; @@ -56,6 +61,12 @@ DEFINE_STATIC_KEY_FALSE(have_mio); static struct kmem_cache *zdev_fmb_cache; +/* AEN structures that must be preserved over KVM module re-insertion */ +union zpci_sic_iib *zpci_aipb; +EXPORT_SYMBOL_GPL(zpci_aipb); +struct airq_iv *zpci_aif_sbv; +EXPORT_SYMBOL_GPL(zpci_aif_sbv); + struct zpci_dev *get_zdev_by_fid(u32 fid) { struct zpci_dev *tmp, *zdev = NULL; @@ -64,6 +75,7 @@ struct zpci_dev *get_zdev_by_fid(u32 fid) list_for_each_entry(tmp, &zpci_list, entry) { if (tmp->fid == fid) { zdev = tmp; + zpci_zdev_get(zdev); break; } } @@ -87,17 +99,12 @@ void zpci_remove_reserved_devices(void) spin_unlock(&zpci_list_lock); list_for_each_entry_safe(zdev, tmp, &remove, entry) - zpci_remove_device(zdev); -} - -static struct zpci_dev *get_zdev_by_bus(struct pci_bus *bus) -{ - return (bus && bus->sysdata) ? (struct zpci_dev *) bus->sysdata : NULL; + zpci_device_reserved(zdev); } int pci_domain_nr(struct pci_bus *bus) { - return ((struct zpci_dev *) bus->sysdata)->domain; + return ((struct zpci_bus *) bus->sysdata)->domain_nr; } EXPORT_SYMBOL_GPL(pci_domain_nr); @@ -109,18 +116,27 @@ EXPORT_SYMBOL_GPL(pci_proc_domain); /* Modify PCI: Register I/O address translation parameters */ int zpci_register_ioat(struct zpci_dev *zdev, u8 dmaas, - u64 base, u64 limit, u64 iota) + u64 base, u64 limit, u64 iota, u8 *status) { u64 req = ZPCI_CREATE_REQ(zdev->fh, dmaas, ZPCI_MOD_FC_REG_IOAT); struct zpci_fib fib = {0}; - u8 status; + u8 cc; WARN_ON_ONCE(iota & 0x3fff); fib.pba = base; - fib.pal = limit; + /* Work around off by one in ISM virt device */ + if (zdev->pft == PCI_FUNC_TYPE_ISM && limit > base) + fib.pal = limit + (1 << 12); + else + fib.pal = limit; fib.iota = iota | ZPCI_IOTA_RTTO_FLAG; - return zpci_mod_fc(req, &fib, &status) ? -EIO : 0; + fib.gd = zdev->gisa; + cc = zpci_mod_fc(req, &fib, status); + if (cc) + zpci_dbg(3, "reg ioat fid:%x, cc:%d, status:%d\n", zdev->fid, cc, *status); + return cc; } +EXPORT_SYMBOL_GPL(zpci_register_ioat); /* Modify PCI: Unregister I/O address translation parameters */ int zpci_unregister_ioat(struct zpci_dev *zdev, u8 dmaas) @@ -129,16 +145,19 @@ int zpci_unregister_ioat(struct zpci_dev *zdev, u8 dmaas) struct zpci_fib fib = {0}; u8 cc, status; + fib.gd = zdev->gisa; + cc = zpci_mod_fc(req, &fib, &status); - if (cc == 3) /* Function already gone. */ - cc = 0; - return cc ? -EIO : 0; + if (cc) + zpci_dbg(3, "unreg ioat fid:%x, cc:%d, status:%d\n", zdev->fid, cc, status); + return cc; } /* Modify PCI: Set PCI function measurement parameters */ int zpci_fmb_enable_device(struct zpci_dev *zdev) { u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_SET_MEASURE); + struct zpci_iommu_ctrs *ctrs; struct zpci_fib fib = {0}; u8 cc, status; @@ -151,11 +170,18 @@ int zpci_fmb_enable_device(struct zpci_dev *zdev) WARN_ON((u64) zdev->fmb & 0xf); /* reset software counters */ - atomic64_set(&zdev->allocated_pages, 0); - atomic64_set(&zdev->mapped_pages, 0); - atomic64_set(&zdev->unmapped_pages, 0); + ctrs = zpci_get_iommu_ctrs(zdev); + if (ctrs) { + atomic64_set(&ctrs->mapped_pages, 0); + atomic64_set(&ctrs->unmapped_pages, 0); + atomic64_set(&ctrs->global_rpcits, 0); + atomic64_set(&ctrs->sync_map_rpcits, 0); + atomic64_set(&ctrs->sync_rpcits, 0); + } + fib.fmb_addr = virt_to_phys(zdev->fmb); + fib.gd = zdev->gisa; cc = zpci_mod_fc(req, &fib, &status); if (cc) { kmem_cache_free(zdev_fmb_cache, zdev->fmb); @@ -174,6 +200,8 @@ int zpci_fmb_disable_device(struct zpci_dev *zdev) if (!zdev->fmb) return -EINVAL; + fib.gd = zdev->gisa; + /* Function measurement is disabled if fmb address is zero */ cc = zpci_mod_fc(req, &fib, &status); if (cc == 3) /* Function already gone. */ @@ -227,38 +255,25 @@ void __iowrite64_copy(void __iomem *to, const void *from, size_t count) zpci_memcpy_toio(to, from, count); } -void __iomem *ioremap(unsigned long ioaddr, unsigned long size) +void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, + unsigned long prot) { - struct vm_struct *area; - unsigned long offset; - - if (!size) - return NULL; - + /* + * When PCI MIO instructions are unavailable the "physical" address + * encodes a hint for accessing the PCI memory space it represents. + * Just pass it unchanged such that ioread/iowrite can decode it. + */ if (!static_branch_unlikely(&have_mio)) - return (void __iomem *) ioaddr; + return (void __iomem *)phys_addr; - offset = ioaddr & ~PAGE_MASK; - ioaddr &= PAGE_MASK; - size = PAGE_ALIGN(size + offset); - area = get_vm_area(size, VM_IOREMAP); - if (!area) - return NULL; - - if (ioremap_page_range((unsigned long) area->addr, - (unsigned long) area->addr + size, - ioaddr, PAGE_KERNEL)) { - vunmap(area->addr); - return NULL; - } - return (void __iomem *) ((unsigned long) area->addr + offset); + return generic_ioremap_prot(phys_addr, size, __pgprot(prot)); } -EXPORT_SYMBOL(ioremap); +EXPORT_SYMBOL(ioremap_prot); void iounmap(volatile void __iomem *addr) { if (static_branch_likely(&have_mio)) - vunmap((__force void *) ((unsigned long) addr & PAGE_MASK)); + generic_iounmap(addr); } EXPORT_SYMBOL(iounmap); @@ -372,29 +387,17 @@ EXPORT_SYMBOL(pci_iounmap); static int pci_read(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val) { - struct zpci_dev *zdev = get_zdev_by_bus(bus); - int ret; - - if (!zdev || devfn != ZPCI_DEVFN) - ret = -ENODEV; - else - ret = zpci_cfg_load(zdev, where, val, size); + struct zpci_dev *zdev = zdev_from_bus(bus, devfn); - return ret; + return (zdev) ? zpci_cfg_load(zdev, where, val, size) : -ENODEV; } static int pci_write(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 val) { - struct zpci_dev *zdev = get_zdev_by_bus(bus); - int ret; + struct zpci_dev *zdev = zdev_from_bus(bus, devfn); - if (!zdev || devfn != ZPCI_DEVFN) - ret = -ENODEV; - else - ret = zpci_cfg_store(zdev, where, val, size); - - return ret; + return (zdev) ? zpci_cfg_store(zdev, where, val, size) : -ENODEV; } static struct pci_ops pci_root_ops = { @@ -402,15 +405,6 @@ static struct pci_ops pci_root_ops = { .write = pci_write, }; -#ifdef CONFIG_PCI_IOV -static struct resource iov_res = { - .name = "PCI IOV res", - .start = 0, - .end = -1, - .flags = IORESOURCE_MEM, -}; -#endif - static void zpci_map_resources(struct pci_dev *pdev) { struct zpci_dev *zdev = to_zpci(pdev); @@ -424,23 +418,14 @@ static void zpci_map_resources(struct pci_dev *pdev) if (zpci_use_mio(zdev)) pdev->resource[i].start = - (resource_size_t __force) zdev->bars[i].mio_wb; + (resource_size_t __force) zdev->bars[i].mio_wt; else pdev->resource[i].start = (resource_size_t __force) pci_iomap_range_fh(pdev, i, 0, 0); pdev->resource[i].end = pdev->resource[i].start + len - 1; } -#ifdef CONFIG_PCI_IOV - for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { - int bar = i + PCI_IOV_RESOURCES; - - len = pci_resource_len(pdev, bar); - if (!len) - continue; - pdev->resource[bar].parent = &iov_res; - } -#endif + zpci_iov_map_resources(pdev); } static void zpci_unmap_resources(struct pci_dev *pdev) @@ -484,6 +469,34 @@ static void zpci_free_iomap(struct zpci_dev *zdev, int entry) spin_unlock(&zpci_iomap_lock); } +static void zpci_do_update_iomap_fh(struct zpci_dev *zdev, u32 fh) +{ + int bar, idx; + + spin_lock(&zpci_iomap_lock); + for (bar = 0; bar < PCI_STD_NUM_BARS; bar++) { + if (!zdev->bars[bar].size) + continue; + idx = zdev->bars[bar].map_idx; + if (!zpci_iomap_start[idx].count) + continue; + WRITE_ONCE(zpci_iomap_start[idx].fh, zdev->fh); + } + spin_unlock(&zpci_iomap_lock); +} + +void zpci_update_fh(struct zpci_dev *zdev, u32 fh) +{ + if (!fh || zdev->fh == fh) + return; + + zdev->fh = fh; + if (zpci_use_mio(zdev)) + return; + if (zdev->has_resources && zdev_enabled(zdev)) + zpci_do_update_iomap_fh(zdev, fh); +} + static struct resource *__alloc_res(struct zpci_dev *zdev, unsigned long start, unsigned long size, unsigned long flags) { @@ -505,15 +518,14 @@ static struct resource *__alloc_res(struct zpci_dev *zdev, unsigned long start, return r; } -static int zpci_setup_bus_resources(struct zpci_dev *zdev, - struct list_head *resources) +int zpci_setup_bus_resources(struct zpci_dev *zdev) { unsigned long addr, size, flags; struct resource *res; int i, entry; snprintf(zdev->res_name, sizeof(zdev->res_name), - "PCI Bus %04x:%02x", zdev->domain, ZPCI_BUS_NR); + "PCI Bus %04x:%02x", zdev->uid, ZPCI_BUS_NR); for (i = 0; i < PCI_STD_NUM_BARS; i++) { if (!zdev->bars[i].size) @@ -531,7 +543,7 @@ static int zpci_setup_bus_resources(struct zpci_dev *zdev, flags |= IORESOURCE_MEM_64; if (zpci_use_mio(zdev)) - addr = (unsigned long) zdev->bars[i].mio_wb; + addr = (unsigned long) zdev->bars[i].mio_wt; else addr = ZPCI_ADDR(entry); size = 1UL << zdev->bars[i].size; @@ -542,36 +554,45 @@ static int zpci_setup_bus_resources(struct zpci_dev *zdev, return -ENOMEM; } zdev->bars[i].res = res; - pci_add_resource(resources, res); } + zdev->has_resources = 1; return 0; } static void zpci_cleanup_bus_resources(struct zpci_dev *zdev) { + struct resource *res; int i; + pci_lock_rescan_remove(); for (i = 0; i < PCI_STD_NUM_BARS; i++) { - if (!zdev->bars[i].size || !zdev->bars[i].res) + res = zdev->bars[i].res; + if (!res) continue; + release_resource(res); + pci_bus_remove_resource(zdev->zbus->bus, res); zpci_free_iomap(zdev, zdev->bars[i].map_idx); - release_resource(zdev->bars[i].res); - kfree(zdev->bars[i].res); + zdev->bars[i].res = NULL; + kfree(res); } + zdev->has_resources = 0; + pci_unlock_rescan_remove(); } -int pcibios_add_device(struct pci_dev *pdev) +int pcibios_device_add(struct pci_dev *pdev) { + struct zpci_dev *zdev = to_zpci(pdev); struct resource *res; int i; + /* The pdev has a reference to the zdev via its bus */ + zpci_zdev_get(zdev); if (pdev->is_physfn) pdev->no_vf_scan = 1; pdev->dev.groups = zpci_attr_groups; - pdev->dev.dma_ops = &s390_pci_dma_ops; zpci_map_resources(pdev); for (i = 0; i < PCI_STD_NUM_BARS; i++) { @@ -586,7 +607,10 @@ int pcibios_add_device(struct pci_dev *pdev) void pcibios_release_device(struct pci_dev *pdev) { + struct zpci_dev *zdev = to_zpci(pdev); + zpci_unmap_resources(pdev); + zpci_zdev_put(zdev); } int pcibios_enable_device(struct pci_dev *pdev, int mask) @@ -607,210 +631,323 @@ void pcibios_disable_device(struct pci_dev *pdev) zpci_debug_exit_device(zdev); } -#ifdef CONFIG_HIBERNATE_CALLBACKS -static int zpci_restore(struct device *dev) +static int __zpci_register_domain(int domain) { - struct pci_dev *pdev = to_pci_dev(dev); - struct zpci_dev *zdev = to_zpci(pdev); - int ret = 0; - - if (zdev->state != ZPCI_FN_STATE_ONLINE) - goto out; - - ret = clp_enable_fh(zdev, ZPCI_NR_DMA_SPACES); - if (ret) - goto out; - - zpci_map_resources(pdev); - zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, - (u64) zdev->dma_table); - -out: - return ret; + spin_lock(&zpci_domain_lock); + if (test_bit(domain, zpci_domain)) { + spin_unlock(&zpci_domain_lock); + pr_err("Domain %04x is already assigned\n", domain); + return -EEXIST; + } + set_bit(domain, zpci_domain); + spin_unlock(&zpci_domain_lock); + return domain; } -static int zpci_freeze(struct device *dev) +static int __zpci_alloc_domain(void) { - struct pci_dev *pdev = to_pci_dev(dev); - struct zpci_dev *zdev = to_zpci(pdev); + int domain; - if (zdev->state != ZPCI_FN_STATE_ONLINE) - return 0; - - zpci_unregister_ioat(zdev, 0); - zpci_unmap_resources(pdev); - return clp_disable_fh(zdev); + spin_lock(&zpci_domain_lock); + /* + * We can always auto allocate domains below ZPCI_NR_DEVICES. + * There is either a free domain or we have reached the maximum in + * which case we would have bailed earlier. + */ + domain = find_first_zero_bit(zpci_domain, ZPCI_NR_DEVICES); + set_bit(domain, zpci_domain); + spin_unlock(&zpci_domain_lock); + return domain; } -struct dev_pm_ops pcibios_pm_ops = { - .thaw_noirq = zpci_restore, - .freeze_noirq = zpci_freeze, - .restore_noirq = zpci_restore, - .poweroff_noirq = zpci_freeze, -}; -#endif /* CONFIG_HIBERNATE_CALLBACKS */ - -static int zpci_alloc_domain(struct zpci_dev *zdev) +int zpci_alloc_domain(int domain) { if (zpci_unique_uid) { - zdev->domain = (u16) zdev->uid; - if (zdev->domain >= ZPCI_NR_DEVICES) - return 0; - - spin_lock(&zpci_domain_lock); - if (test_bit(zdev->domain, zpci_domain)) { - spin_unlock(&zpci_domain_lock); - pr_err("Adding PCI function %08x failed because domain %04x is already assigned\n", - zdev->fid, zdev->domain); - return -EEXIST; - } - set_bit(zdev->domain, zpci_domain); - spin_unlock(&zpci_domain_lock); - return 0; - } - - spin_lock(&zpci_domain_lock); - zdev->domain = find_first_zero_bit(zpci_domain, ZPCI_NR_DEVICES); - if (zdev->domain == ZPCI_NR_DEVICES) { - spin_unlock(&zpci_domain_lock); - pr_err("Adding PCI function %08x failed because the configured limit of %d is reached\n", - zdev->fid, ZPCI_NR_DEVICES); - return -ENOSPC; + if (domain) + return __zpci_register_domain(domain); + pr_warn("UID checking was active but no UID is provided: switching to automatic domain allocation\n"); + update_uid_checking(false); } - set_bit(zdev->domain, zpci_domain); - spin_unlock(&zpci_domain_lock); - return 0; + return __zpci_alloc_domain(); } -static void zpci_free_domain(struct zpci_dev *zdev) +void zpci_free_domain(int domain) { - if (zdev->domain >= ZPCI_NR_DEVICES) - return; - spin_lock(&zpci_domain_lock); - clear_bit(zdev->domain, zpci_domain); + clear_bit(domain, zpci_domain); spin_unlock(&zpci_domain_lock); } -void pcibios_remove_bus(struct pci_bus *bus) + +int zpci_enable_device(struct zpci_dev *zdev) { - struct zpci_dev *zdev = get_zdev_by_bus(bus); + u32 fh = zdev->fh; + int rc = 0; - zpci_exit_slot(zdev); - zpci_cleanup_bus_resources(zdev); - zpci_destroy_iommu(zdev); - zpci_free_domain(zdev); + if (clp_enable_fh(zdev, &fh, ZPCI_NR_DMA_SPACES)) + rc = -EIO; + else + zpci_update_fh(zdev, fh); + return rc; +} +EXPORT_SYMBOL_GPL(zpci_enable_device); - spin_lock(&zpci_list_lock); - list_del(&zdev->entry); - spin_unlock(&zpci_list_lock); +int zpci_disable_device(struct zpci_dev *zdev) +{ + u32 fh = zdev->fh; + int cc, rc = 0; - zpci_dbg(3, "rem fid:%x\n", zdev->fid); - kfree(zdev); + cc = clp_disable_fh(zdev, &fh); + if (!cc) { + zpci_update_fh(zdev, fh); + } else if (cc == CLP_RC_SETPCIFN_ALRDY) { + pr_info("Disabling PCI function %08x had no effect as it was already disabled\n", + zdev->fid); + /* Function is already disabled - update handle */ + rc = clp_refresh_fh(zdev->fid, &fh); + if (!rc) { + zpci_update_fh(zdev, fh); + rc = -EINVAL; + } + } else { + rc = -EIO; + } + return rc; } +EXPORT_SYMBOL_GPL(zpci_disable_device); -static int zpci_scan_bus(struct zpci_dev *zdev) +/** + * zpci_hot_reset_device - perform a reset of the given zPCI function + * @zdev: the slot which should be reset + * + * Performs a low level reset of the zPCI function. The reset is low level in + * the sense that the zPCI function can be reset without detaching it from the + * common PCI subsystem. The reset may be performed while under control of + * either DMA or IOMMU APIs in which case the existing DMA/IOMMU translation + * table is reinstated at the end of the reset. + * + * After the reset the functions internal state is reset to an initial state + * equivalent to its state during boot when first probing a driver. + * Consequently after reset the PCI function requires re-initialization via the + * common PCI code including re-enabling IRQs via pci_alloc_irq_vectors() + * and enabling the function via e.g.pci_enablde_device_flags().The caller + * must guard against concurrent reset attempts. + * + * In most cases this function should not be called directly but through + * pci_reset_function() or pci_reset_bus() which handle the save/restore and + * locking. + * + * Return: 0 on success and an error value otherwise + */ +int zpci_hot_reset_device(struct zpci_dev *zdev) { - LIST_HEAD(resources); - int ret; + u8 status; + int rc; - ret = zpci_setup_bus_resources(zdev, &resources); - if (ret) - goto error; + zpci_dbg(3, "rst fid:%x, fh:%x\n", zdev->fid, zdev->fh); + if (zdev_enabled(zdev)) { + /* Disables device access, DMAs and IRQs (reset state) */ + rc = zpci_disable_device(zdev); + /* + * Due to a z/VM vs LPAR inconsistency in the error state the + * FH may indicate an enabled device but disable says the + * device is already disabled don't treat it as an error here. + */ + if (rc == -EINVAL) + rc = 0; + if (rc) + return rc; + } - zdev->bus = pci_scan_root_bus(NULL, ZPCI_BUS_NR, &pci_root_ops, - zdev, &resources); - if (!zdev->bus) { - ret = -EIO; - goto error; + rc = zpci_enable_device(zdev); + if (rc) + return rc; + + if (zdev->dma_table) + rc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, + virt_to_phys(zdev->dma_table), &status); + if (rc) { + zpci_disable_device(zdev); + return rc; } - zdev->bus->max_bus_speed = zdev->max_bus_speed; - pci_bus_add_devices(zdev->bus); - return 0; -error: - zpci_cleanup_bus_resources(zdev); - pci_free_resource_list(&resources); - return ret; + return 0; } -int zpci_enable_device(struct zpci_dev *zdev) +/** + * zpci_create_device() - Create a new zpci_dev and add it to the zbus + * @fid: Function ID of the device to be created + * @fh: Current Function Handle of the device to be created + * @state: Initial state after creation either Standby or Configured + * + * Creates a new zpci device and adds it to its, possibly newly created, zbus + * as well as zpci_list. + * + * Returns: the zdev on success or an error pointer otherwise + */ +struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state) { + struct zpci_dev *zdev; int rc; - rc = clp_enable_fh(zdev, ZPCI_NR_DMA_SPACES); + zpci_dbg(1, "add fid:%x, fh:%x, c:%d\n", fid, fh, state); + zdev = kzalloc(sizeof(*zdev), GFP_KERNEL); + if (!zdev) + return ERR_PTR(-ENOMEM); + + /* FID and Function Handle are the static/dynamic identifiers */ + zdev->fid = fid; + zdev->fh = fh; + + /* Query function properties and update zdev */ + rc = clp_query_pci_fn(zdev); if (rc) - goto out; + goto error; + zdev->state = state; - rc = zpci_dma_init_device(zdev); + kref_init(&zdev->kref); + mutex_init(&zdev->lock); + mutex_init(&zdev->kzdev_lock); + + rc = zpci_init_iommu(zdev); if (rc) - goto out_dma; + goto error; - zdev->state = ZPCI_FN_STATE_ONLINE; - return 0; + rc = zpci_bus_device_register(zdev, &pci_root_ops); + if (rc) + goto error_destroy_iommu; -out_dma: - clp_disable_fh(zdev); -out: - return rc; + spin_lock(&zpci_list_lock); + list_add_tail(&zdev->entry, &zpci_list); + spin_unlock(&zpci_list_lock); + + return zdev; + +error_destroy_iommu: + zpci_destroy_iommu(zdev); +error: + zpci_dbg(0, "add fid:%x, rc:%d\n", fid, rc); + kfree(zdev); + return ERR_PTR(rc); } -EXPORT_SYMBOL_GPL(zpci_enable_device); -int zpci_disable_device(struct zpci_dev *zdev) +bool zpci_is_device_configured(struct zpci_dev *zdev) { - zpci_dma_exit_device(zdev); - return clp_disable_fh(zdev); + enum zpci_state state = zdev->state; + + return state != ZPCI_FN_STATE_RESERVED && + state != ZPCI_FN_STATE_STANDBY; } -EXPORT_SYMBOL_GPL(zpci_disable_device); -int zpci_create_device(struct zpci_dev *zdev) +/** + * zpci_scan_configured_device() - Scan a freshly configured zpci_dev + * @zdev: The zpci_dev to be configured + * @fh: The general function handle supplied by the platform + * + * Given a device in the configuration state Configured, enables, scans and + * adds it to the common code PCI subsystem if possible. If any failure occurs, + * the zpci_dev is left disabled. + * + * Return: 0 on success, or an error code otherwise + */ +int zpci_scan_configured_device(struct zpci_dev *zdev, u32 fh) { - int rc; + zpci_update_fh(zdev, fh); + return zpci_bus_scan_device(zdev); +} - rc = zpci_alloc_domain(zdev); - if (rc) - goto out; +/** + * zpci_deconfigure_device() - Deconfigure a zpci_dev + * @zdev: The zpci_dev to configure + * + * Deconfigure a zPCI function that is currently configured and possibly known + * to the common code PCI subsystem. + * If any failure occurs the device is left as is. + * + * Return: 0 on success, or an error code otherwise + */ +int zpci_deconfigure_device(struct zpci_dev *zdev) +{ + int rc; - rc = zpci_init_iommu(zdev); - if (rc) - goto out_free; + if (zdev->zbus->bus) + zpci_bus_remove_device(zdev, false); - mutex_init(&zdev->lock); - if (zdev->state == ZPCI_FN_STATE_CONFIGURED) { - rc = zpci_enable_device(zdev); + if (zdev_enabled(zdev)) { + rc = zpci_disable_device(zdev); if (rc) - goto out_destroy_iommu; + return rc; } - rc = zpci_scan_bus(zdev); + + rc = sclp_pci_deconfigure(zdev->fid); + zpci_dbg(3, "deconf fid:%x, rc:%d\n", zdev->fid, rc); if (rc) - goto out_disable; + return rc; + zdev->state = ZPCI_FN_STATE_STANDBY; + + return 0; +} +/** + * zpci_device_reserved() - Mark device as resverved + * @zdev: the zpci_dev that was reserved + * + * Handle the case that a given zPCI function was reserved by another system. + * After a call to this function the zpci_dev can not be found via + * get_zdev_by_fid() anymore but may still be accessible via existing + * references though it will not be functional anymore. + */ +void zpci_device_reserved(struct zpci_dev *zdev) +{ + if (zdev->has_hp_slot) + zpci_exit_slot(zdev); + /* + * Remove device from zpci_list as it is going away. This also + * makes sure we ignore subsequent zPCI events for this device. + */ spin_lock(&zpci_list_lock); - list_add_tail(&zdev->entry, &zpci_list); + list_del(&zdev->entry); spin_unlock(&zpci_list_lock); + zdev->state = ZPCI_FN_STATE_RESERVED; + zpci_dbg(3, "rsv fid:%x\n", zdev->fid); + zpci_zdev_put(zdev); +} - zpci_init_slot(zdev); +void zpci_release_device(struct kref *kref) +{ + struct zpci_dev *zdev = container_of(kref, struct zpci_dev, kref); + int ret; - return 0; + if (zdev->zbus->bus) + zpci_bus_remove_device(zdev, false); -out_disable: - if (zdev->state == ZPCI_FN_STATE_ONLINE) + if (zdev_enabled(zdev)) zpci_disable_device(zdev); -out_destroy_iommu: - zpci_destroy_iommu(zdev); -out_free: - zpci_free_domain(zdev); -out: - return rc; -} -void zpci_remove_device(struct zpci_dev *zdev) -{ - if (!zdev->bus) - return; - - pci_stop_root_bus(zdev->bus); - pci_remove_root_bus(zdev->bus); + switch (zdev->state) { + case ZPCI_FN_STATE_CONFIGURED: + ret = sclp_pci_deconfigure(zdev->fid); + zpci_dbg(3, "deconf fid:%x, rc:%d\n", zdev->fid, ret); + fallthrough; + case ZPCI_FN_STATE_STANDBY: + if (zdev->has_hp_slot) + zpci_exit_slot(zdev); + spin_lock(&zpci_list_lock); + list_del(&zdev->entry); + spin_unlock(&zpci_list_lock); + zpci_dbg(3, "rsv fid:%x\n", zdev->fid); + fallthrough; + case ZPCI_FN_STATE_RESERVED: + if (zdev->has_resources) + zpci_cleanup_bus_resources(zdev); + zpci_bus_device_unregister(zdev); + zpci_destroy_iommu(zdev); + fallthrough; + default: + break; + } + zpci_dbg(3, "rem fid:%x\n", zdev->fid); + kfree_rcu(zdev, rcu); } int zpci_report_error(struct pci_dev *pdev, @@ -822,6 +959,59 @@ int zpci_report_error(struct pci_dev *pdev, } EXPORT_SYMBOL(zpci_report_error); +/** + * zpci_clear_error_state() - Clears the zPCI error state of the device + * @zdev: The zdev for which the zPCI error state should be reset + * + * Clear the zPCI error state of the device. If clearing the zPCI error state + * fails the device is left in the error state. In this case it may make sense + * to call zpci_io_perm_failure() on the associated pdev if it exists. + * + * Returns: 0 on success, -EIO otherwise + */ +int zpci_clear_error_state(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_RESET_ERROR); + struct zpci_fib fib = {0}; + u8 status; + int cc; + + cc = zpci_mod_fc(req, &fib, &status); + if (cc) { + zpci_dbg(3, "ces fid:%x, cc:%d, status:%x\n", zdev->fid, cc, status); + return -EIO; + } + + return 0; +} + +/** + * zpci_reset_load_store_blocked() - Re-enables L/S from error state + * @zdev: The zdev for which to unblock load/store access + * + * Re-enables load/store access for a PCI function in the error state while + * keeping DMA blocked. In this state drivers can poke MMIO space to determine + * if error recovery is possible while catching any rogue DMA access from the + * device. + * + * Returns: 0 on success, -EIO otherwise + */ +int zpci_reset_load_store_blocked(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_RESET_BLOCK); + struct zpci_fib fib = {0}; + u8 status; + int cc; + + cc = zpci_mod_fc(req, &fib, &status); + if (cc) { + zpci_dbg(3, "rls fid:%x, cc:%d, status:%x\n", zdev->fid, cc, status); + return -EIO; + } + + return 0; +} + static int zpci_mem_init(void) { BUILD_BUG_ON(!is_power_of_2(__alignof__(struct zpci_fmb)) || @@ -842,6 +1032,9 @@ static int zpci_mem_init(void) if (!zpci_iomap_bitmap) goto error_iomap_bitmap; + if (static_branch_likely(&have_mio)) + clp_setup_writeback_mio(); + return 0; error_iomap_bitmap: kfree(zpci_iomap_start); @@ -859,7 +1052,6 @@ static void zpci_mem_exit(void) } static unsigned int s390_pci_probe __initdata = 1; -static unsigned int s390_pci_no_mio __initdata; unsigned int s390_pci_force_floating __initdata; static unsigned int s390_pci_initialized; @@ -870,13 +1062,17 @@ char * __init pcibios_setup(char *str) return NULL; } if (!strcmp(str, "nomio")) { - s390_pci_no_mio = 1; + S390_lowcore.machine_flags &= ~MACHINE_FLAG_PCI_MIO; return NULL; } if (!strcmp(str, "force_floating")) { s390_pci_force_floating = 1; return NULL; } + if (!strcmp(str, "norid")) { + s390_pci_no_rid = 1; + return NULL; + } return str; } @@ -892,12 +1088,14 @@ static int __init pci_base_init(void) if (!s390_pci_probe) return 0; - if (!test_facility(69) || !test_facility(71)) + if (!test_facility(69) || !test_facility(71)) { + pr_info("PCI is not supported because CPU facilities 69 or 71 are not available\n"); return 0; + } - if (test_facility(153) && !s390_pci_no_mio) { + if (MACHINE_HAS_PCI_MIO) { static_branch_enable(&have_mio); - ctl_set_bit(2, 5); + system_ctl_set_bit(2, CR2_MIO_ADDRESSING_BIT); } rc = zpci_debug_init(); @@ -912,20 +1110,15 @@ static int __init pci_base_init(void) if (rc) goto out_irq; - rc = zpci_dma_init(); - if (rc) - goto out_dma; - rc = clp_scan_pci_devices(); if (rc) goto out_find; + zpci_bus_scan_busses(); s390_pci_initialized = 1; return 0; out_find: - zpci_dma_exit(); -out_dma: zpci_irq_exit(); out_irq: zpci_mem_exit(); @@ -935,9 +1128,3 @@ out: return rc; } subsys_initcall_sync(pci_base_init); - -void zpci_rescan(void) -{ - if (zpci_is_enabled()) - clp_rescan_pci_devices_simple(); -} diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c new file mode 100644 index 000000000000..daa5d7450c7d --- /dev/null +++ b/arch/s390/pci/pci_bus.c @@ -0,0 +1,374 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2020 + * + * Author(s): + * Pierre Morel <pmorel@linux.ibm.com> + * + */ + +#define KMSG_COMPONENT "zpci" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/err.h> +#include <linux/export.h> +#include <linux/delay.h> +#include <linux/seq_file.h> +#include <linux/jump_label.h> +#include <linux/pci.h> +#include <linux/printk.h> + +#include <asm/pci_clp.h> +#include <asm/pci_dma.h> + +#include "pci_bus.h" +#include "pci_iov.h" + +static LIST_HEAD(zbus_list); +static DEFINE_MUTEX(zbus_list_lock); +static int zpci_nb_devices; + +/* zpci_bus_prepare_device - Prepare a zPCI function for scanning + * @zdev: the zPCI function to be prepared + * + * The PCI resources for the function are set up and added to its zbus and the + * function is enabled. The function must be added to a zbus which must have + * a PCI bus created. If an error occurs the zPCI function is not enabled. + * + * Return: 0 on success, an error code otherwise + */ +static int zpci_bus_prepare_device(struct zpci_dev *zdev) +{ + int rc, i; + + if (!zdev_enabled(zdev)) { + rc = zpci_enable_device(zdev); + if (rc) + return rc; + } + + if (!zdev->has_resources) { + zpci_setup_bus_resources(zdev); + for (i = 0; i < PCI_STD_NUM_BARS; i++) { + if (zdev->bars[i].res) + pci_bus_add_resource(zdev->zbus->bus, zdev->bars[i].res, 0); + } + } + + return 0; +} + +/* zpci_bus_scan_device - Scan a single device adding it to the PCI core + * @zdev: the zdev to be scanned + * + * Scans the PCI function making it available to the common PCI code. + * + * Return: 0 on success, an error value otherwise + */ +int zpci_bus_scan_device(struct zpci_dev *zdev) +{ + struct pci_dev *pdev; + int rc; + + rc = zpci_bus_prepare_device(zdev); + if (rc) + return rc; + + pdev = pci_scan_single_device(zdev->zbus->bus, zdev->devfn); + if (!pdev) + return -ENODEV; + + pci_lock_rescan_remove(); + pci_bus_add_device(pdev); + pci_unlock_rescan_remove(); + + return 0; +} + +/* zpci_bus_remove_device - Removes the given zdev from the PCI core + * @zdev: the zdev to be removed from the PCI core + * @set_error: if true the device's error state is set to permanent failure + * + * Sets a zPCI device to a configured but offline state; the zPCI + * device is still accessible through its hotplug slot and the zPCI + * API but is removed from the common code PCI bus, making it + * no longer available to drivers. + */ +void zpci_bus_remove_device(struct zpci_dev *zdev, bool set_error) +{ + struct zpci_bus *zbus = zdev->zbus; + struct pci_dev *pdev; + + if (!zdev->zbus->bus) + return; + + pdev = pci_get_slot(zbus->bus, zdev->devfn); + if (pdev) { + if (set_error) + pdev->error_state = pci_channel_io_perm_failure; + if (pdev->is_virtfn) { + zpci_iov_remove_virtfn(pdev, zdev->vfn); + /* balance pci_get_slot */ + pci_dev_put(pdev); + return; + } + pci_stop_and_remove_bus_device_locked(pdev); + /* balance pci_get_slot */ + pci_dev_put(pdev); + } +} + +/* zpci_bus_scan_bus - Scan all configured zPCI functions on the bus + * @zbus: the zbus to be scanned + * + * Enables and scans all PCI functions on the bus making them available to the + * common PCI code. If a PCI function fails to be initialized an error will be + * returned but attempts will still be made for all other functions on the bus. + * + * Return: 0 on success, an error value otherwise + */ +int zpci_bus_scan_bus(struct zpci_bus *zbus) +{ + struct zpci_dev *zdev; + int devfn, rc, ret = 0; + + for (devfn = 0; devfn < ZPCI_FUNCTIONS_PER_BUS; devfn++) { + zdev = zbus->function[devfn]; + if (zdev && zdev->state == ZPCI_FN_STATE_CONFIGURED) { + rc = zpci_bus_prepare_device(zdev); + if (rc) + ret = -EIO; + } + } + + pci_lock_rescan_remove(); + pci_scan_child_bus(zbus->bus); + pci_bus_add_devices(zbus->bus); + pci_unlock_rescan_remove(); + + return ret; +} + +/* zpci_bus_scan_busses - Scan all registered busses + * + * Scan all available zbusses + * + */ +void zpci_bus_scan_busses(void) +{ + struct zpci_bus *zbus = NULL; + + mutex_lock(&zbus_list_lock); + list_for_each_entry(zbus, &zbus_list, bus_next) { + zpci_bus_scan_bus(zbus); + cond_resched(); + } + mutex_unlock(&zbus_list_lock); +} + +/* zpci_bus_create_pci_bus - Create the PCI bus associated with this zbus + * @zbus: the zbus holding the zdevices + * @fr: PCI root function that will determine the bus's domain, and bus speeed + * @ops: the pci operations + * + * The PCI function @fr determines the domain (its UID), multifunction property + * and maximum bus speed of the entire bus. + * + * Return: 0 on success, an error code otherwise + */ +static int zpci_bus_create_pci_bus(struct zpci_bus *zbus, struct zpci_dev *fr, struct pci_ops *ops) +{ + struct pci_bus *bus; + int domain; + + domain = zpci_alloc_domain((u16)fr->uid); + if (domain < 0) + return domain; + + zbus->domain_nr = domain; + zbus->multifunction = fr->rid_available; + zbus->max_bus_speed = fr->max_bus_speed; + + /* + * Note that the zbus->resources are taken over and zbus->resources + * is empty after a successful call + */ + bus = pci_create_root_bus(NULL, ZPCI_BUS_NR, ops, zbus, &zbus->resources); + if (!bus) { + zpci_free_domain(zbus->domain_nr); + return -EFAULT; + } + + zbus->bus = bus; + + return 0; +} + +static void zpci_bus_release(struct kref *kref) +{ + struct zpci_bus *zbus = container_of(kref, struct zpci_bus, kref); + + if (zbus->bus) { + pci_lock_rescan_remove(); + pci_stop_root_bus(zbus->bus); + + zpci_free_domain(zbus->domain_nr); + pci_free_resource_list(&zbus->resources); + + pci_remove_root_bus(zbus->bus); + pci_unlock_rescan_remove(); + } + + mutex_lock(&zbus_list_lock); + list_del(&zbus->bus_next); + mutex_unlock(&zbus_list_lock); + kfree(zbus); +} + +static void zpci_bus_put(struct zpci_bus *zbus) +{ + kref_put(&zbus->kref, zpci_bus_release); +} + +static struct zpci_bus *zpci_bus_get(int pchid) +{ + struct zpci_bus *zbus; + + mutex_lock(&zbus_list_lock); + list_for_each_entry(zbus, &zbus_list, bus_next) { + if (pchid == zbus->pchid) { + kref_get(&zbus->kref); + goto out_unlock; + } + } + zbus = NULL; +out_unlock: + mutex_unlock(&zbus_list_lock); + return zbus; +} + +static struct zpci_bus *zpci_bus_alloc(int pchid) +{ + struct zpci_bus *zbus; + + zbus = kzalloc(sizeof(*zbus), GFP_KERNEL); + if (!zbus) + return NULL; + + zbus->pchid = pchid; + INIT_LIST_HEAD(&zbus->bus_next); + mutex_lock(&zbus_list_lock); + list_add_tail(&zbus->bus_next, &zbus_list); + mutex_unlock(&zbus_list_lock); + + kref_init(&zbus->kref); + INIT_LIST_HEAD(&zbus->resources); + + zbus->bus_resource.start = 0; + zbus->bus_resource.end = ZPCI_BUS_NR; + zbus->bus_resource.flags = IORESOURCE_BUS; + pci_add_resource(&zbus->resources, &zbus->bus_resource); + + return zbus; +} + +void pcibios_bus_add_device(struct pci_dev *pdev) +{ + struct zpci_dev *zdev = to_zpci(pdev); + + /* + * With pdev->no_vf_scan the common PCI probing code does not + * perform PF/VF linking. + */ + if (zdev->vfn) { + zpci_iov_setup_virtfn(zdev->zbus, pdev, zdev->vfn); + pdev->no_command_memory = 1; + } +} + +static int zpci_bus_add_device(struct zpci_bus *zbus, struct zpci_dev *zdev) +{ + int rc = -EINVAL; + + if (zbus->function[zdev->devfn]) { + pr_err("devfn %04x is already assigned\n", zdev->devfn); + return rc; + } + + zdev->zbus = zbus; + zbus->function[zdev->devfn] = zdev; + zpci_nb_devices++; + + if (zbus->multifunction && !zdev->rid_available) { + WARN_ONCE(1, "rid_available not set for multifunction\n"); + goto error; + } + rc = zpci_init_slot(zdev); + if (rc) + goto error; + zdev->has_hp_slot = 1; + + return 0; + +error: + zbus->function[zdev->devfn] = NULL; + zdev->zbus = NULL; + zpci_nb_devices--; + return rc; +} + +int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops) +{ + struct zpci_bus *zbus = NULL; + int rc = -EBADF; + + if (zpci_nb_devices == ZPCI_NR_DEVICES) { + pr_warn("Adding PCI function %08x failed because the configured limit of %d is reached\n", + zdev->fid, ZPCI_NR_DEVICES); + return -ENOSPC; + } + + if (zdev->devfn >= ZPCI_FUNCTIONS_PER_BUS) + return -EINVAL; + + if (!s390_pci_no_rid && zdev->rid_available) + zbus = zpci_bus_get(zdev->pchid); + + if (!zbus) { + zbus = zpci_bus_alloc(zdev->pchid); + if (!zbus) + return -ENOMEM; + } + + if (!zbus->bus) { + /* The UID of the first PCI function registered with a zpci_bus + * is used as the domain number for that bus. Currently there + * is exactly one zpci_bus per domain. + */ + rc = zpci_bus_create_pci_bus(zbus, zdev, ops); + if (rc) + goto error; + } + + rc = zpci_bus_add_device(zbus, zdev); + if (rc) + goto error; + + return 0; + +error: + pr_err("Adding PCI function %08x failed\n", zdev->fid); + zpci_bus_put(zbus); + return rc; +} + +void zpci_bus_device_unregister(struct zpci_dev *zdev) +{ + struct zpci_bus *zbus = zdev->zbus; + + zpci_nb_devices--; + zbus->function[zdev->devfn] = NULL; + zpci_bus_put(zbus); +} diff --git a/arch/s390/pci/pci_bus.h b/arch/s390/pci/pci_bus.h new file mode 100644 index 000000000000..af9f0ac79a1b --- /dev/null +++ b/arch/s390/pci/pci_bus.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 2020 + * + * Author(s): + * Pierre Morel <pmorel@linux.ibm.com> + * + */ + +int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops); +void zpci_bus_device_unregister(struct zpci_dev *zdev); + +int zpci_bus_scan_bus(struct zpci_bus *zbus); +void zpci_bus_scan_busses(void); + +int zpci_bus_scan_device(struct zpci_dev *zdev); +void zpci_bus_remove_device(struct zpci_dev *zdev, bool set_error); + +void zpci_release_device(struct kref *kref); +static inline void zpci_zdev_put(struct zpci_dev *zdev) +{ + if (zdev) + kref_put(&zdev->kref, zpci_release_device); +} + +static inline void zpci_zdev_get(struct zpci_dev *zdev) +{ + kref_get(&zdev->kref); +} + +int zpci_alloc_domain(int domain); +void zpci_free_domain(int domain); +int zpci_setup_bus_resources(struct zpci_dev *zdev); + +static inline struct zpci_dev *zdev_from_bus(struct pci_bus *bus, + unsigned int devfn) +{ + struct zpci_bus *zbus = bus->sysdata; + + return (devfn >= ZPCI_FUNCTIONS_PER_BUS) ? NULL : zbus->function[devfn]; +} + diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c index 4c613e569fe0..ee90a91ed888 100644 --- a/arch/s390/pci/pci_clp.c +++ b/arch/s390/pci/pci_clp.c @@ -17,17 +17,20 @@ #include <linux/delay.h> #include <linux/pci.h> #include <linux/uaccess.h> +#include <asm/asm-extable.h> #include <asm/pci_debug.h> #include <asm/pci_clp.h> #include <asm/clp.h> #include <uapi/asm/clp.h> +#include "pci_bus.h" + bool zpci_unique_uid; -static void update_uid_checking(bool new) +void update_uid_checking(bool new) { if (zpci_unique_uid != new) - zpci_dbg(1, "uid checking:%d\n", new); + zpci_dbg(3, "uid checking:%d\n", new); zpci_unique_uid = new; } @@ -102,6 +105,9 @@ static void clp_store_query_pci_fngrp(struct zpci_dev *zdev, zdev->msi_addr = response->msia; zdev->max_msi = response->noi; zdev->fmb_update = response->mui; + zdev->version = response->version; + zdev->maxstbl = response->maxstbl; + zdev->dtsm = response->dtsm; switch (response->version) { case 1: @@ -155,13 +161,19 @@ static int clp_store_query_pci_fn(struct zpci_dev *zdev, zdev->pfgid = response->pfgid; zdev->pft = response->pft; zdev->vfn = response->vfn; + zdev->port = response->port; zdev->uid = response->uid; zdev->fmb_length = sizeof(u32) * response->fmb_len; + zdev->rid_available = response->rid_avail; + zdev->is_physfn = response->is_physfn; + if (!s390_pci_no_rid && zdev->rid_available) + zdev->devfn = response->rid & ZPCI_RID_MASK_DEVFN; memcpy(zdev->pfip, response->pfip, sizeof(zdev->pfip)); if (response->util_str_avail) { memcpy(zdev->util_str, response->util_str, sizeof(zdev->util_str)); + zdev->util_str_avail = 1; } zdev->mio_capable = response->mio_addr_avail; for (i = 0; i < PCI_STD_NUM_BARS; i++) { @@ -174,7 +186,7 @@ static int clp_store_query_pci_fn(struct zpci_dev *zdev, return 0; } -static int clp_query_pci_fn(struct zpci_dev *zdev, u32 fh) +int clp_query_pci_fn(struct zpci_dev *zdev) { struct clp_req_rsp_query_pci *rrb; int rc; @@ -187,7 +199,7 @@ static int clp_query_pci_fn(struct zpci_dev *zdev, u32 fh) rrb->request.hdr.len = sizeof(rrb->request); rrb->request.hdr.cmd = CLP_QUERY_PCI_FN; rrb->response.hdr.len = sizeof(rrb->response); - rrb->request.fh = fh; + rrb->request.fh = zdev->fh; rc = clp_req(rrb, CLP_LPS_PCI); if (!rc && rrb->response.hdr.rsp == CLP_RC_OK) { @@ -205,60 +217,39 @@ out: return rc; } -int clp_add_pci_device(u32 fid, u32 fh, int configured) -{ - struct zpci_dev *zdev; - int rc = -ENOMEM; - - zpci_dbg(3, "add fid:%x, fh:%x, c:%d\n", fid, fh, configured); - zdev = kzalloc(sizeof(*zdev), GFP_KERNEL); - if (!zdev) - goto error; - - zdev->fh = fh; - zdev->fid = fid; - - /* Query function properties and update zdev */ - rc = clp_query_pci_fn(zdev, fh); - if (rc) - goto error; - - if (configured) - zdev->state = ZPCI_FN_STATE_CONFIGURED; - else - zdev->state = ZPCI_FN_STATE_STANDBY; - - rc = zpci_create_device(zdev); - if (rc) - goto error; - return 0; - -error: - zpci_dbg(0, "add fid:%x, rc:%d\n", fid, rc); - kfree(zdev); - return rc; -} - -/* - * Enable/Disable a given PCI function defined by its function handle. +/** + * clp_set_pci_fn() - Execute a command on a PCI function + * @zdev: Function that will be affected + * @fh: Out parameter for updated function handle + * @nr_dma_as: DMA address space number + * @command: The command code to execute + * + * Returns: 0 on success, < 0 for Linux errors (e.g. -ENOMEM), and + * > 0 for non-success platform responses */ -static int clp_set_pci_fn(u32 *fh, u8 nr_dma_as, u8 command) +static int clp_set_pci_fn(struct zpci_dev *zdev, u32 *fh, u8 nr_dma_as, u8 command) { struct clp_req_rsp_set_pci *rrb; int rc, retries = 100; + u32 gisa = 0; + *fh = 0; rrb = clp_alloc_block(GFP_KERNEL); if (!rrb) return -ENOMEM; + if (command != CLP_SET_DISABLE_PCI_FN) + gisa = zdev->gisa; + do { memset(rrb, 0, sizeof(*rrb)); rrb->request.hdr.len = sizeof(rrb->request); rrb->request.hdr.cmd = CLP_SET_PCI_FN; rrb->response.hdr.len = sizeof(rrb->response); - rrb->request.fh = *fh; + rrb->request.fh = zdev->fh; rrb->request.oc = command; rrb->request.ndas = nr_dma_as; + rrb->request.gisa = gisa; rc = clp_req(rrb, CLP_LPS_PCI); if (rrb->response.hdr.rsp == CLP_RC_SETPCIFN_BUSY) { @@ -269,50 +260,107 @@ static int clp_set_pci_fn(u32 *fh, u8 nr_dma_as, u8 command) } } while (rrb->response.hdr.rsp == CLP_RC_SETPCIFN_BUSY); - if (!rc && rrb->response.hdr.rsp == CLP_RC_OK) + if (!rc && rrb->response.hdr.rsp == CLP_RC_OK) { *fh = rrb->response.fh; - else { + } else { zpci_err("Set PCI FN:\n"); zpci_err_clp(rrb->response.hdr.rsp, rc); - rc = -EIO; + if (!rc) + rc = rrb->response.hdr.rsp; } clp_free_block(rrb); return rc; } -int clp_enable_fh(struct zpci_dev *zdev, u8 nr_dma_as) +int clp_setup_writeback_mio(void) { - u32 fh = zdev->fh; + struct clp_req_rsp_slpc_pci *rrb; + u8 wb_bit_pos; int rc; - rc = clp_set_pci_fn(&fh, nr_dma_as, CLP_SET_ENABLE_PCI_FN); - zpci_dbg(3, "ena fid:%x, fh:%x, rc:%d\n", zdev->fid, fh, rc); - if (rc) - goto out; + rrb = clp_alloc_block(GFP_KERNEL); + if (!rrb) + return -ENOMEM; + + memset(rrb, 0, sizeof(*rrb)); + rrb->request.hdr.len = sizeof(rrb->request); + rrb->request.hdr.cmd = CLP_SLPC; + rrb->response.hdr.len = sizeof(rrb->response); + + rc = clp_req(rrb, CLP_LPS_PCI); + if (!rc && rrb->response.hdr.rsp == CLP_RC_OK) { + if (rrb->response.vwb) { + wb_bit_pos = rrb->response.mio_wb; + set_bit_inv(wb_bit_pos, &mio_wb_bit_mask); + zpci_dbg(3, "wb bit: %d\n", wb_bit_pos); + } else { + zpci_dbg(3, "wb bit: n.a.\n"); + } + + } else { + zpci_err("SLPC PCI:\n"); + zpci_err_clp(rrb->response.hdr.rsp, rc); + rc = -EIO; + } + clp_free_block(rrb); + return rc; +} + +int clp_enable_fh(struct zpci_dev *zdev, u32 *fh, u8 nr_dma_as) +{ + int rc; - zdev->fh = fh; - if (zpci_use_mio(zdev)) { - rc = clp_set_pci_fn(&fh, nr_dma_as, CLP_SET_ENABLE_MIO); - zpci_dbg(3, "ena mio fid:%x, fh:%x, rc:%d\n", zdev->fid, fh, rc); + rc = clp_set_pci_fn(zdev, fh, nr_dma_as, CLP_SET_ENABLE_PCI_FN); + zpci_dbg(3, "ena fid:%x, fh:%x, rc:%d\n", zdev->fid, *fh, rc); + if (!rc && zpci_use_mio(zdev)) { + rc = clp_set_pci_fn(zdev, fh, nr_dma_as, CLP_SET_ENABLE_MIO); + zpci_dbg(3, "ena mio fid:%x, fh:%x, rc:%d\n", + zdev->fid, *fh, rc); if (rc) - clp_disable_fh(zdev); + clp_disable_fh(zdev, fh); } -out: return rc; } -int clp_disable_fh(struct zpci_dev *zdev) +int clp_disable_fh(struct zpci_dev *zdev, u32 *fh) { - u32 fh = zdev->fh; int rc; if (!zdev_enabled(zdev)) return 0; - rc = clp_set_pci_fn(&fh, 0, CLP_SET_DISABLE_PCI_FN); - zpci_dbg(3, "dis fid:%x, fh:%x, rc:%d\n", zdev->fid, fh, rc); - if (!rc) - zdev->fh = fh; + rc = clp_set_pci_fn(zdev, fh, 0, CLP_SET_DISABLE_PCI_FN); + zpci_dbg(3, "dis fid:%x, fh:%x, rc:%d\n", zdev->fid, *fh, rc); + return rc; +} + +static int clp_list_pci_req(struct clp_req_rsp_list_pci *rrb, + u64 *resume_token, int *nentries) +{ + int rc; + + memset(rrb, 0, sizeof(*rrb)); + rrb->request.hdr.len = sizeof(rrb->request); + rrb->request.hdr.cmd = CLP_LIST_PCI; + /* store as many entries as possible */ + rrb->response.hdr.len = CLP_BLK_SIZE - LIST_PCI_HDR_LEN; + rrb->request.resume_token = *resume_token; + + /* Get PCI function handle list */ + rc = clp_req(rrb, CLP_LPS_PCI); + if (rc || rrb->response.hdr.rsp != CLP_RC_OK) { + zpci_err("List PCI FN:\n"); + zpci_err_clp(rrb->response.hdr.rsp, rc); + return -EIO; + } + + update_uid_checking(rrb->response.uid_checking); + WARN_ON_ONCE(rrb->response.entry_size != + sizeof(struct clp_fh_list_entry)); + + *nentries = (rrb->response.hdr.len - LIST_PCI_HDR_LEN) / + rrb->response.entry_size; + *resume_token = rrb->response.resume_token; return rc; } @@ -321,53 +369,43 @@ static int clp_list_pci(struct clp_req_rsp_list_pci *rrb, void *data, void (*cb)(struct clp_fh_list_entry *, void *)) { u64 resume_token = 0; - int entries, i, rc; + int nentries, i, rc; do { - memset(rrb, 0, sizeof(*rrb)); - rrb->request.hdr.len = sizeof(rrb->request); - rrb->request.hdr.cmd = CLP_LIST_PCI; - /* store as many entries as possible */ - rrb->response.hdr.len = CLP_BLK_SIZE - LIST_PCI_HDR_LEN; - rrb->request.resume_token = resume_token; - - /* Get PCI function handle list */ - rc = clp_req(rrb, CLP_LPS_PCI); - if (rc || rrb->response.hdr.rsp != CLP_RC_OK) { - zpci_err("List PCI FN:\n"); - zpci_err_clp(rrb->response.hdr.rsp, rc); - rc = -EIO; - goto out; - } - - update_uid_checking(rrb->response.uid_checking); - WARN_ON_ONCE(rrb->response.entry_size != - sizeof(struct clp_fh_list_entry)); - - entries = (rrb->response.hdr.len - LIST_PCI_HDR_LEN) / - rrb->response.entry_size; - - resume_token = rrb->response.resume_token; - for (i = 0; i < entries; i++) + rc = clp_list_pci_req(rrb, &resume_token, &nentries); + if (rc) + return rc; + for (i = 0; i < nentries; i++) cb(&rrb->response.fh_list[i], data); } while (resume_token); -out: + return rc; } -static void __clp_add(struct clp_fh_list_entry *entry, void *data) +static int clp_find_pci(struct clp_req_rsp_list_pci *rrb, u32 fid, + struct clp_fh_list_entry *entry) { - struct zpci_dev *zdev; + struct clp_fh_list_entry *fh_list; + u64 resume_token = 0; + int nentries, i, rc; - if (!entry->vendor_id) - return; + do { + rc = clp_list_pci_req(rrb, &resume_token, &nentries); + if (rc) + return rc; + fh_list = rrb->response.fh_list; + for (i = 0; i < nentries; i++) { + if (fh_list[i].fid == fid) { + *entry = fh_list[i]; + return 0; + } + } + } while (resume_token); - zdev = get_zdev_by_fid(entry->fid); - if (!zdev) - clp_add_pci_device(entry->fid, entry->fh, entry->config_state); + return -ENODEV; } -static void __clp_update(struct clp_fh_list_entry *entry, void *data) +static void __clp_add(struct clp_fh_list_entry *entry, void *data) { struct zpci_dev *zdev; @@ -375,10 +413,11 @@ static void __clp_update(struct clp_fh_list_entry *entry, void *data) return; zdev = get_zdev_by_fid(entry->fid); - if (!zdev) + if (zdev) { + zpci_zdev_put(zdev); return; - - zdev->fh = entry->fh; + } + zpci_create_device(entry->fid, entry->fh, entry->config_state); } int clp_scan_pci_devices(void) @@ -396,66 +435,44 @@ int clp_scan_pci_devices(void) return rc; } -int clp_rescan_pci_devices(void) -{ - struct clp_req_rsp_list_pci *rrb; - int rc; - - zpci_remove_reserved_devices(); - - rrb = clp_alloc_block(GFP_KERNEL); - if (!rrb) - return -ENOMEM; - - rc = clp_list_pci(rrb, NULL, __clp_add); - - clp_free_block(rrb); - return rc; -} - -int clp_rescan_pci_devices_simple(void) +/* + * Get the current function handle of the function matching @fid + */ +int clp_refresh_fh(u32 fid, u32 *fh) { struct clp_req_rsp_list_pci *rrb; + struct clp_fh_list_entry entry; int rc; rrb = clp_alloc_block(GFP_NOWAIT); if (!rrb) return -ENOMEM; - rc = clp_list_pci(rrb, NULL, __clp_update); + rc = clp_find_pci(rrb, fid, &entry); + if (!rc) + *fh = entry.fh; clp_free_block(rrb); return rc; } -struct clp_state_data { - u32 fid; - enum zpci_state state; -}; - -static void __clp_get_state(struct clp_fh_list_entry *entry, void *data) -{ - struct clp_state_data *sd = data; - - if (entry->fid != sd->fid) - return; - - sd->state = entry->config_state; -} - int clp_get_state(u32 fid, enum zpci_state *state) { struct clp_req_rsp_list_pci *rrb; - struct clp_state_data sd = {fid, ZPCI_FN_STATE_RESERVED}; + struct clp_fh_list_entry entry; int rc; rrb = clp_alloc_block(GFP_ATOMIC); if (!rrb) return -ENOMEM; - rc = clp_list_pci(rrb, &sd, __clp_get_state); - if (!rc) - *state = sd.state; + rc = clp_find_pci(rrb, fid, &entry); + if (!rc) { + *state = entry.config_state; + } else if (rc == -ENODEV) { + *state = ZPCI_FN_STATE_RESERVED; + rc = 0; + } clp_free_block(rrb); return rc; @@ -481,7 +498,7 @@ static int clp_base_command(struct clp_req *req, struct clp_req_hdr *lpcb) } } -static int clp_pci_slpc(struct clp_req *req, struct clp_req_rsp_slpc *lpcb) +static int clp_pci_slpc(struct clp_req *req, struct clp_req_rsp_slpc_pci *lpcb) { unsigned long limit = PAGE_SIZE - sizeof(lpcb->request); @@ -649,9 +666,4 @@ static struct miscdevice clp_misc_device = { .fops = &clp_misc_fops, }; -static int __init clp_misc_init(void) -{ - return misc_register(&clp_misc_device); -} - -device_initcall(clp_misc_init); +builtin_misc_device(clp_misc_device); diff --git a/arch/s390/pci/pci_debug.c b/arch/s390/pci/pci_debug.c index 3408c0df3ebf..6dde2263c79d 100644 --- a/arch/s390/pci/pci_debug.c +++ b/arch/s390/pci/pci_debug.c @@ -53,9 +53,11 @@ static char *pci_fmt3_names[] = { }; static char *pci_sw_names[] = { - "Allocated pages", "Mapped pages", "Unmapped pages", + "Global RPCITs", + "Sync Map RPCITs", + "Sync RPCITs", }; static void pci_fmb_show(struct seq_file *m, char *name[], int length, @@ -69,10 +71,14 @@ static void pci_fmb_show(struct seq_file *m, char *name[], int length, static void pci_sw_counter_show(struct seq_file *m) { - struct zpci_dev *zdev = m->private; - atomic64_t *counter = &zdev->allocated_pages; + struct zpci_iommu_ctrs *ctrs = zpci_get_iommu_ctrs(m->private); + atomic64_t *counter; int i; + if (!ctrs) + return; + + counter = &ctrs->mapped_pages; for (i = 0; i < ARRAY_SIZE(pci_sw_names); i++, counter++) seq_printf(m, "%26s:\t%llu\n", pci_sw_names[i], atomic64_read(counter)); @@ -196,7 +202,7 @@ int __init zpci_debug_init(void) if (!pci_debug_err_id) return -EINVAL; debug_register_view(pci_debug_err_id, &debug_hex_ascii_view); - debug_set_level(pci_debug_err_id, 6); + debug_set_level(pci_debug_err_id, 3); debugfs_root = debugfs_create_dir("pci", NULL); return 0; diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c deleted file mode 100644 index 64b1399a73f0..000000000000 --- a/arch/s390/pci/pci_dma.c +++ /dev/null @@ -1,684 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright IBM Corp. 2012 - * - * Author(s): - * Jan Glauber <jang@linux.vnet.ibm.com> - */ - -#include <linux/kernel.h> -#include <linux/slab.h> -#include <linux/export.h> -#include <linux/iommu-helper.h> -#include <linux/dma-mapping.h> -#include <linux/vmalloc.h> -#include <linux/pci.h> -#include <asm/pci_dma.h> - -static struct kmem_cache *dma_region_table_cache; -static struct kmem_cache *dma_page_table_cache; -static int s390_iommu_strict; - -static int zpci_refresh_global(struct zpci_dev *zdev) -{ - return zpci_refresh_trans((u64) zdev->fh << 32, zdev->start_dma, - zdev->iommu_pages * PAGE_SIZE); -} - -unsigned long *dma_alloc_cpu_table(void) -{ - unsigned long *table, *entry; - - table = kmem_cache_alloc(dma_region_table_cache, GFP_ATOMIC); - if (!table) - return NULL; - - for (entry = table; entry < table + ZPCI_TABLE_ENTRIES; entry++) - *entry = ZPCI_TABLE_INVALID; - return table; -} - -static void dma_free_cpu_table(void *table) -{ - kmem_cache_free(dma_region_table_cache, table); -} - -static unsigned long *dma_alloc_page_table(void) -{ - unsigned long *table, *entry; - - table = kmem_cache_alloc(dma_page_table_cache, GFP_ATOMIC); - if (!table) - return NULL; - - for (entry = table; entry < table + ZPCI_PT_ENTRIES; entry++) - *entry = ZPCI_PTE_INVALID; - return table; -} - -static void dma_free_page_table(void *table) -{ - kmem_cache_free(dma_page_table_cache, table); -} - -static unsigned long *dma_get_seg_table_origin(unsigned long *entry) -{ - unsigned long *sto; - - if (reg_entry_isvalid(*entry)) - sto = get_rt_sto(*entry); - else { - sto = dma_alloc_cpu_table(); - if (!sto) - return NULL; - - set_rt_sto(entry, sto); - validate_rt_entry(entry); - entry_clr_protected(entry); - } - return sto; -} - -static unsigned long *dma_get_page_table_origin(unsigned long *entry) -{ - unsigned long *pto; - - if (reg_entry_isvalid(*entry)) - pto = get_st_pto(*entry); - else { - pto = dma_alloc_page_table(); - if (!pto) - return NULL; - set_st_pto(entry, pto); - validate_st_entry(entry); - entry_clr_protected(entry); - } - return pto; -} - -unsigned long *dma_walk_cpu_trans(unsigned long *rto, dma_addr_t dma_addr) -{ - unsigned long *sto, *pto; - unsigned int rtx, sx, px; - - rtx = calc_rtx(dma_addr); - sto = dma_get_seg_table_origin(&rto[rtx]); - if (!sto) - return NULL; - - sx = calc_sx(dma_addr); - pto = dma_get_page_table_origin(&sto[sx]); - if (!pto) - return NULL; - - px = calc_px(dma_addr); - return &pto[px]; -} - -void dma_update_cpu_trans(unsigned long *entry, void *page_addr, int flags) -{ - if (flags & ZPCI_PTE_INVALID) { - invalidate_pt_entry(entry); - } else { - set_pt_pfaa(entry, page_addr); - validate_pt_entry(entry); - } - - if (flags & ZPCI_TABLE_PROTECTED) - entry_set_protected(entry); - else - entry_clr_protected(entry); -} - -static int __dma_update_trans(struct zpci_dev *zdev, unsigned long pa, - dma_addr_t dma_addr, size_t size, int flags) -{ - unsigned int nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT; - u8 *page_addr = (u8 *) (pa & PAGE_MASK); - unsigned long irq_flags; - unsigned long *entry; - int i, rc = 0; - - if (!nr_pages) - return -EINVAL; - - spin_lock_irqsave(&zdev->dma_table_lock, irq_flags); - if (!zdev->dma_table) { - rc = -EINVAL; - goto out_unlock; - } - - for (i = 0; i < nr_pages; i++) { - entry = dma_walk_cpu_trans(zdev->dma_table, dma_addr); - if (!entry) { - rc = -ENOMEM; - goto undo_cpu_trans; - } - dma_update_cpu_trans(entry, page_addr, flags); - page_addr += PAGE_SIZE; - dma_addr += PAGE_SIZE; - } - -undo_cpu_trans: - if (rc && ((flags & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID)) { - flags = ZPCI_PTE_INVALID; - while (i-- > 0) { - page_addr -= PAGE_SIZE; - dma_addr -= PAGE_SIZE; - entry = dma_walk_cpu_trans(zdev->dma_table, dma_addr); - if (!entry) - break; - dma_update_cpu_trans(entry, page_addr, flags); - } - } -out_unlock: - spin_unlock_irqrestore(&zdev->dma_table_lock, irq_flags); - return rc; -} - -static int __dma_purge_tlb(struct zpci_dev *zdev, dma_addr_t dma_addr, - size_t size, int flags) -{ - unsigned long irqflags; - int ret; - - /* - * With zdev->tlb_refresh == 0, rpcit is not required to establish new - * translations when previously invalid translation-table entries are - * validated. With lazy unmap, rpcit is skipped for previously valid - * entries, but a global rpcit is then required before any address can - * be re-used, i.e. after each iommu bitmap wrap-around. - */ - if ((flags & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID) { - if (!zdev->tlb_refresh) - return 0; - } else { - if (!s390_iommu_strict) - return 0; - } - - ret = zpci_refresh_trans((u64) zdev->fh << 32, dma_addr, - PAGE_ALIGN(size)); - if (ret == -ENOMEM && !s390_iommu_strict) { - /* enable the hypervisor to free some resources */ - if (zpci_refresh_global(zdev)) - goto out; - - spin_lock_irqsave(&zdev->iommu_bitmap_lock, irqflags); - bitmap_andnot(zdev->iommu_bitmap, zdev->iommu_bitmap, - zdev->lazy_bitmap, zdev->iommu_pages); - bitmap_zero(zdev->lazy_bitmap, zdev->iommu_pages); - spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, irqflags); - ret = 0; - } -out: - return ret; -} - -static int dma_update_trans(struct zpci_dev *zdev, unsigned long pa, - dma_addr_t dma_addr, size_t size, int flags) -{ - int rc; - - rc = __dma_update_trans(zdev, pa, dma_addr, size, flags); - if (rc) - return rc; - - rc = __dma_purge_tlb(zdev, dma_addr, size, flags); - if (rc && ((flags & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID)) - __dma_update_trans(zdev, pa, dma_addr, size, ZPCI_PTE_INVALID); - - return rc; -} - -void dma_free_seg_table(unsigned long entry) -{ - unsigned long *sto = get_rt_sto(entry); - int sx; - - for (sx = 0; sx < ZPCI_TABLE_ENTRIES; sx++) - if (reg_entry_isvalid(sto[sx])) - dma_free_page_table(get_st_pto(sto[sx])); - - dma_free_cpu_table(sto); -} - -void dma_cleanup_tables(unsigned long *table) -{ - int rtx; - - if (!table) - return; - - for (rtx = 0; rtx < ZPCI_TABLE_ENTRIES; rtx++) - if (reg_entry_isvalid(table[rtx])) - dma_free_seg_table(table[rtx]); - - dma_free_cpu_table(table); -} - -static unsigned long __dma_alloc_iommu(struct device *dev, - unsigned long start, int size) -{ - struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); - unsigned long boundary_size; - - boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, - PAGE_SIZE) >> PAGE_SHIFT; - return iommu_area_alloc(zdev->iommu_bitmap, zdev->iommu_pages, - start, size, zdev->start_dma >> PAGE_SHIFT, - boundary_size, 0); -} - -static dma_addr_t dma_alloc_address(struct device *dev, int size) -{ - struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); - unsigned long offset, flags; - - spin_lock_irqsave(&zdev->iommu_bitmap_lock, flags); - offset = __dma_alloc_iommu(dev, zdev->next_bit, size); - if (offset == -1) { - if (!s390_iommu_strict) { - /* global flush before DMA addresses are reused */ - if (zpci_refresh_global(zdev)) - goto out_error; - - bitmap_andnot(zdev->iommu_bitmap, zdev->iommu_bitmap, - zdev->lazy_bitmap, zdev->iommu_pages); - bitmap_zero(zdev->lazy_bitmap, zdev->iommu_pages); - } - /* wrap-around */ - offset = __dma_alloc_iommu(dev, 0, size); - if (offset == -1) - goto out_error; - } - zdev->next_bit = offset + size; - spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, flags); - - return zdev->start_dma + offset * PAGE_SIZE; - -out_error: - spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, flags); - return DMA_MAPPING_ERROR; -} - -static void dma_free_address(struct device *dev, dma_addr_t dma_addr, int size) -{ - struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); - unsigned long flags, offset; - - offset = (dma_addr - zdev->start_dma) >> PAGE_SHIFT; - - spin_lock_irqsave(&zdev->iommu_bitmap_lock, flags); - if (!zdev->iommu_bitmap) - goto out; - - if (s390_iommu_strict) - bitmap_clear(zdev->iommu_bitmap, offset, size); - else - bitmap_set(zdev->lazy_bitmap, offset, size); - -out: - spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, flags); -} - -static inline void zpci_err_dma(unsigned long rc, unsigned long addr) -{ - struct { - unsigned long rc; - unsigned long addr; - } __packed data = {rc, addr}; - - zpci_err_hex(&data, sizeof(data)); -} - -static dma_addr_t s390_dma_map_pages(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction direction, - unsigned long attrs) -{ - struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); - unsigned long pa = page_to_phys(page) + offset; - int flags = ZPCI_PTE_VALID; - unsigned long nr_pages; - dma_addr_t dma_addr; - int ret; - - /* This rounds up number of pages based on size and offset */ - nr_pages = iommu_num_pages(pa, size, PAGE_SIZE); - dma_addr = dma_alloc_address(dev, nr_pages); - if (dma_addr == DMA_MAPPING_ERROR) { - ret = -ENOSPC; - goto out_err; - } - - /* Use rounded up size */ - size = nr_pages * PAGE_SIZE; - - if (direction == DMA_NONE || direction == DMA_TO_DEVICE) - flags |= ZPCI_TABLE_PROTECTED; - - ret = dma_update_trans(zdev, pa, dma_addr, size, flags); - if (ret) - goto out_free; - - atomic64_add(nr_pages, &zdev->mapped_pages); - return dma_addr + (offset & ~PAGE_MASK); - -out_free: - dma_free_address(dev, dma_addr, nr_pages); -out_err: - zpci_err("map error:\n"); - zpci_err_dma(ret, pa); - return DMA_MAPPING_ERROR; -} - -static void s390_dma_unmap_pages(struct device *dev, dma_addr_t dma_addr, - size_t size, enum dma_data_direction direction, - unsigned long attrs) -{ - struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); - int npages, ret; - - npages = iommu_num_pages(dma_addr, size, PAGE_SIZE); - dma_addr = dma_addr & PAGE_MASK; - ret = dma_update_trans(zdev, 0, dma_addr, npages * PAGE_SIZE, - ZPCI_PTE_INVALID); - if (ret) { - zpci_err("unmap error:\n"); - zpci_err_dma(ret, dma_addr); - return; - } - - atomic64_add(npages, &zdev->unmapped_pages); - dma_free_address(dev, dma_addr, npages); -} - -static void *s390_dma_alloc(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag, - unsigned long attrs) -{ - struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); - struct page *page; - unsigned long pa; - dma_addr_t map; - - size = PAGE_ALIGN(size); - page = alloc_pages(flag | __GFP_ZERO, get_order(size)); - if (!page) - return NULL; - - pa = page_to_phys(page); - map = s390_dma_map_pages(dev, page, 0, size, DMA_BIDIRECTIONAL, 0); - if (dma_mapping_error(dev, map)) { - free_pages(pa, get_order(size)); - return NULL; - } - - atomic64_add(size / PAGE_SIZE, &zdev->allocated_pages); - if (dma_handle) - *dma_handle = map; - return (void *) pa; -} - -static void s390_dma_free(struct device *dev, size_t size, - void *pa, dma_addr_t dma_handle, - unsigned long attrs) -{ - struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); - - size = PAGE_ALIGN(size); - atomic64_sub(size / PAGE_SIZE, &zdev->allocated_pages); - s390_dma_unmap_pages(dev, dma_handle, size, DMA_BIDIRECTIONAL, 0); - free_pages((unsigned long) pa, get_order(size)); -} - -/* Map a segment into a contiguous dma address area */ -static int __s390_dma_map_sg(struct device *dev, struct scatterlist *sg, - size_t size, dma_addr_t *handle, - enum dma_data_direction dir) -{ - unsigned long nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT; - struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); - dma_addr_t dma_addr_base, dma_addr; - int flags = ZPCI_PTE_VALID; - struct scatterlist *s; - unsigned long pa = 0; - int ret; - - dma_addr_base = dma_alloc_address(dev, nr_pages); - if (dma_addr_base == DMA_MAPPING_ERROR) - return -ENOMEM; - - dma_addr = dma_addr_base; - if (dir == DMA_NONE || dir == DMA_TO_DEVICE) - flags |= ZPCI_TABLE_PROTECTED; - - for (s = sg; dma_addr < dma_addr_base + size; s = sg_next(s)) { - pa = page_to_phys(sg_page(s)); - ret = __dma_update_trans(zdev, pa, dma_addr, - s->offset + s->length, flags); - if (ret) - goto unmap; - - dma_addr += s->offset + s->length; - } - ret = __dma_purge_tlb(zdev, dma_addr_base, size, flags); - if (ret) - goto unmap; - - *handle = dma_addr_base; - atomic64_add(nr_pages, &zdev->mapped_pages); - - return ret; - -unmap: - dma_update_trans(zdev, 0, dma_addr_base, dma_addr - dma_addr_base, - ZPCI_PTE_INVALID); - dma_free_address(dev, dma_addr_base, nr_pages); - zpci_err("map error:\n"); - zpci_err_dma(ret, pa); - return ret; -} - -static int s390_dma_map_sg(struct device *dev, struct scatterlist *sg, - int nr_elements, enum dma_data_direction dir, - unsigned long attrs) -{ - struct scatterlist *s = sg, *start = sg, *dma = sg; - unsigned int max = dma_get_max_seg_size(dev); - unsigned int size = s->offset + s->length; - unsigned int offset = s->offset; - int count = 0, i; - - for (i = 1; i < nr_elements; i++) { - s = sg_next(s); - - s->dma_address = DMA_MAPPING_ERROR; - s->dma_length = 0; - - if (s->offset || (size & ~PAGE_MASK) || - size + s->length > max) { - if (__s390_dma_map_sg(dev, start, size, - &dma->dma_address, dir)) - goto unmap; - - dma->dma_address += offset; - dma->dma_length = size - offset; - - size = offset = s->offset; - start = s; - dma = sg_next(dma); - count++; - } - size += s->length; - } - if (__s390_dma_map_sg(dev, start, size, &dma->dma_address, dir)) - goto unmap; - - dma->dma_address += offset; - dma->dma_length = size - offset; - - return count + 1; -unmap: - for_each_sg(sg, s, count, i) - s390_dma_unmap_pages(dev, sg_dma_address(s), sg_dma_len(s), - dir, attrs); - - return 0; -} - -static void s390_dma_unmap_sg(struct device *dev, struct scatterlist *sg, - int nr_elements, enum dma_data_direction dir, - unsigned long attrs) -{ - struct scatterlist *s; - int i; - - for_each_sg(sg, s, nr_elements, i) { - if (s->dma_length) - s390_dma_unmap_pages(dev, s->dma_address, s->dma_length, - dir, attrs); - s->dma_address = 0; - s->dma_length = 0; - } -} - -int zpci_dma_init_device(struct zpci_dev *zdev) -{ - int rc; - - /* - * At this point, if the device is part of an IOMMU domain, this would - * be a strong hint towards a bug in the IOMMU API (common) code and/or - * simultaneous access via IOMMU and DMA API. So let's issue a warning. - */ - WARN_ON(zdev->s390_domain); - - spin_lock_init(&zdev->iommu_bitmap_lock); - spin_lock_init(&zdev->dma_table_lock); - - zdev->dma_table = dma_alloc_cpu_table(); - if (!zdev->dma_table) { - rc = -ENOMEM; - goto out; - } - - /* - * Restrict the iommu bitmap size to the minimum of the following: - * - main memory size - * - 3-level pagetable address limit minus start_dma offset - * - DMA address range allowed by the hardware (clp query pci fn) - * - * Also set zdev->end_dma to the actual end address of the usable - * range, instead of the theoretical maximum as reported by hardware. - */ - zdev->start_dma = PAGE_ALIGN(zdev->start_dma); - zdev->iommu_size = min3((u64) high_memory, - ZPCI_TABLE_SIZE_RT - zdev->start_dma, - zdev->end_dma - zdev->start_dma + 1); - zdev->end_dma = zdev->start_dma + zdev->iommu_size - 1; - zdev->iommu_pages = zdev->iommu_size >> PAGE_SHIFT; - zdev->iommu_bitmap = vzalloc(zdev->iommu_pages / 8); - if (!zdev->iommu_bitmap) { - rc = -ENOMEM; - goto free_dma_table; - } - if (!s390_iommu_strict) { - zdev->lazy_bitmap = vzalloc(zdev->iommu_pages / 8); - if (!zdev->lazy_bitmap) { - rc = -ENOMEM; - goto free_bitmap; - } - - } - rc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, - (u64) zdev->dma_table); - if (rc) - goto free_bitmap; - - return 0; -free_bitmap: - vfree(zdev->iommu_bitmap); - zdev->iommu_bitmap = NULL; - vfree(zdev->lazy_bitmap); - zdev->lazy_bitmap = NULL; -free_dma_table: - dma_free_cpu_table(zdev->dma_table); - zdev->dma_table = NULL; -out: - return rc; -} - -void zpci_dma_exit_device(struct zpci_dev *zdev) -{ - /* - * At this point, if the device is part of an IOMMU domain, this would - * be a strong hint towards a bug in the IOMMU API (common) code and/or - * simultaneous access via IOMMU and DMA API. So let's issue a warning. - */ - WARN_ON(zdev->s390_domain); - - if (zpci_unregister_ioat(zdev, 0)) - return; - - dma_cleanup_tables(zdev->dma_table); - zdev->dma_table = NULL; - vfree(zdev->iommu_bitmap); - zdev->iommu_bitmap = NULL; - vfree(zdev->lazy_bitmap); - zdev->lazy_bitmap = NULL; - - zdev->next_bit = 0; -} - -static int __init dma_alloc_cpu_table_caches(void) -{ - dma_region_table_cache = kmem_cache_create("PCI_DMA_region_tables", - ZPCI_TABLE_SIZE, ZPCI_TABLE_ALIGN, - 0, NULL); - if (!dma_region_table_cache) - return -ENOMEM; - - dma_page_table_cache = kmem_cache_create("PCI_DMA_page_tables", - ZPCI_PT_SIZE, ZPCI_PT_ALIGN, - 0, NULL); - if (!dma_page_table_cache) { - kmem_cache_destroy(dma_region_table_cache); - return -ENOMEM; - } - return 0; -} - -int __init zpci_dma_init(void) -{ - return dma_alloc_cpu_table_caches(); -} - -void zpci_dma_exit(void) -{ - kmem_cache_destroy(dma_page_table_cache); - kmem_cache_destroy(dma_region_table_cache); -} - -const struct dma_map_ops s390_pci_dma_ops = { - .alloc = s390_dma_alloc, - .free = s390_dma_free, - .map_sg = s390_dma_map_sg, - .unmap_sg = s390_dma_unmap_sg, - .map_page = s390_dma_map_pages, - .unmap_page = s390_dma_unmap_pages, - .mmap = dma_common_mmap, - .get_sgtable = dma_common_get_sgtable, - /* dma_supported is unconditionally true without a callback */ -}; -EXPORT_SYMBOL_GPL(s390_pci_dma_ops); - -static int __init s390_iommu_setup(char *str) -{ - if (!strcmp(str, "strict")) - s390_iommu_strict = 1; - return 1; -} - -__setup("s390_iommu=", s390_iommu_setup); diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c index 8d6ee4af4230..4d9773ef9e0a 100644 --- a/arch/s390/pci/pci_event.c +++ b/arch/s390/pci/pci_event.c @@ -12,8 +12,11 @@ #include <linux/kernel.h> #include <linux/pci.h> #include <asm/pci_debug.h> +#include <asm/pci_dma.h> #include <asm/sclp.h> +#include "pci_bus.h" + /* Content Code Description for PCI Function Error */ struct zpci_ccdf_err { u32 reserved1; @@ -44,25 +47,254 @@ struct zpci_ccdf_avail { u16 pec; /* PCI event code */ } __packed; +static inline bool ers_result_indicates_abort(pci_ers_result_t ers_res) +{ + switch (ers_res) { + case PCI_ERS_RESULT_CAN_RECOVER: + case PCI_ERS_RESULT_RECOVERED: + case PCI_ERS_RESULT_NEED_RESET: + return false; + default: + return true; + } +} + +static bool is_passed_through(struct pci_dev *pdev) +{ + struct zpci_dev *zdev = to_zpci(pdev); + bool ret; + + mutex_lock(&zdev->kzdev_lock); + ret = !!zdev->kzdev; + mutex_unlock(&zdev->kzdev_lock); + + return ret; +} + +static bool is_driver_supported(struct pci_driver *driver) +{ + if (!driver || !driver->err_handler) + return false; + if (!driver->err_handler->error_detected) + return false; + if (!driver->err_handler->slot_reset) + return false; + if (!driver->err_handler->resume) + return false; + return true; +} + +static pci_ers_result_t zpci_event_notify_error_detected(struct pci_dev *pdev, + struct pci_driver *driver) +{ + pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT; + + ers_res = driver->err_handler->error_detected(pdev, pdev->error_state); + if (ers_result_indicates_abort(ers_res)) + pr_info("%s: Automatic recovery failed after initial reporting\n", pci_name(pdev)); + else if (ers_res == PCI_ERS_RESULT_NEED_RESET) + pr_debug("%s: Driver needs reset to recover\n", pci_name(pdev)); + + return ers_res; +} + +static pci_ers_result_t zpci_event_do_error_state_clear(struct pci_dev *pdev, + struct pci_driver *driver) +{ + pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT; + struct zpci_dev *zdev = to_zpci(pdev); + int rc; + + pr_info("%s: Unblocking device access for examination\n", pci_name(pdev)); + rc = zpci_reset_load_store_blocked(zdev); + if (rc) { + pr_err("%s: Unblocking device access failed\n", pci_name(pdev)); + /* Let's try a full reset instead */ + return PCI_ERS_RESULT_NEED_RESET; + } + + if (driver->err_handler->mmio_enabled) { + ers_res = driver->err_handler->mmio_enabled(pdev); + if (ers_result_indicates_abort(ers_res)) { + pr_info("%s: Automatic recovery failed after MMIO re-enable\n", + pci_name(pdev)); + return ers_res; + } else if (ers_res == PCI_ERS_RESULT_NEED_RESET) { + pr_debug("%s: Driver needs reset to recover\n", pci_name(pdev)); + return ers_res; + } + } + + pr_debug("%s: Unblocking DMA\n", pci_name(pdev)); + rc = zpci_clear_error_state(zdev); + if (!rc) { + pdev->error_state = pci_channel_io_normal; + } else { + pr_err("%s: Unblocking DMA failed\n", pci_name(pdev)); + /* Let's try a full reset instead */ + return PCI_ERS_RESULT_NEED_RESET; + } + + return ers_res; +} + +static pci_ers_result_t zpci_event_do_reset(struct pci_dev *pdev, + struct pci_driver *driver) +{ + pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT; + + pr_info("%s: Initiating reset\n", pci_name(pdev)); + if (zpci_hot_reset_device(to_zpci(pdev))) { + pr_err("%s: The reset request failed\n", pci_name(pdev)); + return ers_res; + } + pdev->error_state = pci_channel_io_normal; + ers_res = driver->err_handler->slot_reset(pdev); + if (ers_result_indicates_abort(ers_res)) { + pr_info("%s: Automatic recovery failed after slot reset\n", pci_name(pdev)); + return ers_res; + } + + return ers_res; +} + +/* zpci_event_attempt_error_recovery - Try to recover the given PCI function + * @pdev: PCI function to recover currently in the error state + * + * We follow the scheme outlined in Documentation/PCI/pci-error-recovery.rst. + * With the simplification that recovery always happens per function + * and the platform determines which functions are affected for + * multi-function devices. + */ +static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev) +{ + pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT; + struct pci_driver *driver; + + /* + * Ensure that the PCI function is not removed concurrently, no driver + * is unbound or probed and that userspace can't access its + * configuration space while we perform recovery. + */ + pci_dev_lock(pdev); + if (pdev->error_state == pci_channel_io_perm_failure) { + ers_res = PCI_ERS_RESULT_DISCONNECT; + goto out_unlock; + } + pdev->error_state = pci_channel_io_frozen; + + if (is_passed_through(pdev)) { + pr_info("%s: Cannot be recovered in the host because it is a pass-through device\n", + pci_name(pdev)); + goto out_unlock; + } + + driver = to_pci_driver(pdev->dev.driver); + if (!is_driver_supported(driver)) { + if (!driver) + pr_info("%s: Cannot be recovered because no driver is bound to the device\n", + pci_name(pdev)); + else + pr_info("%s: The %s driver bound to the device does not support error recovery\n", + pci_name(pdev), + driver->name); + goto out_unlock; + } + + ers_res = zpci_event_notify_error_detected(pdev, driver); + if (ers_result_indicates_abort(ers_res)) + goto out_unlock; + + if (ers_res == PCI_ERS_RESULT_CAN_RECOVER) { + ers_res = zpci_event_do_error_state_clear(pdev, driver); + if (ers_result_indicates_abort(ers_res)) + goto out_unlock; + } + + if (ers_res == PCI_ERS_RESULT_NEED_RESET) + ers_res = zpci_event_do_reset(pdev, driver); + + if (ers_res != PCI_ERS_RESULT_RECOVERED) { + pr_err("%s: Automatic recovery failed; operator intervention is required\n", + pci_name(pdev)); + goto out_unlock; + } + + pr_info("%s: The device is ready to resume operations\n", pci_name(pdev)); + if (driver->err_handler->resume) + driver->err_handler->resume(pdev); +out_unlock: + pci_dev_unlock(pdev); + + return ers_res; +} + +/* zpci_event_io_failure - Report PCI channel failure state to driver + * @pdev: PCI function for which to report + * @es: PCI channel failure state to report + */ +static void zpci_event_io_failure(struct pci_dev *pdev, pci_channel_state_t es) +{ + struct pci_driver *driver; + + pci_dev_lock(pdev); + pdev->error_state = es; + /** + * While vfio-pci's error_detected callback notifies user-space QEMU + * reacts to this by freezing the guest. In an s390 environment PCI + * errors are rarely fatal so this is overkill. Instead in the future + * we will inject the error event and let the guest recover the device + * itself. + */ + if (is_passed_through(pdev)) + goto out; + driver = to_pci_driver(pdev->dev.driver); + if (driver && driver->err_handler && driver->err_handler->error_detected) + driver->err_handler->error_detected(pdev, pdev->error_state); +out: + pci_dev_unlock(pdev); +} + static void __zpci_event_error(struct zpci_ccdf_err *ccdf) { struct zpci_dev *zdev = get_zdev_by_fid(ccdf->fid); struct pci_dev *pdev = NULL; + pci_ers_result_t ers_res; + zpci_dbg(3, "err fid:%x, fh:%x, pec:%x\n", + ccdf->fid, ccdf->fh, ccdf->pec); zpci_err("error CCDF:\n"); zpci_err_hex(ccdf, sizeof(*ccdf)); - if (zdev) - pdev = pci_get_slot(zdev->bus, ZPCI_DEVFN); + if (zdev) { + zpci_update_fh(zdev, ccdf->fh); + if (zdev->zbus->bus) + pdev = pci_get_slot(zdev->zbus->bus, zdev->devfn); + } pr_err("%s: Event 0x%x reports an error for PCI function 0x%x\n", pdev ? pci_name(pdev) : "n/a", ccdf->pec, ccdf->fid); if (!pdev) - return; + goto no_pdev; - pdev->error_state = pci_channel_io_perm_failure; + switch (ccdf->pec) { + case 0x003a: /* Service Action or Error Recovery Successful */ + ers_res = zpci_event_attempt_error_recovery(pdev); + if (ers_res != PCI_ERS_RESULT_RECOVERED) + zpci_event_io_failure(pdev, pci_channel_io_perm_failure); + break; + default: + /* + * Mark as frozen not permanently failed because the device + * could be subsequently recovered by the platform. + */ + zpci_event_io_failure(pdev, pci_channel_io_frozen); + break; + } pci_dev_put(pdev); +no_pdev: + zpci_zdev_put(zdev); } void zpci_event_error(void *data) @@ -71,90 +303,88 @@ void zpci_event_error(void *data) __zpci_event_error(data); } +static void zpci_event_hard_deconfigured(struct zpci_dev *zdev, u32 fh) +{ + zpci_update_fh(zdev, fh); + /* Give the driver a hint that the function is + * already unusable. + */ + zpci_bus_remove_device(zdev, true); + /* Even though the device is already gone we still + * need to free zPCI resources as part of the disable. + */ + if (zdev_enabled(zdev)) + zpci_disable_device(zdev); + zdev->state = ZPCI_FN_STATE_STANDBY; +} + static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) { struct zpci_dev *zdev = get_zdev_by_fid(ccdf->fid); - struct pci_dev *pdev = NULL; + bool existing_zdev = !!zdev; enum zpci_state state; - int ret; - - if (zdev) - pdev = pci_get_slot(zdev->bus, ZPCI_DEVFN); - - pr_info("%s: Event 0x%x reconfigured PCI function 0x%x\n", - pdev ? pci_name(pdev) : "n/a", ccdf->pec, ccdf->fid); - zpci_err("avail CCDF:\n"); - zpci_err_hex(ccdf, sizeof(*ccdf)); + zpci_dbg(3, "avl fid:%x, fh:%x, pec:%x\n", + ccdf->fid, ccdf->fh, ccdf->pec); switch (ccdf->pec) { case 0x0301: /* Reserved|Standby -> Configured */ if (!zdev) { - ret = clp_add_pci_device(ccdf->fid, ccdf->fh, 0); - if (ret) + zdev = zpci_create_device(ccdf->fid, ccdf->fh, ZPCI_FN_STATE_CONFIGURED); + if (IS_ERR(zdev)) break; - zdev = get_zdev_by_fid(ccdf->fid); + } else { + /* the configuration request may be stale */ + if (zdev->state != ZPCI_FN_STATE_STANDBY) + break; + zdev->state = ZPCI_FN_STATE_CONFIGURED; } - if (!zdev || zdev->state != ZPCI_FN_STATE_STANDBY) - break; - zdev->state = ZPCI_FN_STATE_CONFIGURED; - zdev->fh = ccdf->fh; - ret = zpci_enable_device(zdev); - if (ret) - break; - pci_lock_rescan_remove(); - pci_rescan_bus(zdev->bus); - pci_unlock_rescan_remove(); + zpci_scan_configured_device(zdev, ccdf->fh); break; case 0x0302: /* Reserved -> Standby */ if (!zdev) - clp_add_pci_device(ccdf->fid, ccdf->fh, 0); + zpci_create_device(ccdf->fid, ccdf->fh, ZPCI_FN_STATE_STANDBY); + else + zpci_update_fh(zdev, ccdf->fh); break; case 0x0303: /* Deconfiguration requested */ - if (!zdev) - break; - if (pdev) - pci_stop_and_remove_bus_device_locked(pdev); - - ret = zpci_disable_device(zdev); - if (ret) - break; - - ret = sclp_pci_deconfigure(zdev->fid); - zpci_dbg(3, "deconf fid:%x, rc:%d\n", zdev->fid, ret); - if (!ret) - zdev->state = ZPCI_FN_STATE_STANDBY; - + if (zdev) { + /* The event may have been queued before we confirgured + * the device. + */ + if (zdev->state != ZPCI_FN_STATE_CONFIGURED) + break; + zpci_update_fh(zdev, ccdf->fh); + zpci_deconfigure_device(zdev); + } break; case 0x0304: /* Configured -> Standby|Reserved */ - if (!zdev) - break; - if (pdev) { - /* Give the driver a hint that the function is - * already unusable. */ - pdev->error_state = pci_channel_io_perm_failure; - pci_stop_and_remove_bus_device_locked(pdev); - } - - zdev->fh = ccdf->fh; - zpci_disable_device(zdev); - zdev->state = ZPCI_FN_STATE_STANDBY; - if (!clp_get_state(ccdf->fid, &state) && - state == ZPCI_FN_STATE_RESERVED) { - zpci_remove_device(zdev); + if (zdev) { + /* The event may have been queued before we confirgured + * the device.: + */ + if (zdev->state == ZPCI_FN_STATE_CONFIGURED) + zpci_event_hard_deconfigured(zdev, ccdf->fh); + /* The 0x0304 event may immediately reserve the device */ + if (!clp_get_state(zdev->fid, &state) && + state == ZPCI_FN_STATE_RESERVED) { + zpci_device_reserved(zdev); + } } break; case 0x0306: /* 0x308 or 0x302 for multiple devices */ - clp_rescan_pci_devices(); + zpci_remove_reserved_devices(); + clp_scan_pci_devices(); break; case 0x0308: /* Standby -> Reserved */ if (!zdev) break; - zpci_remove_device(zdev); + zpci_device_reserved(zdev); break; default: break; } - pci_dev_put(pdev); + if (existing_zdev) + zpci_zdev_put(zdev); } void zpci_event_availability(void *data) diff --git a/arch/s390/pci/pci_insn.c b/arch/s390/pci/pci_insn.c index 02f9505c99a8..56480be48244 100644 --- a/arch/s390/pci/pci_insn.c +++ b/arch/s390/pci/pci_insn.c @@ -9,6 +9,7 @@ #include <linux/errno.h> #include <linux/delay.h> #include <linux/jump_label.h> +#include <asm/asm-extable.h> #include <asm/facility.h> #include <asm/pci_insn.h> #include <asm/pci_debug.h> @@ -17,16 +18,40 @@ #define ZPCI_INSN_BUSY_DELAY 1 /* 1 microsecond */ -static inline void zpci_err_insn(u8 cc, u8 status, u64 req, u64 offset) +struct zpci_err_insn_data { + u8 insn; + u8 cc; + u8 status; + union { + struct { + u64 req; + u64 offset; + }; + struct { + u64 addr; + u64 len; + }; + }; +} __packed; + +static inline void zpci_err_insn_req(int lvl, u8 insn, u8 cc, u8 status, + u64 req, u64 offset) { - struct { - u64 req; - u64 offset; - u8 cc; - u8 status; - } __packed data = {req, offset, cc, status}; - - zpci_err_hex(&data, sizeof(data)); + struct zpci_err_insn_data data = { + .insn = insn, .cc = cc, .status = status, + .req = req, .offset = offset}; + + zpci_err_hex_level(lvl, &data, sizeof(data)); +} + +static inline void zpci_err_insn_addr(int lvl, u8 insn, u8 cc, u8 status, + u64 addr, u64 len) +{ + struct zpci_err_insn_data data = { + .insn = insn, .cc = cc, .status = status, + .addr = addr, .len = len}; + + zpci_err_hex_level(lvl, &data, sizeof(data)); } /* Modify PCI Function Controls */ @@ -46,33 +71,41 @@ static inline u8 __mpcifc(u64 req, struct zpci_fib *fib, u8 *status) u8 zpci_mod_fc(u64 req, struct zpci_fib *fib, u8 *status) { + bool retried = false; u8 cc; do { cc = __mpcifc(req, fib, status); - if (cc == 2) + if (cc == 2) { msleep(ZPCI_INSN_BUSY_DELAY); + if (!retried) { + zpci_err_insn_req(1, 'M', cc, *status, req, 0); + retried = true; + } + } } while (cc == 2); if (cc) - zpci_err_insn(cc, *status, req, 0); + zpci_err_insn_req(0, 'M', cc, *status, req, 0); + else if (retried) + zpci_err_insn_req(1, 'M', cc, *status, req, 0); return cc; } +EXPORT_SYMBOL_GPL(zpci_mod_fc); /* Refresh PCI Translations */ static inline u8 __rpcit(u64 fn, u64 addr, u64 range, u8 *status) { - register u64 __addr asm("2") = addr; - register u64 __range asm("3") = range; + union register_pair addr_range = {.even = addr, .odd = range}; u8 cc; asm volatile ( - " .insn rre,0xb9d30000,%[fn],%[addr]\n" + " .insn rre,0xb9d30000,%[fn],%[addr_range]\n" " ipm %[cc]\n" " srl %[cc],28\n" : [cc] "=d" (cc), [fn] "+d" (fn) - : [addr] "d" (__addr), "d" (__range) + : [addr_range] "d" (addr_range.pair) : "cc"); *status = fn >> 24 & 0xff; return cc; @@ -80,16 +113,24 @@ static inline u8 __rpcit(u64 fn, u64 addr, u64 range, u8 *status) int zpci_refresh_trans(u64 fn, u64 addr, u64 range) { + bool retried = false; u8 cc, status; do { cc = __rpcit(fn, addr, range, &status); - if (cc == 2) + if (cc == 2) { udelay(ZPCI_INSN_BUSY_DELAY); + if (!retried) { + zpci_err_insn_addr(1, 'R', cc, status, addr, range); + retried = true; + } + } } while (cc == 2); if (cc) - zpci_err_insn(cc, status, addr, range); + zpci_err_insn_addr(0, 'R', cc, status, addr, range); + else if (retried) + zpci_err_insn_addr(1, 'R', cc, status, addr, range); if (cc == 1 && (status == 4 || status == 16)) return -ENOMEM; @@ -98,7 +139,7 @@ int zpci_refresh_trans(u64 fn, u64 addr, u64 range) } /* Set Interruption Controls */ -int __zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib) +int zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib) { if (!test_facility(72)) return -EIO; @@ -109,25 +150,24 @@ int __zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib) return 0; } +EXPORT_SYMBOL_GPL(zpci_set_irq_ctrl); /* PCI Load */ static inline int ____pcilg(u64 *data, u64 req, u64 offset, u8 *status) { - register u64 __req asm("2") = req; - register u64 __offset asm("3") = offset; + union register_pair req_off = {.even = req, .odd = offset}; int cc = -ENXIO; u64 __data; asm volatile ( - " .insn rre,0xb9d20000,%[data],%[req]\n" + " .insn rre,0xb9d20000,%[data],%[req_off]\n" "0: ipm %[cc]\n" " srl %[cc],28\n" "1:\n" EX_TABLE(0b, 1b) - : [cc] "+d" (cc), [data] "=d" (__data), [req] "+d" (__req) - : "d" (__offset) - : "cc"); - *status = __req >> 24 & 0xff; + : [cc] "+d" (cc), [data] "=d" (__data), + [req_off] "+&d" (req_off.pair) :: "cc"); + *status = req_off.even >> 24 & 0xff; *data = __data; return cc; } @@ -146,17 +186,25 @@ static inline int __pcilg(u64 *data, u64 req, u64 offset, u8 *status) int __zpci_load(u64 *data, u64 req, u64 offset) { + bool retried = false; u8 status; int cc; do { cc = __pcilg(data, req, offset, &status); - if (cc == 2) + if (cc == 2) { udelay(ZPCI_INSN_BUSY_DELAY); + if (!retried) { + zpci_err_insn_req(1, 'l', cc, status, req, offset); + retried = true; + } + } } while (cc == 2); if (cc) - zpci_err_insn(cc, status, req, offset); + zpci_err_insn_req(0, 'l', cc, status, req, offset); + else if (retried) + zpci_err_insn_req(1, 'l', cc, status, req, offset); return (cc > 0) ? -EIO : cc; } @@ -166,28 +214,26 @@ static inline int zpci_load_fh(u64 *data, const volatile void __iomem *addr, unsigned long len) { struct zpci_iomap_entry *entry = &zpci_iomap_start[ZPCI_IDX(addr)]; - u64 req = ZPCI_CREATE_REQ(entry->fh, entry->bar, len); + u64 req = ZPCI_CREATE_REQ(READ_ONCE(entry->fh), entry->bar, len); return __zpci_load(data, req, ZPCI_OFFSET(addr)); } static inline int __pcilg_mio(u64 *data, u64 ioaddr, u64 len, u8 *status) { - register u64 addr asm("2") = ioaddr; - register u64 r3 asm("3") = len; + union register_pair ioaddr_len = {.even = ioaddr, .odd = len}; int cc = -ENXIO; u64 __data; asm volatile ( - " .insn rre,0xb9d60000,%[data],%[ioaddr]\n" + " .insn rre,0xb9d60000,%[data],%[ioaddr_len]\n" "0: ipm %[cc]\n" " srl %[cc],28\n" "1:\n" EX_TABLE(0b, 1b) - : [cc] "+d" (cc), [data] "=d" (__data), "+d" (r3) - : [ioaddr] "d" (addr) - : "cc"); - *status = r3 >> 24 & 0xff; + : [cc] "+d" (cc), [data] "=d" (__data), + [ioaddr_len] "+&d" (ioaddr_len.pair) :: "cc"); + *status = ioaddr_len.odd >> 24 & 0xff; *data = __data; return cc; } @@ -202,7 +248,7 @@ int zpci_load(u64 *data, const volatile void __iomem *addr, unsigned long len) cc = __pcilg_mio(data, (__force u64) addr, len, &status); if (cc) - zpci_err_insn(cc, status, 0, (__force u64) addr); + zpci_err_insn_addr(0, 'L', cc, status, (__force u64) addr, len); return (cc > 0) ? -EIO : cc; } @@ -211,36 +257,43 @@ EXPORT_SYMBOL_GPL(zpci_load); /* PCI Store */ static inline int __pcistg(u64 data, u64 req, u64 offset, u8 *status) { - register u64 __req asm("2") = req; - register u64 __offset asm("3") = offset; + union register_pair req_off = {.even = req, .odd = offset}; int cc = -ENXIO; asm volatile ( - " .insn rre,0xb9d00000,%[data],%[req]\n" + " .insn rre,0xb9d00000,%[data],%[req_off]\n" "0: ipm %[cc]\n" " srl %[cc],28\n" "1:\n" EX_TABLE(0b, 1b) - : [cc] "+d" (cc), [req] "+d" (__req) - : "d" (__offset), [data] "d" (data) + : [cc] "+d" (cc), [req_off] "+&d" (req_off.pair) + : [data] "d" (data) : "cc"); - *status = __req >> 24 & 0xff; + *status = req_off.even >> 24 & 0xff; return cc; } int __zpci_store(u64 data, u64 req, u64 offset) { + bool retried = false; u8 status; int cc; do { cc = __pcistg(data, req, offset, &status); - if (cc == 2) + if (cc == 2) { udelay(ZPCI_INSN_BUSY_DELAY); + if (!retried) { + zpci_err_insn_req(1, 's', cc, status, req, offset); + retried = true; + } + } } while (cc == 2); if (cc) - zpci_err_insn(cc, status, req, offset); + zpci_err_insn_req(0, 's', cc, status, req, offset); + else if (retried) + zpci_err_insn_req(1, 's', cc, status, req, offset); return (cc > 0) ? -EIO : cc; } @@ -250,27 +303,26 @@ static inline int zpci_store_fh(const volatile void __iomem *addr, u64 data, unsigned long len) { struct zpci_iomap_entry *entry = &zpci_iomap_start[ZPCI_IDX(addr)]; - u64 req = ZPCI_CREATE_REQ(entry->fh, entry->bar, len); + u64 req = ZPCI_CREATE_REQ(READ_ONCE(entry->fh), entry->bar, len); return __zpci_store(data, req, ZPCI_OFFSET(addr)); } static inline int __pcistg_mio(u64 data, u64 ioaddr, u64 len, u8 *status) { - register u64 addr asm("2") = ioaddr; - register u64 r3 asm("3") = len; + union register_pair ioaddr_len = {.even = ioaddr, .odd = len}; int cc = -ENXIO; asm volatile ( - " .insn rre,0xb9d40000,%[data],%[ioaddr]\n" + " .insn rre,0xb9d40000,%[data],%[ioaddr_len]\n" "0: ipm %[cc]\n" " srl %[cc],28\n" "1:\n" EX_TABLE(0b, 1b) - : [cc] "+d" (cc), "+d" (r3) - : [data] "d" (data), [ioaddr] "d" (addr) - : "cc"); - *status = r3 >> 24 & 0xff; + : [cc] "+d" (cc), [ioaddr_len] "+&d" (ioaddr_len.pair) + : [data] "d" (data) + : "cc", "memory"); + *status = ioaddr_len.odd >> 24 & 0xff; return cc; } @@ -284,7 +336,7 @@ int zpci_store(const volatile void __iomem *addr, u64 data, unsigned long len) cc = __pcistg_mio(data, (__force u64) addr, len, &status); if (cc) - zpci_err_insn(cc, status, 0, (__force u64) addr); + zpci_err_insn_addr(0, 'S', cc, status, (__force u64) addr, len); return (cc > 0) ? -EIO : cc; } @@ -310,17 +362,25 @@ static inline int __pcistb(const u64 *data, u64 req, u64 offset, u8 *status) int __zpci_store_block(const u64 *data, u64 req, u64 offset) { + bool retried = false; u8 status; int cc; do { cc = __pcistb(data, req, offset, &status); - if (cc == 2) + if (cc == 2) { udelay(ZPCI_INSN_BUSY_DELAY); + if (!retried) { + zpci_err_insn_req(0, 'b', cc, status, req, offset); + retried = true; + } + } } while (cc == 2); if (cc) - zpci_err_insn(cc, status, req, offset); + zpci_err_insn_req(0, 'b', cc, status, req, offset); + else if (retried) + zpci_err_insn_req(1, 'b', cc, status, req, offset); return (cc > 0) ? -EIO : cc; } @@ -364,7 +424,7 @@ int zpci_write_block(volatile void __iomem *dst, cc = __pcistb_mio(src, (__force u64) dst, len, &status); if (cc) - zpci_err_insn(cc, status, 0, (__force u64) dst); + zpci_err_insn_addr(0, 'B', cc, status, (__force u64) dst, len); return (cc > 0) ? -EIO : cc; } @@ -372,10 +432,7 @@ EXPORT_SYMBOL_GPL(zpci_write_block); static inline void __pciwb_mio(void) { - unsigned long unused = 0; - - asm volatile (".insn rre,0xb9d50000,%[op],%[op]\n" - : [op] "+d" (unused)); + asm volatile (".insn rre,0xb9d50000,0,0\n"); } void zpci_barrier(void) diff --git a/arch/s390/pci/pci_iov.c b/arch/s390/pci/pci_iov.c new file mode 100644 index 000000000000..ead062bf2b41 --- /dev/null +++ b/arch/s390/pci/pci_iov.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2020 + * + * Author(s): + * Niklas Schnelle <schnelle@linux.ibm.com> + * + */ + +#define KMSG_COMPONENT "zpci" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/kernel.h> +#include <linux/pci.h> + +#include "pci_iov.h" + +static struct resource iov_res = { + .name = "PCI IOV res", + .start = 0, + .end = -1, + .flags = IORESOURCE_MEM, +}; + +void zpci_iov_map_resources(struct pci_dev *pdev) +{ + resource_size_t len; + int i; + + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + int bar = i + PCI_IOV_RESOURCES; + + len = pci_resource_len(pdev, bar); + if (!len) + continue; + pdev->resource[bar].parent = &iov_res; + } +} + +void zpci_iov_remove_virtfn(struct pci_dev *pdev, int vfn) +{ + pci_lock_rescan_remove(); + /* Linux' vfid's start at 0 vfn at 1 */ + pci_iov_remove_virtfn(pdev->physfn, vfn - 1); + pci_unlock_rescan_remove(); +} + +static int zpci_iov_link_virtfn(struct pci_dev *pdev, struct pci_dev *virtfn, int vfid) +{ + int rc; + + rc = pci_iov_sysfs_link(pdev, virtfn, vfid); + if (rc) + return rc; + + virtfn->is_virtfn = 1; + virtfn->multifunction = 0; + virtfn->physfn = pci_dev_get(pdev); + + return 0; +} + +int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *virtfn, int vfn) +{ + int i, cand_devfn; + struct zpci_dev *zdev; + struct pci_dev *pdev; + int vfid = vfn - 1; /* Linux' vfid's start at 0 vfn at 1*/ + int rc = 0; + + if (!zbus->multifunction) + return 0; + + /* If the parent PF for the given VF is also configured in the + * instance, it must be on the same zbus. + * We can then identify the parent PF by checking what + * devfn the VF would have if it belonged to that PF using the PF's + * stride and offset. Only if this candidate devfn matches the + * actual devfn will we link both functions. + */ + for (i = 0; i < ZPCI_FUNCTIONS_PER_BUS; i++) { + zdev = zbus->function[i]; + if (zdev && zdev->is_physfn) { + pdev = pci_get_slot(zbus->bus, zdev->devfn); + if (!pdev) + continue; + cand_devfn = pci_iov_virtfn_devfn(pdev, vfid); + if (cand_devfn == virtfn->devfn) { + rc = zpci_iov_link_virtfn(pdev, virtfn, vfid); + /* balance pci_get_slot() */ + pci_dev_put(pdev); + break; + } + /* balance pci_get_slot() */ + pci_dev_put(pdev); + } + } + return rc; +} diff --git a/arch/s390/pci/pci_iov.h b/arch/s390/pci/pci_iov.h new file mode 100644 index 000000000000..b2c828003bad --- /dev/null +++ b/arch/s390/pci/pci_iov.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 2020 + * + * Author(s): + * Niklas Schnelle <schnelle@linux.ibm.com> + * + */ + +#ifndef __S390_PCI_IOV_H +#define __S390_PCI_IOV_H + +#ifdef CONFIG_PCI_IOV +void zpci_iov_remove_virtfn(struct pci_dev *pdev, int vfn); + +void zpci_iov_map_resources(struct pci_dev *pdev); + +int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *virtfn, int vfn); + +#else /* CONFIG_PCI_IOV */ +static inline void zpci_iov_remove_virtfn(struct pci_dev *pdev, int vfn) {} + +static inline void zpci_iov_map_resources(struct pci_dev *pdev) {} + +static inline int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *virtfn, int vfn) +{ + return 0; +} +#endif /* CONFIG_PCI_IOV */ +#endif /* __S390_PCI_IOV_h */ diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c index fbe97ab2e228..ff8f24854c64 100644 --- a/arch/s390/pci/pci_irq.c +++ b/arch/s390/pci/pci_irq.c @@ -11,16 +11,10 @@ #include <asm/isc.h> #include <asm/airq.h> +#include <asm/tpi.h> static enum {FLOATING, DIRECTED} irq_delivery; -#define SIC_IRQ_MODE_ALL 0 -#define SIC_IRQ_MODE_SINGLE 1 -#define SIC_IRQ_MODE_DIRECT 4 -#define SIC_IRQ_MODE_D_ALL 16 -#define SIC_IRQ_MODE_D_SINGLE 17 -#define SIC_IRQ_MODE_SET_CPU 18 - /* * summary bit vector * FLOATING - summary bit per function @@ -35,7 +29,7 @@ static struct airq_iv *zpci_sbv; */ static struct airq_iv **zpci_ibv; -/* Modify PCI: Register adapter interruptions */ +/* Modify PCI: Register floating adapter interruptions */ static int zpci_set_airq(struct zpci_dev *zdev) { u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_REG_INT); @@ -45,21 +39,24 @@ static int zpci_set_airq(struct zpci_dev *zdev) fib.fmt0.isc = PCI_ISC; fib.fmt0.sum = 1; /* enable summary notifications */ fib.fmt0.noi = airq_iv_end(zdev->aibv); - fib.fmt0.aibv = (unsigned long) zdev->aibv->vector; + fib.fmt0.aibv = virt_to_phys(zdev->aibv->vector); fib.fmt0.aibvo = 0; /* each zdev has its own interrupt vector */ - fib.fmt0.aisb = (unsigned long) zpci_sbv->vector + (zdev->aisb/64)*8; + fib.fmt0.aisb = virt_to_phys(zpci_sbv->vector) + (zdev->aisb / 64) * 8; fib.fmt0.aisbo = zdev->aisb & 63; + fib.gd = zdev->gisa; return zpci_mod_fc(req, &fib, &status) ? -EIO : 0; } -/* Modify PCI: Unregister adapter interruptions */ +/* Modify PCI: Unregister floating adapter interruptions */ static int zpci_clear_airq(struct zpci_dev *zdev) { u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_DEREG_INT); struct zpci_fib fib = {0}; u8 cc, status; + fib.gd = zdev->gisa; + cc = zpci_mod_fc(req, &fib, &status); if (cc == 3 || (cc == 1 && status == 24)) /* Function already gone or IRQs already deregistered. */ @@ -78,6 +75,7 @@ static int zpci_set_directed_irq(struct zpci_dev *zdev) fib.fmt = 1; fib.fmt1.noi = zdev->msi_nr_irqs; fib.fmt1.dibvo = zdev->msi_first_bit; + fib.gd = zdev->gisa; return zpci_mod_fc(req, &fib, &status) ? -EIO : 0; } @@ -90,6 +88,7 @@ static int zpci_clear_directed_irq(struct zpci_dev *zdev) u8 cc, status; fib.fmt = 1; + fib.gd = zdev->gisa; cc = zpci_mod_fc(req, &fib, &status); if (cc == 3 || (cc == 1 && status == 24)) /* Function already gone or IRQs already deregistered. */ @@ -98,14 +97,47 @@ static int zpci_clear_directed_irq(struct zpci_dev *zdev) return cc ? -EIO : 0; } +/* Register adapter interruptions */ +static int zpci_set_irq(struct zpci_dev *zdev) +{ + int rc; + + if (irq_delivery == DIRECTED) + rc = zpci_set_directed_irq(zdev); + else + rc = zpci_set_airq(zdev); + + if (!rc) + zdev->irqs_registered = 1; + + return rc; +} + +/* Clear adapter interruptions */ +static int zpci_clear_irq(struct zpci_dev *zdev) +{ + int rc; + + if (irq_delivery == DIRECTED) + rc = zpci_clear_directed_irq(zdev); + else + rc = zpci_clear_airq(zdev); + + if (!rc) + zdev->irqs_registered = 0; + + return rc; +} + static int zpci_set_irq_affinity(struct irq_data *data, const struct cpumask *dest, bool force) { - struct msi_desc *entry = irq_get_msi_desc(data->irq); + struct msi_desc *entry = irq_data_get_msi_desc(data); struct msi_msg msg = entry->msg; + int cpu_addr = smp_cpu_get_cpu_address(cpumask_first(dest)); msg.address_lo &= 0xff0000ff; - msg.address_lo |= (cpumask_first(dest) << 8); + msg.address_lo |= (cpu_addr << 8); pci_write_msi_msg(data->irq, &msg); return IRQ_SET_MASK_OK; @@ -115,12 +147,12 @@ static struct irq_chip zpci_irq_chip = { .name = "PCI-MSI", .irq_unmask = pci_msi_unmask_irq, .irq_mask = pci_msi_mask_irq, - .irq_set_affinity = zpci_set_irq_affinity, }; static void zpci_handle_cpu_local_irq(bool rescan) { struct airq_iv *dibv = zpci_ibv[smp_processor_id()]; + union zpci_sic_iib iib = {{0}}; unsigned long bit; int irqs_on = 0; @@ -131,8 +163,8 @@ static void zpci_handle_cpu_local_irq(bool rescan) if (!rescan || irqs_on++) /* End of second scan with interrupts on. */ break; - /* First scan complete, reenable interrupts. */ - if (zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC)) + /* First scan complete, re-enable interrupts. */ + if (zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC, &iib)) break; bit = 0; continue; @@ -160,6 +192,7 @@ static void zpci_handle_remote_irq(void *data) static void zpci_handle_fallback_irq(void) { struct cpu_irq_data *cpu_data; + union zpci_sic_iib iib = {{0}}; unsigned long cpu; int irqs_on = 0; @@ -169,8 +202,8 @@ static void zpci_handle_fallback_irq(void) if (irqs_on++) /* End of second scan with interrupts on. */ break; - /* First scan complete, reenable interrupts. */ - if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC)) + /* First scan complete, re-enable interrupts. */ + if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC, &iib)) break; cpu = 0; continue; @@ -179,15 +212,16 @@ static void zpci_handle_fallback_irq(void) if (atomic_inc_return(&cpu_data->scheduled) > 1) continue; - cpu_data->csd.func = zpci_handle_remote_irq; - cpu_data->csd.info = &cpu_data->scheduled; - cpu_data->csd.flags = 0; + INIT_CSD(&cpu_data->csd, zpci_handle_remote_irq, &cpu_data->scheduled); smp_call_function_single_async(cpu, &cpu_data->csd); } } -static void zpci_directed_irq_handler(struct airq_struct *airq, bool floating) +static void zpci_directed_irq_handler(struct airq_struct *airq, + struct tpi_info *tpi_info) { + bool floating = !tpi_info->directed_irq; + if (floating) { inc_irq_stat(IRQIO_PCF); zpci_handle_fallback_irq(); @@ -197,8 +231,10 @@ static void zpci_directed_irq_handler(struct airq_struct *airq, bool floating) } } -static void zpci_floating_irq_handler(struct airq_struct *airq, bool floating) +static void zpci_floating_irq_handler(struct airq_struct *airq, + struct tpi_info *tpi_info) { + union zpci_sic_iib iib = {{0}}; unsigned long si, ai; struct airq_iv *aibv; int irqs_on = 0; @@ -211,8 +247,8 @@ static void zpci_floating_irq_handler(struct airq_struct *airq, bool floating) if (irqs_on++) /* End of second scan with interrupts on. */ break; - /* First scan complete, reenable interrupts. */ - if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC)) + /* First scan complete, re-enable interrupts. */ + if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC, &iib)) break; si = 0; continue; @@ -239,6 +275,7 @@ int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) unsigned long bit; struct msi_desc *msi; struct msi_msg msg; + int cpu_addr; int rc, irq; zdev->aisb = -1UL; @@ -260,7 +297,7 @@ int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) zdev->aisb = bit; /* Create adapter interrupt vector */ - zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | AIRQ_IV_BITLOCK); + zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | AIRQ_IV_BITLOCK, NULL); if (!zdev->aibv) return -ENOMEM; @@ -272,11 +309,13 @@ int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) /* Request MSI interrupts */ hwirq = bit; - for_each_pci_msi_entry(msi, pdev) { + msi_for_each_desc(msi, &pdev->dev, MSI_DESC_NOTASSOCIATED) { rc = -EIO; if (hwirq - bit >= msi_vecs) break; - irq = __irq_alloc_descs(-1, 0, 1, 0, THIS_MODULE, msi->affinity); + irq = __irq_alloc_descs(-1, 0, 1, 0, THIS_MODULE, + (irq_delivery == DIRECTED) ? + msi->affinity : NULL); if (irq < 0) return -ENOMEM; rc = irq_set_msi_desc(irq, msi); @@ -286,9 +325,15 @@ int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) handle_percpu_irq); msg.data = hwirq - bit; if (irq_delivery == DIRECTED) { + if (msi->affinity) + cpu = cpumask_first(&msi->affinity->mask); + else + cpu = 0; + cpu_addr = smp_cpu_get_cpu_address(cpu); + msg.address_lo = zdev->msi_addr & 0xff0000ff; - msg.address_lo |= msi->affinity ? - (cpumask_first(&msi->affinity->mask) << 8) : 0; + msg.address_lo |= (cpu_addr << 8); + for_each_possible_cpu(cpu) { airq_iv_set_data(zpci_ibv[cpu], hwirq, irq); } @@ -304,10 +349,7 @@ int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) zdev->msi_first_bit = bit; zdev->msi_nr_irqs = msi_vecs; - if (irq_delivery == DIRECTED) - rc = zpci_set_directed_irq(zdev); - else - rc = zpci_set_airq(zdev); + rc = zpci_set_irq(zdev); if (rc) return rc; @@ -321,21 +363,12 @@ void arch_teardown_msi_irqs(struct pci_dev *pdev) int rc; /* Disable interrupts */ - if (irq_delivery == DIRECTED) - rc = zpci_clear_directed_irq(zdev); - else - rc = zpci_clear_airq(zdev); + rc = zpci_clear_irq(zdev); if (rc) return; /* Release MSI interrupts */ - for_each_pci_msi_entry(msi, pdev) { - if (!msi->irq) - continue; - if (msi->msi_attrib.is_msix) - __pci_msix_desc_mask_irq(msi, 1); - else - __pci_msi_desc_mask_irq(msi, 1, 1); + msi_for_each_desc(msi, &pdev->dev, MSI_DESC_ASSOCIATED) { irq_set_msi_desc(msi->irq, NULL); irq_free_desc(msi->irq); msi->msg.address_lo = 0; @@ -358,6 +391,15 @@ void arch_teardown_msi_irqs(struct pci_dev *pdev) airq_iv_free(zpci_ibv[0], zdev->msi_first_bit, zdev->msi_nr_irqs); } +bool arch_restore_msi_irqs(struct pci_dev *pdev) +{ + struct zpci_dev *zdev = to_zpci(pdev); + + if (!zdev->irqs_registered) + zpci_set_irq(zdev); + return true; +} + static struct airq_struct zpci_airq = { .handler = zpci_floating_irq_handler, .isc = PCI_ISC, @@ -366,11 +408,12 @@ static struct airq_struct zpci_airq = { static void __init cpu_enable_directed_irq(void *unused) { union zpci_sic_iib iib = {{0}}; + union zpci_sic_iib ziib = {{0}}; iib.cdiib.dibv_addr = (u64) zpci_ibv[smp_processor_id()]->vector; - __zpci_set_irq_ctrl(SIC_IRQ_MODE_SET_CPU, 0, &iib); - zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC); + zpci_set_irq_ctrl(SIC_IRQ_MODE_SET_CPU, 0, &iib); + zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC, &ziib); } static int __init zpci_directed_irq_init(void) @@ -378,14 +421,14 @@ static int __init zpci_directed_irq_init(void) union zpci_sic_iib iib = {{0}}; unsigned int cpu; - zpci_sbv = airq_iv_create(num_possible_cpus(), 0); + zpci_sbv = airq_iv_create(num_possible_cpus(), 0, NULL); if (!zpci_sbv) return -ENOMEM; iib.diib.isc = PCI_ISC; iib.diib.nr_cpus = num_possible_cpus(); - iib.diib.disb_addr = (u64) zpci_sbv->vector; - __zpci_set_irq_ctrl(SIC_IRQ_MODE_DIRECT, 0, &iib); + iib.diib.disb_addr = virt_to_phys(zpci_sbv->vector); + zpci_set_irq_ctrl(SIC_IRQ_MODE_DIRECT, 0, &iib); zpci_ibv = kcalloc(num_possible_cpus(), sizeof(*zpci_ibv), GFP_KERNEL); @@ -400,7 +443,7 @@ static int __init zpci_directed_irq_init(void) zpci_ibv[cpu] = airq_iv_create(cache_line_size() * BITS_PER_BYTE, AIRQ_IV_DATA | AIRQ_IV_CACHELINE | - (!cpu ? AIRQ_IV_ALLOC : 0)); + (!cpu ? AIRQ_IV_ALLOC : 0), NULL); if (!zpci_ibv[cpu]) return -ENOMEM; } @@ -417,7 +460,7 @@ static int __init zpci_floating_irq_init(void) if (!zpci_ibv) return -ENOMEM; - zpci_sbv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC); + zpci_sbv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC, NULL); if (!zpci_sbv) goto out_free; @@ -430,6 +473,7 @@ out_free: int __init zpci_irq_init(void) { + union zpci_sic_iib iib = {{0}}; int rc; irq_delivery = sclp.has_dirq ? DIRECTED : FLOATING; @@ -461,7 +505,7 @@ int __init zpci_irq_init(void) * Enable floating IRQs (with suppression after one IRQ). When using * directed IRQs this enables the fallback path. */ - zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC); + zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC, &iib); return 0; out_airq: diff --git a/arch/s390/pci/pci_kvm_hook.c b/arch/s390/pci/pci_kvm_hook.c new file mode 100644 index 000000000000..ff34baf50a3e --- /dev/null +++ b/arch/s390/pci/pci_kvm_hook.c @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * VFIO ZPCI devices support + * + * Copyright (C) IBM Corp. 2022. All rights reserved. + * Author(s): Pierre Morel <pmorel@linux.ibm.com> + */ +#include <linux/kvm_host.h> + +struct zpci_kvm_hook zpci_kvm_hook; +EXPORT_SYMBOL_GPL(zpci_kvm_hook); diff --git a/arch/s390/pci/pci_mmio.c b/arch/s390/pci/pci_mmio.c index 7d42a8794f10..a90499c087f0 100644 --- a/arch/s390/pci/pci_mmio.c +++ b/arch/s390/pci/pci_mmio.c @@ -11,25 +11,108 @@ #include <linux/mm.h> #include <linux/errno.h> #include <linux/pci.h> +#include <asm/asm-extable.h> +#include <asm/pci_io.h> +#include <asm/pci_debug.h> -static long get_pfn(unsigned long user_addr, unsigned long access, - unsigned long *pfn) +static inline void zpci_err_mmio(u8 cc, u8 status, u64 offset) { - struct vm_area_struct *vma; - long ret; + struct { + u64 offset; + u8 cc; + u8 status; + } data = {offset, cc, status}; - down_read(¤t->mm->mmap_sem); - ret = -EINVAL; - vma = find_vma(current->mm, user_addr); - if (!vma) - goto out; - ret = -EACCES; - if (!(vma->vm_flags & access)) - goto out; - ret = follow_pfn(vma, user_addr, pfn); -out: - up_read(¤t->mm->mmap_sem); - return ret; + zpci_err_hex(&data, sizeof(data)); +} + +static inline int __pcistb_mio_inuser( + void __iomem *ioaddr, const void __user *src, + u64 len, u8 *status) +{ + int cc = -ENXIO; + + asm volatile ( + " sacf 256\n" + "0: .insn rsy,0xeb00000000d4,%[len],%[ioaddr],%[src]\n" + "1: ipm %[cc]\n" + " srl %[cc],28\n" + "2: sacf 768\n" + EX_TABLE(0b, 2b) EX_TABLE(1b, 2b) + : [cc] "+d" (cc), [len] "+d" (len) + : [ioaddr] "a" (ioaddr), [src] "Q" (*((u8 __force *)src)) + : "cc", "memory"); + *status = len >> 24 & 0xff; + return cc; +} + +static inline int __pcistg_mio_inuser( + void __iomem *ioaddr, const void __user *src, + u64 ulen, u8 *status) +{ + union register_pair ioaddr_len = {.even = (u64 __force)ioaddr, .odd = ulen}; + int cc = -ENXIO; + u64 val = 0; + u64 cnt = ulen; + u8 tmp; + + /* + * copy 0 < @len <= 8 bytes from @src into the right most bytes of + * a register, then store it to PCI at @ioaddr while in secondary + * address space. pcistg then uses the user mappings. + */ + asm volatile ( + " sacf 256\n" + "0: llgc %[tmp],0(%[src])\n" + "4: sllg %[val],%[val],8\n" + " aghi %[src],1\n" + " ogr %[val],%[tmp]\n" + " brctg %[cnt],0b\n" + "1: .insn rre,0xb9d40000,%[val],%[ioaddr_len]\n" + "2: ipm %[cc]\n" + " srl %[cc],28\n" + "3: sacf 768\n" + EX_TABLE(0b, 3b) EX_TABLE(4b, 3b) EX_TABLE(1b, 3b) EX_TABLE(2b, 3b) + : + [src] "+a" (src), [cnt] "+d" (cnt), + [val] "+d" (val), [tmp] "=d" (tmp), + [cc] "+d" (cc), [ioaddr_len] "+&d" (ioaddr_len.pair) + :: "cc", "memory"); + *status = ioaddr_len.odd >> 24 & 0xff; + + /* did we read everything from user memory? */ + if (!cc && cnt != 0) + cc = -EFAULT; + + return cc; +} + +static inline int __memcpy_toio_inuser(void __iomem *dst, + const void __user *src, size_t n) +{ + int size, rc = 0; + u8 status = 0; + + if (!src) + return -EINVAL; + + while (n > 0) { + size = zpci_get_max_io_size((u64 __force) dst, + (u64 __force) src, n, + ZPCI_MAX_WRITE_SIZE); + if (size > 8) /* main path */ + rc = __pcistb_mio_inuser(dst, src, size, &status); + else + rc = __pcistg_mio_inuser(dst, src, size, &status); + if (rc) + break; + src += size; + dst += size; + n -= size; + } + if (rc) + zpci_err_mmio(rc, status, (__force u64) dst); + return rc; } SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr, @@ -38,7 +121,9 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr, u8 local_buf[64]; void __iomem *io_addr; void *buf; - unsigned long pfn; + struct vm_area_struct *vma; + pte_t *ptep; + spinlock_t *ptl; long ret; if (!zpci_is_enabled()) @@ -46,6 +131,22 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr, if (length <= 0 || PAGE_SIZE - (mmio_addr & ~PAGE_MASK) < length) return -EINVAL; + + /* + * We only support write access to MIO capable devices if we are on + * a MIO enabled system. Otherwise we would have to check for every + * address if it is a special ZPCI_ADDR and would have to do + * a pfn lookup which we don't need for MIO capable devices. Currently + * ISM devices are the only devices without MIO support and there is no + * known need for accessing these from userspace. + */ + if (static_branch_likely(&have_mio)) { + ret = __memcpy_toio_inuser((void __iomem *) mmio_addr, + user_buffer, + length); + return ret; + } + if (length > 64) { buf = kmalloc(length, GFP_KERNEL); if (!buf) @@ -53,32 +154,118 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr, } else buf = local_buf; - ret = get_pfn(mmio_addr, VM_WRITE, &pfn); + ret = -EFAULT; + if (copy_from_user(buf, user_buffer, length)) + goto out_free; + + mmap_read_lock(current->mm); + ret = -EINVAL; + vma = vma_lookup(current->mm, mmio_addr); + if (!vma) + goto out_unlock_mmap; + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + goto out_unlock_mmap; + ret = -EACCES; + if (!(vma->vm_flags & VM_WRITE)) + goto out_unlock_mmap; + + ret = follow_pte(vma->vm_mm, mmio_addr, &ptep, &ptl); if (ret) - goto out; - io_addr = (void __iomem *)((pfn << PAGE_SHIFT) | (mmio_addr & ~PAGE_MASK)); + goto out_unlock_mmap; - ret = -EFAULT; - if ((unsigned long) io_addr < ZPCI_IOMAP_ADDR_BASE) - goto out; + io_addr = (void __iomem *)((pte_pfn(*ptep) << PAGE_SHIFT) | + (mmio_addr & ~PAGE_MASK)); - if (copy_from_user(buf, user_buffer, length)) - goto out; + if ((unsigned long) io_addr < ZPCI_IOMAP_ADDR_BASE) + goto out_unlock_pt; ret = zpci_memcpy_toio(io_addr, buf, length); -out: +out_unlock_pt: + pte_unmap_unlock(ptep, ptl); +out_unlock_mmap: + mmap_read_unlock(current->mm); +out_free: if (buf != local_buf) kfree(buf); return ret; } +static inline int __pcilg_mio_inuser( + void __user *dst, const void __iomem *ioaddr, + u64 ulen, u8 *status) +{ + union register_pair ioaddr_len = {.even = (u64 __force)ioaddr, .odd = ulen}; + u64 cnt = ulen; + int shift = ulen * 8; + int cc = -ENXIO; + u64 val, tmp; + + /* + * read 0 < @len <= 8 bytes from the PCI memory mapped at @ioaddr (in + * user space) into a register using pcilg then store these bytes at + * user address @dst + */ + asm volatile ( + " sacf 256\n" + "0: .insn rre,0xb9d60000,%[val],%[ioaddr_len]\n" + "1: ipm %[cc]\n" + " srl %[cc],28\n" + " ltr %[cc],%[cc]\n" + " jne 4f\n" + "2: ahi %[shift],-8\n" + " srlg %[tmp],%[val],0(%[shift])\n" + "3: stc %[tmp],0(%[dst])\n" + "5: aghi %[dst],1\n" + " brctg %[cnt],2b\n" + "4: sacf 768\n" + EX_TABLE(0b, 4b) EX_TABLE(1b, 4b) EX_TABLE(3b, 4b) EX_TABLE(5b, 4b) + : + [ioaddr_len] "+&d" (ioaddr_len.pair), + [cc] "+d" (cc), [val] "=d" (val), + [dst] "+a" (dst), [cnt] "+d" (cnt), [tmp] "=d" (tmp), + [shift] "+d" (shift) + :: "cc", "memory"); + + /* did we write everything to the user space buffer? */ + if (!cc && cnt != 0) + cc = -EFAULT; + + *status = ioaddr_len.odd >> 24 & 0xff; + return cc; +} + +static inline int __memcpy_fromio_inuser(void __user *dst, + const void __iomem *src, + unsigned long n) +{ + int size, rc = 0; + u8 status; + + while (n > 0) { + size = zpci_get_max_io_size((u64 __force) src, + (u64 __force) dst, n, + ZPCI_MAX_READ_SIZE); + rc = __pcilg_mio_inuser(dst, src, size, &status); + if (rc) + break; + src += size; + dst += size; + n -= size; + } + if (rc) + zpci_err_mmio(rc, status, (__force u64) dst); + return rc; +} + SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr, void __user *, user_buffer, size_t, length) { u8 local_buf[64]; void __iomem *io_addr; void *buf; - unsigned long pfn; + struct vm_area_struct *vma; + pte_t *ptep; + spinlock_t *ptl; long ret; if (!zpci_is_enabled()) @@ -86,29 +273,62 @@ SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr, if (length <= 0 || PAGE_SIZE - (mmio_addr & ~PAGE_MASK) < length) return -EINVAL; + + /* + * We only support read access to MIO capable devices if we are on + * a MIO enabled system. Otherwise we would have to check for every + * address if it is a special ZPCI_ADDR and would have to do + * a pfn lookup which we don't need for MIO capable devices. Currently + * ISM devices are the only devices without MIO support and there is no + * known need for accessing these from userspace. + */ + if (static_branch_likely(&have_mio)) { + ret = __memcpy_fromio_inuser( + user_buffer, (const void __iomem *)mmio_addr, + length); + return ret; + } + if (length > 64) { buf = kmalloc(length, GFP_KERNEL); if (!buf) return -ENOMEM; - } else + } else { buf = local_buf; + } - ret = get_pfn(mmio_addr, VM_READ, &pfn); + mmap_read_lock(current->mm); + ret = -EINVAL; + vma = vma_lookup(current->mm, mmio_addr); + if (!vma) + goto out_unlock_mmap; + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + goto out_unlock_mmap; + ret = -EACCES; + if (!(vma->vm_flags & VM_WRITE)) + goto out_unlock_mmap; + + ret = follow_pte(vma->vm_mm, mmio_addr, &ptep, &ptl); if (ret) - goto out; - io_addr = (void __iomem *)((pfn << PAGE_SHIFT) | (mmio_addr & ~PAGE_MASK)); + goto out_unlock_mmap; + + io_addr = (void __iomem *)((pte_pfn(*ptep) << PAGE_SHIFT) | + (mmio_addr & ~PAGE_MASK)); if ((unsigned long) io_addr < ZPCI_IOMAP_ADDR_BASE) { ret = -EFAULT; - goto out; + goto out_unlock_pt; } ret = zpci_memcpy_fromio(buf, io_addr, length); - if (ret) - goto out; - if (copy_to_user(user_buffer, buf, length)) + +out_unlock_pt: + pte_unmap_unlock(ptep, ptl); +out_unlock_mmap: + mmap_read_unlock(current->mm); + + if (!ret && copy_to_user(user_buffer, buf, length)) ret = -EFAULT; -out: if (buf != local_buf) kfree(buf); return ret; diff --git a/arch/s390/pci/pci_sysfs.c b/arch/s390/pci/pci_sysfs.c index a433ba01a317..8a7abac51816 100644 --- a/arch/s390/pci/pci_sysfs.c +++ b/arch/s390/pci/pci_sysfs.c @@ -13,6 +13,8 @@ #include <linux/stat.h> #include <linux/pci.h> +#include "../../../drivers/pci/pci.h" + #include <asm/sclp.h> #define zpci_attr(name, fmt, member) \ @@ -31,6 +33,7 @@ zpci_attr(pchid, "0x%04x\n", pchid); zpci_attr(pfgid, "0x%02x\n", pfgid); zpci_attr(vfn, "0x%04x\n", vfn); zpci_attr(pft, "0x%02x\n", pft); +zpci_attr(port, "%d\n", port); zpci_attr(uid, "0x%x\n", uid); zpci_attr(segment0, "0x%02x\n", pfip[0]); zpci_attr(segment1, "0x%02x\n", pfip[1]); @@ -49,31 +52,68 @@ static DEVICE_ATTR_RO(mio_enabled); static ssize_t recover_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { + struct kernfs_node *kn; struct pci_dev *pdev = to_pci_dev(dev); struct zpci_dev *zdev = to_zpci(pdev); - int ret; - - if (!device_remove_file_self(dev, attr)) - return count; - + int ret = 0; + u8 status; + + /* Can't use device_remove_self() here as that would lead us to lock + * the pci_rescan_remove_lock while holding the device' kernfs lock. + * This would create a possible deadlock with disable_slot() which is + * not directly protected by the device' kernfs lock but takes it + * during the device removal which happens under + * pci_rescan_remove_lock. + * + * This is analogous to sdev_store_delete() in + * drivers/scsi/scsi_sysfs.c + */ + kn = sysfs_break_active_protection(&dev->kobj, &attr->attr); + WARN_ON_ONCE(!kn); + /* device_remove_file() serializes concurrent calls ignoring all but + * the first + */ + device_remove_file(dev, attr); + + /* A concurrent call to recover_store() may slip between + * sysfs_break_active_protection() and the sysfs file removal. + * Once it unblocks from pci_lock_rescan_remove() the original pdev + * will already be removed. + */ pci_lock_rescan_remove(); - pci_stop_and_remove_bus_device(pdev); - ret = zpci_disable_device(zdev); - if (ret) - goto error; - - ret = zpci_enable_device(zdev); - if (ret) - goto error; - - pci_rescan_bus(zdev->bus); - pci_unlock_rescan_remove(); - - return count; - -error: + if (pci_dev_is_added(pdev)) { + pci_stop_and_remove_bus_device(pdev); + if (zdev_enabled(zdev)) { + ret = zpci_disable_device(zdev); + /* + * Due to a z/VM vs LPAR inconsistency in the error + * state the FH may indicate an enabled device but + * disable says the device is already disabled don't + * treat it as an error here. + */ + if (ret == -EINVAL) + ret = 0; + if (ret) + goto out; + } + + ret = zpci_enable_device(zdev); + if (ret) + goto out; + + if (zdev->dma_table) { + ret = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, + virt_to_phys(zdev->dma_table), &status); + if (ret) + zpci_disable_device(zdev); + } + } +out: + pci_rescan_bus(zdev->zbus->bus); pci_unlock_rescan_remove(); - return ret; + if (kn) + sysfs_unbreak_active_protection(kn); + return ret ? ret : count; } static DEVICE_ATTR_WO(recover); @@ -109,6 +149,45 @@ static ssize_t report_error_write(struct file *filp, struct kobject *kobj, } static BIN_ATTR(report_error, S_IWUSR, NULL, report_error_write, PAGE_SIZE); +static ssize_t uid_is_unique_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", zpci_unique_uid ? 1 : 0); +} +static DEVICE_ATTR_RO(uid_is_unique); + +#ifndef CONFIG_DMI +/* analogous to smbios index */ +static ssize_t index_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); + u32 index = ~0; + + if (zpci_unique_uid) + index = zdev->uid; + + return sysfs_emit(buf, "%u\n", index); +} +static DEVICE_ATTR_RO(index); + +static umode_t zpci_index_is_visible(struct kobject *kobj, + struct attribute *attr, int n) +{ + return zpci_unique_uid ? attr->mode : 0; +} + +static struct attribute *zpci_ident_attrs[] = { + &dev_attr_index.attr, + NULL, +}; + +static struct attribute_group zpci_ident_attr_group = { + .attrs = zpci_ident_attrs, + .is_visible = zpci_index_is_visible, +}; +#endif + static struct bin_attribute *zpci_bin_attrs[] = { &bin_attr_util_string, &bin_attr_report_error, @@ -121,12 +200,15 @@ static struct attribute *zpci_dev_attrs[] = { &dev_attr_pchid.attr, &dev_attr_pfgid.attr, &dev_attr_pft.attr, + &dev_attr_port.attr, &dev_attr_vfn.attr, &dev_attr_uid.attr, &dev_attr_recover.attr, &dev_attr_mio_enabled.attr, + &dev_attr_uid_is_unique.attr, NULL, }; + static struct attribute_group zpci_attr_group = { .attrs = zpci_dev_attrs, .bin_attrs = zpci_bin_attrs, @@ -147,5 +229,8 @@ static struct attribute_group pfip_attr_group = { const struct attribute_group *zpci_attr_groups[] = { &zpci_attr_group, &pfip_attr_group, +#ifndef CONFIG_DMI + &zpci_ident_attr_group, +#endif NULL, }; diff --git a/arch/s390/purgatory/.gitignore b/arch/s390/purgatory/.gitignore index 04a03433c720..97ca52779457 100644 --- a/arch/s390/purgatory/.gitignore +++ b/arch/s390/purgatory/.gitignore @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only purgatory +purgatory.chk purgatory.lds purgatory.ro diff --git a/arch/s390/purgatory/Makefile b/arch/s390/purgatory/Makefile index bc0d7a0d0394..4e930f566878 100644 --- a/arch/s390/purgatory/Makefile +++ b/arch/s390/purgatory/Makefile @@ -4,40 +4,51 @@ OBJECT_FILES_NON_STANDARD := y purgatory-y := head.o purgatory.o string.o sha256.o mem.o -targets += $(purgatory-y) purgatory.lds purgatory purgatory.ro +targets += $(purgatory-y) purgatory.lds purgatory purgatory.chk purgatory.ro PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y)) $(obj)/sha256.o: $(srctree)/lib/crypto/sha256.c FORCE $(call if_changed_rule,cc_o_c) -CFLAGS_sha256.o := -D__DISABLE_EXPORTS +CFLAGS_sha256.o := -D__DISABLE_EXPORTS -D__NO_FORTIFY $(obj)/mem.o: $(srctree)/arch/s390/lib/mem.S FORCE $(call if_changed_rule,as_o_S) -$(obj)/string.o: $(srctree)/arch/s390/lib/string.c FORCE - $(call if_changed_rule,cc_o_c) +KCOV_INSTRUMENT := n +GCOV_PROFILE := n +UBSAN_SANITIZE := n +KASAN_SANITIZE := n +KCSAN_SANITIZE := n KBUILD_CFLAGS := -fno-strict-aliasing -Wall -Wstrict-prototypes KBUILD_CFLAGS += -Wno-pointer-sign -Wno-sign-compare KBUILD_CFLAGS += -fno-zero-initialized-in-bss -fno-builtin -ffreestanding -KBUILD_CFLAGS += -c -MD -Os -m64 -msoft-float -fno-common +KBUILD_CFLAGS += -Os -m64 -msoft-float -fno-common +KBUILD_CFLAGS += -fno-stack-protector +KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING KBUILD_CFLAGS += $(CLANG_FLAGS) KBUILD_CFLAGS += $(call cc-option,-fno-PIE) KBUILD_AFLAGS := $(filter-out -DCC_USING_EXPOLINE,$(KBUILD_AFLAGS)) -LDFLAGS_purgatory := -r --no-undefined -nostdlib -z nodefaultlib -T +# Since we link purgatory with -r unresolved symbols are not checked, so we +# also link a purgatory.chk binary without -r to check for unresolved symbols. +PURGATORY_LDFLAGS := -nostdlib -z nodefaultlib +LDFLAGS_purgatory := -r $(PURGATORY_LDFLAGS) -T +LDFLAGS_purgatory.chk := -e purgatory_start $(PURGATORY_LDFLAGS) $(obj)/purgatory: $(obj)/purgatory.lds $(PURGATORY_OBJS) FORCE $(call if_changed,ld) +$(obj)/purgatory.chk: $(obj)/purgatory FORCE + $(call if_changed,ld) + OBJCOPYFLAGS_purgatory.ro := -O elf64-s390 OBJCOPYFLAGS_purgatory.ro += --remove-section='*debug*' OBJCOPYFLAGS_purgatory.ro += --remove-section='.comment' OBJCOPYFLAGS_purgatory.ro += --remove-section='.note.*' -$(obj)/purgatory.ro: $(obj)/purgatory FORCE +$(obj)/purgatory.ro: $(obj)/purgatory $(obj)/purgatory.chk FORCE $(call if_changed,objcopy) -$(obj)/kexec-purgatory.o: $(obj)/kexec-purgatory.S $(obj)/purgatory.ro FORCE - $(call if_changed_rule,as_o_S) +$(obj)/kexec-purgatory.o: $(obj)/purgatory.ro -obj-$(CONFIG_ARCH_HAS_KEXEC_PURGATORY) += kexec-purgatory.o +obj-y += kexec-purgatory.o diff --git a/arch/s390/purgatory/head.S b/arch/s390/purgatory/head.S index 5a10ce34b95d..0f93f2e72eba 100644 --- a/arch/s390/purgatory/head.S +++ b/arch/s390/purgatory/head.S @@ -44,11 +44,14 @@ .endm .macro MEMSWAP dst,src,buf,len -10: cghi \len,bufsz +10: larl %r0,purgatory_end + larl %r1,stack + slgr %r0,%r1 + cgr \len,%r0 jh 11f lgr %r4,\len j 12f -11: lghi %r4,bufsz +11: lgr %r4,%r0 12: MEMCPY \buf,\dst,%r4 MEMCPY \dst,\src,%r4 @@ -62,19 +65,20 @@ jh 10b .endm -.macro START_NEXT_KERNEL base +.macro START_NEXT_KERNEL base subcode lg %r4,kernel_entry-\base(%r13) lg %r5,load_psw_mask-\base(%r13) ogr %r4,%r5 stg %r4,0(%r0) xgr %r0,%r0 - diag %r0,%r0,0x308 + lghi %r1,\subcode + diag %r0,%r1,0x308 .endm -.text -.align PAGE_SIZE -ENTRY(purgatory_start) + .text + .balign PAGE_SIZE +SYM_CODE_START(purgatory_start) /* The purgatory might be called after a diag308 so better set * architecture and addressing mode. */ @@ -96,7 +100,7 @@ ENTRY(purgatory_start) * checksum verification only (%r2 = 0 -> verification only). * * Check now and preserve over C function call by storing in - * %r10 whith + * %r10 with * 1 -> checksum verification only * 0 -> load new kernel */ @@ -123,7 +127,7 @@ ENTRY(purgatory_start) je .start_crash_kernel /* start normal kernel */ - START_NEXT_KERNEL .base_crash + START_NEXT_KERNEL .base_crash 0 .return_old_kernel: lmg %r6,%r15,gprregs-.base_crash(%r13) @@ -134,12 +138,18 @@ ENTRY(purgatory_start) .start_crash_kernel: /* Location of purgatory_start in crash memory */ + larl %r0,.base_crash + larl %r1,purgatory_start + slgr %r0,%r1 lgr %r8,%r13 - aghi %r8,-(.base_crash-purgatory_start) + sgr %r8,%r0 /* Destination for this code i.e. end of memory to be swapped. */ + larl %r0,purgatory_end + larl %r1,purgatory_start + slgr %r0,%r1 lg %r9,crash_size-.base_crash(%r13) - aghi %r9,-(purgatory_end-purgatory_start) + sgr %r9,%r0 /* Destination in crash memory, i.e. same as r9 but in crash memory. */ lg %r10,crash_start-.base_crash(%r13) @@ -148,15 +158,19 @@ ENTRY(purgatory_start) /* Buffer location (in crash memory) and size. As the purgatory is * behind the point of no return it can re-use the stack as buffer. */ - lghi %r11,bufsz + larl %r11,purgatory_end larl %r12,stack + slgr %r11,%r12 MEMCPY %r12,%r9,%r11 /* dst -> (crash) buf */ MEMCPY %r9,%r8,%r11 /* self -> dst */ /* Jump to new location. */ lgr %r7,%r9 - aghi %r7,.jump_to_dst-purgatory_start + larl %r0,.jump_to_dst + larl %r1,purgatory_start + slgr %r0,%r1 + agr %r7,%r0 br %r7 .jump_to_dst: @@ -168,7 +182,10 @@ ENTRY(purgatory_start) /* Load new buffer location after jump */ larl %r7,stack - aghi %r10,stack-purgatory_start + lgr %r0,%r7 + larl %r1,purgatory_start + slgr %r0,%r1 + agr %r10,%r0 MEMCPY %r10,%r7,%r11 /* (new) buf -> (crash) buf */ /* Now the code is set up to run from its designated location. Start @@ -227,46 +244,22 @@ ENTRY(purgatory_start) MEMCPY %r9,%r10,%r11 /* start crash kernel */ - START_NEXT_KERNEL .base_dst - - -load_psw_mask: - .long 0x00080000,0x80000000 - - .align 8 -disabled_wait_psw: - .quad 0x0002000180000000 - .quad 0x0000000000000000 + .do_checksum_verification - -gprregs: - .rept 10 - .quad 0 - .endr - -/* Macro to define a global variable with name and size (in bytes) to be - * shared with C code. - * - * Add the .size and .type attribute to satisfy checks on the Elf_Sym during - * purgatory load. - */ -.macro GLOBAL_VARIABLE name,size -\name: - .global \name - .size \name,\size - .type \name,object - .skip \size,0 -.endm - -GLOBAL_VARIABLE purgatory_sha256_digest,32 -GLOBAL_VARIABLE purgatory_sha_regions,16*__KEXEC_SHA_REGION_SIZE -GLOBAL_VARIABLE kernel_entry,8 -GLOBAL_VARIABLE kernel_type,8 -GLOBAL_VARIABLE crash_start,8 -GLOBAL_VARIABLE crash_size,8 - - .align PAGE_SIZE -stack: + START_NEXT_KERNEL .base_dst 1 +SYM_CODE_END(purgatory_start) + +SYM_DATA_LOCAL(load_psw_mask, .long 0x00080000,0x80000000) + .balign 8 +SYM_DATA_LOCAL(disabled_wait_psw, .quad 0x0002000180000000,.do_checksum_verification) +SYM_DATA_LOCAL(gprregs, .fill 10,8,0) +SYM_DATA(purgatory_sha256_digest, .skip 32) +SYM_DATA(purgatory_sha_regions, .skip 16*__KEXEC_SHA_REGION_SIZE) +SYM_DATA(kernel_entry, .skip 8) +SYM_DATA(kernel_type, .skip 8) +SYM_DATA(crash_start, .skip 8) +SYM_DATA(crash_size, .skip 8) + .balign PAGE_SIZE +SYM_DATA_START_LOCAL(stack) /* The buffer to move this code must be as big as the code. */ .skip stack-purgatory_start - .align PAGE_SIZE -purgatory_end: + .balign PAGE_SIZE +SYM_DATA_END_LABEL(stack, SYM_L_LOCAL, purgatory_end) diff --git a/arch/s390/purgatory/kexec-purgatory.S b/arch/s390/purgatory/kexec-purgatory.S index 8293753100ae..25f512b1de12 100644 --- a/arch/s390/purgatory/kexec-purgatory.S +++ b/arch/s390/purgatory/kexec-purgatory.S @@ -1,14 +1,12 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/linkage.h> .section .rodata, "a" - .align 8 -kexec_purgatory: - .globl kexec_purgatory + .balign 8 +SYM_DATA_START(kexec_purgatory) .incbin "arch/s390/purgatory/purgatory.ro" -.Lkexec_purgatroy_end: +SYM_DATA_END_LABEL(kexec_purgatory, SYM_L_LOCAL, kexec_purgatory_end) - .align 8 -kexec_purgatory_size: - .globl kexec_purgatory_size - .quad .Lkexec_purgatroy_end - kexec_purgatory + .balign 8 +SYM_DATA(kexec_purgatory_size, .quad kexec_purgatory_end-kexec_purgatory) diff --git a/arch/s390/purgatory/purgatory.c b/arch/s390/purgatory/purgatory.c index 0a423bcf6746..030efda05dbe 100644 --- a/arch/s390/purgatory/purgatory.c +++ b/arch/s390/purgatory/purgatory.c @@ -9,7 +9,7 @@ #include <linux/kexec.h> #include <linux/string.h> -#include <crypto/sha.h> +#include <crypto/sha2.h> #include <asm/purgatory.h> int verify_sha256_digest(void) diff --git a/arch/s390/purgatory/string.c b/arch/s390/purgatory/string.c new file mode 100644 index 000000000000..c98c22a72db7 --- /dev/null +++ b/arch/s390/purgatory/string.c @@ -0,0 +1,3 @@ +// SPDX-License-Identifier: GPL-2.0 +#define __HAVE_ARCH_MEMCMP /* arch function */ +#include "../lib/string.c" diff --git a/arch/s390/scripts/Makefile.chkbss b/arch/s390/scripts/Makefile.chkbss deleted file mode 100644 index f4f4c2c6dee9..000000000000 --- a/arch/s390/scripts/Makefile.chkbss +++ /dev/null @@ -1,20 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 - -chkbss-target ?= built-in.a -$(obj)/$(chkbss-target): chkbss - -chkbss-files := $(addsuffix .chkbss, $(chkbss)) -clean-files += $(chkbss-files) - -PHONY += chkbss -chkbss: $(addprefix $(obj)/, $(chkbss-files)) - -quiet_cmd_chkbss = CHKBSS $< - cmd_chkbss = \ - if ! $(OBJSIZE) --common $< | $(AWK) 'END { if ($$3) exit 1 }'; then \ - echo "error: $< .bss section is not empty" >&2; exit 1; \ - fi; \ - touch $@; - -$(obj)/%.o.chkbss: $(obj)/%.o - $(call cmd,chkbss) diff --git a/arch/s390/tools/.gitignore b/arch/s390/tools/.gitignore index 71bd6f8eebaf..ea62f37b79ef 100644 --- a/arch/s390/tools/.gitignore +++ b/arch/s390/tools/.gitignore @@ -1,2 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only gen_facilities gen_opcode_table diff --git a/arch/s390/tools/Makefile b/arch/s390/tools/Makefile index b5e35e8f999a..f9dd47ff9ac4 100644 --- a/arch/s390/tools/Makefile +++ b/arch/s390/tools/Makefile @@ -10,8 +10,8 @@ PHONY += kapi kapi: $(kapi-hdrs-y) -hostprogs-y += gen_facilities -hostprogs-y += gen_opcode_table +hostprogs += gen_facilities +hostprogs += gen_opcode_table HOSTCFLAGS_gen_facilities.o += $(LINUXINCLUDE) diff --git a/arch/s390/tools/gcc-thunk-extern.sh b/arch/s390/tools/gcc-thunk-extern.sh new file mode 100755 index 000000000000..20bcbf6dd7ab --- /dev/null +++ b/arch/s390/tools/gcc-thunk-extern.sh @@ -0,0 +1,24 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# Borrowed from gcc: gcc/testsuite/gcc.target/s390/nobp-section-type-conflict.c +# Checks that we don't get error: section type conflict with ‘put_page’. + +cat << "END" | $@ -x c - -fno-PIE -march=z10 -mindirect-branch=thunk-extern -mfunction-return=thunk-extern -mindirect-branch-table -O2 -c -o /dev/null +int a; +int b (void); +void c (int); + +static void +put_page (void) +{ + if (b ()) + c (a); +} + +__attribute__ ((__section__ (".init.text"), __cold__)) void +d (void) +{ + put_page (); + put_page (); +} +END diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c index 61ce5b59b828..68580cbea4e6 100644 --- a/arch/s390/tools/gen_facilities.c +++ b/arch/s390/tools/gen_facilities.c @@ -27,24 +27,16 @@ static struct facility_def facility_defs[] = { */ .name = "FACILITIES_ALS", .bits = (int[]){ -#ifdef CONFIG_HAVE_MARCH_Z900_FEATURES 0, /* N3 instructions */ 1, /* z/Arch mode installed */ -#endif -#ifdef CONFIG_HAVE_MARCH_Z990_FEATURES 18, /* long displacement facility */ -#endif -#ifdef CONFIG_HAVE_MARCH_Z9_109_FEATURES 21, /* extended-immediate facility */ 25, /* store clock fast */ -#endif -#ifdef CONFIG_HAVE_MARCH_Z10_FEATURES 27, /* mvcos */ 32, /* compare and swap and store */ 33, /* compare and swap and store 2 */ 34, /* general instructions extension */ 35, /* execute extensions */ -#endif #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES 45, /* fast-BCR, etc. */ #endif @@ -54,6 +46,7 @@ static struct facility_def facility_defs[] = { #endif #ifdef CONFIG_HAVE_MARCH_Z13_FEATURES 53, /* load-and-zero-rightmost-byte, etc. */ + 129, /* vector */ #endif #ifdef CONFIG_HAVE_MARCH_Z14_FEATURES 58, /* miscellaneous-instruction-extension 2 */ @@ -115,6 +108,11 @@ static struct facility_def facility_defs[] = { 12, /* AP Query Configuration Information */ 15, /* AP Facilities Test */ 156, /* etoken facility */ + 165, /* nnpa facility */ + 193, /* bear enhancement facility */ + 194, /* rdp enhancement facility */ + 196, /* processor activity instrumentation facility */ + 197, /* processor activity instrumentation extension 1 */ -1 /* END */ } }, diff --git a/arch/s390/tools/opcodes.txt b/arch/s390/tools/opcodes.txt index 46d8ed96cf06..5f008e794898 100644 --- a/arch/s390/tools/opcodes.txt +++ b/arch/s390/tools/opcodes.txt @@ -189,6 +189,8 @@ ad stosm SI_URD ae sigp RS_RRRD af mc SI_URD b1 lra RX_RRRD +b200 lbear S_RD +b201 stbear S_RD b202 stidp S_RD b204 sck S_RD b205 stck S_RD @@ -274,6 +276,7 @@ b285 lpctl S_RD b286 qsi S_RD b287 lsctl S_RD b28e qctri S_RD +b28f qpaci S_RD b299 srnm S_RD b29c stfpc S_RD b29d lfpc S_RD @@ -523,6 +526,7 @@ b931 clgfr RRE_RR b938 sortl RRE_RR b939 dfltcc RRF_R0RR2 b93a kdsa RRE_RR +b93b nnpa RRE_00 b93c ppno RRE_RR b93e kimd RRE_RR b93f klmd RRE_RR @@ -562,6 +566,7 @@ b987 dlgr RRE_RR b988 alcgr RRE_RR b989 slbgr RRE_RR b98a cspg RRE_RR +b98b rdp RRF_RURR2 b98d epsw RRE_RR b98e idte RRF_RURR2 b98f crdte RRF_RURR2 @@ -597,7 +602,7 @@ b9b3 cu42 RRE_RR b9bd trtre RRF_U0RR b9be srstu RRE_RR b9bf trte RRF_U0RR -b9c0 selhhhr RRF_RURR +b9c0 selfhr RRF_RURR b9c8 ahhhr RRF_R0RR2 b9c9 shhhr RRF_R0RR2 b9ca alhhhr RRF_R0RR2 @@ -876,19 +881,32 @@ e63d vstrl VSI_URDV e63f vstrlr VRS_RRDV e649 vlip VRI_V0UU2 e650 vcvb VRR_RV0UU +e651 vclzdp VRR_VV0U2 e652 vcvbg VRR_RV0UU +e654 vupkzh VRR_VV0U2 +e655 vcnf VRR_VV0UU2 +e656 vclfnh VRR_VV0UU2 e658 vcvd VRI_VR0UU e659 vsrp VRI_VVUUU2 e65a vcvdg VRI_VR0UU e65b vpsop VRI_VVUUU2 +e65c vupkzl VRR_VV0U2 +e65d vcfn VRR_VV0UU2 +e65e vclfnl VRR_VV0UU2 e65f vtp VRR_0V +e670 vpkzr VRI_VVV0UU2 e671 vap VRI_VVV0UU2 +e672 vsrpr VRI_VVV0UU2 e673 vsp VRI_VVV0UU2 +e674 vschp VRR_VVV0U0U +e675 vcrnf VRR_VVV0UU e677 vcp VRR_0VV0U e678 vmp VRI_VVV0UU2 e679 vmsp VRI_VVV0UU2 e67a vdp VRI_VVV0UU2 e67b vrp VRI_VVV0UU2 +e67c vscshp VRR_VVV +e67d vcsph VRR_VVV0U0 e67e vsdp VRI_VVV0UU2 e700 vleb VRX_VRRDU e701 vleh VRX_VRRDU @@ -1081,6 +1099,7 @@ eb61 stric RSY_RDRU eb62 mric RSY_RDRU eb6a asi SIY_IRD eb6e alsi SIY_IRD +eb71 lpswey SIY_RD eb7a agsi SIY_IRD eb7e algsi SIY_IRD eb80 icmh RSY_RURD |